diff --git a/.gitignore b/.gitignore
index 21bd1f0b6..4ccfa6ce6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,4 @@ nv_gpu_build
 ios_build
 gpu_build
 output
+third-party/extern_miopen
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 050da5434..45ea6a020 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,6 +126,8 @@ anakin_option(USE_MKL "Use mkl libs." NO if USE_X86_PLACE)
 anakin_option(USE_MKLML "Use MKLML libs." YES if USE_X86_PLACE)
 anakin_option(USE_XBYAK "Use XBYAK libs." YES if USE_X86_PLACE)
 anakin_option(USE_OPENMP "Use Openmp when in android environment." YES if TARGET_ANDROID)
+anakin_option(USE_OPENSSL "Use OpenSSL." YES if AMD_GPU)
+anakin_option(USE_MIOPENGEMM "Use MIOpenGEMM ." YES if AMD_GPU)
 
 # build components
 anakin_option(BUILD_WITH_UNIT_TEST "Build anakin unit test components." YES)
@@ -148,6 +150,10 @@ if(ENABLE_MIN_DEPENDENCY)
     set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--version-script,${ANAKIN_ROOT}/cmake/ak_link.lds")
 endif()
 
+# for AMD profile computing performance
+anakin_option(ENABLE_AMD_PROFILING "Enable amd profiling" NO)
+anakin_option(ENABLE_AMD_DEBUG "Enable amd debug" NO)
+
 # ----------------------------------------------------------------------------
 # section: anakin compiler and linker options
 # ----------------------------------------------------------------------------
@@ -201,6 +207,7 @@ endif()
 
 if(AMD_GPU)
     include(cmake/amd.cmake)
+    include(cmake/external/miopen.cmake)
 endif()
 
 # gather all the config options to anakin
diff --git a/cmake/amd.cmake b/cmake/amd.cmake
index 1ebc7bf56..8d89f917c 100644
--- a/cmake/amd.cmake
+++ b/cmake/amd.cmake
@@ -30,24 +30,77 @@ macro(amd_set_opencl_path)
     #endif()
 endmacro()
 
-macro(amd_build_cl_file file_path dest_path)
-    FILE(GLOB CL_FILES ${file_path}/*.cl)
-    message(STATUS "found cl files: ${CL_FILES}")
-    foreach(src_file ${CL_FILES})
-        get_filename_component(src_file_name ${src_file} NAME)
-        message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}")
-        configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY)
-    endforeach()
+macro(amd_find_file regexps)
+    FILE(GLOB FOUND_FILES ${regexps})
 endmacro()
 
+function(add_kernels KERNEL_FILES)
+    set(INIT_KERNELS_LIST)
 
-macro(amd_build_cl_binary_file file_path dest_path)
-    FILE(GLOB CL_FILES ${file_path}/*.so)
-    message(STATUS "found cl files: ${CL_FILES}")
-    foreach(src_file ${CL_FILES})
-        get_filename_component(src_file_name ${src_file} NAME)
-        message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}")
-        configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY)
+    foreach(KERNEL_FILE ${KERNEL_FILES})
+        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
+            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
+        else()
+            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
+        endif()
+        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
+        string(TOUPPER "${BASE_NAME}" KEY_NAME)
+        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
+        list(APPEND INIT_KERNELS_LIST "    { \"${KEY_NAME}\", std::string(reinterpret_cast<const char*>(${VAR_NAME}), ${VAR_NAME}_SIZE) }")
     endforeach()
+    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
+    configure_file("${CMAKE_SOURCE_DIR}/saber/core/impl/amd/utils/amd_kernels.cpp.in" ${PROJECT_BINARY_DIR}/amd_kernels.cpp)
+endfunction()
+
+macro(generate_amd_kernel_src)
+    set(AMD_KERNELS)
+    set(AMD_KERNEL_INCLUDES)
+
+    set(FILE_REGEXP
+        "${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl/*.cl"
+        "${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl/*.s"
+        "${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib/*.so"
+        "${CMAKE_SOURCE_DIR}/test/saber/amd/*.cl"
+    )
+    amd_find_file("${FILE_REGEXP}")
+    list(APPEND AMD_KERNELS ${FOUND_FILES})
+    
+    set(FILE_REGEXP
+        "${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl/*.inc"
+    )
+    amd_find_file("${FILE_REGEXP}")
+    list(APPEND AMD_KERNEL_INCLUDES ${FOUND_FILES})
+    
+    #message(STATUS =======${AMD_KERNELS}======)
+    #message(STATUS =======${AMD_KERNEL_INCLUDES}======)
+    add_kernels("${AMD_KERNELS}")
+    add_custom_command(
+        OUTPUT ${PROJECT_BINARY_DIR}/amd_kernels.h
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        DEPENDS ${MIOPEN_PROJECT} ${AMD_KERNELS} ${AMD_KERNEL_INCLUDES}
+        COMMAND ${MIOPEN_BINARY_DIR}/bin/addkernels -guard GUARD_AMD_KERNELS_HPP_ -target ${PROJECT_BINARY_DIR}/amd_kernels.h -source ${AMD_KERNELS}
+        COMMENT "Inlining AMD kernels"
+        )
+    add_custom_target(amd_kernels DEPENDS ${PROJECT_BINARY_DIR}/amd_kernels.h)
+    include_directories($PROJECT_BINARY_DIR}/amd_kernels.h)
+    list(APPEND ANAKIN_SABER_DEPENDENCIES amd_kernels)
+    list(APPEND ANAKIN_SABER_BASE_SRC ${PROJECT_BINARY_DIR}/amd_kernels.cpp)
+endmacro()
+
+macro(amd_set_miopengemm_path)
+    if(NOT DEFINED MIOPENGEMM_INCLUDE_DIR)
+        set(MIOPENGEMM_INCLUDE_DIR "/opt/rocm/miopengemm/include")
+    endif()
+    if(NOT DEFINED MIOPENGEMM_LIBRARY)
+        set(MIOPENGEMM_LIBRARY "/opt/rocm/miopengemm/lib64/libmiopengemm.so")
+    endif()
+
+    #FIND_PACKAGE(miopengemm REQUIRED)
+    #if(miopengemm_FOUND)
+    #    message(STATUS "Found miopengemm in ${MIOPENGEMM_INCLUDE_DIR}")
+    #    message(STATUS "Found miopengemm lib in ${MIOPENGEMM_LIBRARY}")
+    #    include_directories(${MIOPENGEMM_INCLUDE_DIR})
+    #    LINK_LIBRARIES(${MIOPENGEMM_LIBRARY})
+    #endif()
 endmacro()
 
diff --git a/cmake/config/anakin_config.h.in b/cmake/config/anakin_config.h.in
index d96e231bf..92aa8010e 100644
--- a/cmake/config/anakin_config.h.in
+++ b/cmake/config/anakin_config.h.in
@@ -55,6 +55,7 @@
 
 #cmakedefine USE_GFLAGS
 
+#cmakedefine USE_MIOPENGEMM
 
 // plantform to use
 #cmakedefine USE_GPU_PLACE
@@ -83,6 +84,10 @@
 // build arm lite
 #cmakedefine BUILD_LITE
 
+// for AMD profiling computing performance
+#cmakedefine ENABLE_AMD_PROFILING
+
+#cmakedefine ENABLE_AMD_DEBUG
 
 #if defined(ANDROID) || defined(__ANDROID__)
     #define PLATFORM_ANDROID
@@ -114,4 +119,3 @@
 #endif
 
 #endif
-
diff --git a/cmake/external/miopen.cmake b/cmake/external/miopen.cmake
old mode 100644
new mode 100755
index e76acb428..71c2a53f9
--- a/cmake/external/miopen.cmake
+++ b/cmake/external/miopen.cmake
@@ -14,50 +14,27 @@
 include(ExternalProject)
 
 set(MIOPEN_PROJECT       extern_miopen)
-set(MIOPEN_PREFIX_DIR    ${ANAKIN_TEMP_THIRD_PARTY_PATH}/miopen)
-set(MIOPEN_INSTALL_ROOT  ${ANAKIN_THIRD_PARTY_PATH}/miopen)
-set(MIOPEN_SOURCE_DIR    ${MIOPEN_PREFIX_DIR}/src/${MIOPEN_PROJECT})
-set(MIOPEN_BINARY_DIR    ${MIOPEN_PREFIX_DIR}/src/${MIOPEN_PROJECT}-build)
+set(MIOPEN_PREFIX_DIR    ${ANAKIN_THIRD_PARTY_PATH})
+set(MIOPEN_INSTALL_ROOT  ${ANAKIN_ROOT}/output/miopen)
+set(MIOPEN_SOURCE_DIR    ${ANAKIN_THIRD_PARTY_PATH}/${MIOPEN_PROJECT})
+set(MIOPEN_STAMP_DIR     ${ANAKIN_TEMP_THIRD_PARTY_PATH}/${MIOPEN_PROJECT}/stamp)
+set(MIOPEN_BINARY_DIR    ${ANAKIN_TEMP_THIRD_PARTY_PATH}/${MIOPEN_PROJECT}/build)
 set(MIOPEN_LIB           ${MIOPEN_INSTALL_ROOT}/lib/libMIOpen.so  CACHE FILEPATH "miopen library." FORCE)
 
-if(NOT Boost_FOUND)
-    set(BOOST_ROOT       ${BOOST_INSTALL_ROOT} CACHE FILEPATH "boost library/" FORCE)
-endif()
-
 message(STATUS "Scanning external modules ${Green}MIOPEN${ColourReset} ...")
 
-ExternalProject_Add(
-    ${MIOPEN_PROJECT}_customize
-    GIT_REPOSITORY        "ssh://git@icode.baidu.com:8235/baidu/third-party/miopen"
-    GIT_TAG               "cbd4e7dbad0599c7327cb43888476ab8d966f285"
-    PREFIX                ${ANAKIN_TEMP_THIRD_PARTY_PATH}/miopen/customize_miopen_file
-    SOURCE_DIR            ${ANAKIN_THIRD_PARTY_PATH}/miopen/customize_miopen_file
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    INSTALL_COMMAND       ""
-)
-
 ExternalProject_Add(
     ${MIOPEN_PROJECT}
-    DEPENDS               ${MIOPEN_PROJECT}_customize
-    GIT_REPOSITORY        "ssh://git@icode.baidu.com:8235/baidu/third-party/miopen"
-    GIT_TAG               1.4.2
+    GIT_REPOSITORY        https://github.com/ROCmSoftwarePlatform/Anakin.git
+    GIT_TAG               anakin-amd_miopen
     PREFIX                ${MIOPEN_PREFIX_DIR}
-    CMAKE_ARGS            -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX=${MIOPEN_INSTALL_ROOT} -DCMAKE_INSTALL_LIBDIR=lib -DBOOST_ROOT=${BOOST_ROOT}
-    #LOG_DOWNLOAD          1
+    STAMP_DIR             ${MIOPEN_STAMP_DIR}
+    BINARY_DIR            ${MIOPEN_BINARY_DIR}
+    CMAKE_ARGS            -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX=${MIOPEN_INSTALL_ROOT} -DCMAKE_INSTALL_LIBDIR=lib
     LOG_BUILD             1
     
 )
 
-ExternalProject_Add_Step(
-    ${MIOPEN_PROJECT} ${MIOPEN_PROJECT}_customize
-    DEPENDEES         download
-    DEPENDERS         build
-    COMMAND           ${CMAKE_COMMAND} -E copy_directory ${ANAKIN_THIRD_PARTY_PATH}/miopen/customize_miopen_file ${MIOPEN_SOURCE_DIR}
-    ALWAYS            1
-    EXCLUDE_FORM_MAIN 1
-    LOG               1
-)
 include_directories(${MIOPEN_INSTALL_ROOT}/include)
 add_library(miopen SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET miopen PROPERTY IMPORTED_LOCATION ${MIOPEN_LIB})
diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index 2f0790b5b..62d0fc8ae 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -440,10 +440,19 @@ macro(anakin_find_bmlib)
         list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmrt.so) 
         list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES}) 
     else() 
-        message(FATAL_ERROR "Could not found bm_lib") 
-    endif()
+        message(FATAL_ERROR "Could not found bm_lib")
 endmacro()
 
+macro(anakin_find_openssl)
+    find_package(OpenSSL REQUIRED)
+    if(OPENSSL_FOUND)
+        message(STATUS "Found openssl in ${OPENSSL_INCLUDE_DIR}")
+        list(APPEND ANAKIN_LINKER_LIBS ${OPENSSL_LIBRARIES})
+        include_directories(${OPENSSL_INCLUDE_DIR})
+    else()
+        message(FATAL_ERROR "Could not found openmp !")
+    endif()
+endmacro()
 
 macro(anakin_find_nvinfer)
 	find_path(NVINFER_INCLUDE_DIR NvInfer.h PATHS ${ANAKIN_ROOT}/third-party/tensorrt5/include
@@ -488,3 +497,14 @@ macro(anakin_find_nvinfer)
 	endif()
 endmacro()
 
+#find miopengemm
+macro(anakin_find_miopengemm)
+    find_package(miopengemm REQUIRED PATHS /opt/rocm)
+    if(miopengemm_FOUND)
+        set(MIOPENGEMM_FOUND  TRUE)
+        message(STATUS "Found miopengemm: ${miopengemm_INCLUDE_DIR}")
+        include_directories(SYSTEM ${miopengemm_INCLUDE_DIR})
+        list(APPEND ANAKIN_LINKER_LIBS ${miopengemm_LIBRARIES})
+    endif()
+endmacro()
+
diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index 32b03c05f..1754243ea 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -28,13 +28,14 @@ if(USE_BM_PLACE)
     anakin_find_bmlib()
 endif()
 
-# set amd opencl path
-if(AMD_GPU)
-    amd_build_cl_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl" "${CMAKE_BINARY_DIR}/cl/amd")
-    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib" "${CMAKE_BINARY_DIR}/cl/amd")
-    amd_build_cl_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl" "${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/unit_test")
-    amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib" "${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/unit_test")
-    amd_build_cl_file("${CMAKE_SOURCE_DIR}/test/saber/amd" "${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/unit_test")
+# find openssl
+if(USE_OPENSSL)
+    anakin_find_openssl()
+endif()
+
+# find openssl
+if(USE_OPENSSL)
+    anakin_find_openssl()
 endif()
 
 # find opencl
@@ -105,3 +106,8 @@ endif()
 if(USE_TENSORRT)
     anakin_find_nvinfer()
 endif()
+
+# find miopengemm
+if(USE_MIOPENGEMM)
+    anakin_find_miopengemm()
+endif()
diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt
index 86d4b0836..05efe5f0c 100644
--- a/saber/CMakeLists.txt
+++ b/saber/CMakeLists.txt
@@ -42,6 +42,11 @@ if(USE_GPU_PLACE)
     else()
         anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/amd "cpp" ANAKIN_SABER_BASE_SRC)
         anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/amd "cpp" ANAKIN_SABER_BASE_SRC)
+        anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/amd/utils "cpp" ANAKIN_SABER_BASE_SRC)
+        if(USE_OPENCL)
+            anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/amd/utils/ocl "cpp" ANAKIN_SABER_BASE_SRC)
+        endif()
+        generate_amd_kernel_src()
     endif()
 endif()
 
@@ -92,7 +97,7 @@ if(UNIX OR APPLE)
         if (BUILD_SHARED)
             ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC})
             #$<TARGET_OBJECTS:ANAKIN_SABER_BASE_OBJS>)
-            if(USE_X86_PLACE OR USE_CUDA)
+            if(USE_X86_PLACE OR USE_CUDA OR AMD_GPU)
                 list(LENGTH ANAKIN_SABER_DEPENDENCIES dependencies_len)
                 if(dependencies_len GREATER 0)
                     add_dependencies(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_SABER_DEPENDENCIES})
diff --git a/saber/core/device.h b/saber/core/device.h
index ced61d6a8..f3baef1b0 100644
--- a/saber/core/device.h
+++ b/saber/core/device.h
@@ -63,7 +63,7 @@ struct Device<AMD> {
 
     typedef TargetWrapper<AMD> API;
 
-    Device(int max_stream = 1);
+    Device(int max_stream = 4);
 
     void get_info();
     void create_stream();
diff --git a/saber/core/env.h b/saber/core/env.h
index edd72a3a4..ad225956c 100644
--- a/saber/core/env.h
+++ b/saber/core/env.h
@@ -56,55 +56,6 @@ class Env {
     Env(){}
 };
 
-#ifdef AMD_GPU 
-typedef std::list<cl_event> cl_event_list;
-
-template <>
-class Env<AMD> {
-public:
-    typedef TargetWrapper<AMD> AMD_API;
-    typedef std::vector<Device<AMD>> Devs;
-    static Devs& cur_env() {
-        static Devs* _g_env = new Devs();
-        return *_g_env;
-    }
-
-    static void env_init(int max_stream = 4);
-    static bool is_init(); 
-    static cl_platform_id get_platform_id();
-
-    static void add_event(const char *tag, cl_event_list event);
-    static void add_event(cl_event_list event) {
-        add_event(mTag.c_str(), event);
-    }
-
-    static void pop();
-    static void set_tag(const char *tag){
-        mTag = std::string(tag);
-    }
-
-    static const std::string& get_tag(){
-        return mTag;
-    }
-
-    static void start_record(){
-        record = true;
-    }
-    static void stop_record(){
-        record = false;
-    }
-private:
-    Env(){}
-
-    static cl_platform_id platform_id;
-    static std::map<std::string, std::list<cl_event_list>> eMap;
-    static std::list<std::string> tList;
-    static bool record;
-    static std::string  mTag;
-
-};
-#endif
-
 } //namespace saber
 
 } //namespace anakin
diff --git a/saber/core/impl/amd/amd_device.cpp b/saber/core/impl/amd/amd_device.cpp
index b3543ac23..6477e3c50 100644
--- a/saber/core/impl/amd/amd_device.cpp
+++ b/saber/core/impl/amd/amd_device.cpp
@@ -54,16 +54,14 @@ static void get_param(cl_device_id dev, cl_device_info param_name, T** param_val
 
 
 Device<AMD>::Device(int max_stream) : _max_stream(max_stream) {
-    if (!Env<AMD>::is_init()) {
-        return;
-    }
 
     //get cl device id;
     int nums = 0;
     AMD_API::get_device_count(nums);
     cl_device_id* device_ids = new cl_device_id[nums];
     cl_uint device_nums;
-    clGetDeviceIDs(Env<AMD>::get_platform_id(), CL_DEVICE_TYPE_GPU, (cl_uint)nums, device_ids,
+    cl_platform_id platform_id = TargetWrapper<AMD>::get_platform_id();
+    clGetDeviceIDs(platform_id,  CL_DEVICE_TYPE_GPU, (cl_uint)nums, device_ids,
                    &device_nums);
     id = device_ids[AMD_API::get_device_id()];
     delete []device_ids;
@@ -71,12 +69,11 @@ Device<AMD>::Device(int max_stream) : _max_stream(max_stream) {
 
     //init context, one by one mapping to device.
     cl_int errNum;
-    const cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)Env<AMD>::get_platform_id(), 0};
+    const cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0};
     context = clCreateContext(prop, 1, &id, NULL, NULL, &errNum);
     CHECK(errNum == CL_SUCCESS);
 
     get_info();
-    create_stream();
 }
 
 void Device<AMD>::create_stream() {
@@ -87,8 +84,13 @@ void Device<AMD>::create_stream() {
         typename AMD_API::stream_t stream_data;
         typename AMD_API::stream_t stream_compute;
 
+#ifdef ENABLE_AMD_PROFILING
         API::_create_stream_with_flag(&stream_data, context, id, CL_QUEUE_PROFILING_ENABLE);
         API::_create_stream_with_flag(&stream_compute, context, id, CL_QUEUE_PROFILING_ENABLE);
+#else
+        API::_create_stream_with_flag(&stream_data, context, id, 0);
+        API::_create_stream_with_flag(&stream_compute, context, id, 0);
+#endif
         _data_stream.push_back(stream_data);
         _compute_stream.push_back(stream_compute);
     }
diff --git a/saber/core/impl/amd/amd_env.cpp b/saber/core/impl/amd/amd_env.cpp
deleted file mode 100644
index 6395de384..000000000
--- a/saber/core/impl/amd/amd_env.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#include "core/env.h"
-namespace anakin {
-
-namespace saber {
-
-#ifdef AMD_GPU
-
-typedef TargetWrapper<AMD> AMD_API;
-typedef Env<AMD> AMD_ENV;
-
-cl_platform_id AMD_ENV::platform_id = NULL;
-
-void AMD_ENV::env_init(int max_stream) {
-    Devs& devs = cur_env();
-
-    if (devs.size() > 0) {
-        return;
-    }
-
-    platform_id = AMD_API::get_platform_id();
-
-    int count = 0;
-    AMD_API::get_device_count(count);
-
-    if (count == 0) {
-        LOG(WARNING) << "no device found!";
-    } else {
-        LOG(INFO) << "found " << count << " device(s)";
-    }
-
-    int cur_id = AMD_API::get_device_id();
-
-    for (int i = 0; i < count; i++) {
-        AMD_API::set_device(i);
-        devs.push_back(Device<AMD>(max_stream));
-    }
-
-    AMD_API::set_device(cur_id);
-}
-
-bool AMD_ENV::is_init() {
-    CHECK(platform_id != NULL);
-    return true;
-}
-
-cl_platform_id AMD_ENV::get_platform_id() {
-    if (!is_init()) {
-        return NULL;
-    }
-
-    return platform_id;
-}
-
-bool AMD_ENV::record = false;
-std::string AMD_ENV::mTag;
-std::list<std::string> AMD_ENV::tList;
-std::map<std::string, std::list<cl_event_list>> AMD_ENV::eMap;
-
-void AMD_ENV::add_event(const char* tag, cl_event_list event) {
-    if (!record) {
-        return;
-    }
-
-    std::map<std::string, std::list<cl_event_list>>::iterator it;
-    it = eMap.find(std::string(tag));
-
-    if (it != eMap.end()) {
-        it->second.push_back(event);
-    } else {
-        LOG(INFO) << "record [" << tList.size() << "]=" << tag;
-        tList.push_back(std::string(tag));
-        std::list<cl_event_list> list;
-        list.push_back(event);
-        eMap[std::string(tag)] = list;
-    }
-}
-void AMD_ENV::pop() {
-    std::map<std::string, std::list<cl_event_list>>::iterator it;
-    size_t t_size;
-    size_t e_size;
-    size_t s_size;
-    size_t size;
-    float executionTime = 0;
-    float waitTime = 0;
-    float g_execute = 0;
-    float g_wait = 0;
-    cl_ulong submit;
-    cl_ulong start;
-    cl_ulong end;
-    cl_ulong wait;
-    cl_ulong execute;
-    CHECK(tList.size() == eMap.size());
-    t_size = tList.size();
-    std::string log;
-    log.append("\n");
-    std::string tmp;
-
-    for (int i = 0 ; i < t_size; i++) {
-        waitTime = executionTime = 0;
-        std::string tag = tList.front();
-        it = eMap.find(tag);
-        std::list<cl_event_list> list = it->second;
-        e_size = list.size();
-
-        cl_ulong* s_waits = NULL, *s_executes = NULL;
-
-        for (int j = 0 ; j < e_size; j++) {
-            cl_event_list eList = list.front();
-
-            s_size = eList.size();
-
-            if (s_size > 1) {
-
-                if (s_waits == NULL) {
-                    s_waits = new cl_ulong[s_size];
-                    s_executes = new cl_ulong[s_size];
-                    memset(s_waits, 0, s_size * sizeof(cl_ulong));
-                    memset(s_executes, 0, s_size * sizeof(cl_ulong));
-                }
-
-                int tmps = 0;
-
-                for (cl_event_list::iterator ite = eList.begin(); ite != eList.end(); ite++) {
-
-                    cl_event event = *ite;
-                    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL);
-                    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
-                    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
-
-                    s_waits[tmps] += (start - submit);
-                    s_executes[tmps] += (end - start);
-                    tmps++;
-                }
-
-                cl_event eventS = eList.front();
-                cl_event eventE = eList.back();
-                clGetEventProfilingInfo(eventS, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
-                clGetEventProfilingInfo(eventE, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
-                execute = end - start;
-
-                clGetEventProfilingInfo(eventS, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL);
-                wait = start - submit;
-
-                clGetEventProfilingInfo(eventS, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL);
-                clGetEventProfilingInfo(eventE, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &start, NULL);
-                wait += start - end;
-            } else {
-
-                cl_event event = eList.front();
-                clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL);
-                clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
-                clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
-
-                wait = start - submit;
-                execute = end - start;
-            }
-
-            eList.clear();
-
-            executionTime += (execute) * 1e-6;
-            waitTime += (wait) * 1e-6;
-            list.pop_front();
-            //LOG(INFO) << tag << ":" << "["<< wait <<", " << execute <<"]";
-        }
-
-        //LOG(INFO) << "[" << i << "]" << tag << " avg - wait :"<< waitTime/e_size << " ms, execute " << executionTime/e_size <<" ms";
-        tmp = std::string("[") + std::to_string(i) + std::string("]\t") + \
-              tag + std::string("\t") + std::to_string(executionTime / e_size) + std::string(" ms\n");
-
-        if (s_size > 1) {
-            for (int s = 0; s < s_size; s++)  {
-                tmp.append(std::string("--[") + std::to_string(i) + std::string("-") + std::to_string(
-                               s) + std::string("]\t\t"));
-                tmp.append(std::to_string((float)s_executes[s] * 1e-6 / e_size) + std::string(" ms\n"));
-            }
-
-            delete []s_waits;
-            delete []s_executes;
-            s_waits = NULL;
-            s_executes = NULL;
-        }
-
-        log.append(tmp);
-
-        g_wait += (waitTime / e_size);
-        g_execute += (executionTime / e_size);
-        //LOG(INFO) << "[" << i << "]"  << tag << '\t' << " avg - execute " << '\t'<< executionTime/e_size <<" ms";
-
-        tList.pop_front();
-    }
-
-    tmp = std::string("[Total]\t\t") + \
-          std::to_string(g_execute) + std::string(" ms\n");
-    log.append(tmp);
-    LOG(INFO) << log;
-}
-
-
-
-//template void AMD_ENV::evn_init();
-
-#endif // AMD_GPU
-
-} //namespace saber
-} //namespace anakin
-
diff --git a/saber/core/impl/amd/amd_impl.cpp b/saber/core/impl/amd/amd_impl.cpp
index a22146104..f6d6d3732 100644
--- a/saber/core/impl/amd/amd_impl.cpp
+++ b/saber/core/impl/amd/amd_impl.cpp
@@ -1,8 +1,23 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
 #include "core/tensor.h"
 #include "core/common.h"
 #include "core/buffer.h"
 #include "core/data_traits.h"
 #include "env.h"
+#include "utils/amd_logger.h"
 
 namespace anakin {
 
@@ -49,15 +64,13 @@ const char* opencl_get_error_string(cl_int err) {
     }
 
     return "Unknown cl error";
-
 }
 
-
 /**
  * \brief for AMD device target only, device target is AMD gpu
  * use opencl api to manage memory
  * support device to device, device to host, host to device memcpy
-*/
+ */
 typedef TargetWrapper<AMD, __device_target> AMD_API;
 typedef Env<AMD> AMD_ENV;
 
@@ -65,7 +78,7 @@ int AMD_API::current_device_id_index = 0;
 std::map<void*, cl_mem> AMD_API::buffers;
 
 void AMD_API::get_device_count(int& count) {
-    cl_platform_id id = AMD_ENV::get_platform_id();
+    cl_platform_id id = AMD_API::get_platform_id();
     cl_uint nums;
     AMD_CHECK(clGetDeviceIDs(id, CL_DEVICE_TYPE_GPU, 0, NULL, &nums));
     count = (int)nums;
@@ -77,12 +90,11 @@ void AMD_API::set_device(int id) {
 }
 
 void AMD_API::mem_alloc(TPtr* ptr, size_t n) {
-    AMD_ENV::is_init();
 
-#ifdef  AMD_GPU_EXTENSION
-    //LOG(INFO) << "use CL_MEM_USE_PERSISTENT_MEM_AMD to create buffer.";
+#ifdef AMD_GPU_EXTENSION
+    // LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "use CL_MEM_USE_PERSISTENT_MEM_AMD to create buffer.";
 #else
-    //LOG(INFO) << "use CL_MEM_ALLOC_HOST_PTR to create buffer.";
+    // LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "use CL_MEM_ALLOC_HOST_PTR to create buffer.";
 #endif
 
     int index = get_device_id();
@@ -90,24 +102,31 @@ void AMD_API::mem_alloc(TPtr* ptr, size_t n) {
     cl_context context = AMD_ENV::cur_env()[index].get_context();
 
     cl_int err;
-    cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE
+    cl_mem buf = clCreateBuffer(
+                     context,
+                     CL_MEM_READ_WRITE
 #ifdef AMD_GPU_EXTENSION
-                                | CL_MEM_USE_PERSISTENT_MEM_AMD
+                     | CL_MEM_USE_PERSISTENT_MEM_AMD
 #else
-                                | CL_MEM_ALLOC_HOST_PTR
+                     | CL_MEM_ALLOC_HOST_PTR
 #endif
-                                , n,  NULL, &err);
+                     ,
+                     n,
+                     NULL,
+                     &err);
 
     AMD_CHECK(err);
     *ptr = buf;
 
-    LOG(INFO) << __func__ << "device =" << index << " get context :" << context << " buffer :" << buf <<
-              " size :" << n;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << "device =" << index << " get context :" <<
+                                         context << " buffer :" << buf
+                                         << " size :" << n;
 }
 
 void AMD_API::mem_free(TPtr ptr) {
 
     if (ptr != nullptr) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " buffer :" << ptr;
         clReleaseMemObject(ptr);
     }
 }
@@ -116,27 +135,23 @@ void AMD_API::mem_free(TPtr ptr) {
 void AMD_API::mem_set(TPtr ptr, int value, size_t n) {
 
     if (ptr == nullptr) {
-        return ;
+        return;
     }
 
-    AMD_ENV::is_init();
-
     Device<AMD> dev = AMD_ENV::cur_env()[current_device_id_index];
-    stream_t cm = dev.get_available_stream();
+    stream_t cm     = dev.get_available_stream();
 
     clEnqueueFillBuffer(cm, ptr, &value, sizeof(int), 0, n, 0, NULL, NULL);
 }
 
 #else
 
-template<typename U>
+template <typename U>
 void AMD_API::mem_set(TPtr ptr, U value, size_t n) {
     if (ptr == nullptr) {
-        return ;
+        return;
     }
 
-    AMD_ENV::is_init();
-
     Device<AMD> dev = AMD_ENV::cur_env()[current_device_id_index];
     stream_t cm = dev.get_available_stream(stream);
 
@@ -148,14 +163,12 @@ void AMD_API::mem_set(TPtr ptr, U value, size_t n) {
 
 void AMD_API::create_event(event_t* event, bool flag) {
 
-    LOG(INFO) << "create_event break opencl call sequence. Is baidu expect clCreateUserEvent?";
-    //do nothing for this.
+    // do nothing for this.
     *event = nullptr;
 
-    //Env<AMD>::is_init();
-    //cl_int err = CL_SUCCESS;
-    //event = clCreaeUserEvent(AMD_ENV::cur_env()[current_device_id_index].context, &err);
-    //AMD_CHECK(err);
+    // cl_int err = CL_SUCCESS;
+    // event = clCreaeUserEvent(AMD_ENV::cur_env()[current_device_id_index].context, &err);
+    // AMD_CHECK(err);
 }
 
 void AMD_API::create_stream(stream_t* stream) {
@@ -168,7 +181,6 @@ void AMD_API::create_stream(stream_t* stream) {
  * @param flag      input flag, 0: default stream flag, 1: cudaStreamNonBlocking
  */
 void AMD_API::create_stream_with_flag(stream_t* stream, unsigned int flag) {
-    Env<AMD>::is_init();
     cl_int err = CL_SUCCESS;
 
     if (!stream) {
@@ -204,13 +216,12 @@ void AMD_API::destroy_stream(stream_t stream) {
 }
 
 void AMD_API::destroy_event(event_t event) {
-    //    LOG(INFO) << __func__ <<" :Does baidu expect this event is an User Event?";
-
     if (event == nullptr) {
-        //        LOG(INFO) << "event is empty, do nothing";
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "event is empty, do nothing";
         return;
     }
 
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << event;
     cl_command_type t;
     AMD_CHECK(clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &t, NULL));
 
@@ -221,51 +232,53 @@ void AMD_API::destroy_event(event_t event) {
         if (refs == 1) {
             AMD_CHECK(clSetUserEventStatus(event, CL_COMPLETE));
         }
-
-        AMD_CHECK(clReleaseEvent(event));
-    } else {
-        //        LOG(INFO) << "NOT User Event, do nothing";
     }
 
+    AMD_CHECK(clReleaseEvent(event));
 }
 
-void AMD_API::record_event(event_t event, stream_t stream) {
-    //LOG(WARNING) << "OpenCL record event when calling clEnqueueXXX, so we use marker to simulate this behavior";
+void AMD_API::record_event(event_t& event, stream_t stream) {
+    // LOG(WARNING) << "OpenCL record event when calling clEnqueueXXX, so we use marker to simulate
+    // this behavior";
     AMD_CHECK(clEnqueueMarkerWithWaitList(stream, 0, NULL, &event));
-    //LOG(INFO) << "marker event "<< event;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "marker event " << event;
 }
 
 void AMD_API::query_event(event_t event) {
     // TODO
-    LOG(ERROR) <<
-               "OpenCL us clGetEventInfo to retrive event's specific info. so we need to know what info user want to know";
+    LOG(WARNING) << "device target AMD \" query_event\" is not implemented";
 }
 
 void AMD_API::sync_event(event_t event) {
-    //    LOG(INFO) << __func__ ;
 
     if (event == nullptr) {
-        LOG(INFO) << "event is empty, do nothing";
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " event is empty, do nothing";
         return;
     }
 
-    //    LOG(INFO) << "sync_event E " << event;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "sync_event E " << event;
     AMD_CHECK(clWaitForEvents(1, &event));
-    //    LOG(INFO) << "sync_event X " << event;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "sync_event X " << event;
 }
 
 void AMD_API::sync_stream(event_t event, stream_t stream) {
-    LOG(INFO) << __func__ ;
-
-    if (event != nullptr) {
-        LOG(INFO) << "event is null";
+    if (stream == NULL) {
+        LOG(INFO) << __func__ << " stream is empty, do nothing";
         return;
     }
 
-    LOG(INFO) << "sync_stream E ";
-    AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 1, &event, NULL));
+    LOG(INFO) << "sync_stream E";
+    event_t revent;
+
+    if (event == NULL) {
+        AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 0, NULL, &revent));
+    } else {
+        AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 1, &event, &revent));
+    }
+
     clFlush(stream);
-    LOG(INFO) << "sync_stream D ";
+    AMD_API::sync_event(revent);
+    LOG(INFO) << "sync_stream D";
 }
 
 #if 0
@@ -314,8 +327,14 @@ void AMD_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
                           const TPtr src, size_t src_offset, int src_id, \
                           size_t count, __DtoD) {
     // TODO
-    LOG(INFO)  << __func__ << " D2D dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset
-               << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2D dst=" << dst << " dst_id=" << dst_id <<
+                                         " dst_office=" << dst_offset
+                                         << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset
+                                         << " count=" << count;
+
+    if (count == 0) {
+        return;
+    }
 
     cl_mem dst_mem = (cl_mem)dst;
     cl_mem src_mem = (cl_mem)src;
@@ -327,6 +346,7 @@ void AMD_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
                                       &event));
         clFlush(cm);
         clWaitForEvents(1, &event);
+        clReleaseEvent(event);
         LOG(INFO) << "OpenCL, sync, D2D, size: " << count;
     } else {
         cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream();
@@ -343,9 +363,9 @@ void AMD_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
         clFlush(src_cm);
         clFlush(dst_cm);
         clWaitForEvents(1, &event);
+        clReleaseEvent(event);
         LOG(INFO) << "OpenCL, sync, P2P, size: " << count;
     }
-
 }
 #endif
 
@@ -360,9 +380,10 @@ void AMD_API::async_memcpy(TPtr dst, int dst_id, const TPtr src, int src_id, \
     size_t src_offset = src.offset;
     //async_memcpy_with_offset(dst, dst_id, 0, src, src_id, 0, count, stream, __DtoD());
 
-    LOG(INFO)  << __func__ << " D2D dst=" << (void*)dst_mem << " dst_id=" << dst_id << " dst_office=" <<
-               dst_offset << " src=" << (void*)src_mem << " src_id=" << src_id << " src_offset=" << src_offset <<
-               " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2D dst=" << (void*)dst_mem << " dst_id=" <<
+                                         dst_id << " dst_office=" <<
+                                         dst_offset << " src=" << (void*)src_mem << " src_id=" << src_id << " src_offset=" << src_offset <<
+                                         " count=" << count;
 
     //cl_mem dst_mem = (cl_mem) dst;
     //cl_mem src_mem = (cl_mem) src;
@@ -395,15 +416,22 @@ void AMD_API::async_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
                            const TPtr src, size_t src_offset, int src_id, \
                            size_t count, stream_t stream, __DtoD) {
 
-    LOG(INFO)  << __func__ << " D2D dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset
-               << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2D dst=" << dst << " dst_id=" << dst_id <<
+                                         " dst_office=" << dst_offset
+                                         << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset
+                                         << " count=" << count;
+
+    if (count == 0) {
+        return;
+    }
 
     cl_mem dst_mem = (cl_mem)dst;
     cl_mem src_mem = (cl_mem)src;
 
     if (dst_id == src_id) {
         cl_command_queue cm = AMD_ENV::cur_env()[dst_id].get_available_stream(stream);
-        AMD_CHECK(clEnqueueCopyBuffer(cm, src_mem, dst_mem, src_offset, dst_offset, count, 0, NULL, NULL));
+        AMD_CHECK(clEnqueueCopyBuffer(
+                      cm, src_mem, dst_mem, src_offset, dst_offset, count, 0, NULL, NULL));
         clFlush(cm);
         LOG(INFO) << "OpenCL, sync, D2D, size: " << count;
     } else {
@@ -413,22 +441,19 @@ void AMD_API::async_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
         cl_int err;
         cl_event dst_event;
         cl_event src_event;
-        cl_event src_event2;
         void* host_ptr = clEnqueueMapBuffer(src_cm, src_mem, CL_FALSE, CL_MAP_READ, src_offset, count, 0,
                                             NULL, &dst_event, &err);
         AMD_CHECK(err);
         AMD_CHECK(clEnqueueWriteBuffer(dst_cm, dst_mem, CL_FALSE, dst_offset, count, host_ptr, 1,
                                        &dst_event, &src_event));
-        AMD_CHECK(clEnqueueUnmapMemObject(src_cm, src_mem, host_ptr, 1, &src_event2, NULL));
+        AMD_CHECK(clEnqueueUnmapMemObject(src_cm, src_mem, host_ptr, 1, &src_event, NULL));
         clFlush(src_cm);
         clFlush(dst_cm);
         LOG(INFO) << "OpenCL, sync, P2P, size: " << count;
     }
-
 }
 #endif
 
-
 #if 0
 void AMD_API::sync_memcpy(TPtr dst, int dst_id, const void* src, int src_id, \
                           size_t count, __HtoD) {
@@ -436,8 +461,9 @@ void AMD_API::sync_memcpy(TPtr dst, int dst_id, const void* src, int src_id, \
     cl_mem dst_mem = dst.dmem;
     size_t dst_offset = dst.offset;
 
-    LOG(INFO)  << __func__ << " H2D dst=" << (void*)dst_mem << " dst_id=" << dst_id << " dst_office=" <<
-               dst_offset << " src=" << src << " src_id=" << src_id << " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " H2D dst=" << (void*)dst_mem << " dst_id=" <<
+                                         dst_id << " dst_office=" <<
+                                         dst_offset << " src=" << src << " src_id=" << src_id << " count=" << count;
 
     cl_event event;
     cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream();
@@ -451,18 +477,24 @@ void AMD_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
                           const void* src, size_t src_offset, int src_id, \
                           size_t count, __HtoD) {
 
-    LOG(INFO)  << __func__ << " H2D dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset
-               << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " H2D dst=" << dst << " dst_id=" << dst_id <<
+                                         " dst_office=" << dst_offset
+                                         << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset
+                                         << " count=" << count;
+
+    if (count == 0) {
+        return;
+    }
 
     cl_event event;
     cl_mem dst_mem = (cl_mem)dst;
     cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream();
-    clEnqueueWriteBuffer(dst_cm, dst_mem, CL_TRUE, dst_offset, count, (char*)src + src_offset, 0, NULL,
-                         &event);
+    AMD_CHECK(clEnqueueWriteBuffer(
+                  dst_cm, dst_mem, CL_TRUE, dst_offset, count, (char*)src + src_offset, 0, NULL, &event));
     clFlush(dst_cm);
     clWaitForEvents(1, &event);
+    clReleaseEvent(event);
     LOG(INFO) << "OpenCL, sync, H2D, size: " << count;
-
 }
 #endif
 
@@ -473,8 +505,9 @@ void AMD_API::async_memcpy(TPtr dst, int dst_id, const void* src, int src_id, \
     cl_mem dst_mem = dst.dmem;
     size_t dst_offset = dst.offset;
 
-    LOG(INFO)  << __func__ << " H2D dst=" << (void*)dst_mem << " dst_id=" << dst_id << " dst_office=" <<
-               dst_offset << " src=" << src << " src_id=" << src_id << " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " H2D dst=" << (void*)dst_mem << " dst_id=" <<
+                                         dst_id << " dst_office=" <<
+                                         dst_offset << " src=" << src << " src_id=" << src_id << " count=" << count;
 
     cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(stream);
     clEnqueueWriteBuffer(dst_cm, dst_mem, CL_FALSE, dst_offset, count, src, 0, NULL, NULL);
@@ -487,13 +520,19 @@ void AMD_API::async_memcpy(TPtr dst, size_t dst_offset, int dst_id, \
                            const void* src, size_t src_offset, int src_id, \
                            size_t count, stream_t stream, __HtoD) {
 
-    LOG(INFO)  << __func__ << " H2D dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset
-               << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " H2D dst=" << dst << " dst_id=" << dst_id <<
+                                         " dst_office=" << dst_offset
+                                         << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset
+                                         << " count=" << count;
+
+    if (count == 0) {
+        return;
+    }
 
     cl_mem dst_mem = (cl_mem)dst;
     cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(stream);
-    clEnqueueWriteBuffer(dst_cm, dst_mem, CL_FALSE, dst_offset, count, (char*)src + src_offset, 0, NULL,
-                         NULL);
+    AMD_CHECK(clEnqueueWriteBuffer(
+                  dst_cm, dst_mem, CL_FALSE, dst_offset, count, (char*)src + src_offset, 0, NULL, NULL));
     clFlush(dst_cm);
     LOG(INFO) << "OpenCL, async, H2D, size: " << count;
 }
@@ -520,21 +559,27 @@ void AMD_API::sync_memcpy(void* dst, int dst_id, const TPtr src, int src_id, \
 void AMD_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
                           const TPtr src, size_t src_offset, int src_id, \
                           size_t count, __DtoH) {
-    LOG(INFO)  << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset
-               << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id <<
+                                         " dst_office=" << dst_offset
+                                         << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset
+                                         << " count=" << count;
+
+    if (count == 0) {
+        return;
+    }
 
     cl_event event;
     cl_mem src_mem = (cl_mem)src;
     cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream();
-    clEnqueueReadBuffer(src_cm, src_mem, CL_TRUE, src_offset, count, (char*) dst + dst_offset, 0, NULL,
-                        &event);
+    AMD_CHECK(clEnqueueReadBuffer(
+                  src_cm, src_mem, CL_TRUE, src_offset, count, (char*)dst + dst_offset, 0, NULL, &event));
     clFlush(src_cm);
     clWaitForEvents(1, &event);
+    clReleaseEvent(event);
     LOG(INFO) << "OpenCL, sync, D2H, size: " << count;
 }
 #endif
 
-
 #if 0
 void AMD_API::async_memcpy(void* dst, int dst_id, const TPtr src, int src_id, \
                            size_t count, stream_t& stream, __DtoH) {
@@ -542,8 +587,9 @@ void AMD_API::async_memcpy(void* dst, int dst_id, const TPtr src, int src_id, \
     cl_mem src_mem = src.dmem;
     size_t src_offset = src.offset;
 
-    LOG(INFO)  << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id << " src=" <<
-               (void*)src_mem << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id <<
+                                         " src=" <<
+                                         (void*)src_mem << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count;
 
     //cl_mem src_mem = (cl_mem) src;
     cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream(stream);
@@ -556,14 +602,20 @@ void AMD_API::async_memcpy(void* dst, size_t dst_offset, int dst_id, \
                            const TPtr src, size_t src_offset, int src_id, \
                            size_t count, stream_t stream, __DtoH) {
 
-    LOG(INFO)  << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset
-               << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count;
-    LOG(INFO)  << __func__ << " stream: " << stream;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " D2H dst=" << dst << " dst_id=" << dst_id <<
+                                         " dst_office=" << dst_offset
+                                         << " src=" << src << " src_id=" << src_id << " src_offset=" << src_offset
+                                         << " count=" << count;
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " stream: " << stream;
+
+    if (count == 0) {
+        return;
+    }
 
     cl_mem src_mem = (cl_mem)src;
     cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream(stream);
-    clEnqueueReadBuffer(src_cm, src_mem, CL_FALSE, src_offset, count, (char*)dst + dst_offset, 0, NULL,
-                        NULL);
+    AMD_CHECK(clEnqueueReadBuffer(
+                  src_cm, src_mem, CL_FALSE, src_offset, count, (char*)dst + dst_offset, 0, NULL, NULL));
     clFlush(src_cm);
     LOG(INFO) << "OpenCL, async, D2H, size: " << count;
 }
@@ -598,6 +650,10 @@ void AMD_API::sync_memcpy_p2p(TPtr dst, int dst_dev, const TPtr src, \
 void AMD_API::sync_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \
                               const TPtr src, size_t src_offset, int src_id, size_t count) {
 
+    if (count == 0) {
+        return;
+    }
+
     cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream();
     cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream();
 
@@ -613,11 +669,11 @@ void AMD_API::sync_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \
     clFlush(src_cm);
     clFlush(dst_cm);
     clWaitForEvents(1, &event);
+    clReleaseEvent(event);
     LOG(INFO) << "OpenCL, sync, P2P, size: " << count;
 }
 #endif
 
-
 #if 0
 void AMD_API::async_memcpy_p2p(TPtr dst, int dst_dev, const TPtr src, \
                                int src_dev, size_t count, stream_t& stream) {
@@ -648,6 +704,10 @@ void AMD_API::async_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \
                                const TPtr src, size_t src_offset, int src_id, \
                                size_t count, stream_t stream) {
 
+    if (count == 0) {
+        return;
+    }
+
     cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(stream);
     cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream(stream);
 
@@ -669,13 +729,12 @@ void AMD_API::async_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \
 }
 #endif
 
-
 /**
  * \brief device target return currently used device id
  * @return          currently activated device id
  */
 int AMD_API::get_device_id() {
-    //LOG(INFO) << "get device id = " << current_device_id_index;
+    // LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "get device id = " << current_device_id_index;
     return current_device_id_index;
 }
 
@@ -721,7 +780,7 @@ cl_platform_id AMD_API::get_platform_id() {
         AMD_CHECK(clGetPlatformInfo(platformIDs[i], CL_PLATFORM_NAME, infoSize, info, NULL));
 
         if (strstr(info, "AMD") != NULL) {
-            id = platformIDs[i];
+            id     = platformIDs[i];
             errNum = CL_SUCCESS;
             free(info);
             break;
@@ -795,7 +854,6 @@ void AMD_API::init(){
 }
 */
 
-
 typedef TargetWrapper<AMDHX86, __host_target> AMDH_API;
 typedef Env<AMDHX86> AMDH_ENV;
 
@@ -806,7 +864,7 @@ void AMDH_API::get_device_count(int& count) {
 }
 
 void AMDH_API::set_device(int id) {
-    //todo
+    // todo
     LOG(WARNING) << "host target AMDHX86 \" set_device\" is not implemented";
 }
 
@@ -820,21 +878,18 @@ void AMDH_API::mem_free(void* ptr) {
     }
 }
 
-
 void AMDH_API::mem_set(void* ptr, int value, size_t n) {
     memset(ptr, value, n);
 }
 
 void AMDH_API::create_event(event_t* event, bool flag) {
 
-    LOG(INFO) << "create_event break opencl call sequence. Is baidu expect clCreateUserEvent?";
-    //do nothing for this.
+    // do nothing for this.
     *event = nullptr;
 
-    //Env<AMD>::is_init();
-    //cl_int err = CL_SUCCESS;
-    //event = clCreaeUserEvent(AMD_ENV::cur_env()[current_device_id_index].context, &err);
-    //AMD_CHECK(err);
+    // cl_int err = CL_SUCCESS;
+    // event = clCreaeUserEvent(AMD_ENV::cur_env()[current_device_id_index].context, &err);
+    // AMD_CHECK(err);
 }
 
 void AMDH_API::create_stream(stream_t* stream) {
@@ -847,7 +902,6 @@ void AMDH_API::create_stream(stream_t* stream) {
  * @param flag      input flag, 0: default stream flag, 1: cudaStreamNonBlocking
  */
 void AMDH_API::create_stream_with_flag(stream_t* stream, unsigned int flag) {
-    Env<AMD>::is_init();
     int current_device_id_index = 0;
     cl_int err = CL_SUCCESS;
 
@@ -872,10 +926,11 @@ void AMDH_API::destroy_stream(stream_t stream) {
 
 void AMDH_API::destroy_event(event_t event) {
     if (event == nullptr) {
-        //        LOG(INFO) << "event is empty, do nothing";
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "event is empty, do nothing";
         return;
     }
 
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << event;
     cl_command_type t;
     AMD_CHECK(clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &t, NULL));
 
@@ -886,15 +941,12 @@ void AMDH_API::destroy_event(event_t event) {
         if (refs == 1) {
             AMD_CHECK(clSetUserEventStatus(event, CL_COMPLETE));
         }
-
-        AMD_CHECK(clReleaseEvent(event));
-    } else {
-        //        LOG(INFO) << "NOT User Event, do nothing";
     }
 
+    AMD_CHECK(clReleaseEvent(event));
 }
 
-void AMDH_API::record_event(event_t event, stream_t stream) {
+void AMDH_API::record_event(event_t& event, stream_t stream) {
     AMD_CHECK(clEnqueueMarkerWithWaitList(stream, 0, NULL, &event));
 }
 
@@ -904,30 +956,33 @@ void AMDH_API::query_event(event_t event) {
 }
 
 void AMDH_API::sync_event(event_t event) {
-    //    LOG(INFO) << __func__ ;
-
     if (event == nullptr) {
-        LOG(INFO) << "event is empty, do nothing";
+        LOG(INFO) << __func__ << " event is empty, do nothing";
         return;
     }
 
-    //    LOG(INFO) << "sync_event E " << event;
+    LOG(INFO) << "sync_event E " << event;
     AMD_CHECK(clWaitForEvents(1, &event));
-    //    LOG(INFO) << "sync_event X " << event;
+    LOG(INFO) << "sync_event X " << event;
 }
 
 void AMDH_API::sync_stream(event_t event, stream_t stream) {
-    LOG(INFO) << __func__ ;
-
-    if (event != nullptr) {
-        LOG(INFO) << "event is null";
+    if (stream == NULL) {
+        LOG(INFO) << __func__ << " stream is empty, do nothing";
         return;
     }
 
-    LOG(INFO) << "sync_stream E ";
-    AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 1, &event, NULL));
+    LOG(INFO) << "sync_stream E";
+    event_t revent;
+
+    if (event == NULL) {
+        AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 0, NULL, &revent));
+    } else {
+        AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 1, &event, &revent));
+    }
+
     clFlush(stream);
-    LOG(INFO) << "sync_stream D ";
+    LOG(INFO) << "sync_stream D";
 }
 
 void AMDH_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \
@@ -972,6 +1027,6 @@ template struct Env<AMDHX86>;
 
 #endif // AMD_GPU
 
-} //namespace saber
+} // namespace saber
 
-} //namespace anakin
+} // namespace anakin
diff --git a/saber/core/impl/amd/utils/amd_base.h b/saber/core/impl/amd/utils/amd_base.h
new file mode 100644
index 000000000..23c3fc422
--- /dev/null
+++ b/saber/core/impl/amd/utils/amd_base.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_AMDBASE_H
+#define ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_AMDBASE_H
+#include <vector>
+#include <string>
+#include "amd_logger.h"
+#include <miopen/kernel_info.hpp>
+
+namespace anakin {
+namespace saber {
+
+typedef struct ExtSolutionConfigTpye {
+    int in_tile0, in_tile1;
+    int grp_tile0, grp_tile1;
+    int out_pix_tile0, out_pix_tile1;
+    int n_stacks;
+    int n_out_pix_tiles;
+    int n_out_tiles_perstack;
+    int n_in_data_tiles;
+    int n_read_procs;
+    int alu_tile0, alu_tile1;
+    int horiz_out_pix;
+    int vert_out_pix;
+} T_ExtSolutionConfig;
+
+enum KernelType {
+    DEFAULT = 0,
+    SABER   = DEFAULT,
+    SOURCE,
+    MIOPEN,
+};
+
+struct KernelInfo {
+    std::string comp_options;
+    int wk_dim = 1;
+    std::vector<size_t> l_wk;
+    std::vector<size_t> g_wk;
+    std::vector<size_t> g_wk_offset;
+    std::string kernel_file;
+    std::string kernel_name;
+    KernelType kernel_type = DEFAULT;
+    friend std::ostream& operator<<(std::ostream& os, const KernelInfo& k);
+
+    KernelInfo& operator=(KernelInfo right) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "assign KerneInfo";
+        clone(&right);
+    };
+    KernelInfo& operator=(KernelInfo* right) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "assign KerneInfo *";
+        clone(right);
+    };
+    KernelInfo& operator=(miopen::solver::KernelInfo& right) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "assign KerneInfo *";
+        clone(&right);
+    };
+
+    void printE() {
+        std::string s = "";
+
+        LOG(INFO) << "======================================================================";
+
+        if (kernel_type == KernelType::SOURCE) {
+            LOG(INFO) << "kernel is cl string";
+        } else {
+            LOG(INFO) << "kernel file name: " << kernel_file;
+        }
+
+        LOG(INFO) << "kernel name:      " << kernel_name;
+
+        for (auto v : l_wk) {
+            s += std::to_string(v);
+            s += " ";
+        }
+
+        LOG(INFO) << "local work size:  " << s;
+        s = "";
+
+        for (auto v : g_wk) {
+            s += std::to_string(v);
+            s += " ";
+        }
+
+        LOG(INFO) << "global work size: " << s;
+        LOG(INFO) << "compile option:   " << comp_options;
+        LOG(INFO) << "kernel type:      " << kernel_type;
+        LOG(INFO) << "======================================================================";
+    };
+
+    void print() {
+#ifdef ENABLE_DEBUG
+        printE();
+#endif
+    };
+
+private:
+    template <typename T>
+    void copy_vector(std::vector<T>* l, std::vector<T>* r) {
+        l->clear();
+        typename std::vector<T>::iterator i;
+
+        for (i = r->begin(); i < r->end(); i++) {
+            l->push_back(*i);
+        }
+    }
+    void clone(KernelInfo* ki) {
+        comp_options.assign(ki->comp_options);
+        wk_dim = ki->wk_dim;
+        copy_vector(&l_wk, &ki->l_wk);
+        copy_vector(&g_wk, &ki->g_wk);
+        copy_vector(&g_wk_offset, &ki->g_wk_offset);
+        kernel_file.assign(ki->kernel_file);
+        kernel_name.assign(ki->kernel_name);
+        kernel_type = ki->kernel_type;
+    }
+
+    void clone(miopen::solver::KernelInfo* ki) {
+        comp_options.assign(ki->comp_options);
+        wk_dim = ki->g_wk.size();
+        copy_vector(&l_wk, &ki->l_wk);
+        copy_vector(&g_wk, &ki->g_wk);
+        //copy_vector(&g_wk_offset, &ki->g_wk_offset);
+        kernel_file.assign(ki->kernel_file);
+        kernel_name.assign(ki->kernel_name);
+        kernel_type = MIOPEN;
+    }
+};
+
+} // namespace saber
+} // namespace anakin
+#endif
diff --git a/saber/core/impl/amd/utils/amd_cache.h b/saber/core/impl/amd/utils/amd_cache.h
new file mode 100644
index 000000000..2ab2f3dcc
--- /dev/null
+++ b/saber/core/impl/amd/utils/amd_cache.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_AMDCACHE_H
+#define ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_AMDCACHE_H
+#include <memory>
+#include <map>
+#include <mutex>
+#include <iostream>
+#include "ocl/ocl_kernel.h"
+#include "anakin_config.h"
+
+namespace anakin {
+namespace saber {
+
+template <typename T, typename U>
+class AMDCache {
+public:
+    typedef std::map<U, T> MType;
+
+    static AMDCache<T, U>* getInstance() {
+        if (_instance.get() == NULL) {
+            _instance = std::unique_ptr<AMDCache<T, U>>(new AMDCache<T, U>);
+        }
+
+        return _instance.get();
+    }
+
+    size_t getSize() {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+        return cache_size;
+    }
+
+    // lookup the binary given the file name
+    T lookup(U key) {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+        typename MType::iterator iter;
+        iter = cache_map.find(key);
+
+        if (iter != cache_map.end()) {
+            LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "[" << typeid(T).name() << "] found cache!";
+            return iter->second;
+        } else {
+            return NULL;
+        }
+    }
+
+    // add program to the cache
+    void add(U key, T t) {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
+        if (!lookup(key)) {
+            LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "[" << typeid(T).name() << "] insert cache to slot, size: "
+                                                 << cache_size;
+            cache_size++;
+            cache_map.insert(typename MType::value_type(key, t));
+            LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "[" << typeid(T).name() << "] insert cache to slot done";
+        }
+    }
+
+    // The presumed watermark for the cache volume (256MB). Is it enough?
+    // We may need more delicate algorithms when necessary later.
+    // Right now, let's just leave it along.
+    static const unsigned MAX_CACHE_SIZE = 1024;
+
+    ~AMDCache() {
+        release();
+    }
+
+    AMDCache() {
+        cache_size = 0;
+        cache_map.clear();
+    };
+
+protected:
+    void release() {
+        cache_size = 0;
+        cache_map.clear();
+    }
+
+private:
+    static typename std::unique_ptr<AMDCache<T, U>> _instance;
+    MType cache_map;
+    unsigned int cache_size;
+    std::recursive_mutex m_mutex;
+};
+
+template <class T, typename U>
+std::unique_ptr<AMDCache<T, U>> AMDCache<T, U>::_instance = NULL;
+
+using ProgramCache = AMDCache<ClProgramPtr, std::pair<cl_context, std::string>>;
+using KernelCache  = AMDCache<ClKernelPtr, std::pair<cl_program, std::string>>;
+
+} // namespace saber
+} // namespace anakin
+
+#endif
diff --git a/saber/funcs/impl/amd/cl/ReluUni.cl b/saber/core/impl/amd/utils/amd_common.h
similarity index 65%
rename from saber/funcs/impl/amd/cl/ReluUni.cl
rename to saber/core/impl/amd/utils/amd_common.h
index 5656b0155..5ab91c220 100644
--- a/saber/funcs/impl/amd/cl/ReluUni.cl
+++ b/saber/core/impl/amd/utils/amd_common.h
@@ -12,11 +12,18 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 */
+#ifndef ANAKIN_SABER_CORE_IMPL_AMD_AMDCOMMON_H
+#define ANAKIN_SABER_CORE_IMPL_AMD_AMDCOMMON_H
 
-__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void
-ReluUni(const __global float* __restrict in,
-	__global float* __restrict out,
-	float slope)
-{
-	out[get_global_id(0)] = in[get_global_id(0)] * (in[get_global_id(0)] > 0.0f ? 1.0f : slope);
-}
+#include "anakin_config.h"
+#include "saber/core/common.h"
+namespace anakin {
+namespace saber {
+#ifdef USE_OPENCL
+typedef cl_command_queue AMDStream_t;
+typedef cl_event AMDEvent_t;
+#endif
+
+} // namespace saber
+} // namespace anakin
+#endif
diff --git a/saber/core/impl/amd/utils/amd_error.h b/saber/core/impl/amd/utils/amd_error.h
new file mode 100644
index 000000000..a6efd28f0
--- /dev/null
+++ b/saber/core/impl/amd/utils/amd_error.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#ifndef ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDERROR_H
+#define ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDERROR_H
+
+#include <exception>
+#include <iostream>
+#include <string>
+#include <tuple>
+
+namespace anakin {
+namespace saber {
+
+struct Exception : std::exception {
+    std::string message;
+    int status;
+    Exception(const std::string& msg = "") : message(msg), status(-1) {}
+
+    Exception(int s, const std::string& msg = "") : message(msg), status(s) {}
+
+    Exception SetMessage(const std::string& file, int line) {
+        message = file + ":" + std::to_string(line) + ": " + message;
+        return *this;
+    }
+
+    const char* what() const noexcept override {
+        return message.c_str();
+    }
+};
+
+#define AMD_THROW(...) throw anakin::saber::Exception(__VA_ARGS__).SetMessage(__FILE__, __LINE__)
+
+} // namespace saber
+} // namespace anakin
+
+#endif
diff --git a/saber/core/impl/amd/utils/amd_file_utils.cpp b/saber/core/impl/amd/utils/amd_file_utils.cpp
new file mode 100644
index 000000000..3e33041a8
--- /dev/null
+++ b/saber/core/impl/amd/utils/amd_file_utils.cpp
@@ -0,0 +1,435 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#include "amd_file_utils.h"
+#include <miopen/db_path.hpp>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include "amd_logger.h"
+#include "amd_error.h"
+#include <vector>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+namespace anakin {
+namespace saber {
+
+#define SABER_CACHE_DIR "~/.cache/amd_saber/"
+#define separator "/"
+
+std::string md5(std::string s) {
+    std::array<unsigned char, MD5_DIGEST_LENGTH> result {};
+    MD5(reinterpret_cast<const unsigned char*>(s.data()), s.length(), result.data());
+
+    std::ostringstream sout;
+    sout << std::hex << std::setfill('0');
+
+    for (auto c : result)
+        sout << std::setw(2) << int {c};
+
+    return sout.str();
+}
+
+inline std::string
+ReplaceString(std::string subject, const std::string& search, const std::string& replace) {
+    size_t pos = 0;
+
+    while ((pos = subject.find(search, pos)) != std::string::npos) {
+        subject.replace(pos, search.length(), replace);
+        pos += replace.length();
+    }
+
+    return subject;
+}
+
+bool is_directory_separator(char c) {
+    return c == '/';
+}
+
+bool is_root_separator(const char* str, size_t pos) {
+    // pos is position of the separator
+    if (str != nullptr && !is_directory_separator(str[pos])) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "precondition violation";
+    }
+
+    // subsequent logic expects pos to be for leftmost slash of a set
+    while (pos > 0 && is_directory_separator(str[pos - 1])) {
+        --pos;
+    }
+
+    //  "/" [...]
+    if (pos == 0) {
+        return true;
+    }
+
+    //  "//" name "/"
+    if (pos < 3 || !is_directory_separator(str[0]) || !is_directory_separator(str[1])) {
+        return false;
+    }
+
+    std::string tmp(str);
+    return tmp.find_first_of("/", 2) == pos;
+}
+
+size_t root_directory_start(const char* path, size_t size) {
+    // return npos if no root_directory found
+    // case "//"
+    if (size == 2 && is_directory_separator(path[0]) && is_directory_separator(path[1])) {
+        return std::string::npos;
+    }
+
+    // case "//net {/}"
+    if (size > 3 && is_directory_separator(path[0]) && is_directory_separator(path[1])
+            && !is_directory_separator(path[2])) {
+        std::string str(path);
+        size_t pos = str.find_first_of("/", 2);
+        return pos < size ? pos : std::string::npos;
+    }
+
+    // case "/"
+    if (size > 0 && is_directory_separator(path[0])) {
+        return 0;
+    }
+
+    return std::string::npos;
+}
+
+size_t filename_pos(const char* str, size_t end_pos) {
+    // end_pos is past-the-end position
+    // return 0 if str itself is filename (or empty)
+
+    // case: "//"
+    if (end_pos == 2 && is_directory_separator(str[0]) && is_directory_separator(str[1])) {
+        return 0;
+    }
+
+    // case: ends in "/"
+    if (end_pos && is_directory_separator(str[end_pos - 1])) {
+        return end_pos - 1;
+    }
+
+    // set pos to start of last element
+    std::string filename(str);
+    size_t pos = (filename.find_last_of('/', end_pos - 1));
+
+    return (pos == std::string::npos // path itself must be a filename (or empty)
+            || (pos == 1 && is_directory_separator(str[0]))) // or net
+           ? 0                                       // so filename is entire string
+           : pos + 1;                                // or starts after delimiter
+}
+
+size_t parent_path_end(const std::string& path) {
+    size_t end_pos = filename_pos(path.c_str(), path.length());
+
+    bool filename_was_separator = path.length() && is_directory_separator(path.c_str()[end_pos]);
+
+    // skip separators unless root directory
+    size_t root_dir_pos = root_directory_start(path.c_str(), end_pos);
+
+    for (; end_pos > 0 && (end_pos - 1) != root_dir_pos
+            && is_directory_separator(path.c_str()[end_pos - 1]);
+            --end_pos) {
+    }
+
+    return (end_pos == 1 && root_dir_pos == 0 && filename_was_separator) ? std::string::npos
+           : end_pos;
+}
+
+void SplitString(const std::string& s, std::vector<std::string>& v, const std::string& c) {
+    std::string::size_type pos1, pos2;
+    pos2 = s.find(c);
+    pos1 = 0;
+
+    while (std::string::npos != pos2) {
+        v.push_back(s.substr(pos1, pos2 - pos1));
+
+        pos1 = pos2 + c.size();
+        pos2 = s.find(c, pos1);
+    }
+
+    if (pos1 != s.length()) {
+        v.push_back(s.substr(pos1));
+    }
+}
+
+std::string temp_directory_path() {
+    const char* val = 0;
+    (val = std::getenv("TMPDIR")) || (val = std::getenv("TMP")) || (val = std::getenv("TEMP"))
+    || (val = std::getenv("TEMPDIR"));
+
+    const char* default_tmp = "/tmp";
+    std::string temp_directory((val != 0) ? val : default_tmp);
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "temp directory path :" << temp_directory;
+    return temp_directory;
+}
+
+bool is_directory(const std::string& path) {
+    struct stat info;
+
+    if (stat(path.c_str(), &info) == 0) {
+        if (info.st_mode & S_IFDIR) {
+            LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "it's a directory";
+            return true;
+        } else if (info.st_mode & S_IFREG) {
+            LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "it's a file";
+            return false;
+        } else {
+            LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "not file or directory";
+        }
+    } else {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "path not exist";
+    }
+
+    return false;
+}
+
+std::string filename(const std::string& filename) {
+    size_t pos = filename_pos(filename.c_str(), filename.length());
+    return (filename.length() && pos && is_directory_separator(filename.c_str()[pos])
+            && !is_root_separator(filename.c_str(), pos))
+           ? "."
+           : filename.substr(pos, filename.length());
+}
+
+std::string remove_filename(std::string path) {
+    return path.erase(parent_path_end(path), path.length());
+}
+
+int permissions(std::string& p, mode_t mode) {
+    int result = chmod(p.c_str(), mode);
+
+    if (result != 0) {
+        result = errno;
+    }
+
+    return (result);
+}
+
+std::string unique_path() {
+    int unique_path_len = 19;
+    std::string str     = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+    std::string newstr;
+    int pos, i = 0;
+    std::string str1 = "-";
+
+    while (newstr.size() != unique_path_len) {
+        pos = ((rand() % (str.size() - 1)));
+        newstr += str.substr(pos, 1);
+        i++;
+
+        if (i == 4 && newstr.size() != unique_path_len) {
+            newstr += "-";
+            i = 0;
+        }
+    }
+
+    return newstr;
+}
+
+bool exists(std::string path) {
+    int state = access(path.c_str(), R_OK | W_OK);
+
+    if (state == 0) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "file path exist";
+        return true;
+    } else {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "file path not exist, path :" << path;
+        return false;
+    }
+}
+
+std::string parent_path(const std::string& path) {
+    size_t end_pos = parent_path_end(path);
+    return end_pos == std::string::npos ? path : path.substr(0, end_pos);
+}
+
+int file_status(const std::string& path) {
+    struct stat info;
+
+    if (stat(path.c_str(), &info) == 0) {
+        if (info.st_mode & S_IFDIR) {
+            return 1; // it's a directory
+        } else if (info.st_mode & S_IFREG) {
+            return 2; // it's a file
+        } else {
+            return 3; // not direcort or file
+        }
+    } else {
+        return 3;
+    }
+}
+
+bool filename_is_dot(std::string p) {
+    std::string name = filename(p);
+    return name.length() == 1 && *name.c_str() == '.';
+}
+
+bool filename_is_dot_dot(std::string p) {
+    return p.length() >= 2 && p.c_str()[p.length() - 1] == '.' && p.c_str()[p.length() - 2] == '.';
+}
+
+bool create_directory(const std::string& p) {
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "path :" << p;
+    int flag = mkdir(p.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+
+    if (flag == 0) {
+        return true;
+    }
+
+    // attempt to create directory failed
+    if (is_directory(p)) {
+        return false;
+    }
+
+    //  attempt to create directory failed && it doesn't already exist
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) <<
+                                         "attempt to create directory failed && it doesn't already exist";
+    return false;
+}
+
+bool create_directories(const std::string p) {
+    if (p.length() == 0) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "invalid argument for path";
+        return false;
+    }
+
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "path :" << p;
+
+    if (filename_is_dot(p) || filename_is_dot_dot(p)) {
+        return create_directories(parent_path(p));
+    }
+
+    if (is_directory(p)) {
+        return false;
+    }
+
+    std::string parent = parent_path(p);
+
+    if (parent == p) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "internal error: path == parent path";
+    }
+
+    if (parent.length() > 0) {
+        // if the parent does not exist, create the parent
+        if (!is_directory(p)) {
+            create_directories(parent);
+        }
+    }
+
+    // create the directory
+    return create_directory(p);
+}
+
+// original functions
+std::string ComputeCachePath() {
+    std::string cache_dir = SABER_CACHE_DIR;
+
+    auto p = ReplaceString(cache_dir, "~", getenv("HOME"));
+
+    if (!exists(p)) {
+        create_directories(p);
+    }
+
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Get program path :" << p;
+
+    return p;
+}
+
+std::string GetCachePath() {
+    static const std::string path = ComputeCachePath();
+    return path;
+}
+
+std::string
+GetCacheFile(const std::string& device, const std::string& name, const std::string& args) {
+    std::string filename = name + ".o";
+    return GetCachePath() + md5(device + ":" + args) + "/" + filename;
+}
+
+std::string
+LoadBinaryPath(const std::string& device, const std::string& name, const std::string& args) {
+    auto f = GetCacheFile(device, name, args);
+
+    if (exists(f)) {
+        return f;
+    } else {
+        return {};
+    }
+}
+
+void SaveBinary(
+    const std::string& binary_path,
+    const std::string& device,
+    const std::string& name,
+    const std::string& args) {
+    auto p = GetCacheFile(device, name, args);
+    create_directories(parent_path(p));
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "target path :" << p;
+    rename(binary_path.c_str(), p.c_str());
+}
+
+std::string LoadFile(const std::string& s) {
+    std::ifstream t(s);
+    std::stringstream buffer;
+    buffer << t.rdbuf();
+    return buffer.str();
+}
+
+miopen::Db GetDb(std::string device_name, int max_CU) {
+    auto p = ReplaceString(miopen::GetDbPath(), "~", getenv("HOME"));
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "db path :" << p;
+
+    if (!exists(p)) {
+        create_directories(p);
+    }
+
+    std::string dbFileName = p + "/" + device_name + "_" + std::to_string(max_CU) + ".cd.pad.txt";
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "db file name :" << dbFileName;
+    return {dbFileName};
+}
+
+std::string genTempFilePath(std::string name) {
+    std::string dir_path;
+    std::string file_path;
+    dir_path =
+        (/*boost::filesystem::*/ temp_directory_path() + "/" +
+                                 /*boost::filesystem::*/ unique_path(/*"amd-tmp-%%%%-%%%%-%%%%-%%%%"*/));
+
+    create_directories(dir_path);
+
+    file_path = dir_path + "/" + name; //(dir_path / name).string();
+
+    if (!std::ofstream {file_path, std::ios_base::out | std::ios_base::in | std::ios_base::trunc}
+            .good()) {
+        AMD_THROW("Failed to create temp file: " + file_path);
+    }
+    return file_path;
+}
+
+void writeFile(const std::string& content, const std::string& name) {
+    FILE* fd = std::fopen(name.c_str(), "w");
+
+    if (std::fwrite(content.c_str(), 1, content.size(), fd) != content.size()) {
+        std::fclose(fd);
+        AMD_THROW("Failed to write to src file");
+    }
+
+    std::fclose(fd);
+}
+
+} // namespace saber
+} // namespace anakin
diff --git a/saber/core/impl/amd/utils/amd_file_utils.h b/saber/core/impl/amd/utils/amd_file_utils.h
new file mode 100644
index 000000000..2d2e6bdb9
--- /dev/null
+++ b/saber/core/impl/amd/utils/amd_file_utils.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#ifndef ANAKIN_SABER_FUNCS_IMPL_UTILS_AMDFILEUTILS_H
+#define ANAKIN_SABER_FUNCS_IMPL_UTILS_AMDFILEUTILS_H
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <memory>
+#include <openssl/md5.h>
+#include <miopen/db.hpp>
+
+namespace anakin {
+namespace saber {
+
+std::string temp_directory_path();
+bool is_directory(const std::string& path);
+std::string filename(const std::string& filename);
+std::string remove_filename(std::string path);
+int permissions(std::string& p, mode_t mode);
+std::string unique_path();
+bool exists(std::string path);
+std::string parent_path(const std::string& path);
+bool create_directories(std::string path);
+
+std::string
+GetCacheFile(const std::string& device, const std::string& name, const std::string& args);
+
+std::string GetCachePath();
+
+void SaveBinary(
+    const std::string& binary_path,
+    const std::string& device,
+    const std::string& name,
+    const std::string& args);
+
+std::string
+LoadBinaryPath(const std::string& device, const std::string& name, const std::string& args);
+
+std::string LoadFile(const std::string& s);
+std::string md5(std::string s);
+miopen::Db GetDb(std::string device_name, int max_CU);
+std::string genTempFilePath(std::string name);
+void writeFile(const std::string& content, const std::string& name);
+
+} // namespace saber
+} // namespace anakin
+#endif
diff --git a/saber/core/impl/amd/utils/amd_kernel.cpp b/saber/core/impl/amd/utils/amd_kernel.cpp
new file mode 100644
index 000000000..424dceda8
--- /dev/null
+++ b/saber/core/impl/amd/utils/amd_kernel.cpp
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#include "amd_kernel.h"
+#include "saber/core/env.h"
+#include "saber/core/device.h"
+
+namespace anakin {
+namespace saber {
+
+#ifdef USE_OPENCL
+AMDKernelPtr CreateKernel(int device_id, KernelInfo* ki) {
+    cl_context context  = 0;
+    cl_device_id device = 0;
+
+    Device<AMD> dev = Env<AMD>::cur_env()[device_id]; // anakin device id to AMD device
+    device          = dev.get_device();
+    context         = dev.get_context();
+
+    return gen_shared_ocl(new OCLKernel(context, device, ki));
+}
+
+bool LaunchKernel(AMDStream_t stream, amd_kernel_list kernels, bool sync) {
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__;
+
+    float exec_time_ms = 0;
+
+    for (amd_kernel_list::iterator it = kernels.begin(); it != kernels.end(); it++) {
+        cl_event event;
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " E";
+
+        if (!it->get()->Invoke(stream, 0, NULL, (sync ? &event : NULL))) {
+            LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " Failed";
+            return false;
+        }
+
+        if (sync) {
+            TargetWrapper<AMD>::sync_event(event);
+#ifdef ENABLE_AMD_PROFILING
+            cl_ulong start, end;
+            clGetEventProfilingInfo(
+                event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
+            clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
+            exec_time_ms = (end - start) * 1e-6;
+#endif
+        }
+
+        if (sync) {
+            TargetWrapper<AMD>::destroy_event(event);
+        }
+
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " X : " << exec_time_ms << " ms";
+    }
+
+    return true;
+}
+#endif
+
+} // namespace saber
+} // namespace anakin
diff --git a/saber/core/impl/amd/utils/amd_kernel.h b/saber/core/impl/amd/utils/amd_kernel.h
new file mode 100644
index 000000000..28aefc3f2
--- /dev/null
+++ b/saber/core/impl/amd/utils/amd_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#ifndef ANAKIN_SABER_CORE_IMPL_AMD_AMDKERNEL_H
+#define ANAKIN_SABER_CORE_IMPL_AMD_AMDKERNEL_H
+
+#include "amd_base.h"
+#include "amd_common.h"
+
+#ifdef USE_OPENCL
+#include "ocl/ocl_kernel.h"
+#endif
+
+namespace anakin {
+namespace saber {
+
+#ifdef USE_OPENCL
+typedef OCLKernel AMDKernel;
+typedef OCLKernelPtr AMDKernelPtr;
+typedef std::list<AMDKernelPtr> amd_kernel_list;
+#endif
+
+extern AMDKernelPtr CreateKernel(int device_id, KernelInfo* ki);
+extern bool LaunchKernel(AMDStream_t stream, amd_kernel_list kernels, bool sync = false);
+
+} // namespace saber
+} // namespace anakin
+#endif
diff --git a/saber/core/impl/amd/utils/amd_kernels.cpp.in b/saber/core/impl/amd/utils/amd_kernels.cpp.in
new file mode 100644
index 000000000..9be1aa4c0
--- /dev/null
+++ b/saber/core/impl/amd/utils/amd_kernels.cpp.in
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#include "amd_kernels.h"
+#include "utils/logger/logger.h"
+#include <algorithm>
+#include <iostream>
+#include <map>
+
+namespace anakin {
+namespace saber {
+
+const std::map<std::string, std::string>& kernels() {
+    static const std::map<std::string, std::string> data {${INIT_KERNELS}};
+    return data;
+}
+
+std::string GetKernelSrc(std::string name) {
+    // Use the base name of the string
+    int start = 0;
+    auto slash = name.find_last_of("/\\");
+
+    if (slash != std::string::npos) {
+        start = slash + 1;
+    }
+
+    int len = name.size();
+    auto ex = name.rfind('.');
+
+    if (ex != std::string::npos) {
+        len = ex - start;
+    }
+
+    auto key = name.substr(start, len);
+    // Convert to uppercase
+    std::transform(key.begin(), key.end(), key.begin(), ::toupper);
+
+    auto it = kernels().find(key);
+
+    if (it == kernels().end()) {
+        LOG(ERROR) << "Failed to load kernel source: " << key;
+    }
+
+    return it->second;
+}
+} // namespace saber
+} // namespace anakin
diff --git a/saber/funcs/impl/amd/cl/MIOpenBiasReLuUni.cl b/saber/core/impl/amd/utils/amd_logger.h
similarity index 57%
rename from saber/funcs/impl/amd/cl/MIOpenBiasReLuUni.cl
rename to saber/core/impl/amd/utils/amd_logger.h
index 676f0df07..a5535ec6a 100644
--- a/saber/funcs/impl/amd/cl/MIOpenBiasReLuUni.cl
+++ b/saber/core/impl/amd/utils/amd_logger.h
@@ -12,14 +12,22 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 */
+#ifndef ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDLOGGER_H
+#define ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDLOGGER_H
 
-__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void
-MIOpenReLu(const __global float* __restrict in,
-	__global float* __restrict out,
-	__global float* bias, 
-	float slope, int N, int C, int H, int W)
-{
-	int gid_x = get_global_id(0);
-	float intermediate = in[gid_x] + bias[(gid_x % (C * H * W)) / (H * W)];
-	out[gid_x] = intermediate * (intermediate > 0.0f ? 1.0f : slope);
-}
+#include "anakin_config.h"
+#include "utils/logger/logger.h"
+
+namespace anakin {
+namespace saber {
+//#define AMD_ENABLE_LOG
+
+#if defined(ENABLE_DEBUG) || defined(ENABLE_AMD_DEBUG)
+#define ENABLE_AMD_DEBUG_LOG 1
+#else
+#define ENABLE_AMD_DEBUG_LOG 0
+#endif
+
+} // namespace saber
+} // namespace anakin
+#endif // ANAKIN_SABER_CORE_IMPL_AMD_UTILS_AMDLOGGER_H
diff --git a/saber/core/impl/amd/utils/ocl/ocl_kernel.cpp b/saber/core/impl/amd/utils/ocl/ocl_kernel.cpp
new file mode 100644
index 000000000..95faabf40
--- /dev/null
+++ b/saber/core/impl/amd/utils/ocl/ocl_kernel.cpp
@@ -0,0 +1,437 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#include "ocl_kernel.h"
+#include "amd_cache.h"
+#include "amd_file_utils.h"
+#include "amd_logger.h"
+
+#include <miopen/gcn_asm_utils.hpp>
+#include <miopen/kernel.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/kernel_warnings.hpp>
+
+namespace anakin {
+namespace saber {
+//#define ENABLE_LOG
+#define SAVE_TO_FILE
+#define MAX_LOG_LENGTH 65535
+extern std::string GetKernelSrc(std::string name);
+
+std::string GetDeviceName(cl_device_id device_id) {
+
+    char deviceName[100];
+    clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Device Name: " << deviceName;
+    return std::string(deviceName);
+}
+
+#if 1
+
+void SaveProgramBinary(const cl_program program, const std::string& name) {
+    size_t binary_size;
+    clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, nullptr);
+
+    std::vector<char> binary(binary_size);
+    char* src[1] = {binary.data()};
+    clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(src), &src, nullptr);
+
+    std::ofstream fout(name.c_str(), std::ios::out | std::ios::binary);
+    fout.write(binary.data(), binary.size());
+
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "save program to cache file: " << name;
+}
+
+void WriteProgramToFile(cl_program cl_prg, cl_device_id device_id, KernelInfo* ki) {
+#ifndef SAVE_TO_FILE
+
+    if (true) {
+        return;
+    }
+
+#endif
+
+    std::string kernelKey;
+
+    if (ki->kernel_type == SOURCE) {
+        return;
+    } else {
+        kernelKey = ki->kernel_file;
+    }
+
+    std::string deviceName = GetDeviceName(device_id);
+    auto path              = GetCachePath() + unique_path();
+    SaveProgramBinary(cl_prg, path);
+    SaveBinary(path, deviceName, kernelKey, ki->comp_options);
+}
+
+cl_program LoadBinaryProgram(
+    cl_context context,
+    cl_device_id device_id,
+    const char* source_data,
+    size_t size) {
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__;
+    cl_int errNum;
+    cl_program program = clCreateProgramWithBinary(
+                             context, 1, &device_id, &size, (const unsigned char**)&source_data, NULL, &errNum);
+
+    if (errNum != CL_SUCCESS) {
+        LOG(ERROR) << __func__ << " error(" << errNum << ")";
+    }
+
+    return program;
+}
+
+cl_program LoadProgramFromFileCache(cl_context context, cl_device_id device_id, KernelInfo* ki) {
+#ifndef SAVE_TO_FILE
+
+    if (true) {
+        return NULL;
+    }
+
+#endif
+
+    std::string kernelKey;
+
+    if (ki->kernel_type == SOURCE) {
+        return NULL;
+    } else {
+        kernelKey = ki->kernel_file;
+    }
+
+    std::string deviceName = GetDeviceName(device_id);
+
+    std::string cacheFilePath = LoadBinaryPath(deviceName.c_str(), kernelKey, ki->comp_options);
+
+    if (!cacheFilePath.empty()) {
+        std::string source = LoadFile(cacheFilePath);
+
+        cl_program program = LoadBinaryProgram(context, device_id, source.data(), source.size());
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Create CL program from cache file: " << kernelKey;
+        return program;
+    }
+
+    return NULL;
+}
+
+ClProgramPtr LoadProgramFromMemCache(cl_context context, KernelInfo* ki) {
+    ProgramCache* programCache = ProgramCache::getInstance();
+
+    std::string kernelKey;
+
+    if (ki->kernel_type == SOURCE) {
+        return NULL;
+    } else {
+        kernelKey.assign(ki->kernel_file + ki->comp_options + ":" + ki->kernel_name);
+    }
+
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << kernelKey;
+
+    std::string progKey(kernelKey);
+    // Consider different compile options for single program
+    progKey += ki->comp_options;
+
+    // get program from program cache
+    auto program_ptr = programCache->lookup(std::pair<cl_context, std::string>(context, progKey));
+
+    if (program_ptr != NULL && program_ptr.get() != NULL) {
+        return program_ptr;
+    }
+
+    return NULL;
+}
+
+void WriteProgramIntoMemCache(cl_context context, ClProgramPtr program, KernelInfo* ki) {
+    ProgramCache* programCache = ProgramCache::getInstance();
+
+    std::string kernelKey;
+
+    if (ki->kernel_type == SOURCE) {
+        return NULL;
+    } else {
+        kernelKey.assign(ki->kernel_file + ki->comp_options + ":" + ki->kernel_name);
+    }
+
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << kernelKey;
+
+    std::string progKey(kernelKey);
+    // Consider different compile options for single program
+    progKey += ki->comp_options;
+
+    // Save to Program Cache
+    if (programCache->getSize() < programCache->MAX_CACHE_SIZE) {
+        programCache->add(std::pair<cl_context, std::string>(context, progKey), program);
+    } else {
+        LOG(INFO) << "Warning: program code cache has been full.\n";
+    }
+}
+bool BuildProgram(cl_program program, cl_device_id device_id, KernelInfo* ki) {
+
+    if (program == NULL) {
+        LOG(ERROR) << "Failed to Build Program, cl_program is not initialized.";
+        return false;
+    }
+
+    cl_int errNum;
+    auto is_asm = miopen::EndsWith(ki->kernel_file, ".s");
+
+    if (is_asm) {
+        clBuildProgram(program, 1, &device_id, "", NULL, NULL);
+    } else {
+        errNum = clBuildProgram(program, 1, &device_id, ki->comp_options.c_str(), NULL, NULL);
+    }
+
+    if (errNum != CL_SUCCESS) {
+        char buildErrLog[MAX_LOG_LENGTH];
+        clGetProgramBuildInfo(
+            program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buildErrLog), buildErrLog, NULL);
+
+        LOG(ERROR) << "CL program build error log in kernel: " << buildErrLog;
+        return true;
+    }
+
+    return true;
+}
+
+cl_program CreateProgramFromSource(cl_context context, cl_device_id device_id, KernelInfo* ki) {
+
+    cl_program program = NULL;
+    bool is_binary     = false;
+    std::string source;
+
+    if (ki->kernel_type == SOURCE) {
+        source = ki->kernel_file;
+    } else {
+        try {
+            if (ki->kernel_type == MIOPEN) {
+                source = miopen::GetKernelSrc(ki->kernel_file);
+            } else {
+                source = GetKernelSrc(ki->kernel_file);
+            }
+        } catch (...) {
+            LOG(ERROR) << "Can't Load CL Program";
+            return NULL;
+        }
+
+        if (miopen::EndsWith(ki->kernel_file, ".s") || miopen::EndsWith(ki->kernel_file, ".so")) {
+            is_binary = true;
+        }
+
+        if (!source.empty()) {
+            auto is_asm = miopen::EndsWith(ki->kernel_file, ".s");
+
+            if (is_asm) {
+                std::string deviceName = GetDeviceName(device_id);
+                AmdgcnAssemble(
+                    source, std::string(" -mcpu=") + deviceName + " " + ki->comp_options);
+            }
+        } else {
+            std::ifstream kFile(ki->kernel_file, std::ios::in);
+
+            if (!kFile.is_open()) {
+                LOG(ERROR) << "Failed to open file for reading: " << ki->kernel_file;
+                return NULL;
+            }
+
+            source = std::string(
+                         (std::istreambuf_iterator<char>(kFile)), std::istreambuf_iterator<char>());
+            kFile.close();
+        }
+    }
+
+    if (source.empty()) {
+        return NULL;
+    }
+
+    if (is_binary) {
+        program = LoadBinaryProgram(context, device_id, source.data(), source.size());
+    } else {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "createPrograWithSource";
+        std::string params = ki->comp_options;
+#if defined(ENABLE_DEBUG) || defined(ENABLE_LOG)
+        params += " -Werror";
+#ifdef __linux__
+        params += miopen::KernelWarningsString();
+#endif
+#endif
+        params += " -cl-std=CL1.2";
+
+        // load program from header
+        const char* srcStr = source.data();
+        size_t size        = source.size();
+
+        program = clCreateProgramWithSource(context, 1, (const char**)&srcStr, &size, NULL);
+
+        if (program == NULL) {
+            LOG(ERROR) << "Failed to create CL program from header: " << ki->kernel_file;
+            return NULL;
+        }
+    }
+
+    return program;
+}
+
+#if 0
+ClKernelPtr LoadKernelFromMemCache(cl_program program, KernelInfo* ki) {
+    std::string kernelKey;
+
+    if (ki->kernel_type == SOURCE) {
+        return NULL;
+    } else {
+        kernelKey.assign(ki->kernel_file + ki->comp_options + ":" + ki->kernel_name);
+    }
+
+    KernelCache* kernelCache = KernelCache::getInstance();
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << kernelKey;
+    auto kernel_ptr = kernelCache->lookup(std::pair<cl_program, std::string>(program, kernelKey));
+
+    if (kernel_ptr != NULL && kernel_ptr.get() != NULL) {
+        return kernel_ptr;
+    }
+
+    return NULL;
+}
+
+void WriteKernelIntoMemCache(cl_program program, ClKernelPtr kernel, KernelInfo* ki) {
+    std::string kernelKey;
+
+    if (ki->kernel_type == SOURCE) {
+        return NULL;
+    } else {
+        kernelKey.assign(ki->kernel_file + ki->comp_options + ":" + ki->kernel_name);
+    }
+
+    KernelCache* kernelCache = KernelCache::getInstance();
+    LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__ << " " << kernelKey;
+
+    if (kernelCache->getSize() < kernelCache->MAX_CACHE_SIZE) {
+        kernelCache->add(std::pair<cl_program, std::string>(program, kernelKey), kernel);
+    } else {
+        LOG(INFO) << "Warning: kernel cache has been full.\n";
+    }
+}
+#endif
+
+cl_kernel CreateKernelFromSource(cl_program program, KernelInfo* ki) {
+    cl_kernel kernel = NULL;
+
+    kernel = clCreateKernel(program, ki->kernel_name.c_str(), NULL);
+
+    if (kernel == NULL) {
+        LOG(ERROR) << "error: failed to create CL kernel.\n";
+        return NULL;
+    }
+
+    return kernel;
+}
+
+ClProgramPtr CreateProgram(cl_context context, cl_device_id device_id, KernelInfo* kernel_info) {
+
+    ClProgramPtr program_ptr = LoadProgramFromMemCache(context, kernel_info);
+
+    if (program_ptr != NULL) {
+        return program_ptr;
+    }
+
+    bool is_load_from_file = true;
+    cl_program program     = LoadProgramFromFileCache(context, device_id, kernel_info);
+
+    if (program == NULL) {
+        program           = CreateProgramFromSource(context, device_id, kernel_info);
+        is_load_from_file = false;
+    }
+
+    if (!BuildProgram(program, device_id, kernel_info)) {
+        program = NULL;
+        return NULL;
+    }
+
+    program_ptr = gen_shared_cl_program(program);
+    WriteProgramIntoMemCache(context, program_ptr, kernel_info);
+
+    if (!is_load_from_file) {
+        WriteProgramToFile(program, device_id, kernel_info);
+    }
+
+    return program_ptr;
+}
+
+ClKernelPtr CreateKernel(cl_program program, KernelInfo* kernel_info) {
+
+#if 0
+    //Kernel cache cannot work at multi-thread scenario
+    ClKernelPtr kernel_ptr = LoadKernelFromMemCache(program, kernel_info);
+
+    if (kernel_ptr != NULL) {
+        return kernel_ptr;
+    }
+
+#endif
+
+    cl_kernel kernel = CreateKernelFromSource(program, kernel_info);
+
+    if (kernel == NULL) {
+        return NULL;
+    }
+
+    ClKernelPtr kernel_ptr = gen_shared_cl_kernel(kernel);
+    //WriteKernelIntoMemCache(program, kernel_ptr, kernel_info);
+
+    return kernel_ptr;
+}
+
+#endif
+#if 1
+void OCLKernel::CreateProgram() {
+    _program = anakin::saber::CreateProgram(_context, _device_id, &kernel_info);
+}
+
+void OCLKernel::CreateKernel() {
+    _kernel = anakin::saber::CreateKernel(_program.get(), &kernel_info);
+}
+
+bool OCLKernel::run(
+    cl_command_queue cm,
+    int wait_events_num,
+    const cl_event* wait_events,
+    cl_event* event) {
+
+    cl_int errNum = clEnqueueNDRangeKernel(
+                        cm,
+                        _kernel.get(),
+                        kernel_info.wk_dim,
+                        (kernel_info.g_wk_offset.size() > 0 ? kernel_info.g_wk_offset.data() : NULL),
+                        kernel_info.g_wk.data(),
+                        kernel_info.l_wk.data(),
+                        wait_events_num,
+                        wait_events,
+                        event);
+
+    if (errNum != CL_SUCCESS) {
+        LOG(ERROR) << "Fail to set execution: " << errNum;
+        kernel_info.printE();
+        return false;
+    }
+
+    return true;
+}
+
+std::string OCLKernel::GetName() {
+    return kernel_info.kernel_name;
+}
+
+#endif
+
+} // namespace saber
+} // namespace anakin
diff --git a/saber/core/impl/amd/utils/ocl/ocl_kernel.h b/saber/core/impl/amd/utils/ocl/ocl_kernel.h
new file mode 100644
index 000000000..8b09f7b24
--- /dev/null
+++ b/saber/core/impl/amd/utils/ocl/ocl_kernel.h
@@ -0,0 +1,164 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_OCLKERNEL_H
+#define ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_OCLKERNEL_H
+
+#include "shared_pointer.h"
+#include "amd_base.h"
+#include <CL/cl.h>
+#include <string.h>
+
+namespace anakin {
+namespace saber {
+
+using ClProgramPtr = SHARED_OBJ(cl_program);
+#define gen_shared_cl_program(t) GEN_SHARED_OBJ_WITH_DELETER(cl_program, clReleaseProgram, t)
+
+using ClKernelPtr = SHARED_OBJ(cl_kernel);
+#define gen_shared_cl_kernel(t) GEN_SHARED_OBJ_WITH_DELETER(cl_kernel, clReleaseKernel, t)
+
+class OCLKernel {
+public:
+    OCLKernel(cl_context ctx, cl_device_id id, anakin::saber::KernelInfo* ki) :
+        _context(ctx),
+        _device_id(id) {
+        if (ki == nullptr) {
+            throw "KernelInfo can't be null";
+        }
+
+        kernel_info = ki;
+        kernel_info.print();
+
+        CreateProgram();
+        CreateKernel();
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "create kernel complete";
+    }
+
+    bool isInit() {
+        return _kernel == NULL ? false : true;
+    }
+
+    cl_program getProgram() {
+        return _program.get();
+    };
+    cl_kernel getKernel() {
+        return _kernel.get();
+    };
+    template <class... Ts>
+    bool SetKernelArgs(const Ts& ... xs) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__;
+
+        if (!isInit()) {
+            return false;
+        }
+
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Kernel is init";
+
+        if (!setKernelArgs(0, xs...)) {
+            return false;
+        }
+
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Set Kernel Args complete";
+        return true;
+    }
+
+    bool
+    Invoke(cl_command_queue cm, int wait_events_num, const cl_event* wait_events, cl_event* event) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__;
+
+        if (!isInit()) {
+            return false;
+        }
+
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Kernel is init";
+        return run(cm, wait_events_num, wait_events, event);
+    }
+
+    template <class... Ts>
+    bool
+    Invoke(cl_command_queue cm,
+           int wait_events_num,
+           const cl_event* wait_events,
+           cl_event* event,
+           const Ts& ... xs) {
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << __func__;
+
+        if (!isInit()) {
+            return false;
+        }
+
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Kernel is init";
+
+        if (!setKernelArgs(0, xs...)) {
+            return false;
+        }
+
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Set Kernel Args complete";
+
+        return run(cm, wait_events_num, wait_events, event);
+    }
+
+    std::string GetName();
+
+    ~OCLKernel() {}
+
+private:
+    template <class T>
+    bool setKernelArgs(int index, const T& x) {
+
+        cl_int errNum = clSetKernelArg(_kernel.get(), index, sizeof(T), &x);
+
+        if (errNum != CL_SUCCESS) {
+
+            if (errNum == CL_INVALID_ARG_INDEX) { // workaround for miopengemm kenrel
+                LOG(ERROR) << "set kernel args[" << index << "] err = " << errNum;
+                return true;
+            }
+
+            LOG(ERROR) << "set kernel args[" << index << "] err = " << errNum;
+            return false;
+        }
+
+        LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "Set Kernel Args[" << index << "]";
+        return true;
+    }
+    template <class T, class... Ts>
+    bool setKernelArgs(int index, const T& x, const Ts& ... xs) {
+        if (!setKernelArgs(index, x)) {
+            return false;
+        }
+
+        return setKernelArgs(index + 1, xs...);
+    }
+
+    bool
+    run(cl_command_queue cm, int wait_events_num, const cl_event* wait_events, cl_event* event);
+
+    void CreateProgram();
+    void CreateKernel();
+
+    ClProgramPtr _program;
+    ClKernelPtr _kernel;
+    cl_context _context;
+    cl_device_id _device_id;
+    KernelInfo kernel_info;
+};
+
+using OCLKernelPtr = SHARED_OBJ(OCLKernel);
+#define gen_shared_ocl(t) GEN_SHARED_OBJ(OCLKernel, t)
+
+} // namespace saber
+} // namespace anakin
+#endif
diff --git a/saber/core/impl/amd/utils/shared_pointer.h b/saber/core/impl/amd/utils/shared_pointer.h
new file mode 100644
index 000000000..1411cf795
--- /dev/null
+++ b/saber/core/impl/amd/utils/shared_pointer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_SHAREDPOINTER_H
+#define ANAKIN_SABER_FUNCS_IMPL_AMD_UTILS_SHAREDPOINTER_H
+#include <memory>
+#include <iostream>
+#include "anakin_config.h"
+#include "amd_logger.h"
+
+namespace anakin {
+namespace saber {
+
+template <class F, F f>
+struct SharedDeleter {
+    template <class T>
+    void operator()(T* t) const {
+        if (t != NULL) {
+            try {
+                LOG_IF_S(INFO, ENABLE_AMD_DEBUG_LOG) << "release shared object :" << typeid(t).name();
+                f(t);
+                t = NULL;
+            } catch (...) {
+                LOG(ERROR) << "Catught error for release shared object : " << typeid(t).name();
+            }
+        }
+    }
+};
+
+#define SHARED_OBJ(T) std::shared_ptr<std::remove_pointer<T>::type>
+
+#define GEN_SHARED_OBJ_WITH_DELETER(T, F, t)       \
+    std::shared_ptr<std::remove_pointer<T>::type>( \
+            t, anakin::saber::SharedDeleter<decltype(&F), &F>())
+
+#define GEN_SHARED_OBJ(T, t) std::shared_ptr<std::remove_pointer<T>::type>(t)
+
+} // namespace saber
+} // namespace anakin
+
+#endif
diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h
index 49df342ce..f0b59868a 100644
--- a/saber/core/target_wrapper.h
+++ b/saber/core/target_wrapper.h
@@ -18,6 +18,9 @@
 #include "saber/core/target_traits.h"
 #include "saber/core/data_traits.h"
 #include <memory>
+#ifdef AMD_GPU
+#include "saber/core/impl/amd/utils/amd_common.h"
+#endif
 
 namespace anakin{
 
@@ -464,8 +467,8 @@ struct TargetWrapper<AMD, __device_target> {
 
     typedef typename DataTraitBase<AMD>::PtrDtype TPtr;
 
-    typedef cl_event event_t;
-    typedef cl_command_queue stream_t;
+    typedef AMDEvent_t event_t;
+    typedef AMDStream_t stream_t;
 
     static void get_device_count(int& count);
 
@@ -496,7 +499,7 @@ struct TargetWrapper<AMD, __device_target> {
 
     static void destroy_event(event_t event);
 
-    static void record_event(event_t event, stream_t stream);
+    static void record_event(event_t& event, stream_t stream);
 
     static void query_event(event_t event);
 
@@ -563,15 +566,15 @@ struct TargetWrapper<AMD, __device_target> {
     //static cl_platform_id platform_id;
     //static cl_uint device_nums;
     static int current_device_id_index;
-    static std::map<void *, cl_mem> buffers;
+    static std::map<void *, TPtr> buffers;
     //static cl_context* contexts;
 
 };
 
 template <>
 struct TargetWrapper<AMDHX86, __host_target> {
-    typedef cl_event event_t;
-    typedef cl_command_queue stream_t;
+    typedef AMDEvent_t event_t;
+    typedef AMDStream_t stream_t;
 
     static void get_device_count(int& count);
 
@@ -587,7 +590,7 @@ struct TargetWrapper<AMDHX86, __host_target> {
 
     static void destroy_event(event_t event);
 
-    static void record_event(event_t event, stream_t stream);
+    static void record_event(event_t& event, stream_t stream);
 
     static void create_stream(stream_t* stream);
 
diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h
index b1874e5b6..314ddbc91 100644
--- a/saber/funcs/activation.h
+++ b/saber/funcs/activation.h
@@ -29,8 +29,8 @@
 #include "saber/funcs/impl/x86/saber_activation.h"
 #endif
 
-#ifdef AMD_GPU 
-#include "saber/funcs/impl/amd/saber_activation.h"
+#ifdef AMD_GPU
+//#include "saber/funcs/impl/amd/saber_activation.h"
 #endif
 
 #ifdef USE_ARM_PLACE
diff --git a/saber/funcs/impl/amd/amd_utils.cpp b/saber/funcs/impl/amd/amd_utils.cpp
deleted file mode 100644
index 3412eea80..000000000
--- a/saber/funcs/impl/amd/amd_utils.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#include "saber/funcs/impl/amd/amd_utils.h"
-#include "utils/logger/logger.h"
-
-namespace anakin {
-namespace saber {
-#define MAX_LOG_LENGTH 65535
-cl_program CreateCLProgram(cl_context context, cl_device_id device, const char* fileName,
-                           KernelInfo* ki) {
-    cl_int errNum;
-    cl_program program;
-
-    std::ifstream kFile(fileName, std::ios::in);
-
-    if (!kFile.is_open()) {
-        LOG(ERROR) << "Failed to open file for reading: " << fileName;
-        return NULL;
-    }
-
-    std::string src(
-        (std::istreambuf_iterator<char>(kFile)),
-        std::istreambuf_iterator<char>()
-    );
-    char* srcStr = src.c_str();
-    program = clCreateProgramWithSource(context, 1, (const char**)&srcStr, NULL, NULL);
-
-    kFile.close();
-
-    if (program == NULL) {
-        LOG(ERROR) << "Failed to create CL program with source file.";
-        return NULL;
-    }
-
-    char* comp_options = NULL;
-
-    if (ki != NULL) {
-        comp_options = ki->comp_options.c_str();
-    }
-
-    errNum = clBuildProgram(program, 1, &device, comp_options, NULL, NULL);
-
-    if (errNum != CL_SUCCESS) {
-        char buildErrLog[MAX_LOG_LENGTH];
-        clGetProgramBuildInfo(program,
-                              device,
-                              CL_PROGRAM_BUILD_LOG,
-                              sizeof(buildErrLog),
-                              buildErrLog,
-                              NULL);
-
-        LOG(ERROR) << "CL program build error log in kernel: " << buildErrLog;
-        clReleaseProgram(program);
-        return NULL;
-    }
-
-    return program;
-};
-
-cl_program CreatProgramFromBinaryFile(cl_context context, cl_device_id device,
-                                      const char* binFile) {
-    cl_program program;
-    cl_int errNum;
-
-    FILE* fp = fopen(binFile, "rb");
-
-    if (fp == NULL) {
-        LOG(ERROR) << "Can't open bin file: " <<  std::string(binFile);
-        return NULL;
-    }
-
-    size_t binSize;
-    fseek(fp, 0, SEEK_END);
-    binSize = ftell(fp);
-    rewind(fp);
-
-    unsigned char* binProgram = new unsigned char[binSize];
-    fread(binProgram, 1, binSize, fp);
-    fclose(fp);
-
-    program = clCreateProgramWithBinary(context, 1, &device, &binSize,
-                                        (const unsigned char**)&binProgram, NULL, &errNum);
-    errNum = clBuildProgram(program, 1, &device, "", NULL, NULL);
-
-    delete[] binProgram;
-
-    if (errNum != CL_SUCCESS) {
-        char buildErrLog[MAX_LOG_LENGTH];
-        clGetProgramBuildInfo(program,
-                              device,
-                              CL_PROGRAM_BUILD_LOG,
-                              sizeof(buildErrLog),
-                              buildErrLog,
-                              NULL);
-
-        LOG(ERROR) << "CL program build error log in kernel: " << buildErrLog;
-        clReleaseProgram(program);
-
-        return NULL;
-    }
-
-    return program;
-};
-
-
-}
-}
diff --git a/saber/funcs/impl/amd/amd_utils.h b/saber/funcs/impl/amd/amd_utils.h
deleted file mode 100644
index e55229014..000000000
--- a/saber/funcs/impl/amd/amd_utils.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#ifndef ANAKIN_SABER_FUNC_IMPL_AMD_UTILS_H
-#define ANAKIN_SABER_FUNC_IMPL_AMD_UTILS_H
-
-#include <CL/cl.h>
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <vector>
-
-#define MLO_POOLING_OP_MAX 0
-#define MLO_POOLING_OP_AVE 1
-
-namespace anakin {
-namespace saber {
-
-typedef struct ExtSolutionConfigTpye
-{
-    int in_tile0, in_tile1;
-    int grp_tile0, grp_tile1;
-    int out_pix_tile0, out_pix_tile1;
-    int n_stacks;
-    int n_out_pix_tiles;
-    int n_out_tiles_perstack;
-    int n_in_data_tiles;
-    int n_read_procs;
-    int alu_tile0, alu_tile1;
-    int horiz_out_pix; 
-    int vert_out_pix; 
-}T_ExtSolutionConfig;
-
-struct KernelInfo
-{
-    std::string comp_options;
-    std::vector<size_t> l_wk;
-    std::vector<size_t> g_wk;
-    std::string kernel_file;
-    std::string kernel_name;
-    friend std::ostream& operator<<(std::ostream& os, const KernelInfo& k);
-};
-
-extern cl_program CreateCLProgram(cl_context context, cl_device_id device, const char* fileName, KernelInfo* ki=NULL);
-extern cl_program CreatProgramFromBinaryFile(cl_context context, cl_device_id device, const char* binFile);
-
-
-inline cl_int _setKernelArgs(cl_kernel &k,int i){ return CL_SUCCESS;}
-
-template<typename T, typename... Args>
-inline cl_int _setKernelArgs(cl_kernel &kernel,int i, const T &firstParameter, const Args& ...restOfParameters){
-    return clSetKernelArg(kernel, i, sizeof(firstParameter), &firstParameter) | \
-           _setKernelArgs(kernel,i+1,restOfParameters...);
-}
-
-template<typename... Args>
-inline cl_int setKernelArgs(cl_kernel &kernel, const Args& ...args){
-    return _setKernelArgs(kernel, 0, args...);
-}
-}
-}
-#endif
diff --git a/saber/funcs/impl/amd/cl/ConvFwd3x3.cl b/saber/funcs/impl/amd/cl/ConvFwd3x3.cl
deleted file mode 100644
index ddc6272c1..000000000
--- a/saber/funcs/impl/amd/cl/ConvFwd3x3.cl
+++ /dev/null
@@ -1,925 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-/* Compiler options:
--c -emit-llvm -target amdgcn-amd-amdhsa-amdgizcl -x cl -cl-kernel-arg-info 
-
--DMLO_HW_WAVE_SZ=64 
--DMLO_DIR_FORWARD=1 
--DMLO_FILTER_SIZE0=3 
--DMLO_FILTER_SIZE1=3 
--DMLO_FILTER_PAD0=1 
--DMLO_FILTER_PAD1=1 
--DMLO_FILTER_STRIDE0=1 
--DMLO_FILTER_STRIDE1=1 
--DMLO_N_OUTPUTS=64 
--DMLO_N_INPUTS=3 
--DMLO_BATCH_SZ=2 
--DMLO_OUT_WIDTH=224 
--DMLO_OUT_HEIGHT=224 
--DMLO_OUT_BATCH_STRIDE=3211264 
--DMLO_OUT_CHANNEL_STRIDE=50176 
--DMLO_OUT_STRIDE=224 
--DMLO_IN_WIDTH=224 
--DMLO_IN_HEIGHT=224 
--DMLO_IN_BATCH_STRIDE=150528 
--DMLO_IN_CHANNEL_STRIDE=50176 
--DMLO_IN_STRIDE=224 
--DMLO_IN_TILE0=32 
--DMLO_IN_TILE1=32 
--DMLO_GRP_TILE0=16 
--DMLO_GRP_TILE1=16 
--DMLO_OUT_TILE0=2 
--DMLO_OUT_TILE1=2 
--DMLO_N_STACKS=1 
--DMLO_N_OUT_TILES=8 
--DMLO_N_OUT_TILES_PERSTACK=8 
--DMLO_N_IN_TILES_PERSTACK=2 
--DMLO_N_READ_PROCS=256 
--DMLO_CONV_BIAS=0 
--DMLO_ALU_VTILE0=16 
--DMLO_ALU_VTILE1=16  
-
--cl-std=CL1.2 -O3 
--mcpu=gfx900 
--mllvm 
--amdgpu-early-inline-all 
--mllvm -amdgpu-prelink 
-
--D__AMD__=1 
--D__gfx900__=1 
--D__gfx900=1 
--D__OPENCL_VERSION__=120 
--D__IMAGE_SUPPORT__=1 
-
--Xclang 
--cl-ext=+cl_khr_fp64,+cl_khr_global_int32_base_atomics,+cl_khr_global_int32_extended_atomics,+cl_khr_local_int32_base_atomics,+cl_khr_local_int32_extended_atomics,+cl_khr_int64_base_atomics,+cl_khr_int64_extended_atomics,+cl_khr_3d_image_writes,+cl_khr_byte_addressable_store,+cl_khr_fp16,+cl_khr_gl_sharing,+cl_amd_device_attribute_query,+cl_amd_media_ops,+cl_amd_media_ops2,+cl_khr_subgroups,+cl_amd_copy_buffer_p2p,+cl_amd_assembly_program -include opencl-c.h 
-*/
-
-#define _FLOAT float
-#define _FLOAT2 float2
-#define _FLOAT4 float4
-#define _FLOAT8 float8
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.402823466e+38F /* max value */
-#endif
-
-
-
-//HCJ definitions generated by solver
-//#define MLO_HW_WAVE_SZ 64
-//#define MLO_DIR_FORWARD 1
-//#define MLO_FILTER_SIZE0 3
-//#define MLO_FILTER_SIZE1 3
-//#define MLO_FILTER_PAD0 1
-//#define MLO_FILTER_PAD1 1
-//#define MLO_FILTER_STRIDE0 1
-//#define MLO_FILTER_STRIDE1 1
-//#define MLO_N_OUTPUTS 64
-//#define MLO_N_INPUTS 3
-//#define MLO_BATCH_SZ 2
-//#define MLO_OUT_WIDTH 224
-//#define MLO_OUT_HEIGHT 224
-//#define MLO_OUT_BATCH_STRIDE 3211264
-//#define MLO_OUT_CHANNEL_STRIDE 50176
-//#define MLO_OUT_STRIDE 224
-//#define MLO_IN_WIDTH 224
-//#define MLO_IN_HEIGHT 224
-//#define MLO_IN_BATCH_STRIDE 150528
-//#define MLO_IN_CHANNEL_STRIDE 50176
-//#define MLO_IN_STRIDE 224
-//#define MLO_IN_TILE0 32
-//#define MLO_IN_TILE1 32
-//#define MLO_GRP_TILE0 16
-//#define MLO_GRP_TILE1 16
-//#define MLO_OUT_TILE0 2
-//#define MLO_OUT_TILE1 2
-//#define MLO_N_STACKS 1
-//#define MLO_N_OUT_TILES 8
-//#define MLO_N_OUT_TILES_PERSTACK 8
-//#define MLO_N_IN_TILES_PERSTACK 2
-//#define MLO_N_READ_PROCS 256
-//#define MLO_CONV_BIAS 0
-//#define MLO_ALU_VTILE0 16
-//#define MLO_ALU_VTILE1 16
-#define MLO_CONV_BIAS 1
-
-#define UNUSED __attribute__((__unused__))
-
-#ifndef MLO_FILTER_STRIDE0
-#define MLO_FILTER_STRIDE0 1
-#endif
-#ifndef MLO_FILTER_STRIDE1
-#define MLO_FILTER_STRIDE1 1
-#endif
-
-#define MLO_FILTER_SZ (MLO_FILTER_SIZE1 * MLO_FILTER_SIZE0)
-
-#define MLO_GRP_SZ0 (MLO_GRP_TILE0 * MLO_GRP_TILE1)
-#define MLO_GRP_SZ1 1
-#define MLO_GRP_SZ2 1
-#define MLO_GRP_SZ (MLO_GRP_SZ0 * MLO_GRP_SZ1 * MLO_GRP_SZ2)
-#define MLO_N_PROC_WAVES ((MLO_GRP_SZ + MLO_N_READ_PROCS - 1) / MLO_N_READ_PROCS)
-#define MLO_OUT_TILE_SZ (MLO_OUT_TILE1 * MLO_OUT_TILE0)
-#define MLO_ALU_TILE_SZ (MLO_ALU_VTILE1 * MLO_ALU_VTILE0)
-
-#if MLO_IN_TILE0 < MLO_OUT_WIDTH || MLO_IN_TILE1 < MLO_OUT_HEIGHT
-#define MLO_LARGE_MAP 1
-#else
-#define MLO_LARGE_MAP 0
-#endif
-
-#if(MLO_IN_WIDTH == MLO_OUT_WIDTH &&                                \
-    (MLO_IN_WIDTH / MLO_IN_TILE0) * MLO_IN_TILE0 == MLO_IN_WIDTH && \
-    MLO_IN_HEIGHT == MLO_OUT_HEIGHT &&                              \
-    (MLO_IN_HEIGHT / MLO_IN_TILE1) * MLO_IN_TILE1 == MLO_IN_HEIGHT)
-#define MLO_OUT_ALIGNED 1
-#else
-#define MLO_OUT_ALIGNED 0
-#endif
-
-#define MLO_N_ALUTILES_TOTAL ((MLO_GRP_TILE0 * MLO_GRP_TILE1) / (MLO_ALU_TILE_SZ))
-#define MLO_N_ALUTILES_PERSTACK (MLO_N_ALUTILES_TOTAL / MLO_N_STACKS)
-#define MLO_ALUTILES_STACK_SZ (MLO_N_ALUTILES_PERSTACK * MLO_ALU_TILE_SZ)
-#define MLO_N_IN_TILES_TOTAL (MLO_N_IN_TILES_PERSTACK * MLO_N_STACKS)
-/*
-#define MLO_N_OUT_TILES_PERSTACK (MLO_N_OUT_TILES*MLO_N_ALUTILES_PERSTACK)
-#if MLO_N_OUT_TILES_PERSTACK > MLO_N_OUTPUTS
-#undef MLO_N_OUT_TILES_PERSTACK
-#define MLO_N_OUT_TILES_PERSTACK MLO_N_OUTPUTS
-#endif
-*/
-#define MLO_N_OUT_TILE_BLOCKS0 ((MLO_OUT_WIDTH + MLO_IN_TILE0 - 1) / MLO_IN_TILE0)
-#define MLO_N_OUT_TILE_BLOCKS1 ((MLO_OUT_HEIGHT + MLO_IN_TILE1 - 1) / MLO_IN_TILE1)
-#define MLO_N_IN_PACKS ((MLO_N_INPUTS + MLO_N_IN_TILES_PERSTACK - 1) / MLO_N_IN_TILES_PERSTACK)
-
-#define MLO_N_IN_READ (MLO_N_IN_PACKS * MLO_N_IN_TILES_PERSTACK)
-#if MLO_N_IN_READ == MLO_N_INPUTS
-#define MLO_INPUTS_ALIGNED 1
-#else
-#define MLO_INPUTS_ALIGNED 0
-#endif
-
-#define MLO_N_OUT_PACKS (MLO_N_OUTPUTS / MLO_N_OUT_TILES_PERSTACK)
-#if MLO_N_OUT_PACKS * MLO_N_OUT_TILES_PERSTACK == MLO_N_OUTPUTS && \
-    MLO_N_OUT_TILES_PERSTACK != MLO_N_OUTPUTS
-#define MLO_OUTPUTS_ALIGNED 1
-#else
-#define MLO_OUTPUTS_ALIGNED 0
-#endif
-
-#define MLO_N_BATCH_PACKS (MLO_BATCH_SZ / MLO_N_STACKS)
-#if MLO_N_BATCH_PACKS * MLO_N_STACKS == MLO_BATCH_SZ && MLO_N_STACKS != MLO_BATCH_SZ
-#define MLO_BATCH_ALIGNED 1
-#else
-#define MLO_BATCH_ALIGNED 0
-#endif
-
-#if MLO_DIR_FORWARD == 1
-#define MLO_IN_LCL_WIDTH \
-    ((MLO_IN_TILE0 - 1) * MLO_FILTER_STRIDE0 + MLO_FILTER_SIZE0) // here we use kernel size. it's
-                                                                 // important when padding == 0  2*
-                                                                 // MLO_FILTER_PAD0
-#define MLO_IN_LCL_HEIGHT ((MLO_IN_TILE1 - 1) * MLO_FILTER_STRIDE1 + MLO_FILTER_SIZE1)
-#else
-#define MLO_IN_LCL_WIDTH                                              \
-    ((MLO_IN_TILE0 + MLO_FILTER_SIZE0 - 1 + MLO_FILTER_STRIDE0 - 1) / \
-     MLO_FILTER_STRIDE0) // here we use kernel size. it's important when padding == 0  2*
-// MLO_FILTER_PAD0
-#define MLO_IN_LCL_HEIGHT \
-    ((MLO_IN_TILE1 + MLO_FILTER_SIZE1 - 1 + MLO_FILTER_STRIDE1 - 1) / MLO_FILTER_STRIDE1)
-#endif
-#define MLO_IN_LCL_TILE_SZ (MLO_IN_LCL_WIDTH * MLO_IN_LCL_HEIGHT)
-#define MLO_IN_LCL_PERSTACK_SZ (MLO_IN_LCL_TILE_SZ * MLO_N_IN_TILES_PERSTACK)
-#define MLO_IN_LCL_SZ (MLO_IN_LCL_PERSTACK_SZ * MLO_N_STACKS)
-
-#define MLO_WEIGHTS_SZ (MLO_N_OUT_TILES_PERSTACK * MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ)
-
-#define MLO_PVT_ACCUM_DATA_SZ (MLO_N_OUT_TILES * MLO_OUT_TILE_SZ)
-#if MLO_DIR_FORWARD == 1
-#define MLO_PVT_IN_WIDTH ((MLO_OUT_TILE0 - 1) * MLO_FILTER_STRIDE0 + MLO_FILTER_SIZE0)
-#define MLO_PVT_IN_HEIGHT ((MLO_OUT_TILE1 - 1) * MLO_FILTER_STRIDE1 + 1)
-#else
-#define MLO_PVT_IN_WIDTH \
-    ((MLO_OUT_TILE0 + MLO_FILTER_SIZE0 - 1 + MLO_FILTER_STRIDE0 - 1) / MLO_FILTER_STRIDE0)
-#define MLO_PVT_IN_HEIGHT ((MLO_OUT_TILE1 + MLO_FILTER_STRIDE1 - 1) / MLO_FILTER_STRIDE1)
-#endif
-
-#define MLO_LCL_WEIGHTS 1
-
-#define MLO_PADDING_SHIFT1 (MLO_FILTER_SIZE1 - MLO_FILTER_PAD1 - 1)
-#define MLO_PADDING_SHIFT0 (MLO_FILTER_SIZE0 - MLO_FILTER_PAD0 - 1)
-
-#define MLO_PADDING_FIX1 (MLO_FILTER_SIZE1 % MLO_OUT_TILE1)
-#define MLO_PADDING_FIX0 (MLO_FILTER_SIZE0 % MLO_OUT_TILE0)
-
-#if defined(__AMDGCN__)
-extern uint __llvm_amdgcn_readfirstlane(uint) __asm("llvm.amdgcn.readfirstlane");
-#define uniform(x) __llvm_amdgcn_readfirstlane(x)
-#else
-#define uniform(x) (x)
-#endif
-
-static inline uint iDiv(uint v, uint d)
-{
-    uint r = (uint)((float)v * (1.0f / (float)d) + 0.00001f);
-    return (r);
-}
-
-static inline uint iMod(uint v, uint u, uint d)
-{
-    uint r = v - mul24((uint)u, (uint)d);
-    return (r);
-}
-
-static inline void calculateXYPos(uint linPos, uint width, uint* __restrict x, uint* __restrict y)
-{
-    (*y) = (uint)((float)linPos * (1.0f / (float)width) + 0.00001f);
-    (*x) = linPos - mul24((*y), width);
-}
-
-static inline uint calculateOffset(uint stride, uint x, uint y)
-{
-    uint ret = y * stride + x;
-    return (ret);
-}
-
-static inline void readDataElem(uint linPos,
-                                __local _FLOAT* lcl_data,
-                                int lcl_base,
-                                UNUSED uint lcl_height,
-                                uint lcl_width,
-                                int lcl_stride,
-                                int lcl_y,
-                                int lcl_x,
-                                const __global _FLOAT* gbl_data,
-                                int gbl_base,
-                                uint gbl_height,
-                                uint gbl_width,
-                                int gbl_stride,
-                                int gbl_y,
-                                int gbl_x,
-                                bool vis,
-                                UNUSED bool debug)
-{
-    uint x, y;
-    calculateXYPos(linPos, lcl_width, &x, &y);
-    int g_x       = x + gbl_x;
-    int g_y       = y + gbl_y;
-    uint gbl_off0 = calculateOffset(gbl_stride, g_x, g_y);
-    int gbl_off   = gbl_off0 + gbl_base;
-
-#if MLO_LARGE_MAP == 1
-    int lcl_off = lcl_base + linPos;
-    (void)lcl_stride;
-    (void)lcl_x;
-    (void)lcl_y;
-#else
-    int l_x     = x + lcl_x;
-    int l_y     = y + lcl_y;
-    int lcl_off = lcl_base + mad24(l_y, lcl_stride, l_x);
-#endif
-
-#if MLO_LARGE_MAP == 1
-    vis &= (g_x >= 0 && g_x < gbl_width && g_y >= 0 && g_y < gbl_height);
-#else
-    (void)gbl_width;
-    (void)gbl_height;
-#endif
-    gbl_off        = (vis) ? gbl_off : 0;
-    _FLOAT gbl_val = gbl_data[gbl_off];
-    gbl_val        = (vis) ? gbl_val : 0;
-
-    lcl_data[lcl_off] = gbl_val;
-}
-
-static inline void readData(uint lcl_id,
-                            int size,
-                            int lcl_p_stride,
-                            __local _FLOAT* lcl_data,
-                            int lcl_base,
-                            uint lcl_height,
-                            uint lcl_width,
-                            int lcl_stride,
-                            int lcl_y,
-                            int lcl_x,
-                            const __global _FLOAT* gbl_data,
-                            int gbl_base,
-                            uint gbl_height,
-                            uint gbl_width,
-                            int gbl_stride,
-                            int gbl_y,
-                            int gbl_x,
-                            bool vis,
-                            bool debug)
-{
-
-    for(uint i = lcl_id; i < size; i += lcl_p_stride)
-    {
-        readDataElem(i,
-                     lcl_data,
-                     lcl_base,
-                     lcl_height,
-                     lcl_width,
-                     lcl_stride,
-                     lcl_y,
-                     lcl_x,
-                     gbl_data,
-                     gbl_base,
-                     gbl_height,
-                     gbl_width,
-                     gbl_stride,
-                     gbl_y,
-                     gbl_x,
-                     vis,
-                     debug);
-    }
-}
-
-static inline void loadData(uint lcl_id,
-                            int lcl_p_stride,
-                            __local _FLOAT* lcl_data,
-                            int lcl_off,
-                            int lcl_size,
-                            uint lcl_height,
-                            uint lcl_width,
-                            int lcl_stride,
-                            int lcl_bot_y,
-                            int lcl_bot_x,
-                            const __global _FLOAT* gbl_data,
-                            int gbl_off,
-                            int gbl_size,
-                            uint gbl_height,
-                            uint glb_width,
-                            int gbl_stride,
-                            int gbl_bot_y,
-                            int gbl_bot_x,
-                            int buf_block_ind,
-                            int max_n_bufs,
-                            int lcl_n_bufs,
-                            bool debug)
-{
-
-    for(uint c = 0; c < lcl_n_bufs; ++c, lcl_off += lcl_size, gbl_off += gbl_size)
-    {
-        bool vis = (buf_block_ind + c < max_n_bufs);
-        readData(lcl_id,
-                 lcl_size,
-                 lcl_p_stride,
-                 lcl_data,
-                 lcl_off,
-                 lcl_height,
-                 lcl_width,
-                 lcl_stride,
-                 lcl_bot_y,
-                 lcl_bot_x,
-                 gbl_data,
-                 gbl_off,
-                 gbl_height,
-                 glb_width,
-                 gbl_stride,
-                 gbl_bot_y,
-                 gbl_bot_x,
-                 vis,
-                 (debug));
-    }
-}
-
-static inline void Conv(uint o_map_base,
-                        int in_stg_off,
-                        __private _FLOAT* __restrict pvt_in_stage,
-                        __local _FLOAT* __restrict lcl_indata,
-                        __private _FLOAT* __restrict pvt_wei_stage,
-                        __local _FLOAT* __restrict lcl_wei,
-                        __private _FLOAT* __restrict pvt_accum)
-{
-    // convolution
-
-    // over all inputs in stack
-    int in_stg_off1 = in_stg_off;
-    for(uint i_c = 0; i_c < MLO_N_IN_TILES_PERSTACK; ++i_c, in_stg_off1 += MLO_IN_LCL_TILE_SZ)
-    {
-        // preload input
-        int wei_stg_base_off = mad24(o_map_base,
-                                     (uint)(MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ),
-                                     mul24(i_c, (uint)MLO_FILTER_SZ));
-        int in_stg_off2 = in_stg_off1;
-        for(uint j = 0; j < MLO_PVT_IN_HEIGHT - 1; ++j,
-                 in_stg_off2 += (((j - MLO_PADDING_SHIFT1 + MLO_PADDING_FIX1) % MLO_FILTER_STRIDE1)
-                                     ? 0
-                                     : MLO_IN_LCL_WIDTH))
-        {
-            for(uint i = 0; i < MLO_PVT_IN_WIDTH; ++i)
-            {
-                pvt_in_stage[j * MLO_PVT_IN_WIDTH + i] = lcl_indata[in_stg_off2 + i];
-            }
-        }
-
-// over filter rows
-#ifdef __AMDGCN__
-#if MLO_FILTER_SIZE1 < 6
-#pragma unroll
-#elif MLO_FILTER_SIZE1 < 9
-#pragma unroll 2
-#endif
-#endif
-#if MLO_DIR_FORWARD == 1
-        for(uint k = 0; k < MLO_FILTER_SIZE1; ++k, in_stg_off2 += MLO_IN_LCL_WIDTH)
-#else
-        for(uint k = 0; k < MLO_FILTER_SIZE1; ++k,
-                 in_stg_off2 += (((k - MLO_PADDING_SHIFT1 + MLO_PADDING_FIX1) % MLO_FILTER_STRIDE1)
-                                     ? 0
-                                     : MLO_IN_LCL_WIDTH))
-#endif
-        {
-            int k_act = 0;
-#if MLO_DIR_FORWARD == 1
-            k_act = k;
-#else
-            // load filter in reverse order
-            k_act = MLO_FILTER_SIZE1 - 1 - k;
-#endif
-            // load next input row
-            for(uint i_pvt = 0; i_pvt < MLO_PVT_IN_WIDTH; ++i_pvt)
-            {
-                pvt_in_stage[(MLO_PVT_IN_HEIGHT - 1) * MLO_PVT_IN_WIDTH + i_pvt] =
-                    lcl_indata[in_stg_off2 + i_pvt];
-            }
-
-            // over all outputs
-            for(uint o_c = 0; o_c < MLO_N_OUT_TILES; ++o_c)
-            {
-                int wei_stg_off = wei_stg_base_off + o_c * MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ +
-                                  k_act * MLO_FILTER_SIZE0;
-                for(uint i = 0; i < MLO_FILTER_SIZE0; ++i)
-                {
-                    pvt_wei_stage[i] =
-                        lcl_wei[wei_stg_off +
-                                i]; //(float)o_c/(float)MLO_N_OUT_TILES + (float)(i+k)/9;
-                }
-
-                // actual conv
-
-                for(uint j = 0; j < MLO_OUT_TILE1; ++j)
-                {
-#if MLO_DIR_FORWARD == 0
-                    if(((j + k + 1 - MLO_PADDING_SHIFT1 + (MLO_FILTER_SIZE1 % MLO_FILTER_STRIDE1)) %
-                        MLO_FILTER_STRIDE1) == 0)
-#endif
-                        for(uint i = 0; i < MLO_OUT_TILE0; ++i)
-                        {
-                            for(uint l = 0; l < MLO_FILTER_SIZE0; ++l)
-                            {
-
-                                int l_act = 0;
-#if MLO_DIR_FORWARD == 1
-                                l_act = l;
-
-#else
-                            // in reverse horizontal and vertical orders
-                            l_act = MLO_FILTER_SIZE0 - 1 - l;
-
-#endif
-
-#if MLO_DIR_FORWARD == 1
-                                pvt_accum[(o_c * MLO_OUT_TILE1 + j) * MLO_OUT_TILE0 + i] +=
-                                    pvt_in_stage[j * MLO_PVT_IN_WIDTH * MLO_FILTER_STRIDE1 +
-                                                 i * MLO_FILTER_STRIDE0 + l] *
-                                    pvt_wei_stage[l_act];
-#else
-                            if(((i + l + 1 - MLO_PADDING_SHIFT0 +
-                                 (MLO_FILTER_SIZE0 % MLO_FILTER_STRIDE0)) %
-                                MLO_FILTER_STRIDE0) == 0)
-                            {
-                                pvt_accum[(o_c * MLO_OUT_TILE1 + j) * MLO_OUT_TILE0 + i] +=
-                                    pvt_in_stage[(j / MLO_FILTER_STRIDE1) * MLO_PVT_IN_WIDTH +
-                                                 (i + l) / MLO_FILTER_STRIDE0] *
-                                    pvt_wei_stage[l_act];
-                            }
-#endif
-                            }
-                        }
-                }
-
-            } // for(uint o_c = 0; o_c < MLO_N_OUT_TILES; ++o_c)
-
-            // move data up
-            for(uint j = 0; j < MLO_PVT_IN_HEIGHT - 1; ++j)
-            {
-                for(uint i = 0; i < MLO_PVT_IN_WIDTH; ++i)
-                {
-                    pvt_in_stage[j * MLO_PVT_IN_WIDTH + i] =
-                        pvt_in_stage[(j + 1) * MLO_PVT_IN_WIDTH + i];
-                }
-            }
-
-        } // for(uint k = 0; k < MLO_FILER_SIZE1; ++k,in_stg_off2+=MLO_IN_LCL_WIDTH)
-
-    } // for(uint i_c = 0; i_c < MLO_N_IN_TILES_PERSTACK; ++i_c, in_stg_off1 +=
-      // MLO_IN_LCL_PERSTACK_SZ)
-}
-
-__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void
-ConvFwd3x3(const __global _FLOAT* __restrict in,
-              const __global _FLOAT* __restrict weights,
-#if MLO_CONV_BIAS
-              const __global _FLOAT* __restrict bias,
-#endif
-              __global _FLOAT* __restrict out,
-              _FLOAT slope)
-{
-	
-#if 1
-    __local _FLOAT lcl_indata[MLO_IN_LCL_SZ];
-    __local _FLOAT lcl_wei[MLO_WEIGHTS_SZ];
-    __private _FLOAT pvt_accum[MLO_PVT_ACCUM_DATA_SZ];
-    __private _FLOAT pvt_in_stage[MLO_PVT_IN_HEIGHT * MLO_PVT_IN_WIDTH];
-    __private _FLOAT pvt_wei_stage[MLO_FILTER_SIZE0];
-
-    uint grp_id0 = get_group_id(0);
-#if MLO_N_OUT_TILE_BLOCKS0 & (MLO_N_OUT_TILE_BLOCKS0 - 1)
-    uint y_tile_blk = iDiv(grp_id0, MLO_N_OUT_TILE_BLOCKS0);
-    uint x_tile_blk = iMod(grp_id0, y_tile_blk, MLO_N_OUT_TILE_BLOCKS0);
-#else
-    uint y_tile_blk       = grp_id0 / MLO_N_OUT_TILE_BLOCKS0;
-    uint x_tile_blk       = grp_id0 & (MLO_N_OUT_TILE_BLOCKS0 - 1);
-#endif
-    uint o_pack = get_group_id(1); // block of outputs
-    uint b_pack = get_group_id(2); // batch block
-
-    uint lcl_id = get_local_id(0);
-#if MLO_ALUTILES_STACK_SZ >= MLO_GRP_SZ
-    uint stack        = 0;
-    uint alu_stack_id = lcl_id;
-#elif MLO_ALUTILES_STACK_SZ & (MLO_ALUTILES_STACK_SZ - 1)
-    uint stack            = iDiv(lcl_id, MLO_ALUTILES_STACK_SZ);        // stack
-    uint alu_stack_id     = iMod(lcl_id, stack, MLO_ALUTILES_STACK_SZ); // alu index in stack
-#else
-    uint stack = lcl_id / MLO_ALUTILES_STACK_SZ; // stack
-    uint alu_stack_id = lcl_id & (MLO_ALUTILES_STACK_SZ - 1); // alu index in stack
-#if MLO_ALUTILES_STACK_SZ >= 64
-    stack = uniform(stack);
-#endif
-#endif
-// ALU plane inside stack
-#if MLO_ALU_TILE_SZ & (MLO_ALU_TILE_SZ - 1)
-    uint alu_out_plane_id = iDiv(alu_stack_id, MLO_ALU_TILE_SZ); // alu output plane index
-    uint alu_out_id       = iMod(
-        alu_stack_id, alu_out_plane_id, MLO_ALU_TILE_SZ); // alu index inside an ALU output plane
-#else
-    uint alu_out_plane_id = alu_stack_id / MLO_ALU_TILE_SZ;             // alu output plane index
-    uint alu_out_id       = alu_stack_id & (MLO_ALU_TILE_SZ - 1);       // alu index inside an ALU output plane
-#endif
-// pos inside ALU tile
-#if MLO_ALU_VTILE0 & (MLO_ALU_VTILE0 - 1)
-    uint alu_tl1 = iDiv(alu_out_id, MLO_ALU_VTILE0);
-    uint alu_tl0 = iMod(alu_out_id, alu_tl1, MLO_ALU_VTILE0);
-#else
-    uint alu_tl1          = alu_out_id / MLO_ALU_VTILE0;
-    uint alu_tl0          = alu_out_id & (MLO_ALU_VTILE0 - 1);
-#endif
-
-    uint o_map_plane =
-        o_pack * MLO_N_OUT_TILES_PERSTACK; // first output maps index per full ALU plane stack
-    uint o_map_base = alu_out_plane_id * MLO_N_OUT_TILES; // local output map offset
-    uint o_map      = o_map_plane + o_map_base;           // output map index per ALU plane
-    uint b_index    = b_pack * MLO_N_STACKS;
-
-#if MLO_LARGE_MAP != 1
-#if MLO_N_READ_PROCS >= MLO_GRP_SZ
-    uint wave_id     = 0;
-    uint wave_lcl_id = lcl_id;
-#elif MLO_N_READ_PROCS & (MLO_N_READ_PROCS - 1)
-    uint wave_id     = iDiv(lcl_id, MLO_N_READ_PROCS);
-    uint wave_lcl_id = iMod(lcl_id, wave_id, MLO_N_READ_PROCS);
-#else
-    uint wave_id     = lcl_id / MLO_N_READ_PROCS;
-    uint wave_lcl_id = lcl_id & (MLO_N_READ_PROCS - 1);
-#if MLO_N_READ_PROCS >= 64
-    wave_id          = uniform(wave_id);
-#endif
-#endif
-#endif
-
-#if MLO_DIR_FORWARD == 1
-    uint x_grp = x_tile_blk * MLO_IN_TILE0 * MLO_FILTER_STRIDE0;
-    uint y_grp = y_tile_blk * MLO_IN_TILE1 * MLO_FILTER_STRIDE1;
-#if MLO_LARGE_MAP == 1
-    uint x_in_grp = x_grp - MLO_FILTER_PAD0;
-    uint y_in_grp = y_grp - MLO_FILTER_PAD1;
-#endif
-    uint x_in_lcl = alu_tl0 * MLO_OUT_TILE0 * MLO_FILTER_STRIDE0;
-    uint y_in_lcl = alu_tl1 * MLO_OUT_TILE1 * MLO_FILTER_STRIDE1;
-#else
-    uint x_grp            = x_tile_blk * (MLO_IN_TILE0 / MLO_FILTER_STRIDE0);
-    uint y_grp            = y_tile_blk * (MLO_IN_TILE1 / MLO_FILTER_STRIDE1);
-#if MLO_LARGE_MAP == 1
-    uint x_in_grp         = x_grp - (MLO_FILTER_PAD0 / MLO_FILTER_STRIDE0);
-    uint y_in_grp         = y_grp - (MLO_FILTER_PAD1 / MLO_FILTER_STRIDE1);
-#endif
-    uint x_in_lcl         = alu_tl0 * (MLO_OUT_TILE0 / MLO_FILTER_STRIDE0);
-    uint y_in_lcl         = alu_tl1 * (MLO_OUT_TILE1 / MLO_FILTER_STRIDE1);
-#endif
-
-    // base offset to read data from local input data
-    uint in_stg_off = stack * MLO_IN_LCL_PERSTACK_SZ + (y_in_lcl)*MLO_IN_LCL_WIDTH + x_in_lcl;
-
-    uint in_off = b_index * MLO_IN_BATCH_STRIDE;
-
-#if MLO_DIR_FORWARD == 1
-    uint wei_off = mul24(o_map_plane, (uint)(MLO_N_INPUTS * MLO_FILTER_SZ));
-#else
-    uint wei_off          = mul24(o_map_plane, (uint)MLO_FILTER_SZ);
-#endif
-
-#if MLO_LARGE_MAP == 0
-    for(uint i = lcl_id; i < MLO_IN_LCL_SZ; i += MLO_GRP_SZ)
-    {
-        lcl_indata[i] = 0;
-    }
-#endif
-
-    for(uint i = 0; i < MLO_PVT_ACCUM_DATA_SZ; ++i)
-    {
-        pvt_accum[i] = 0;
-    }
-
-    for(uint ic = 0; ic < MLO_N_INPUTS; ic += MLO_N_IN_TILES_PERSTACK,
-             in_off += MLO_IN_CHANNEL_STRIDE * MLO_N_IN_TILES_PERSTACK,
-             wei_off += MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ
-#if MLO_DIR_FORWARD == 0
-                                        *
-                                        MLO_N_OUTPUTS
-#endif
-        )
-    {
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-// small map has been read in full continiously into the lDS buffer within padded rect,
-// padding has been done on initilization.
-// large map calculates padding on the fly and fills it with 0.
-
-#if 1 // all inputs
-
-#if MLO_LARGE_MAP == 1
-        int in_lcl_off1 = 0;
-        int in_off1     = in_off;
-        for(uint i_b = 0; i_b < MLO_N_STACKS;
-            ++i_b, in_off1 += MLO_IN_BATCH_STRIDE, in_lcl_off1 += MLO_IN_LCL_PERSTACK_SZ)
-        {
-            bool vis = true;
-#if MLO_BATCH_ALIGNED == 0
-            vis &= (b_index + i_b < MLO_BATCH_SZ);
-#endif
-
-            // over all inputs in stack
-            int in_off2     = in_off1;
-            int in_lcl_off2 = in_lcl_off1;
-            for(uint i_c = 0; i_c < MLO_N_IN_TILES_PERSTACK;
-                ++i_c, in_off2 += MLO_IN_CHANNEL_STRIDE, in_lcl_off2 += MLO_IN_LCL_TILE_SZ)
-            {
-#if MLO_INPUTS_ALIGNED == 0
-                vis &= (ic + i_c < MLO_N_INPUTS);
-#endif
-
-                uint elem_id     = lcl_id;
-                int lcl_p_stride = MLO_GRP_SZ0;
-                int lcl_base     = 0;
-                int lcl_y        = 0;
-                int lcl_x        = 0;
-                int gbl_base     = in_off2;
-
-                readData(elem_id,
-                         (MLO_IN_LCL_HEIGHT * MLO_IN_LCL_WIDTH),
-                         lcl_p_stride,
-                         &lcl_indata[in_lcl_off2],
-                         lcl_base,
-                         MLO_IN_LCL_HEIGHT,
-                         MLO_IN_LCL_WIDTH,
-                         MLO_IN_LCL_WIDTH,
-                         lcl_y,
-                         lcl_x,
-                         &in[0],
-                         gbl_base,
-                         MLO_IN_HEIGHT,
-                         MLO_IN_WIDTH,
-                         MLO_IN_STRIDE,
-                         y_in_grp,
-                         x_in_grp,
-                         vis,
-                         true);
-            }
-        }
-#else
-        for(uint i = wave_id; i < MLO_N_IN_TILES_TOTAL; i += MLO_N_PROC_WAVES)
-        {
-#if MLO_N_IN_TILES_PERSTACK & (MLO_N_IN_TILES_PERSTACK - 1)
-            uint i_b = iDiv(i, MLO_N_IN_TILES_PERSTACK);
-            uint i_c = iMod(i, i_b, MLO_N_IN_TILES_PERSTACK);
-#else
-            uint i_b  = i / MLO_N_IN_TILES_PERSTACK;
-            uint i_c  = i & (MLO_N_IN_TILES_PERSTACK - 1);
-#endif
-
-            bool vis = true;
-
-#if MLO_BATCH_ALIGNED == 0
-            vis &= (b_index + i_b < MLO_BATCH_SZ);
-#endif
-
-#if MLO_INPUTS_ALIGNED == 0
-            vis &= (ic + i_c < MLO_N_INPUTS);
-#endif
-            int in_off2     = in_off + i_b * MLO_IN_BATCH_STRIDE + i_c * MLO_IN_CHANNEL_STRIDE;
-            int in_lcl_off2 = i_b * MLO_IN_LCL_PERSTACK_SZ + i_c * MLO_IN_LCL_TILE_SZ;
-
-            uint elem_id     = wave_lcl_id;
-            int lcl_p_stride = MLO_N_READ_PROCS;
-            int lcl_base     = 0;
-#if MLO_DIR_FORWARD == 1
-            int lcl_y        = MLO_FILTER_PAD1;
-            int lcl_x        = MLO_FILTER_PAD0;
-#else
-            int lcl_y = (MLO_FILTER_PAD1 / MLO_FILTER_STRIDE0);
-            int lcl_x = (MLO_FILTER_PAD0 / MLO_FILTER_STRIDE1);
-#endif
-            int gbl_base     = in_off2;
-
-            readData(elem_id,
-                     (MLO_IN_HEIGHT * MLO_IN_WIDTH),
-                     lcl_p_stride,
-                     &lcl_indata[in_lcl_off2],
-                     lcl_base,
-                     MLO_IN_HEIGHT,
-                     MLO_IN_WIDTH,
-                     MLO_IN_LCL_WIDTH,
-                     lcl_y,
-                     lcl_x,
-                     &in[0],
-                     gbl_base,
-                     MLO_IN_HEIGHT,
-                     MLO_IN_WIDTH,
-                     MLO_IN_STRIDE,
-                     y_grp,
-                     x_grp,
-                     vis,
-                     true);
-        }
-#endif
-
-// read inputs and weights
-// put weights into LDS
-
-#if 1 // only weights
-
-        for(uint i = lcl_id; i < MLO_WEIGHTS_SZ; i += MLO_GRP_SZ)
-        {
-#if MLO_DIR_FORWARD == 1
-// here is [tops][bottoms]
-#if(MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) & ((MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) - 1)
-            uint lcl_o = iDiv(i, (MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ));
-            uint gbl_i = iMod(i, lcl_o, (MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ));
-#else
-            uint lcl_o = i / (MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ);
-            uint gbl_i = i & ((MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) - 1);
-#endif
-            uint gbl_we_off   = wei_off + lcl_o * MLO_N_INPUTS * MLO_FILTER_SZ + gbl_i;
-            bool within_range = gbl_we_off < (MLO_N_OUTPUTS * MLO_N_INPUTS * MLO_FILTER_SZ);
-
-            gbl_we_off = (within_range) ? gbl_we_off : 0;
-            _FLOAT wei = weights[gbl_we_off];
-            wei        = (within_range) ? wei : 0;
-            lcl_wei[i] = wei;
-#else
-// outputs are botoms(inputs))
-// inputs are tops(outputs)
-#if(MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ) & ((MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ) - 1)
-            uint lcl_o = iDiv(i, (MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ));
-            uint gbl_i = iMod(i, lcl_o, (MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ));
-#else
-            uint lcl_o = i / (MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ);
-            uint gbl_i = i & ((MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ) - 1);
-#endif
-#if MLO_FILTER_SZ & (MLO_FILTER_SZ - 1)
-            uint lcl_c = iDiv(gbl_i, MLO_FILTER_SZ);
-            uint lcl_i = iMod(gbl_i, lcl_c, MLO_FILTER_SZ);
-#else
-            uint lcl_c = gbl_i / MLO_FILTER_SZ;
-            uint lcl_i = gbl_i & (MLO_FILTER_SZ - 1);
-#endif
-
-            uint lcl_we_off = mad24(
-                mad24(lcl_c, (uint)MLO_N_IN_TILES_PERSTACK, lcl_o), (uint)MLO_FILTER_SZ, lcl_i);
-            uint gbl_we_off = mad24(
-                mad24(lcl_o, (uint)MLO_N_OUTPUTS, lcl_c), (uint)MLO_FILTER_SZ, wei_off + lcl_i);
-            bool within_range   = gbl_we_off < (MLO_N_OUTPUTS * MLO_N_INPUTS * MLO_FILTER_SZ);
-            gbl_we_off          = (within_range) ? gbl_we_off : 0;
-            _FLOAT wei          = weights[gbl_we_off];
-            wei                 = (within_range) ? wei : 0;
-            lcl_wei[lcl_we_off] = wei;
-
-#endif
-        }
-
-#endif
-
-// over all batch stacks
-
-#endif // all input
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // convolution
-        Conv(o_map_base, in_stg_off, pvt_in_stage, lcl_indata, pvt_wei_stage, lcl_wei, pvt_accum);
-
-        //		barrier(CLK_LOCAL_MEM_FENCE);
-    }
-// write results out
-#if MLO_DIR_FORWARD == 1
-#if MLO_FILTER_STRIDE0 == 1
-    int x_out_grp = x_grp;
-#else
-    int x_out_grp = x_tile_blk * MLO_IN_TILE0;
-#endif
-#if MLO_FILTER_STRIDE1 == 1
-    int y_out_grp = y_grp;
-#else
-    int y_out_grp = y_tile_blk * MLO_IN_TILE1;
-#endif
-#else
-    int x_out_grp         = x_grp * MLO_FILTER_STRIDE0;
-    int y_out_grp         = y_grp * MLO_FILTER_STRIDE1;
-#endif
-    int x_out_lcl = alu_tl0 * MLO_OUT_TILE0;
-    int y_out_lcl = alu_tl1 * MLO_OUT_TILE1;
-
-    uint out_off = (b_index + stack) * MLO_OUT_BATCH_STRIDE + o_map * MLO_OUT_CHANNEL_STRIDE +
-                   (y_out_grp + y_out_lcl) * MLO_OUT_STRIDE + x_out_grp + x_out_lcl;
-// over all local stacks
-#if MLO_BATCH_ALIGNED == 0
-    if(b_index + stack < MLO_BATCH_SZ)
-#endif
-    {
-
-        // over all local outputs
-        int out_off1 = out_off;
-        for(uint o = 0; o < MLO_N_OUT_TILES; ++o, out_off1 += MLO_OUT_CHANNEL_STRIDE)
-        {
-#if MLO_OUTPUTS_ALIGNED == 0
-            if(o_map + o < MLO_N_OUTPUTS)
-#endif
-            {
-                // over output tile
-                int out_off2 = out_off1;
-#if MLO_OUT_TILE0 == 1
-                for(int j = 0; j < MLO_OUT_TILE1 && y_out_grp + y_out_lcl + j < MLO_OUT_HEIGHT;
-                    ++j, out_off2 += MLO_OUT_STRIDE)
-                {
-                    for(int i = 0; i < MLO_OUT_TILE0 && x_out_grp + x_out_lcl + i < MLO_OUT_WIDTH &&
-                                   out_off2 + i < MLO_OUT_BATCH_STRIDE * MLO_BATCH_SZ;
-                        ++i)
-                    {
-#else
-                for(uint j = 0; j < MLO_OUT_TILE1; ++j, out_off2 += MLO_OUT_STRIDE)
-                {
-                    if(y_out_grp + y_out_lcl + j < MLO_OUT_HEIGHT)
-                        for(uint i = 0; i < MLO_OUT_TILE0; ++i)
-                        {
-							if (x_out_grp + x_out_lcl + i < MLO_OUT_WIDTH &&
-								out_off2 + i < MLO_OUT_BATCH_STRIDE * MLO_BATCH_SZ)
-							{
-#endif
-								out[out_off2 + i] = pvt_accum[o * MLO_OUT_TILE_SZ + j * MLO_OUT_TILE0 + i]
-#if MLO_CONV_BIAS
-									+ bias[o_map + o]
-#endif
-									;
-								//ReLU fusion
-								out[out_off2 + i] *= (out[out_off2 + i] > 0.0f ? 1.0f : slope);
-							}
-                    }
-                }
-            }
-        }
-    }
-#endif
-
-	/*uint tid = get_local_id(0);
-    uint gid = get_group_id(0);
-	
-	__global _FLOAT* q = out + 256 * gid + tid; // 线性地址.(测试用)
-	*q = 256 * gid + tid;*/
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M1.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M1.cl
deleted file mode 100644
index 960a4e0a2..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M1.cl
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#define STRIDE	(25088)
-#define ITER	(196)
-
-void reduce(__local float* buffer, int tid)
-{
-	if (tid < 64)
-	{
-		buffer[tid] += buffer[tid + 64];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 32)
-	{
-		buffer[tid << 1] += buffer[(tid << 1) + 1];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 16)
-	{
-		buffer[tid << 2] += buffer[(tid << 2) + 2];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 8)
-	{
-		buffer[tid << 3] += buffer[(tid << 3) + 4];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 4)
-	{
-		buffer[tid << 4] += buffer[(tid << 4) + 8];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-__attribute__((reqd_work_group_size(128, 1, 1)))
-__kernel void InnerProduct(
-    __global const float *a, 
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	__local float shared_a[129];
-	__local float shared_b[8][65];
-
-	__local float result[2][129];
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__global const float* pA = (__global const float*)(a + (grid_x >> 9) * STRIDE);
-	__global const float* pB = (__global const float*)(b + ((grid_x & 511) << 3) * STRIDE);
-
-	int offset = ((grid_x << 6) + ((lid_x >> 6) * 12544) + (lid_x & 63)) % STRIDE;
-
-	int temp_offset = offset;
-	for (int l = 0; l < 2; l++, offset = temp_offset)
-	{
-		float sum = 0.0f;
-		for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-		{
-			shared_a[lid_x] = pA[offset];
-
-			for (int j = l * 4; j < (l + 1) * 4; j++)
-			{
-				shared_b[(lid_x >> 6 << 2) + (j & 3)][(lid_x & 63)] = pB[offset + j * STRIDE];
-			}
-			barrier(CLK_LOCAL_MEM_FENCE);
-
-			for (int k = 0; k < 4; k++)
-			{
-				sum += shared_a[(lid_x >> 6 << 6) + ((lid_x & 15) << 2) + k] * shared_b[(lid_x >> 6 << 2) + ((lid_x & 63) >> 4)][((lid_x & 15) << 2) + k];
-			}
-			barrier(CLK_LOCAL_MEM_FENCE);
-		}
-
-		result[l][lid_x] = sum;
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		reduce(result[l], lid_x);
-	}
-
-	if (lid_x < 8)
-	{
-		c[(grid_x << 3) + lid_x] = result[lid_x >> 2][(lid_x & 3) << 4] + bias[(grid_x << 3) + lid_x];
-	}
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M2.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M2.cl
deleted file mode 100644
index 568c241e6..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M2.cl
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#define STRIDE	(25088)
-#define CSTRIDE	(4096)
-#define ITER	(392)
-
-void reduce(__local float* buffer, int tid)
-{
-	if (tid < 32)
-	{
-		buffer[tid << 1] += buffer[(tid << 1) + 1];
-	}
-	if (tid < 16)
-	{
-		buffer[tid << 2] += buffer[(tid << 2) + 2];
-	}
-}
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void InnerProduct(
-	__global const float *a,
-	__global const float *b,
-	__global const float *bias,
-	__global float *c)
-{
-	__local float shared_a[2][66];
-	__local float shared_b[8][66];
-	__local float result[65];
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__global const float* pA = (__global const float*)(a);
-	__global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE);
-
-	int offset = ((grid_x << 6)) % STRIDE;
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 2; j++)
-		{
-			shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE];
-		}
-
-		for (int j = 0; j < 8; j++)
-		{
-			shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE];
-		}
-
-		for (int k = 0; k < 16; k++)
-		{
-			sum += shared_a[lid_x >> 5][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)] * shared_b[(lid_x & 31) >> 2][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)];
-		}
-	}
-
-	result[lid_x] = sum;
-	reduce(result, lid_x);
-
-	if (lid_x < 2)
-	{
-		float8 out;
-		float* pOut = (float*)&out;
-
-		for (int i = 0; i < 8; i++)
-		{
-			pOut[i] = result[((lid_x * 8 + i) << 2)] + bias[(grid_x << 3) + i];
-		}
-
-		__global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE);
-		*pC = out;
-	}
-}
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M32.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M32.cl
deleted file mode 100644
index a135816b7..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M32.cl
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-/////////////////////////////////////////////////////////
-// FC6 batch 32 Version 3 2018.6.25
-
-#define STRIDE	(25088)
-#define CSTRIDE (4096)
-#define ITER	(1568)
-
-__attribute__((reqd_work_group_size(256, 1, 1)))
-__kernel void InnerProduct(
-    __global const float *a, 
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	__local float shared_a[1024];
-	__local float shared_b[1024];
-	__local float4* pShared_a = (__local float4*)shared_a;
-	__local float4* pShared_b = (__local float4*)shared_b;
-
-	float4 sha;
-	float4 shb;
-
-	float4 sum = 0.0f;
-	float* pSum = (float*)&sum;
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__global const float* pA = (__global const float*)(a);
-	__global const float* pB = (__global const float*)(b + (grid_x << 5) * STRIDE);
-	__global float4* pC = (__global float4*)(c + (lid_x >> 3) * CSTRIDE + (grid_x << 5) + ((lid_x & 7) << 2));
-
-	int offset = (grid_x << 5) % STRIDE;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 16) % STRIDE)
-	{
-		for (int j = 0; j < 2; j++)
-		{
-			shared_a[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pA[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset];
-			shared_b[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pB[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset];
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		for (int k = 0; k < 16; k++)
-		{
-			sha = shared_a[(lid_x >> 3) + k * 32 + (((lid_x >> 3) + k * 32) >> 5 << 2)];
-			shb = pShared_b[(((lid_x & 7))) + k * 8 + (((((lid_x & 7))) + k * 8) >> 3)];
-			sum += sha * shb;
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-	}
-
-	for (int i = 0; i < 4; i++)
-	{
-		shared_a[(lid_x >> 3) * 32 + ((lid_x & 7) << 2) + i] = pSum[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	float4 result;
-	float* pResult = (float*)&result;
-
-	for (int i = 0; i < 4; i++)
-	{
-		pResult[i] = shared_a[(lid_x << 2) + i] + bias[(grid_x << 5) + ((lid_x & 7) << 2) + i];
-	}
-
-	*pC = result;
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M4.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M4.cl
deleted file mode 100644
index 3b3fec19b..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M4.cl
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#define STRIDE	(25088)
-#define CSTRIDE	(4096)
-#define ITER	(392)
-
-void reduce(__local float* buffer, int tid)
-{
-	if (tid < 32)
-	{
-		buffer[tid << 1] += buffer[(tid << 1) + 1];
-	}
-	if (tid < 16)
-	{
-		buffer[tid << 2] += buffer[(tid << 2) + 2];
-	}
-}
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void InnerProduct(
-	__global const float *a,
-	__global const float *b,
-	__global const float *bias,
-	__global float *c)
-{
-	__local float shared_a[4][66];
-	__local float shared_b[4][66];
-	__local float result[65];
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__global const float* pA = (__global const float*)(a);
-	__global const float* pB = (__global const float*)(b + (grid_x << 2) * STRIDE);
-
-	int offset = ((grid_x << 6)) % STRIDE;
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 4; j++)
-		{
-			shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE];
-			shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE];
-		}
-
-		for (int k = 0; k < 16; k++)
-		{
-			sum += shared_a[lid_x >> 4][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)] * shared_b[(lid_x & 15) >> 2][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)];
-		}
-	}
-
-	result[lid_x] = sum;
-	reduce(result, lid_x);
-
-	if (lid_x < 4)
-	{
-		float4 out;
-		float* pOut = (float*)&out;
-
-		for (int i = 0; i < 4; i++)
-		{
-			pOut[i] = result[((lid_x * 4 + i) << 2)] + bias[(grid_x << 2) + i];
-		}
-
-		__global float4* pC = (__global float4*)(c + (grid_x << 2) + lid_x * CSTRIDE);
-		*pC = out;
-	}
-}
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M8.cl
deleted file mode 100644
index 29cdb4a6a..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M8.cl
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#define STRIDE	(25088)
-#define CSTRIDE	(4096)
-#define ITER	(392)
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void InnerProduct(
-	__global const float *a,
-	__global const float *b,
-	__global const float *bias,
-	__global float *c)
-{
-	__local float shared_a[8][66];
-	__local float shared_b[8][66];
-	__local float result[65];
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__global const float* pA = (__global const float*)(a);
-	__global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE);
-
-	int offset = ((grid_x << 6)) % STRIDE;
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 8; j++)
-		{
-			shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE];
-		}
-
-		for (int j = 0; j < 8; j++)
-		{
-			shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE];
-		}
-
-		for (int k = 0; k < 64; k++)
-		{
-			sum += shared_a[lid_x >> 3][k + (k >> 5)] * shared_b[(lid_x & 7)][k + (k >> 5)];
-		}
-	}
-
-	result[lid_x] = sum;
-
-	if (lid_x < 8)
-	{
-		float8 out;
-		float* pOut = (float*)&out;
-
-		for (int i = 0; i < 8; i++)
-		{
-			pOut[i] = result[((lid_x * 8 + i))] + bias[(grid_x << 3) + i];
-		}
-
-		__global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE);
-		*pC = out;
-	}
-}
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M1.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M1.cl
deleted file mode 100644
index f166c449a..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M1.cl
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#define STRIDE	(4096)
-#define ITER	(64)
-
-__attribute__((reqd_work_group_size(256, 1, 1)))
-__kernel void InnerProduct(
-	__constant float *a,
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	__local float shared_b[64][65];
-
-	__local float result[256];
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__constant float* pA = (__constant float*)a;
-	__global const float* pB = (__global const float*)(b + ((grid_x << 6) + (lid_x >> 6 << 4)) * STRIDE + (lid_x & 63));
-
-	int offset = (grid_x << 7) % STRIDE;
-
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 16; j++)
-		{
-			shared_b[(lid_x >> 6 << 4) + j][(lid_x & 63)] = pB[offset + j * STRIDE];
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		for (int k = 0; k < 16; k++)
-		{
-			sum += pA[offset + (lid_x >> 6 << 4) + k] * shared_b[lid_x & 63][(lid_x >> 6 << 4) + k];//shared_a[(lid_x >> 6 << 4) + k]
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-	}
-
-	result[lid_x] = sum;
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if (lid_x < 64)
-	{
-		result[lid_x] += result[lid_x + 64] + result[lid_x + 128] + result[lid_x + 192];
-		c[(grid_x << 6) + lid_x] = result[lid_x] + bias[(grid_x << 6) + lid_x];
-	}
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M2.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M2.cl
deleted file mode 100644
index fd4442971..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M2.cl
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#define STRIDE	(4096)
-#define ITER	(64)
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void InnerProduct(
-	__constant float *a,
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	__local float shared_b[16][65];
-
-	__local float result[64];
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__constant float* pA = (__constant float*)(a + (grid_x >> 8 << 12)); //correct
-	__global const float* pB = (__global const float*)(b + ((grid_x & 255) << 4) * STRIDE + lid_x);
-
-	int offset = ((grid_x & 255) << 6) % STRIDE;
-
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 16; j++)
-		{
-			shared_b[j][lid_x] = pB[offset + j * STRIDE];
-		}
-		//barrier(CLK_LOCAL_MEM_FENCE);
-
-		for (int k = 0; k < 16; k++)
-		{
-			sum += pA[offset + (lid_x >> 4 << 4) + k] * shared_b[(lid_x & 15)][(lid_x >> 4 << 4) + k];
-		}
-		//barrier(CLK_LOCAL_MEM_FENCE);
-	}
-
-	result[lid_x] = sum;
-	//barrier(CLK_LOCAL_MEM_FENCE);
-
-	if (lid_x < 16)
-	{
-		result[lid_x] += result[lid_x + 16] + result[lid_x + 32] + result[lid_x + 48];
-		c[(grid_x << 4) + lid_x] = result[lid_x] + bias[((grid_x & 255) << 4) + lid_x];
-	}
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M32.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M32.cl
deleted file mode 100644
index 88a8659d1..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M32.cl
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-/////////////////////////////////////////////////////////
-// FC7 batch 32 Version 3 2018.6.25
-
-#define STRIDE	(4096)
-#define CSTRIDE (4096)
-#define ITER	(256)
-
-__attribute__((reqd_work_group_size(256, 1, 1)))
-__kernel void InnerProduct(
-    __global const float *a, 
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	__local float shared_a[1024];
-	__local float shared_b[1024];
-	__local float4* pShared_a = (__local float4*)shared_a;
-	__local float4* pShared_b = (__local float4*)shared_b;
-
-	float4 sha;
-	float4 shb;
-
-	float4 sum = 0.0f;
-	float* pSum = (float*)&sum;
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__global const float* pA = (__global const float*)(a);
-	__global const float* pB = (__global const float*)(b + (grid_x << 5) * STRIDE);
-	__global float4* pC = (__global float4*)(c + (lid_x >> 3) * CSTRIDE + (grid_x << 5) + ((lid_x & 7) << 2));
-
-	int offset = (grid_x << 6) % STRIDE;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 16) % STRIDE)
-	{
-		for (int j = 0; j < 2; j++)
-		{
-			shared_a[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pA[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset];
-			shared_b[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pB[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset];
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		for (int k = 0; k < 16; k++)
-		{
-			sha = shared_a[(lid_x >> 3) + k * 32 + (((lid_x >> 3) + k * 32) >> 5 << 2)];
-			shb = pShared_b[(((lid_x & 7))) + k * 8 + (((((lid_x & 7))) + k * 8) >> 3)];
-			sum += sha * shb;
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-	}
-
-	for (int i = 0; i < 4; i++)
-	{
-		shared_a[(lid_x >> 3) * 32 + ((lid_x & 7) << 2) + i] = pSum[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	float4 result;
-	float* pResult = (float*)&result;
-
-	for (int i = 0; i < 4; i++)
-	{
-		pResult[i] = shared_a[(lid_x << 2) + i] + bias[(grid_x << 5) + ((lid_x & 7) << 2) + i];
-	}
-
-	*pC = result;
-}
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M4.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M4.cl
deleted file mode 100644
index f7eb1091b..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M4.cl
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#define STRIDE	(4096)
-#define CSTRIDE	(4096)
-#define ITER	(64)
-
-void reduce(__local float* buffer, int tid)
-{
-	if (tid < 32)
-	{
-		buffer[tid << 1] += buffer[(tid << 1) + 1];
-	}
-}
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void InnerProduct(
-	__global const float *a,
-	__global const float *b,
-	__global const float *bias,
-	__global float *c)
-{
-	__local float shared_a[4][66];
-	__local float shared_b[8][66];
-	__local float result[65];
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__global const float* pA = (__global const float*)(a);
-	__global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE);
-
-	int offset = ((grid_x << 6)) % STRIDE;
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 4; j++)
-		{
-			shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE];
-		}
-
-		for (int j = 0; j < 8; j++)
-		{
-			shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE];
-		}
-
-		for (int k = 0; k < 32; k++)
-		{
-			sum += shared_a[lid_x >> 4][((lid_x & 1) << 5) + k + ((((lid_x & 1) << 5) + k) >> 5)] * shared_b[(lid_x & 15) >> 1][((lid_x & 1) << 5) + k + ((((lid_x & 1) << 5) + k) >> 5)];
-		}
-	}
-
-	result[lid_x] = sum;
-	reduce(result, lid_x);
-
-	if (lid_x < 4)
-	{
-		float8 out;
-		float* pOut = (float*)&out;
-
-		for (int i = 0; i < 8; i++)
-		{
-			pOut[i] = result[((lid_x * 8 + i) << 1)] + bias[(grid_x << 3) + i];
-		}
-
-		__global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE);
-		*pC = out;
-	}
-}
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M8.cl
deleted file mode 100644
index ba62863d5..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M8.cl
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#define STRIDE	(4096)
-#define CSTRIDE	(4096)
-#define ITER	(64)
-#define NOVECTOR
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void InnerProduct(
-	__global const float *a,
-	__global const float *b,
-	__global const float *bias,
-	__global float *c)
-{
-	__local float shared_a[8][66];
-	__local float shared_b[8][66];
-	__local float result[65];
-
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__global const float* pA = (__global const float*)(a);
-	__global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE);
-
-	int offset = ((grid_x << 6)) % STRIDE;
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 8; j++)
-		{
-			shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE];
-		}
-
-		for (int j = 0; j < 8; j++)
-		{
-			shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE];
-		}
-
-		for (int k = 0; k < 64; k++)
-		{
-			sum += shared_a[lid_x >> 3][k + (k >> 5)] * shared_b[(lid_x & 7)][k + (k >> 5)];
-		}
-	}
-
-	result[lid_x] = sum;
-	if (lid_x < 8)
-	{
-		float8 out;
-		float* pOut = (float*)&out;
-
-		for (int i = 0; i < 8; i++)
-		{
-			pOut[i] = result[((lid_x * 8 + i))] + bias[(grid_x << 3) + i];
-		}
-
-		__global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE);
-		*pC = out;
-	}
-}
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8.cl
deleted file mode 100644
index 8d9257251..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8.cl
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#define STRIDE	(4096)
-#define ITER	(16)
-
-#define OUTPUT	1000
-
-__attribute__((reqd_work_group_size(256, 1, 1)))
-__kernel void InnerProduct(
-	__constant float *a,
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__local float shared_b[64][65];
-	__local float result[256];
-
-	__constant float* pA = (__constant float*)a;
-	__global const float* pB = (__global const float*)(b + ((grid_x << 4)) * STRIDE + (lid_x & 63)); //correct
-
-	int offset = ((grid_x << 6) + (lid_x >> 6 << 10)) % STRIDE;
-
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 16; j++)
-		{
-			shared_b[(lid_x >> 6 << 4) + j][(lid_x & 63)] = (offset + j * STRIDE + ((grid_x << 4)) * STRIDE + (lid_x & 63) < OUTPUT * STRIDE ? pB[(offset + j * STRIDE)] : 0.0f); //correct
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		for (int k = 0; k < 16; k++)
-		{
-			sum += pA[(offset + ((lid_x & 3) << 4) + k) % STRIDE] * shared_b[(lid_x >> 2)][((lid_x & 3) << 4) + k]; //correct
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-	}
-
-	result[lid_x] = sum;
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if (lid_x < 64)
-	{
-		result[(lid_x << 2)] += result[(lid_x << 2) + 1] + result[(lid_x << 2) + 2] + result[(lid_x << 2) + 3];
-		barrier(CLK_LOCAL_MEM_FENCE);
-		result[(lid_x << 2)] += result[(lid_x << 2) + 64] + result[(lid_x << 2) + 128] + result[(lid_x << 2) + 192];
-
-		if (lid_x < 16 && (grid_x << 4) + lid_x < OUTPUT)
-		{
-			c[(grid_x << 4) + lid_x] = bias[(grid_x << 4) + lid_x] + result[(lid_x << 2)];
-		}
-	}
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M1.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M1.cl
deleted file mode 100644
index 7535432d2..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M1.cl
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#define STRIDE	(4096)
-#define ITER	(16)
-
-#define OUTPUT	1000
-
-__attribute__((reqd_work_group_size(256, 1, 1)))
-__kernel void InnerProduct(
-	__constant float *a,
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__local float shared_b[64][65];
-	__local float result[256];
-
-	__constant float* pA = (__constant float*)a;
-	__global const float* pB = (__global const float*)(b + ((grid_x << 4)) * STRIDE + (lid_x & 63)); //correct
-
-	int offset = ((grid_x << 6) + (lid_x >> 6 << 10)) % STRIDE;
-
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 16; j++)
-		{
-			shared_b[(lid_x >> 6 << 4) + j][(lid_x & 63)] = (offset + j * STRIDE + ((grid_x << 4)) * STRIDE + (lid_x & 63) < OUTPUT * STRIDE ? pB[(offset + j * STRIDE)] : 0.0f); //correct
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		for (int k = 0; k < 16; k++)
-		{
-			sum += pA[(offset + ((lid_x & 3) << 4) + k) % STRIDE] * shared_b[(lid_x >> 2)][((lid_x & 3) << 4) + k]; //correct
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-	}
-
-	result[lid_x] = sum;
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if (lid_x < 64)
-	{
-		result[(lid_x << 2)] += result[(lid_x << 2) + 1] + result[(lid_x << 2) + 2] + result[(lid_x << 2) + 3];
-		barrier(CLK_LOCAL_MEM_FENCE);
-		result[(lid_x << 2)] += result[(lid_x << 2) + 64] + result[(lid_x << 2) + 128] + result[(lid_x << 2) + 192];
-
-		if (lid_x < 16 && (grid_x << 4) + lid_x < OUTPUT)
-		{
-			c[(grid_x << 4) + lid_x] = bias[(grid_x << 4) + lid_x] + result[(lid_x << 2)];
-		}
-	}
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M2.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M2.cl
deleted file mode 100644
index 9e8c270b7..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M2.cl
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#define STRIDE	(4096)
-#define HSTRIDE	(2048)
-#define ITER	(32)
-
-#define OUTPUT	1000
-
-void reduce(__local float* buffer, int tid)
-{
-	if (tid < 64)
-	{
-		buffer[tid] += buffer[tid + 64];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 32)
-	{
-		buffer[tid << 1] += buffer[(tid << 1) + 1];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 16)
-	{
-		buffer[tid << 2] += buffer[(tid << 2) + 2];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 8)
-	{
-		buffer[tid << 3] += buffer[(tid << 3) + 4];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-__attribute__((reqd_work_group_size(128, 1, 1)))
-__kernel void InnerProduct(
-	__constant float *a,
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__local float shared_b[16][65];
-	__local float result[129];
-
-	__constant float* pA = (__constant float*)(a + (grid_x >> 7 << 12));
-	__global const float* pB = (__global const float*)(b);
-
-	int offset = (((grid_x & 127) << 6)) % HSTRIDE;
-
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % HSTRIDE)
-	{
-		for (int j = 0; j < 8; j++)
-		{
-			shared_b[(lid_x >> 6 << 3) + j][(lid_x & 63)] = (offset + (lid_x >> 6 << 11) + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x & 63) < OUTPUT * STRIDE ? pB[offset + (lid_x >> 6 << 11) + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x & 63)] : 0.0f); //correct
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		for (int k = 0; k < 8; k++)
-		{
-			sum += pA[(offset + ((lid_x & 7) << 3) + k) % HSTRIDE + (lid_x >> 6 << 11)] * shared_b[(lid_x >> 6 << 3) + ((lid_x & 63) >> 3)][((lid_x & 7) << 3) + k]; //correct
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-	}
-
-	result[lid_x] = sum;
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	reduce(result, lid_x);
-
-	if (lid_x < 8 && ((grid_x & 127) << 3) + lid_x < OUTPUT)
-	{
-		int out_offset = ((grid_x >> 7) * OUTPUT + ((grid_x & 127) << 3) + lid_x);
-		c[out_offset] = bias[((grid_x & 127) << 3) + lid_x] + result[(lid_x << 3)];
-	}
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M32.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M32.cl
deleted file mode 100644
index 99687a01b..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M32.cl
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#define STRIDE	(4096)
-#define ITER	(64)
-
-#define OUTPUT	1000
-
-void reduce(__local float* buffer, int tid)
-{
-	if (tid < 32)
-	{
-		buffer[tid << 1] += buffer[(tid << 1) + 1];
-	}
-}
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void InnerProduct(
-	__global const float *a,
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__local float shared_a[4][65];
-	__local float shared_b[8][65];
-	__local float result[65];
-
-	__global const float* pA = (__global const float*)(a + (grid_x >> 7 << 14)); //correct
-	__global const float* pB = (__global const float*)(b); //correct
-
-	int offset = (((grid_x & 127) << 6)) % STRIDE;
-
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 4; j++)
-		{
-			shared_a[j][lid_x] = pA[offset + j * STRIDE + lid_x];
-		}
-
-		for (int j = 0; j < 8; j++)
-		{
-			shared_b[j][(lid_x)] = ((j + ((grid_x & 127) << 3)) * STRIDE + (offset + lid_x) < OUTPUT * STRIDE ? pB[(j + ((grid_x & 127) << 3)) * STRIDE + (offset + lid_x)] : 0.0f); //correct
-		}
-
-		for (int k = 0; k < 32; k++)
-		{
-			sum += shared_a[lid_x >> 4][((lid_x & 1) << 5) + k] * shared_b[((lid_x & 15) >> 1)][((lid_x & 1) << 5) + k]; //correct
-		}
-	}
-
-	result[lid_x] = sum;
-	reduce(result, lid_x);
-
-	if (lid_x < 32 && ((grid_x & 127) << 3) + (lid_x & 7) < OUTPUT)
-	{
-		int out_offset = ((grid_x >> 7 << 2) + (lid_x >> 3)) * OUTPUT + ((grid_x & 127) << 3) + (lid_x & 7);
-		c[out_offset] = bias[((grid_x & 127) << 3) + (lid_x & 7)] + result[(lid_x << 1)];
-	}
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M4.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M4.cl
deleted file mode 100644
index 952e71fd5..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M4.cl
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#define STRIDE	(4096)
-#define ITER	(64)
-
-#define OUTPUT	1000
-
-void reduce(__local float* buffer, int tid)
-{
-	if (tid < 32)
-	{
-		buffer[tid << 1] += buffer[(tid << 1) + 1];
-	}
-	if (tid < 16)
-	{
-		buffer[tid << 2] += buffer[(tid << 2) + 2];
-	}
-	if (tid < 8)
-	{
-		buffer[tid << 3] += buffer[(tid << 3) + 4];
-	}
-}
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void InnerProduct(
-	__constant float *a,
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__local float shared_b[8][65];
-	__local float result[64];
-
-	__constant float* pA = (__constant float*)(a + (grid_x >> 7 << 12)); //correct
-	__global const float* pB = (__global const float*)(b); //correct
-
-	int offset = (((grid_x & 127) << 6)) % STRIDE;
-
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		for (int j = 0; j < 8; j++)
-		{
-			shared_b[j][(lid_x)] = (offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x) < OUTPUT * STRIDE ? pB[offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x)] : 0.0f); //correct
-		}
-
-		for (int k = 0; k < 8; k++)
-		{
-			sum += pA[(offset + ((lid_x & 7) << 3) + k)] * shared_b[((lid_x) >> 3)][((lid_x & 7) << 3) + k]; //correct
-		}
-	}
-
-	result[lid_x] = sum;
-
-	reduce(result, lid_x);
-
-	if (lid_x < 8 && ((grid_x & 127) << 3) + lid_x < OUTPUT)
-	{
-		int out_offset = ((grid_x >> 7) * OUTPUT + ((grid_x & 127) << 3) + lid_x);
-		c[out_offset] = bias[((grid_x & 127) << 3) + lid_x] + result[(lid_x << 3)];
-	}
-}
-
diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M8.cl
deleted file mode 100644
index 3e72c5054..000000000
--- a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M8.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#define STRIDE	(4096)
-#define ITER	(64)
-
-#define OUTPUT	1000
-
-void reduce(__local float* buffer, int tid)
-{
-	if (tid < 32)
-	{
-		buffer[tid << 1] += buffer[(tid << 1) + 1];
-	}
-	if (tid < 16)
-	{
-		buffer[tid << 2] += buffer[(tid << 2) + 2];
-	}
-	if (tid < 8)
-	{
-		buffer[tid << 3] += buffer[(tid << 3) + 4];
-	}
-}
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void InnerProduct(
-	__global const float *a,
-    __global const float *b, 
-    __global const float *bias,
-    __global float *c)
-{
-	int gid_x = get_global_id(0);
-	int lid_x = get_local_id(0);
-	int grid_x = get_group_id(0);
-
-	__local float shared_a[65];
-	__local float shared_b[8][65];
-	__local float result[64];
-
-	__global const float* pA = (__global const float*)(a + (grid_x >> 7 << 12)); //correct
-	__global const float* pB = (__global const float*)(b); //correct
-
-	int offset = (((grid_x & 127) << 6)) % STRIDE;
-
-	float sum = 0.0f;
-
-	for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE)
-	{
-		shared_a[lid_x] = pA[offset + lid_x];
-
-		for (int j = 0; j < 8; j++)
-		{
-			shared_b[j][(lid_x)] = (offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x) < OUTPUT * STRIDE ? pB[offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x)] : 0.0f); //correct
-		}
-
-		for (int k = 0; k < 8; k++)
-		{
-#if 1
-			sum = mad(shared_a[((lid_x & 7) << 3) + k], shared_b[((lid_x) >> 3)][((lid_x & 7) << 3) + k], sum); //correct
-#else
-			sum += shared_a[((lid_x & 7) << 3) + k] * shared_b[((lid_x) >> 3)][((lid_x & 7) << 3) + k]; //correct
-#endif
-		}
-	}
-
-	result[lid_x] = sum;
-
-	reduce(result, lid_x);
-
-	if (lid_x < 8 && ((grid_x & 127) << 3) + lid_x < OUTPUT)
-	{
-		int out_offset = ((grid_x >> 7) * OUTPUT + ((grid_x & 127) << 3) + lid_x);
-		c[out_offset] = bias[((grid_x & 127) << 3) + lid_x] + result[(lid_x << 3)];
-	}
-}
-
diff --git a/saber/funcs/impl/amd/cl/MIOpenBiasReLuPooling.cl b/saber/funcs/impl/amd/cl/MIOpenBiasReLuPooling.cl
deleted file mode 100644
index a17634bc9..000000000
--- a/saber/funcs/impl/amd/cl/MIOpenBiasReLuPooling.cl
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#define _FLOAT float
-#define _FLOAT2 float2
-#define _FLOAT4 float4
-#define _FLOAT8 float8
-#define _INT_MASK_GLOBAL uchar
-#define _INT_MASK_LOCAL uchar
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.402823466e+38F /* max value */
-#endif
-
-#define UNUSED __attribute__((__unused__))
-
-#define MLO_POOLING_OP_MAX 0
-#define MLO_POOLING_OP_AVE 1
-#define MLO_POOLING_OP_STC 2
-
-#define MLO_POOLING_GROUP_SZ2 1
-
-#ifndef MLO_POOLING_OP_ID
-#define MLO_POOLING_OP_ID 0
-#endif
-// max
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
-#define MLO_POOLING_OP(A, B) fmax(A, B);
-#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-#define MLO_POOLING_OP(A, B) (A + B);
-#endif
-
-/*********************************************************************************
-
-**********************************************************************************/
-
-#define THREAD_PER_WAVE	64
-#define WAVE_PER_4SIMD	40
-
-#define MLO_BOT_DATA_SZ0 2
-#define MLO_BOT_DATA_SZ1 2
-
-//#define LOCAL_MEMORY
-
-__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void
-mloPooling(const __global _FLOAT* bot,
-    __global _FLOAT* top,
-    __global _FLOAT* bias,
-    float slope)
-{
-    uint gid     = get_global_id(0);
-	uint ob      = BATCH_NUM * MLO_POOLING_N_OUTPUTS; // output * batch_sz
-	uint bot_off = 0;
-	uint top_off = gid;
-
-    _FLOAT2 bot_data[MLO_BOT_DATA_SZ1];
-    _FLOAT res;
-
-#ifdef LOCAL_MEMORY
-	__local _FLOAT write_combine[256];
-	__local _FLOAT4* p_write_combine = (__local _FLOAT4*)write_combine;
-	__global _FLOAT4* p_top;
-#endif
-
-	uint loop_num = ((ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT + THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD - 1) / (THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD));
-	uint top_loop_stride = THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD;
-
-	for (int index = 0; index < loop_num && top_off < ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT; index++, top_off += top_loop_stride)
-	{
-		uint bot_b   = (top_off / MLO_POOLING_TOP_BATCH_STRIDE);
-		uint bot_c   = (top_off % MLO_POOLING_TOP_BATCH_STRIDE / MLO_POOLING_TOP_CHANNEL_STRIDE);
-		uint bot_y   = (top_off % MLO_POOLING_TOP_CHANNEL_STRIDE / MLO_POOLING_TOP_STRIDE) << 1;
-		uint bot_x   = (top_off % MLO_POOLING_TOP_STRIDE) << 1;
-
-		bot_off = bot_b * MLO_POOLING_BOT_BATCH_STRIDE + bot_c * MLO_POOLING_BOT_CHANNEL_STRIDE + bot_y * MLO_POOLING_BOT_STRIDE + bot_x;
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
-		res = -FLT_MAX;
-#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-		res = 0;
-#endif
-
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-		uint pool_size = 0;
-#endif
-		for(uint j = 0; j < MLO_BOT_DATA_SZ1; ++j)
-		{
-			uint bot_gbl_off = bot_off + j * MLO_POOLING_BOT_STRIDE;
-			__global _FLOAT2* read = (__global _FLOAT2*)(bot + bot_gbl_off);
-			bot_data[j] = *read;
-                        bot_data[j] += bias[bot_c];
-                        bot_data[j].s0 *= (bot_data[j].s0 > 0.0f ? 1.0f : slope);
-                        bot_data[j].s1 *= (bot_data[j].s1 > 0.0f ? 1.0f : slope);
-
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-			pool_size += (uint)vis;
-#endif
-            res = MLO_POOLING_OP(res, bot_data[j].s0);
-
-
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-			pool_size += (uint)vis;
-#endif
-            res = MLO_POOLING_OP(res, bot_data[j].s1);
-		}
-
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-		res *= 1.f / (_FLOAT)pool_size;
-#endif
-
-#ifdef LOCAL_MEMORY
-		write_combine[get_local_id(0)] = res;
-		if (get_local_id(0) % 4 == 0)
-		{
-			p_top = (__global _FLOAT4*)(top + top_off);
-			*p_top = p_write_combine[get_local_id(0) / 4];
-		}
-#else
-		top[top_off] = res;
-#endif
-	}
-}
diff --git a/saber/funcs/impl/amd/cl/Pooling.cl b/saber/funcs/impl/amd/cl/Pooling.cl
deleted file mode 100644
index 31927ad16..000000000
--- a/saber/funcs/impl/amd/cl/Pooling.cl
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#define _FLOAT float
-#define _FLOAT2 float2
-#define _FLOAT4 float4
-#define _FLOAT8 float8
-#define _INT_MASK_GLOBAL uchar
-#define _INT_MASK_LOCAL uchar
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.402823466e+38F /* max value */
-#endif
-
-#define UNUSED __attribute__((__unused__))
-
-#define MLO_POOLING_OP_MAX 0
-#define MLO_POOLING_OP_AVE 1
-#define MLO_POOLING_OP_STC 2
-
-#define MLO_POOLING_GROUP_SZ2 1
-
-#ifndef MLO_POOLING_OP_ID
-#define MLO_POOLING_OP_ID 0
-#endif
-// max
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
-#define MLO_POOLING_OP(A, B) fmax(A, B);
-#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-#define MLO_POOLING_OP(A, B) (A + B);
-#endif
-
-/*********************************************************************************
-
-**********************************************************************************/
-
-#define THREAD_PER_WAVE	64
-#define WAVE_PER_4SIMD	40
-
-#define MLO_BOT_DATA_SZ0 2
-#define MLO_BOT_DATA_SZ1 2
-
-//#define LOCAL_MEMORY
-
-__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void
-mloPooling(const __global _FLOAT* bot,
-    __global _FLOAT* top )
-//	__global _INT_MASK_GLOBAL* mask)
-{
-    uint gid     = get_global_id(0);
-	uint ob      = BATCH_NUM * MLO_POOLING_N_OUTPUTS; // output * batch_sz
-	uint bot_off = 0;
-	uint top_off = gid;
-
-    _FLOAT2 bot_data[MLO_BOT_DATA_SZ1];
-    _FLOAT res;
-
-#ifdef LOCAL_MEMORY
-	__local _FLOAT write_combine[256];
-	__local _FLOAT4* p_write_combine = (__local _FLOAT4*)write_combine;
-	__global _FLOAT4* p_top;
-#endif
-
-	uint loop_num = ((ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT + THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD - 1) / (THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD));
-	uint top_loop_stride = THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD;
-
-	for (int index = 0; index < loop_num && top_off < ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT; index++, top_off += top_loop_stride)
-	{
-		uint bot_b   = (top_off / MLO_POOLING_TOP_BATCH_STRIDE);
-		uint bot_c   = (top_off % MLO_POOLING_TOP_BATCH_STRIDE / MLO_POOLING_TOP_CHANNEL_STRIDE);
-		uint bot_y   = (top_off % MLO_POOLING_TOP_CHANNEL_STRIDE / MLO_POOLING_TOP_STRIDE) << 1;
-		uint bot_x   = (top_off % MLO_POOLING_TOP_STRIDE) << 1;
-
-		bot_off = bot_b * MLO_POOLING_BOT_BATCH_STRIDE + bot_c * MLO_POOLING_BOT_CHANNEL_STRIDE + bot_y * MLO_POOLING_BOT_STRIDE + bot_x;
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
-		res = -FLT_MAX;
-#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-		res = 0;
-#endif
-
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-		uint pool_size = 0;
-#endif
-		for(uint j = 0; j < MLO_BOT_DATA_SZ1; ++j)
-		{
-			//int run_y = (int)j;
-
-			uint bot_gbl_off = bot_off + j * MLO_POOLING_BOT_STRIDE;
-			__global _FLOAT2* read = (__global _FLOAT2*)(bot + bot_gbl_off);
-			bot_data[j] = *read;
-
-            //int run_x        = (int)bot_x;
-//#if 1
-//			bool vis = true;
-//#else
-//            bool vis         = ((run_y >= 0 && run_y < MLO_POOLING_BOT_HEIGHT) &&
-//                        (run_x >= 0 && run_x < MLO_POOLING_BOT_WIDTH))
-//                           ? true
-//                           : false;
-//#endif
-//            bot_data[j].s0 = (vis) ? bot_data[j].s0 :
-//#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
-//                                   -FLT_MAX
-//#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-//                                   0
-//#endif
-//                ;
-
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-			pool_size += (uint)vis;
-#endif
-            res = MLO_POOLING_OP(res, bot_data[j].s0);
-
-
-            //run_x++;
-//#if 1
-//#else
-//            vis         = ((run_y >= 0 && run_y < MLO_POOLING_BOT_HEIGHT) &&
-//                        (run_x >= 0 && run_x < MLO_POOLING_BOT_WIDTH))
-//                           ? true
-//                           : false;
-//#endif
-//            bot_data[j].s1 = (vis) ? bot_data[j].s1 :
-//#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
-//                                   -FLT_MAX
-//#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-//                                   0
-//#endif
-//                ;
-
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-			pool_size += (uint)vis;
-#endif
-            res = MLO_POOLING_OP(res, bot_data[j].s1);
-		}
-
-#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE
-		res *= 1.f / (_FLOAT)pool_size;
-#endif
-
-#ifdef LOCAL_MEMORY
-		write_combine[get_local_id(0)] = res;
-		if (get_local_id(0) % 4 == 0)
-		{
-			p_top = (__global _FLOAT4*)(top + top_off);
-			*p_top = p_write_combine[get_local_id(0) / 4];
-		}
-#else
-		top[top_off] = res;
-#endif
-	}
-}
diff --git a/saber/funcs/impl/amd/cl/Relu.cl b/saber/funcs/impl/amd/cl/Relu.cl
deleted file mode 100644
index 87f0e3e42..000000000
--- a/saber/funcs/impl/amd/cl/Relu.cl
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#define _FLOAT float
-#define _FLOAT2 float2
-#define _FLOAT4 float4
-#define _FLOAT8 float8
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.402823466e+38F /* max value */
-#endif
-
-#define UNUSED __attribute__((__unused__))
-
-#ifndef MLO_FILTER_STRIDE0
-#define MLO_FILTER_STRIDE0 1
-#endif
-#ifndef MLO_FILTER_STRIDE1
-#define MLO_FILTER_STRIDE1 1
-#endif
-
-#define MLO_GRP_SZ0 (MLO_GRP_TILE0 * MLO_GRP_TILE1)
-#define MLO_GRP_SZ1 1
-#define MLO_GRP_SZ2 1
-#define MLO_GRP_SZ (MLO_GRP_SZ0 * MLO_GRP_SZ1 * MLO_GRP_SZ2)
-#define MLO_ALU_TILE_SZ (MLO_ALU_VTILE1 * MLO_ALU_VTILE0)
-
-#define MLO_N_ALUTILES_TOTAL ((MLO_GRP_TILE0 * MLO_GRP_TILE1) / (MLO_ALU_TILE_SZ))
-#define MLO_N_ALUTILES_PERSTACK (MLO_N_ALUTILES_TOTAL / MLO_N_STACKS)
-#define MLO_ALUTILES_STACK_SZ (MLO_N_ALUTILES_PERSTACK * MLO_ALU_TILE_SZ)
-
-#define MLO_N_OUT_TILE_BLOCKS0 ((MLO_OUT_WIDTH + MLO_IN_TILE0 - 1) / MLO_IN_TILE0)
-#define MLO_N_OUT_TILE_BLOCKS1 ((MLO_OUT_HEIGHT + MLO_IN_TILE1 - 1) / MLO_IN_TILE1)
-
-#define MLO_N_OUT_PACKS (MLO_N_OUTPUTS / MLO_N_OUT_TILES_PERSTACK)
-#if MLO_N_OUT_PACKS * MLO_N_OUT_TILES_PERSTACK == MLO_N_OUTPUTS && \
-    MLO_N_OUT_TILES_PERSTACK != MLO_N_OUTPUTS
-#define MLO_OUTPUTS_ALIGNED 1
-#else
-#define MLO_OUTPUTS_ALIGNED 0
-#endif
-
-#define MLO_N_BATCH_PACKS (MLO_BATCH_SZ / MLO_N_STACKS)
-#if MLO_N_BATCH_PACKS * MLO_N_STACKS == MLO_BATCH_SZ && MLO_N_STACKS != MLO_BATCH_SZ
-#define MLO_BATCH_ALIGNED 1
-#else
-#define MLO_BATCH_ALIGNED 0
-#endif
-
-#if defined(__AMDGCN__)
-extern uint __llvm_amdgcn_readfirstlane(uint) __asm("llvm.amdgcn.readfirstlane");
-#define uniform(x) __llvm_amdgcn_readfirstlane(x)
-#else
-#define uniform(x) (x)
-#endif
-
-static inline uint iDiv(uint v, uint d)
-{
-    uint r = (uint)((float)v * (1.0f / (float)d) + 0.00001f);
-    return (r);
-}
-
-static inline uint iMod(uint v, uint u, uint d)
-{
-    uint r = v - mul24((uint)u, (uint)d);
-    return (r);
-}
-
-__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void
-Relu(const __global _FLOAT* __restrict in,
-              __global _FLOAT* __restrict out,
-              _FLOAT slope)
-{
-    uint grp_id0 = get_group_id(0);
-#if MLO_N_OUT_TILE_BLOCKS0 & (MLO_N_OUT_TILE_BLOCKS0 - 1)
-    uint y_tile_blk = iDiv(grp_id0, MLO_N_OUT_TILE_BLOCKS0);
-    uint x_tile_blk = iMod(grp_id0, y_tile_blk, MLO_N_OUT_TILE_BLOCKS0);
-#else
-    uint y_tile_blk       = grp_id0 / MLO_N_OUT_TILE_BLOCKS0;
-    uint x_tile_blk       = grp_id0 & (MLO_N_OUT_TILE_BLOCKS0 - 1);
-#endif
-    uint o_pack = get_group_id(1); // block of outputs
-    uint b_pack = get_group_id(2); // batch block
-
-    uint lcl_id = get_local_id(0);
-#if MLO_ALUTILES_STACK_SZ >= MLO_GRP_SZ
-    uint stack        = 0;
-    uint alu_stack_id = lcl_id;
-#elif MLO_ALUTILES_STACK_SZ & (MLO_ALUTILES_STACK_SZ - 1)
-    uint stack            = iDiv(lcl_id, MLO_ALUTILES_STACK_SZ);        // stack
-    uint alu_stack_id     = iMod(lcl_id, stack, MLO_ALUTILES_STACK_SZ); // alu index in stack
-#else
-    uint stack = lcl_id / MLO_ALUTILES_STACK_SZ; // stack
-    uint alu_stack_id = lcl_id & (MLO_ALUTILES_STACK_SZ - 1); // alu index in stack
-#if MLO_ALUTILES_STACK_SZ >= 64
-    stack = uniform(stack);
-#endif
-#endif
-// ALU plane inside stack
-#if MLO_ALU_TILE_SZ & (MLO_ALU_TILE_SZ - 1)
-    uint alu_out_plane_id = iDiv(alu_stack_id, MLO_ALU_TILE_SZ); // alu output plane index
-    uint alu_out_id       = iMod(
-        alu_stack_id, alu_out_plane_id, MLO_ALU_TILE_SZ); // alu index inside an ALU output plane
-#else
-    uint alu_out_plane_id = alu_stack_id / MLO_ALU_TILE_SZ;             // alu output plane index
-    uint alu_out_id       = alu_stack_id & (MLO_ALU_TILE_SZ - 1);       // alu index inside an ALU output plane
-#endif
-// pos inside ALU tile
-#if MLO_ALU_VTILE0 & (MLO_ALU_VTILE0 - 1)
-    uint alu_tl1 = iDiv(alu_out_id, MLO_ALU_VTILE0);
-    uint alu_tl0 = iMod(alu_out_id, alu_tl1, MLO_ALU_VTILE0);
-#else
-    uint alu_tl1          = alu_out_id / MLO_ALU_VTILE0;
-    uint alu_tl0          = alu_out_id & (MLO_ALU_VTILE0 - 1);
-#endif
-
-    uint o_map_plane =
-        o_pack * MLO_N_OUT_TILES_PERSTACK; // first output maps index per full ALU plane stack
-    uint o_map_base = alu_out_plane_id * MLO_N_OUT_TILES; // local output map offset
-    uint o_map      = o_map_plane + o_map_base;           // output map index per ALU plane
-    uint b_index    = b_pack * MLO_N_STACKS;
-
-    uint x_grp = x_tile_blk * MLO_IN_TILE0 * MLO_FILTER_STRIDE0;
-    uint y_grp = y_tile_blk * MLO_IN_TILE1 * MLO_FILTER_STRIDE1;
-
-// write results out
-#if MLO_FILTER_STRIDE0 == 1
-    int x_out_grp = x_grp;
-#else
-    int x_out_grp = x_tile_blk * MLO_IN_TILE0;
-#endif
-#if MLO_FILTER_STRIDE1 == 1
-    int y_out_grp = y_grp;
-#else
-    int y_out_grp = y_tile_blk * MLO_IN_TILE1;
-#endif
-    int x_out_lcl = alu_tl0 * MLO_OUT_TILE0;
-    int y_out_lcl = alu_tl1 * MLO_OUT_TILE1;
-
-    uint out_off = (b_index + stack) * MLO_OUT_BATCH_STRIDE + o_map * MLO_OUT_CHANNEL_STRIDE +
-                   (y_out_grp + y_out_lcl) * MLO_OUT_STRIDE + x_out_grp + x_out_lcl;
-// over all local stacks
-#if MLO_BATCH_ALIGNED == 0
-    if(b_index + stack < MLO_BATCH_SZ)
-#endif
-    {
-        // over all local outputs
-        int out_off1 = out_off;
-        for(uint o = 0; o < MLO_N_OUT_TILES; ++o, out_off1 += MLO_OUT_CHANNEL_STRIDE)
-        {
-#if MLO_OUTPUTS_ALIGNED == 0
-            if(o_map + o < MLO_N_OUTPUTS)
-#endif
-            {
-                // over output tile
-                int out_off2 = out_off1;
-                for(uint j = 0; j < MLO_OUT_TILE1; ++j, out_off2 += MLO_OUT_STRIDE)
-                {
-                    if(y_out_grp + y_out_lcl + j < MLO_OUT_HEIGHT)
-					{
-						for (uint i = 0; i < MLO_OUT_TILE0; ++i)
-						{
-							if (x_out_grp + x_out_lcl + i < MLO_OUT_WIDTH && out_off2 + i < MLO_OUT_BATCH_STRIDE * MLO_BATCH_SZ)
-							{
-								//ReLU
-								//out[out_off2 + i] = max(in[out_off2 + i], 0.0f);
-								out[out_off2 + i] = (in[out_off2 + i] > 0) ? in[out_off2 + i] : in[out_off2 + i] * slope;
-							}
-						}
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/saber/funcs/impl/amd/cl/Softmax.cl b/saber/funcs/impl/amd/cl/Softmax.cl
deleted file mode 100644
index 32e61fa13..000000000
--- a/saber/funcs/impl/amd/cl/Softmax.cl
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-/* Steps to compute softmax:
- * 1. Compute the max per channel.
- * 2. Subtract the max from each value in the channel.
- * 3. Compute the exponent of all the values.
- * 4. Compute the sum of the vales per channel.
- * 5. Normalize based on the sum.
- *
- * We use CSR-{Vector / Stream} apprach to pick an algorithm depending on the
- * number of channels each workgroup has to work with.
- * J. L. Greathouse, M. Daga, Efficient sparse matrix-vector multiplication
- * on GPUs using the CSR storage format, in: Proc. Int'l Conf. High Performance
- * Computing, Networking, Storage and Analysis (SC'14)
-*/
-
-kernel void Softmax(global float* y, const int c, const int grid_size, const int spatial_dim)
-{
-#if NUM_BATCH == 1 // CSR-Vector like appraoch
-
-    /* Entire workgroup works on one spatial_dim.
-     * We use logarthmic reductions to compute max and sum per channel.
-     * This approach reads in the same data thrice from DRAM but is still better
-     * than launching three different kernels.
-     * The workgroup begins by computing the nth image and s (spatial_dim) it
-     * is working on and iterates over the entire grid until finished.
-     */
-
-    local float l_helper[256];
-
-    int gid = get_group_id(0);
-    int lid = get_local_id(0);
-
-    // Total number of workgroups launched can be less than the gridsize, hence iterate over.
-    for(gid = get_group_id(0); gid < grid_size; gid += get_num_groups(0))
-    {
-
-        int n = gid / spatial_dim; // nth image
-        int s = gid % spatial_dim; // spatial dimension (h*w)
-
-        l_helper[lid] = -FLT_MAX;
-
-        float t_helper = -FLT_MAX; // thread_local helper var
-
-        // Compute max per channel
-        // Iterate over all the channels one thread is supposed to loop over
-        // and compute max
-        for(int i = lid; i < c; i += get_local_size(0))
-        {
-            t_helper = max(y[mad24(n, c, i) * spatial_dim + s], t_helper);
-        }
-
-        // Now we have to compute the max from 256 values (one per each thread)
-        l_helper[lid] = t_helper;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Logarithmic reduction to compute the max.
-        for(int i = (get_local_size(0) >> 1); i > 0; i >>= 1)
-        {
-            if(lid < i)
-            {
-                l_helper[lid] = max(l_helper[lid], l_helper[lid + i]);
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        float channel_max = l_helper[0];
-        t_helper          = 0.;
-
-        // Subtract channel_max from each value
-        for(int i = lid; i < c; i += get_local_size(0))
-        {
-            float value = y[mad24(n, c, i) * spatial_dim + s];
-
-            // Compute exponent of each value
-            // Then sum all the values touched by this thread
-            t_helper += exp(value - channel_max);
-        }
-
-        l_helper[lid] = t_helper;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Compute sum of 256 values (one for each thread)
-        // Logarithmic reduction to compute the sum
-        for(int i = (get_local_size(0) >> 1); i > 0; i >>= 1)
-        {
-            if(lid < i)
-            {
-                l_helper[lid] += l_helper[lid + i];
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        float channel_sum = l_helper[0];
-
-        // Normalize each value in the channel by the channel_sum
-        for(int i = lid; i < c; i += get_local_size(0))
-        {
-            float value = y[mad24(n, c, i) * spatial_dim + s];
-
-            // Subtracting max again because we do not write the output of
-            // value-max to DRAM above. Doing a subtraction again is much
-            // faster than writing uncoalesced to DRAM
-            value = exp(value - channel_max);
-
-            y[mad24(n, c, i) * spatial_dim + s] = value / channel_sum;
-        }
-    }
-
-#else // CSR-Stream like approach
-
-    /* Each workgroup is computing the softmax for NUM_BATCH spatial_dims ala CSR-Stream.
-     * The number of threads iterting over channels to compute softmax for one batch is BATCH_SIZE.
-     * The number of values each thread works on is U_BATCH_SIZE (read micro batch size).
-     * Each batch in the workgroup works on its nth image and s (spatial_dim).
-     * E.g. a 256 thread workgroup with c=31 has 8 batches and a batchsize of 32.
-     * The number of workgroups launched are exactly the number as required
-     * hence, there is no for-loop.
-    */
-
-    local float l_helper[256];
-
-    int gid = get_group_id(0);
-    int lid = get_local_id(0);
-
-    // ID of the thread within the batch
-    int batch_lid = lid & (BATCH_SIZE - 1); // thread specific channel_st
-    int batch     = lid / BATCH_SIZE;       // which spatial_dim or pixel
-
-    // Batch specific n and s
-    int batch_n = (NUM_BATCH * gid + batch) / spatial_dim; // nth image
-    int batch_s = (NUM_BATCH * gid + batch) % spatial_dim; // which spatial_dim/pixel
-
-    l_helper[lid] = -FLT_MAX;
-
-    float t_helper = -FLT_MAX; // thread_local helper var
-
-    // stores all the values touched by one thread so that we do not have load
-    // again as the CSR-Vector approach
-    float value[U_BATCH_SIZE];
-    for(int i = 0; i < U_BATCH_SIZE; i++)
-    {
-        value[i] = -FLT_MAX;
-    }
-
-    // Compute max per channel
-    // BATCH_SIZE threads iterate over the channels
-    for(int i = batch_lid; i < c; i += BATCH_SIZE)
-    {
-        if(mad24(batch_n, c, i) * spatial_dim + batch_s < c * grid_size)
-            value[i / BATCH_SIZE] = y[mad24(batch_n, c, i) * spatial_dim + batch_s];
-        t_helper                  = max(value[i / BATCH_SIZE], t_helper);
-    }
-
-    // Now we have to compute the max from 256 values (one per each thread)
-    l_helper[lid] = t_helper;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Logarithmic reduction to compute the max.
-    for(int i = (BATCH_SIZE >> 1); i > 0; i >>= 1)
-    {
-        if(batch_lid < i)
-        {
-            l_helper[lid] = max(l_helper[lid], l_helper[lid + i]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    float channel_max = l_helper[batch * BATCH_SIZE];
-    t_helper          = 0.;
-
-    // Subtract channel_max from each value
-    for(int i = batch_lid; i < c; i += BATCH_SIZE)
-    {
-
-        // Compute exponent of each value
-        // Then sum all the values touched by this thread
-        t_helper += exp(value[i / BATCH_SIZE] - channel_max);
-    }
-
-    l_helper[lid] = t_helper;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Compute sum of 256 values (one for each thread)
-    // Logarithmic reduction to compute the sum
-    for(int i = (BATCH_SIZE >> 1); i > 0; i >>= 1)
-    {
-        if(batch_lid < i)
-        {
-            l_helper[lid] += l_helper[lid + i];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    float channel_sum = l_helper[batch * BATCH_SIZE];
-
-    // Normalize each value in the channel by the channel_sum
-    for(int i = batch_lid; i < c; i += BATCH_SIZE)
-    {
-        value[i / BATCH_SIZE] = exp(value[i / BATCH_SIZE] - channel_max);
-
-        if(mad24(batch_n, c, i) * spatial_dim + batch_s < c * grid_size)
-            y[mad24(batch_n, c, i) * spatial_dim + batch_s] = value[i / BATCH_SIZE] / channel_sum;
-    }
-
-#endif // CSR-Vector vs CSR-Stream
-}
diff --git a/saber/funcs/impl/amd/saber_activation.cpp b/saber/funcs/impl/amd/saber_activation.cpp
deleted file mode 100644
index 151bc5f78..000000000
--- a/saber/funcs/impl/amd/saber_activation.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#include "saber/funcs/base.h"
-#include "saber/funcs/impl/amd/saber_activation.h"
-#include "saber/funcs/impl/amd/amd_utils.h"
-
-namespace anakin{
-namespace saber {
-
-typedef TargetWrapper<AMD> AMD_API;
-
-template <DataType OpDtype>
-SaberStatus SaberActivation<AMD, OpDtype>::init(
-        const std::vector<Tensor<AMD>*>& inputs,
-        std::vector<Tensor<AMD>*>& outputs,
-        ActivationParam<AMD> &param,
-        Context<AMD> &ctx) {
-    this->_ctx = &ctx;
-    return create(inputs, outputs, param, ctx);
-}
-
-template <DataType OpDtype>
-SaberStatus SaberActivation<AMD, OpDtype>::create(
-        const std::vector<Tensor<AMD>*>& inputs,
-        std::vector<Tensor<AMD>*>& outputs,
-        ActivationParam<AMD> &param,
-        Context<AMD> &ctx) {
-
-    this->_ctx = &ctx;
-
-    cl_context context = 0;
-    cl_device_id device = 0;
-
-    Device<AMD> dev = Env<AMD>::cur_env()[inputs[0]->device_id()]; //anakin device id to AMD device
-    device = dev.get_device();
-    context = dev.get_context();
-
-    //LOG(INFO) << "device id= " << device << " conext = " << context;
-
-    KernelInfo kernelInfo;
-
-    switch (param.active){
-        case Active_relu:
-            //TODO
-            //Rewrite here once solver is ready.//////////////
-            T_ExtSolutionConfig extSolution;
-            //LOG(INFO) << inputs[0]->width() << " x " << inputs[0]->height() << " x " << inputs[0]->channel();
-            switch(inputs[0]->width())
-            {
-                case 224:
-                    kernelInfo.l_wk = {256, 1, 1};
-                    kernelInfo.g_wk = {12544, 8, 1};
-
-                    extSolution.in_tile0 = 32;
-                    extSolution.in_tile1 = 32;
-                    extSolution.grp_tile0 = 16;
-                    extSolution.grp_tile1 = 16;
-                    extSolution.out_pix_tile0 = 2;
-                    extSolution.out_pix_tile1 = 2;
-                    extSolution.n_stacks = 1;
-                    extSolution.n_out_pix_tiles = 8;
-                    extSolution.n_out_tiles_perstack = 8;
-                    extSolution.n_in_data_tiles = 2;
-                    extSolution.n_read_procs = 256;
-                    extSolution.alu_tile0 = 16;
-                    extSolution.alu_tile1 = 16;
-                    break;
-
-                case 112:
-                    kernelInfo.l_wk = {256, 1, 1};
-                    kernelInfo.g_wk = {4096, 16, 1};
-
-                    extSolution.in_tile0 = 32;
-                    extSolution.in_tile1 = 32;
-                    extSolution.grp_tile0 = 16;
-                    extSolution.grp_tile1 = 16;
-                    extSolution.out_pix_tile0 = 2;
-                    extSolution.out_pix_tile1 = 2;
-                    extSolution.n_stacks = 1;
-                    extSolution.n_out_pix_tiles = 8;
-                    extSolution.n_out_tiles_perstack = 8;
-                    extSolution.n_in_data_tiles = 2;
-                    extSolution.n_read_procs = 256;
-                    extSolution.alu_tile0 = 16;
-                    extSolution.alu_tile1 = 16;
-                    break;
-
-                case 56:
-                    kernelInfo.l_wk = {256, 1, 1};
-                    kernelInfo.g_wk = {1024, 32, 1};
-
-                    extSolution.in_tile0 = 32;
-                    extSolution.in_tile1 = 32;
-                    extSolution.grp_tile0 = 16;
-                    extSolution.grp_tile1 = 16;
-                    extSolution.out_pix_tile0 = 2;
-                    extSolution.out_pix_tile1 = 2;
-                    extSolution.n_stacks = 1;
-                    extSolution.n_out_pix_tiles = 8;
-                    extSolution.n_out_tiles_perstack = 8;
-                    extSolution.n_in_data_tiles = 2;
-                    extSolution.n_read_procs = 256;
-                    extSolution.alu_tile0 = 16;
-                    extSolution.alu_tile1 = 16;
-                    break;
-
-                case 28:
-                    kernelInfo.l_wk = {256, 1, 1};
-                    kernelInfo.g_wk = {256, 64, 1};
-
-                    extSolution.in_tile0 = 32;
-                    extSolution.in_tile1 = 32;
-                    extSolution.grp_tile0 = 16;
-                    extSolution.grp_tile1 = 16;
-                    extSolution.out_pix_tile0 = 2;
-                    extSolution.out_pix_tile1 = 2;
-                    extSolution.n_stacks = 1;
-                    extSolution.n_out_pix_tiles = 8;
-                    extSolution.n_out_tiles_perstack = 8;
-                    extSolution.n_in_data_tiles = 2;
-                    extSolution.n_read_procs = 256;
-                    extSolution.alu_tile0 = 16;
-                    extSolution.alu_tile1 = 16;
-                    break;
-                case 14:
-                    kernelInfo.l_wk = {64, 1, 1};
-                    kernelInfo.g_wk = {64, 64, 1};
-
-                    extSolution.in_tile0 = 16;
-                    extSolution.in_tile1 = 16;
-                    extSolution.grp_tile0 = 8;
-                    extSolution.grp_tile1 = 8;
-                    extSolution.out_pix_tile0 = 2;
-                    extSolution.out_pix_tile1 = 2;
-                    extSolution.n_stacks = 1;
-                    extSolution.n_out_pix_tiles = 8;
-                    extSolution.n_out_tiles_perstack = 8;
-                    extSolution.n_in_data_tiles = 2;
-                    extSolution.n_read_procs = 64;
-                    extSolution.alu_tile0 = 8;
-                    extSolution.alu_tile1 = 8;
-                    break;
-                case 1:
-                    if (inputs[0]->channel() == 4096) {
-                        kernelInfo.l_wk = {256, 1, 1};
-                        kernelInfo.g_wk = {4096, 1, 1};
-                    }else if(inputs[0]->channel()  == 1000) {
-                        kernelInfo.l_wk = {256, 1, 1};
-                        kernelInfo.g_wk = {1024, 1, 1};
-                    }
-                    break;
-            }
-            
-            kernelInfo.comp_options =
-                std::string(" -DMLO_HW_WAVE_SZ=64") +    // (fixed) wave=64
-                std::string(" -DMLO_DIR_FORWARD=1") +    // (fixed) forward
-                std::string(" -DMLO_FILTER_STRIDE0=1") + // (fixed temp)
-                std::string(" -DMLO_FILTER_STRIDE1=1") + // (fixed temp)
-                std::string(" -DMLO_N_OUTPUTS=") + std::to_string(outputs[0]->channel()) +
-                std::string(" -DMLO_N_INPUTS=") + std::to_string(inputs[0]->channel()) +
-                std::string(" -DMLO_BATCH_SZ=") + std::to_string(inputs[0]->num()) +
-                std::string(" -DMLO_OUT_WIDTH=") + std::to_string(outputs[0]->width()) +
-                std::string(" -DMLO_OUT_HEIGHT=") + std::to_string(outputs[0]->height()) +
-                std::string(" -DMLO_OUT_BATCH_STRIDE=") + std::to_string(outputs[0]->width() * outputs[0]->height() * outputs[0]->channel()) +
-                std::string(" -DMLO_OUT_CHANNEL_STRIDE=") + std::to_string(outputs[0]->width() * outputs[0]->height()) +
-                std::string(" -DMLO_OUT_STRIDE=") + std::to_string(outputs[0]->width()) +
-                std::string(" -DMLO_IN_WIDTH=") + std::to_string(inputs[0]->width()) +
-                std::string(" -DMLO_IN_HEIGHT=") + std::to_string(inputs[0]->height()) +
-                std::string(" -DMLO_IN_BATCH_STRIDE=") + std::to_string(inputs[0]->width() * inputs[0]->height() * inputs[0]->channel()) +
-                std::string(" -DMLO_IN_CHANNEL_STRIDE=") + std::to_string(inputs[0]->width() * inputs[0]->height()) +
-                std::string(" -DMLO_IN_STRIDE=") + std::to_string(inputs[0]->width()) +
-                std::string(" -DMLO_CONV_BIAS=0") ; // (for now not support)
-
-            if(inputs[0]->width() == 1) {
-
-                kernelInfo.kernel_file = "ReluUni.cl";
-                kernelInfo.kernel_name = "ReluUni";
-
-            } else  {
-            //set comp_options...
-
-                kernelInfo.comp_options +=
-                    std::string(" -DMLO_IN_TILE0=") + std::to_string(extSolution.in_tile0) +
-                    std::string(" -DMLO_IN_TILE1=") + std::to_string(extSolution.in_tile1) +
-                    std::string(" -DMLO_GRP_TILE0=") + std::to_string(extSolution.grp_tile0) +
-                    std::string(" -DMLO_GRP_TILE1=") + std::to_string(extSolution.grp_tile1) +
-                    std::string(" -DMLO_OUT_TILE0=") + std::to_string(extSolution.out_pix_tile0) +
-                    std::string(" -DMLO_OUT_TILE1=") + std::to_string(extSolution.out_pix_tile1) +
-                    std::string(" -DMLO_N_STACKS=") + std::to_string(extSolution.n_stacks) +
-                    std::string(" -DMLO_N_OUT_TILES=") + std::to_string(extSolution.n_out_pix_tiles) +
-                    std::string(" -DMLO_N_OUT_TILES_PERSTACK=") + std::to_string(extSolution.n_out_tiles_perstack) +
-                    std::string(" -DMLO_N_IN_TILES_PERSTACK=") + std::to_string(extSolution.n_in_data_tiles) +
-                    std::string(" -DMLO_N_READ_PROCS=") + std::to_string(extSolution.n_read_procs) +
-                    std::string(" -DMLO_ALU_VTILE0=") + std::to_string(extSolution.alu_tile0) +
-                    std::string(" -DMLO_ALU_VTILE1=") + std::to_string(extSolution.alu_tile1);
-
-                kernelInfo.kernel_file = "Relu.cl";
-                kernelInfo.kernel_name = "Relu";
-             }
-            break;
-
-        case Active_sigmoid:
-
-            break;
-
-        case Active_tanh:
-
-            break;
-
-        case Active_clipped_relu:
-
-            break;
-
-        case Active_elu:
-
-            break;
-    }
-    std::copy(kernelInfo.g_wk.begin(), kernelInfo.g_wk.end(), _globalWorkSize);
-    std::copy(kernelInfo.l_wk.begin(), kernelInfo.l_wk.end(), _localWorkSize);
-
-    //LOG(INFO) << "kernel file name: " << kernelInfo.kernel_file;
-    //LOG(INFO) << "kernel name: " << kernelInfo.kernel_name;
-    //LOG(INFO) << "local work size: " << kernelInfo.l_wk[0] << " " << kernelInfo.l_wk[1] << "  " << kernelInfo.l_wk[2];
-    //LOG(INFO) << "global work size: " << kernelInfo.g_wk[0] << " " << kernelInfo.g_wk[1] << "  " << kernelInfo.g_wk[2];
-    //LOG(INFO) << "compile option: " << kernelInfo.comp_options;
-
-    //To create the program
-    cl_program program = CreateCLProgram(context, device, kernelInfo.kernel_file.c_str(), &kernelInfo);
-    if (program == NULL)
-    {
-        LOG(ERROR) << "Failed to load program";
-        return SaberInvalidValue;
-    }
-
-    //LOG(INFO) << "COMPILE OCL KERNEL CODE";
-
-    //To create kernel
-    _kernel = clCreateKernel(program, kernelInfo.kernel_name.c_str(), NULL);
-    if (_kernel == NULL)
-    {
-        LOG(ERROR) << "Failed to create kernel";
-        return SaberInvalidValue;
-    }
-
-    //LOG(INFO) << "COMPLETE CREATE KERNEL";
-
-    return SaberSuccess;
-}
-
-template <DataType OpDtype>
-SaberStatus SaberActivation<AMD, OpDtype>::dispatch(
-        const std::vector<Tensor<AMD>*>& inputs,
-        std::vector<Tensor<AMD>*>& outputs,
-        ActivationParam<AMD> &param) {
-
-    cl_int errNum = 0;
-    //To get the commpute command queue
-    AMD_API::stream_t cm = this->_ctx->get_compute_stream();
-
-    //To set the argument
-    cl_mem memObjects[2] = { 0, 0 };
-    memObjects[0] = (cl_mem)inputs[0]->data();
-    memObjects[1] = (cl_mem)outputs[0]->mutable_data();
-
-    errNum = clSetKernelArg(_kernel, 0, sizeof(cl_mem), &memObjects[0]);
-    errNum |= clSetKernelArg(_kernel, 1, sizeof(cl_mem), &memObjects[1]);
-    errNum |= clSetKernelArg(_kernel, 2, sizeof(float), &param.negative_slope);
-    if (errNum != CL_SUCCESS)
-    {
-        LOG(ERROR) << "Fail to set kernel arguments";
-        return SaberInvalidValue;
-    }
-    cl_event event;
-    //LOG(INFO) << "COMPLETE SET ARGUMENT";
-    errNum = clEnqueueNDRangeKernel(cm, _kernel, 3, NULL,
-                                    _globalWorkSize, _localWorkSize,
-                                    0, NULL, &event);
-    if (errNum != CL_SUCCESS)
-    {
-        LOG(ERROR) << "Fail to set execution: " << errNum;
-        return SaberInvalidValue;
-    }
-    //LOG(INFO) << "COMPLETE EXECUTION";
-    cl_event_list list;
-    list.push_back(event);
-    Env<AMD>::add_event(list);
-
-    return SaberSuccess;
-}
-
-template class SaberActivation<AMD, AK_FLOAT>;
-DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, AMD, AK_INT8);
-DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, AMD, AK_HALF);
-
-}
-} // namespace anakin
diff --git a/saber/funcs/impl/amd/saber_activation.h b/saber/funcs/impl/amd/saber_activation.h
deleted file mode 100644
index fb569ba62..000000000
--- a/saber/funcs/impl/amd/saber_activation.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_ACTIVATION_H
-#define ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_ACTIVATION_H
-
-#include "saber/funcs/impl/impl_activation.h"
-
-namespace anakin{
-
-namespace saber{
-
-template <DataType OpDtype>
-class SaberActivation<AMD, OpDtype> :
-    public ImplBase<
-        AMD, OpDtype,
-        ActivationParam<AMD> > {
-public:
-    typedef typename DataTrait<AMD, OpDtype>::Dtype OpDataType;
-    SaberActivation() = default;
-    ~SaberActivation() {}
-
-    virtual SaberStatus init(const std::vector<Tensor<AMD> *>& inputs,
-                            std::vector<Tensor<AMD> *>& outputs,
-                            ActivationParam<AMD>& param, Context<AMD>& ctx) override;
-
-    virtual SaberStatus create(const std::vector<Tensor<AMD> *>& inputs,
-                            std::vector<Tensor<AMD> *>& outputs,
-                            ActivationParam<AMD>& param, Context<AMD> &ctx) override;
-
-    virtual SaberStatus dispatch(const std::vector<Tensor<AMD>*>& inputs,
-                          std::vector<Tensor<AMD>*>& outputs,
-                          ActivationParam<AMD>& param) override;
-
-    private:
-    cl_kernel _kernel;
-    size_t _globalWorkSize[3];
-    size_t _localWorkSize[3];
-};
-
-}
-
-}
-#endif //ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_ACTIVATION_H
\ No newline at end of file
diff --git a/saber/funcs/impl/amd/saber_softmax.cpp b/saber/funcs/impl/amd/saber_softmax.cpp
deleted file mode 100644
index b96a8f8a4..000000000
--- a/saber/funcs/impl/amd/saber_softmax.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-#include "saber/funcs/base.h"
-#include "saber/funcs/impl/amd/saber_softmax.h"
-#include "saber/funcs/impl/amd/amd_utils.h"
-
-namespace anakin{
-namespace saber {
-
-typedef TargetWrapper<AMD> AMD_API;
-
-template <DataType OpDtype>
-SaberStatus SaberSoftmax<AMD, OpDtype>::init(
-        const std::vector<Tensor<AMD>*>& inputs,
-        std::vector<Tensor<AMD>*>& outputs,
-        SoftmaxParam<AMD> &param,
-        Context<AMD> &ctx) {
-    this->_ctx = &ctx;
-    return create(inputs, outputs, param, ctx);
-}
-
-
-int nextPow2(int v)
-{
-
-    if(v == 1)
-    {
-        return (v << 1);
-    }
-    else
-    {
-        v--;
-        v |= v >> 1;
-        v |= v >> 2;
-        v |= v >> 4;
-        v |= v >> 8;
-        v |= v >> 16;
-        v++;
-        return v;
-    }
-}
-
-
-template <DataType OpDtype>
-SaberStatus SaberSoftmax<AMD, OpDtype>::create(
-        const std::vector<Tensor<AMD>*>& inputs,
-        std::vector<Tensor<AMD>*>& outputs,
-        SoftmaxParam<AMD> &param,
-        Context<AMD> &ctx) {
-
-    cl_context context = 0;
-    cl_device_id device = 0;
-
-    Device<AMD> dev = Env<AMD>::cur_env()[inputs[0]->device_id()]; //anakin device id to AMD device
-    device = dev.get_device();
-    context = dev.get_context();
-
-    //LOG(INFO) << "device id= " << device << " conext = " << context;
-
-    //To create vld, gld and compile options
-    KernelInfo kernerlInfo;
-
-    //To set local work size
-    kernerlInfo.l_wk = {256, 1, 1};
-
-    //int num_batch = inputs[0]->num();
-    int c = inputs[0]->channel();
-    int num_batch = c;
-
-    //To set comp_options
-    kernerlInfo.comp_options = std::string(" -DNUM_BATCH=") + std::to_string(num_batch);
-
-    int grid_size = inputs[0]->num() * inputs[0]->width() * inputs[0]->height();
-
-    if(num_batch == 1)
-    { // CSR-Vector like approach
-
-        // Control the max. number of workgroups launched so that we do not
-        // start getting workgroup scheduling overheads
-        size_t workgroups = std::min(grid_size, 64 * 40 * 8);
-        kernerlInfo.g_wk = {workgroups * kernerlInfo.l_wk[0], 1, 1};
-    }
-    else
-    { // CSR-Stream like approach
-
-        // num_threads iterating over channels for one spatial_dim
-        int batch_size = 256 / num_batch;
-        // num_channels each threads iterates over to cover all the channels
-        int u_batch_size = c > batch_size ? nextPow2(c / batch_size) : 1;
-
-        size_t workgroups =
-            grid_size % num_batch == 0 ? grid_size / num_batch : grid_size / num_batch + 1;
-        kernerlInfo.g_wk = {workgroups * kernerlInfo.l_wk[0], 1, 1};
-
-        kernerlInfo.comp_options += " -DBATCH_SIZE=" + std::to_string(batch_size) + " -DU_BATCH_SIZE=" +
-                 std::to_string(u_batch_size);
-    }
-
-    kernerlInfo.kernel_file = "Softmax.cl";
-    kernerlInfo.kernel_name = "Softmax";
-
-    //LOG(INFO) << "kernel file name: " << kernerlInfo.kernel_file;
-    //LOG(INFO) << "kernel name: " << kernerlInfo.kernel_name;
-    //LOG(INFO) << "local work size: " << kernerlInfo.l_wk[0] << " " << kernerlInfo.l_wk[1] << "  " << kernerlInfo.l_wk[2];
-    //LOG(INFO) << "global work size: " << kernerlInfo.g_wk[0] << " " << kernerlInfo.g_wk[1] << "  " << kernerlInfo.g_wk[2];
-    //LOG(INFO) << "compile option: " << kernerlInfo.comp_options;
-
-    std::copy(kernerlInfo.g_wk.begin(), kernerlInfo.g_wk.end(), _globalWorkSize);
-    std::copy(kernerlInfo.l_wk.begin(), kernerlInfo.l_wk.end(), _localWorkSize);
-
-    std::string kernel_file = kernerlInfo.kernel_file;
-    std::string kernel_name = kernerlInfo.kernel_name;
-
-    //To create the program
-    cl_program program = CreateCLProgram(context, device, kernerlInfo.kernel_file.c_str(), &kernerlInfo);
-    if (program == NULL)
-    {
-        LOG(ERROR) << "Failed to load program";
-        return SaberInvalidValue;
-    }
-
-    //LOG(INFO) << "COMPILE OCL KERNEL CODE";
-
-    //To create kernel
-    _kernel = clCreateKernel(program, kernerlInfo.kernel_name.c_str(), NULL);
-    if (_kernel == NULL)
-    {
-        LOG(ERROR) << "Failed to create kernel";
-        return SaberInvalidValue;
-    }
-    
-    //LOG(INFO) << "COMPLETE CREATE KERNEL";
-    this->_ctx = &ctx;
-    return SaberSuccess;
-}
-
-template <DataType OpDtype>
-SaberStatus SaberSoftmax<AMD, OpDtype>::dispatch(
-        const std::vector<Tensor<AMD>*>& inputs,
-        std::vector<Tensor<AMD>*>& outputs,
-        SoftmaxParam<AMD> &param) {
-
-    //LOG(INFO) << "SaberSoftmax::dispatch";
-    cl_int errNum = 0;
-    
-    //To set the argument
-    cl_mem memObjects[1] = { 0 };
-    outputs[0]->copy_from(*inputs[0]);
-    memObjects[0] = (cl_mem)outputs[0]->data();
-    int arg1_channel = inputs[0]->channel();
-    int arg2_grid_size = inputs[0]->num() * inputs[0]->width() * inputs[0]->height();
-    int arg3_stride_c = inputs[0]->width() * inputs[0]->height();
-
-    errNum = clSetKernelArg(_kernel, 0, sizeof(cl_mem), &memObjects[0]);
-    errNum |= clSetKernelArg(_kernel, 1, sizeof(int), &arg1_channel);
-    errNum |= clSetKernelArg(_kernel, 2, sizeof(int), &arg2_grid_size);
-    errNum |= clSetKernelArg(_kernel, 3, sizeof(int), &arg3_stride_c);
-
-    if (errNum != CL_SUCCESS)
-    {
-        LOG(ERROR) << "Fail to set kernel arguments: " << errNum;
-        return SaberInvalidValue;
-    }
-
-    //LOG(INFO) << "COMPLETE SET ARGUMENT";
-
-    //To get the commpute command queue
-    AMD_API::stream_t cm = this->_ctx->get_compute_stream();
-
-    cl_event event;
-    errNum = clEnqueueNDRangeKernel(cm, _kernel, 3, NULL,
-                                    _globalWorkSize, _localWorkSize,
-                                    0, NULL, &event);
-    if (errNum != CL_SUCCESS)
-    {
-        LOG(ERROR) << "Fail to set execution: " << errNum;
-        return SaberInvalidValue;
-    }
-    //LOG(INFO) << "COMPLETE EXECUTION";
-
-    cl_event_list list;
-    list.push_back(event);
-    Env<AMD>::add_event(list);
-    return SaberSuccess;
-}
-
-template class SaberSoftmax<AMD, AK_FLOAT>;
-DEFINE_OP_TEMPLATE(SaberSoftmax, SoftmaxParam, AMD, AK_INT8);
-DEFINE_OP_TEMPLATE(SaberSoftmax, SoftmaxParam, AMD, AK_HALF);
-
-}
-} // namespace anakin
diff --git a/saber/funcs/impl/amd/saber_softmax.h b/saber/funcs/impl/amd/saber_softmax.h
deleted file mode 100644
index aea4ca81f..000000000
--- a/saber/funcs/impl/amd/saber_softmax.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. 
-*/
-
-#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_SOFTMAX_H
-#define ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_SOFTMAX_H
-
-#include "saber/funcs/impl/impl_softmax.h"
-
-namespace anakin{
-
-namespace saber{
-
-template <DataType OpDtype>
-class SaberSoftmax<AMD, OpDtype> :
-    public ImplBase<
-        AMD, OpDtype,
-        SoftmaxParam<AMD> > {
-public:
-    typedef typename DataTrait<AMD, OpDtype>::Dtype OpDataType;
-    SaberSoftmax() = default;
-    ~SaberSoftmax() {}
-
-    virtual SaberStatus init(const std::vector<Tensor<AMD> *>& inputs,
-                            std::vector<Tensor<AMD> *>& outputs,
-                            SoftmaxParam<AMD>& param, Context<AMD>& ctx) override;
-
-    virtual SaberStatus create(const std::vector<Tensor<AMD> *>& inputs,
-                            std::vector<Tensor<AMD> *>& outputs,
-                            SoftmaxParam<AMD>& param, Context<AMD> &ctx) override;
-
-    virtual SaberStatus dispatch(const std::vector<Tensor<AMD>*>& inputs,
-                          std::vector<Tensor<AMD>*>& outputs,
-                          SoftmaxParam<AMD>& param) override;
-
-    private:
-    cl_kernel _kernel;
-    size_t _globalWorkSize[3];
-    size_t _localWorkSize[3];
-};
-
-}
-
-}
-#endif //ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_SOFTMAX_H
\ No newline at end of file
diff --git a/saber/funcs/softmax.h b/saber/funcs/softmax.h
index 9c8354588..30ab88614 100644
--- a/saber/funcs/softmax.h
+++ b/saber/funcs/softmax.h
@@ -24,7 +24,7 @@
 #include "saber/funcs/impl/cuda/vender_softmax.h"
 #endif
 #ifdef AMD_GPU
-#include "saber/funcs/impl/amd/saber_softmax.h"
+//#include "saber/funcs/impl/amd/saber_softmax.h"
 #endif
 #ifdef USE_X86_PLACE
 #include "saber/funcs/impl/x86/saber_softmax.h"
diff --git a/test/framework/net/net_exec_map_rnn.cpp b/test/framework/net/net_exec_map_rnn.cpp
index 896d02fdb..567642fff 100644
--- a/test/framework/net/net_exec_map_rnn.cpp
+++ b/test/framework/net/net_exec_map_rnn.cpp
@@ -1,3 +1,17 @@
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
 #include <string>
 #include "net_test.h"
 #include "saber/funcs/timer.h"
@@ -17,14 +31,15 @@
 #if  defined(NVIDIA_GPU)
 using Target = NV;
 using Target_H = NVHX86;
-#else if  defined(USE_X86_PLACE)
+#elif defined(USE_X86_PLACE)
 using Target = X86;
 using Target_H = X86;
 #include "mkl_service.h"
+#elif defined(AMD_GPU)
+using Target = AMD;
+using Target_H = AMDHX86;
 #endif
 
-
-
 std::string FLAGS_data_file;
 std::string FLAGS_model_file;
 int FLAGS_num = 1;
@@ -270,7 +285,7 @@ void one_thread_run(std::string path, int thread_id) {
         h_tensor_in_1->reshape(Shape({week_fea.size(), 10, 1, 1}));
         h_tensor_in_2->reshape(Shape({time_fea.size(), 10, 1, 1}));
         h_tensor_in_0->set_seq_offset({seq_offset});
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(AMD_GPU)
         Tensor4d<Target_H> h_tensor_0;
         Tensor4d<Target_H> h_tensor_1;
         Tensor4d<Target_H> h_tensor_2;
@@ -314,9 +329,13 @@ void one_thread_run(std::string path, int thread_id) {
 #endif
 
         net_executer.prediction();
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(AMD_GPU)
+#ifdef AMD_GPU
+        clFinish(ctx.get_compute_stream());
+#else
         cudaDeviceSynchronize();
-        auto dev_out=net_executer.get_out("final_output.tmp_1_gout");
+#endif
+        auto dev_out = net_executer.get_out("final_output.tmp_1_gout");
         Tensor<Target_H> out(dev_out->valid_shape());
         out.copy_from(*dev_out);
         int size = out.valid_size();
@@ -331,8 +350,9 @@ void one_thread_run(std::string path, int thread_id) {
 
             printf("%f\n", static_cast<float*>(out.data())[seq_start + seq_len - 1]);
         }
+
 #else
-        auto out =net_executer.get_out("final_output.tmp_1_gout");
+        auto out = net_executer.get_out("final_output.tmp_1_gout");
         int size = out->valid_size();
 
         for (int seq_id = 0; seq_id < seq_offset.size() - 1; seq_id++) {
@@ -345,6 +365,7 @@ void one_thread_run(std::string path, int thread_id) {
 
             printf("%f\n", static_cast<float*>(out->data())[seq_start + seq_len - 1]);
         }
+
 #endif
 
 
@@ -432,4 +453,3 @@ int main(int argc, const char** argv) {
     RUN_ALL_TESTS(argv[0]);
     return 0;
 }
-
diff --git a/tools/amd_gpu_build.sh b/tools/amd_gpu_build.sh
index 8ca17f36e..60ef4a2d1 100755
--- a/tools/amd_gpu_build.sh
+++ b/tools/amd_gpu_build.sh
@@ -1,9 +1,25 @@
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
 
 #!/bin/bash
 # This script shows how one can build a anakin for the <NVIDIA> gpu platform 
 ANAKIN_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
 echo "-- Anakin root dir is: $ANAKIN_ROOT"
 
+echo "delete cache files"
+rm -rf ~/.cache/amd_saber/
+
 # build the target into gpu_build.
 BUILD_ROOT=$ANAKIN_ROOT/amd_gpu_build
 
@@ -29,6 +45,7 @@ cmake \
     -DUSE_PYTHON=OFF \
     -DENABLE_DEBUG=YES \
     -DENABLE_VERBOSE_MSG=NO \
+    -DENABLE_AMD_PROFILING=NO \
     -DDISABLE_ALL_WARNINGS=YES \
     -DENABLE_NOISY_WARNINGS=NO \
     -DUSE_OPENMP=NO\