diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000000..0968e75f87c
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,62 @@
+
+if(CMAKE_TOOLCHAIN_FILE)
+set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
+# get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
+get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
+find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
+message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}")
+endif()
+
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory")
+endif()
+message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")
+
+project(ncnn)
+
+cmake_minimum_required(VERSION 2.8.10)
+
+# set(CMAKE_BUILD_TYPE debug)
+# set(CMAKE_BUILD_TYPE relwithdebinfo)
+set(CMAKE_BUILD_TYPE release)
+
+option(NCNN_OPENMP "openmp support" ON)
+option(NCNN_STDIO "load model from external file" ON)
+option(NCNN_STRING "plain and verbose string" ON)
+option(NCNN_OPENCV "minimal opencv structure emulation" OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+add_definitions(-Wall -Wextra)
+
+add_definitions(-fPIC)
+add_definitions(-Ofast)
+
+add_definitions(-ffast-math)
+# add_definitions(-march=native)
+
+# add_definitions(-flto)
+
+add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+
+if(ANDROID)
+    # disable shared library on android
+    set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions")
+elseif(IOS)
+    # disable shared library on xcode ios
+    set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions")
+endif()
+
+##############################################
+
+# add_subdirectory(examples)
+add_subdirectory(src)
+if(NOT ANDROID AND NOT IOS)
+add_subdirectory(tools)
+endif()
diff --git a/Info.plist b/Info.plist
new file mode 100644
index 00000000000..f90da17ba35
--- /dev/null
+++ b/Info.plist
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleName</key>
+    <string>ncnn</string>
+    <key>CFBundleIdentifier</key>
+    <string>com.tencent.ncnn</string>
+    <key>CFBundleVersion</key>
+    <string>1.0</string>
+    <key>CFBundleShortVersionString</key>
+    <string>1.0</string>
+    <key>CFBundleSignature</key>
+    <string>????</string>
+    <key>CFBundlePackageType</key>
+    <string>FMWK</string>
+</dict>
+</plist>
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 00000000000..2eb0363c72e
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,86 @@
+Tencent is pleased to support the open source community by making ncnn available.
+Copyright (C) 2017 THL A29 Limited, a Tencent company.  All rights reserved.
+If you have downloaded a copy of the ncnn binary from Tencent, please note that the ncnn binary is licensed under the BSD 3-Clause License.
+If you have downloaded a copy of the ncnn source code from Tencent, please note that ncnn source code is licensed under the BSD 3-Clause License, except for the third-party components listed below which are subject to different license terms.  Your integration of ncnn into your own projects may require compliance with the BSD 3-Clause License, as well as the other licenses applicable to the third-party components included within ncnn.
+A copy of the BSD 3-Clause License is included in this file.
+
+Other dependencies and licenses:
+
+Open Source Software Licensed Under the zlib License:
+The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
+----------------------------------------------------------------------------------------
+1. neon_mathfun.h
+Copyright (C) 2011 Julien Pommier
+
+2. sse_mathfun.h
+Copyright (C) 2007 Julien Pommier
+
+3. avx_mathfun.h
+Copyright (C) 2012 Giovanni Garberoglio
+Interdisciplinary Laboratory for Computational Science (LISC)
+Fondazione Bruno Kessler and University of Trento
+via Sommarive, 18
+I-38123 Trento (Italy)
+
+
+Terms of the zlib License:
+---------------------------------------------------
+Copyright (c) <year> <copyright holders>
+
+This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+
+
+Open Source Software Licensed Under the BSD 2-Clause License:
+The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
+----------------------------------------------------------------------------------------
+1. squeezenet  1.1
+Copyright (c) 2016 Forrest N. Iandola and Matthew W. Moskewicz and Khalid Ashraf and Song Han and William J. Dally and Kurt Keutzer
+All rights reserved.
+
+2. caffe.proto  master
+All contributions by the University of California:
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014-2017, the respective contributors
+All rights reserved.
+
+
+Terms of the BSD 2-Clause License:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+Open Source Software Licensed Under the BSD 3-Clause License:
+The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
+----------------------------------------------------------------------------------------
+1. android.toolchain.cmake  master
+Copyright (c) 2010-2011, Ethan Rublee
+Copyright (c) 2011-2014, Andrey Kamaev
+All rights reserved.
+
+
+Terms of the BSD 3-Clause License:
+--------------------------------------------------------------------
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+Neither the name of [copyright holder] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000000..935f35c35db
--- /dev/null
+++ b/README.md
@@ -0,0 +1,44 @@
+# ncnn
+
+---
+
+ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架。ncnn 从设计之初深刻考虑手机端的部属和使用。无第三方依赖，跨平台，手机端 cpu 的速度快于目前所有已知的开源框架。基于 ncnn，开发者能够将深度学习算法轻松移植到手机端高效执行，开发出人工智能 APP，将 AI 带到你的指尖。ncnn 目前已在腾讯多款应用中使用，如 QQ，Qzone，微信，天天P图等。
+
+ncnn is a high-performance neural network inference computing framework optimized for the mobile platform. ncnn is deeply considered of the deployment and uses on mobile phones from the beginning of the design. ncnn does not have third party dependent, it is cross-platform, and runs faster than all known open source framework on mobile phone cpu. Developers can easily deploy deep learning algorithm models to the mobile platform by using the efficient ncnn implementation, create intelligent APP, and bring the artificial intelligence to your fingertips. ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu and so on.
+
+---
+
+### 功能概述
+
+* 支持卷积神经网络，支持多输入和多分支结构，可计算部分分支
+* 无任何第三方库依赖，不依赖 BLAS/NNPACK 等计算框架
+* 纯 C++ 实现，跨平台，支持 android ios 等
+* ARM NEON 汇编级良心优化，计算速度极快
+* 精细的内存管理和数据结构设计，内存占用极低
+* 支持多核并行计算加速，ARM big.LITTLE cpu 调度优化
+* 整体库体积小于 500K，并可轻松精简到小于 300K
+* 可扩展的模型设计，支持 8bit 量化和半精度浮点存储，可导入 caffe 模型
+* 支持直接内存零拷贝引用加载网络模型
+* 可注册自定义层实现并扩展
+* 恩，很强就是了，不怕被塞卷 QvQ
+
+### Features
+
+* Support convolution neural network, support multiple input and multi-branch structure, can calculate part of the branch
+* No third-party library dependent, do not rely on BLAS / NNPACK or other computing framework
+* Pure C ++ implementation, cross-platform, support android ios and so on
+* ARM NEON assembly level of careful optimization, the calculation speed is extremely fast
+* Sophisticated memory management and data structure design, very low memory footprint
+* Support multi-core parallel computing acceleration, ARM big.LITTLE cpu scheduling optimization
+* The overall library size is less than 500K, and can be easily reduced to less than 300K
+* Extensible model design, support 8bit quantization and half-precision floating point storage, can import caffe model
+* Support direct memory zero copy reference load network model
+* Can be registered with custom layer implementation and extented
+* Well, it is strong, not afraid of being stuffed with 卷   QvQ
+
+---
+
+### License
+
+BSD 3 Clause
+
diff --git a/android.toolchain.cmake b/android.toolchain.cmake
new file mode 100644
index 00000000000..900ca8c91c3
--- /dev/null
+++ b/android.toolchain.cmake
@@ -0,0 +1,1735 @@
+# Copyright (c) 2010-2011, Ethan Rublee
+# Copyright (c) 2011-2014, Andrey Kamaev
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1.  Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#
+# 2.  Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions and the following disclaimer in the documentation
+#     and/or other materials provided with the distribution.
+#
+# 3.  Neither the name of the copyright holder nor the names of its
+#     contributors may be used to endorse or promote products derived from this
+#     software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+# ------------------------------------------------------------------------------
+#  Android CMake toolchain file, for use with the Android NDK r5-r10d
+#  Requires cmake 2.6.3 or newer (2.8.9 or newer is recommended).
+#  See home page: https://github.com/taka-no-me/android-cmake
+#
+#  Usage Linux:
+#   $ export ANDROID_NDK=/absolute/path/to/the/android-ndk
+#   $ mkdir build && cd build
+#   $ cmake -DCMAKE_TOOLCHAIN_FILE=path/to/the/android.toolchain.cmake ..
+#   $ make -j8
+#
+#  Usage Windows:
+#     You need native port of make to build your project.
+#     Android NDK r7 (and newer) already has make.exe on board.
+#     For older NDK you have to install it separately.
+#     For example, this one: http://gnuwin32.sourceforge.net/packages/make.htm
+#
+#   $ SET ANDROID_NDK=C:\absolute\path\to\the\android-ndk
+#   $ mkdir build && cd build
+#   $ cmake.exe -G"MinGW Makefiles"
+#       -DCMAKE_TOOLCHAIN_FILE=path\to\the\android.toolchain.cmake
+#       -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%\prebuilt\windows\bin\make.exe" ..
+#   $ cmake.exe --build .
+#
+#
+#  Options (can be set as cmake parameters: -D<option_name>=<value>):
+#    ANDROID_NDK=/opt/android-ndk - path to the NDK root.
+#      Can be set as environment variable. Can be set only at first cmake run.
+#
+#    ANDROID_ABI=armeabi-v7a - specifies the target Application Binary
+#      Interface (ABI). This option nearly matches to the APP_ABI variable
+#      used by ndk-build tool from Android NDK.
+#
+#      Possible targets are:
+#        "armeabi" - ARMv5TE based CPU with software floating point operations
+#        "armeabi-v7a" - ARMv7 based devices with hardware FPU instructions
+#            this ABI target is used by default
+#        "armeabi-v7a-hard with NEON" - ARMv7 based devices with hardware FPU instructions and hardfp
+#        "armeabi-v7a with NEON" - same as armeabi-v7a, but
+#            sets NEON as floating-point unit
+#        "armeabi-v7a with VFPV3" - same as armeabi-v7a, but
+#            sets VFPV3 as floating-point unit (has 32 registers instead of 16)
+#        "armeabi-v6 with VFP" - tuned for ARMv6 processors having VFP
+#        "x86" - IA-32 instruction set
+#        "mips" - MIPS32 instruction set
+#
+#      64-bit ABIs for NDK r10 and newer:
+#        "arm64-v8a" - ARMv8 AArch64 instruction set
+#        "x86_64" - Intel64 instruction set (r1)
+#        "mips64" - MIPS64 instruction set (r6)
+#
+#    ANDROID_NATIVE_API_LEVEL=android-8 - level of Android API compile for.
+#      Option is read-only when standalone toolchain is used.
+#      Note: building for "android-L" requires explicit configuration.
+#
+#    ANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.9 - the name of compiler
+#      toolchain to be used. The list of possible values depends on the NDK
+#      version. For NDK r10c the possible values are:
+#
+#        * aarch64-linux-android-4.9
+#        * aarch64-linux-android-clang3.4
+#        * aarch64-linux-android-clang3.5
+#        * arm-linux-androideabi-4.6
+#        * arm-linux-androideabi-4.8
+#        * arm-linux-androideabi-4.9 (default)
+#        * arm-linux-androideabi-clang3.4
+#        * arm-linux-androideabi-clang3.5
+#        * mips64el-linux-android-4.9
+#        * mips64el-linux-android-clang3.4
+#        * mips64el-linux-android-clang3.5
+#        * mipsel-linux-android-4.6
+#        * mipsel-linux-android-4.8
+#        * mipsel-linux-android-4.9
+#        * mipsel-linux-android-clang3.4
+#        * mipsel-linux-android-clang3.5
+#        * x86-4.6
+#        * x86-4.8
+#        * x86-4.9
+#        * x86-clang3.4
+#        * x86-clang3.5
+#        * x86_64-4.9
+#        * x86_64-clang3.4
+#        * x86_64-clang3.5
+#
+#    ANDROID_FORCE_ARM_BUILD=OFF - set ON to generate 32-bit ARM instructions
+#      instead of Thumb. Is not available for "armeabi-v6 with VFP"
+#      (is forced to be ON) ABI.
+#
+#    ANDROID_NO_UNDEFINED=ON - set ON to show all undefined symbols as linker
+#      errors even if they are not used.
+#
+#    ANDROID_SO_UNDEFINED=OFF - set ON to allow undefined symbols in shared
+#      libraries. Automatically turned for NDK r5x and r6x due to GLESv2
+#      problems.
+#
+#    ANDROID_STL=gnustl_static - specify the runtime to use.
+#
+#      Possible values are:
+#        none           -> Do not configure the runtime.
+#        system         -> Use the default minimal system C++ runtime library.
+#                          Implies -fno-rtti -fno-exceptions.
+#                          Is not available for standalone toolchain.
+#        system_re      -> Use the default minimal system C++ runtime library.
+#                          Implies -frtti -fexceptions.
+#                          Is not available for standalone toolchain.
+#        gabi++_static  -> Use the GAbi++ runtime as a static library.
+#                          Implies -frtti -fno-exceptions.
+#                          Available for NDK r7 and newer.
+#                          Is not available for standalone toolchain.
+#        gabi++_shared  -> Use the GAbi++ runtime as a shared library.
+#                          Implies -frtti -fno-exceptions.
+#                          Available for NDK r7 and newer.
+#                          Is not available for standalone toolchain.
+#        stlport_static -> Use the STLport runtime as a static library.
+#                          Implies -fno-rtti -fno-exceptions for NDK before r7.
+#                          Implies -frtti -fno-exceptions for NDK r7 and newer.
+#                          Is not available for standalone toolchain.
+#        stlport_shared -> Use the STLport runtime as a shared library.
+#                          Implies -fno-rtti -fno-exceptions for NDK before r7.
+#                          Implies -frtti -fno-exceptions for NDK r7 and newer.
+#                          Is not available for standalone toolchain.
+#        gnustl_static  -> Use the GNU STL as a static library.
+#                          Implies -frtti -fexceptions.
+#        gnustl_shared  -> Use the GNU STL as a shared library.
+#                          Implies -frtti -fno-exceptions.
+#                          Available for NDK r7b and newer.
+#                          Silently degrades to gnustl_static if not available.
+#        c++_static     -> Use the LLVM libc++ runtime as a static library.
+#                          Implies -frtti -fexceptions.
+#        c++_shared     -> Use the LLVM libc++ runtime as a static library.
+#                          Implies -frtti -fno-exceptions.
+#
+#    ANDROID_STL_FORCE_FEATURES=ON - turn rtti and exceptions support based on
+#      chosen runtime. If disabled, then the user is responsible for settings
+#      these options.
+#
+#  What?:
+#    android-cmake toolchain searches for NDK/toolchain in the following order:
+#      ANDROID_NDK - cmake parameter
+#      ANDROID_NDK - environment variable
+#      ANDROID_STANDALONE_TOOLCHAIN - cmake parameter
+#      ANDROID_STANDALONE_TOOLCHAIN - environment variable
+#      ANDROID_NDK - default locations
+#      ANDROID_STANDALONE_TOOLCHAIN - default locations
+#
+#    Make sure to do the following in your scripts:
+#      SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${my_cxx_flags}" )
+#      SET( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${my_cxx_flags}" )
+#    The flags will be prepopulated with critical flags, so don't loose them.
+#    Also be aware that toolchain also sets configuration-specific compiler
+#    flags and linker flags.
+#
+#    ANDROID and BUILD_ANDROID will be set to true, you may test any of these
+#    variables to make necessary Android-specific configuration changes.
+#
+#    Also ARMEABI or ARMEABI_V7A or ARMEABI_V7A_HARD or X86 or MIPS or ARM64_V8A or X86_64 or MIPS64
+#    will be set true, mutually exclusive. NEON option will be set true
+#    if VFP is set to NEON.
+#
+# ------------------------------------------------------------------------------
+
+cmake_minimum_required( VERSION 2.6.3 )
+
+if( DEFINED CMAKE_CROSSCOMPILING )
+ # subsequent toolchain loading is not really needed
+ return()
+endif()
+
+if( CMAKE_TOOLCHAIN_FILE )
+ # touch toolchain variable to suppress "unused variable" warning
+endif()
+
+# inherit settings in recursive loads
+get_property( _CMAKE_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE )
+if( _CMAKE_IN_TRY_COMPILE )
+ include( "${CMAKE_CURRENT_SOURCE_DIR}/../android.toolchain.config.cmake" OPTIONAL )
+endif()
+
+# this one is important
+if( CMAKE_VERSION VERSION_GREATER "3.0.99" )
+ set( CMAKE_SYSTEM_NAME Android )
+else()
+ set( CMAKE_SYSTEM_NAME Linux )
+endif()
+
+# this one not so much
+set( CMAKE_SYSTEM_VERSION 1 )
+
+# rpath makes low sense for Android
+set( CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG "" )
+set( CMAKE_SKIP_RPATH TRUE CACHE BOOL "If set, runtime paths are not added when using shared libraries." )
+
+# NDK search paths
+set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r10d -r10c -r10b -r10 -r9d -r9c -r9b -r9 -r8e -r8d -r8c -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
+if( NOT DEFINED ANDROID_NDK_SEARCH_PATHS )
+ if( CMAKE_HOST_WIN32 )
+  file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS )
+  set( ANDROID_NDK_SEARCH_PATHS "${ANDROID_NDK_SEARCH_PATHS}" "$ENV{SystemDrive}/NVPACK" )
+ else()
+  file( TO_CMAKE_PATH "$ENV{HOME}" ANDROID_NDK_SEARCH_PATHS )
+  set( ANDROID_NDK_SEARCH_PATHS /opt "${ANDROID_NDK_SEARCH_PATHS}/NVPACK" )
+ endif()
+endif()
+if( NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH )
+ set( ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH /opt/android-toolchain )
+endif()
+
+# known ABIs
+set( ANDROID_SUPPORTED_ABIS_arm "armeabi-v7a;armeabi;armeabi-v7a with NEON;armeabi-v7a-hard with NEON;armeabi-v7a with VFPV3;armeabi-v6 with VFP" )
+set( ANDROID_SUPPORTED_ABIS_arm64 "arm64-v8a" )
+set( ANDROID_SUPPORTED_ABIS_x86 "x86" )
+set( ANDROID_SUPPORTED_ABIS_x86_64 "x86_64" )
+set( ANDROID_SUPPORTED_ABIS_mips "mips" )
+set( ANDROID_SUPPORTED_ABIS_mips64 "mips64" )
+
+# API level defaults
+set( ANDROID_DEFAULT_NDK_API_LEVEL 8 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_arm64 21 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_x86 9 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_x86_64 21 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_mips 9 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_mips64 21 )
+
+
+macro( __LIST_FILTER listvar regex )
+  if( ${listvar} )
+    foreach( __val ${${listvar}} )
+      if( __val MATCHES "${regex}" )
+        list( REMOVE_ITEM ${listvar} "${__val}" )
+      endif()
+    endforeach()
+  endif()
+endmacro()
+
+macro( __INIT_VARIABLE var_name )
+  set( __test_path 0 )
+  foreach( __var ${ARGN} )
+    if( __var STREQUAL "PATH" )
+      set( __test_path 1 )
+      break()
+    endif()
+  endforeach()
+
+  if( __test_path AND NOT EXISTS "${${var_name}}" )
+    unset( ${var_name} CACHE )
+  endif()
+
+  if( " ${${var_name}}" STREQUAL " " )
+    set( __values 0 )
+    foreach( __var ${ARGN} )
+      if( __var STREQUAL "VALUES" )
+        set( __values 1 )
+      elseif( NOT __var STREQUAL "PATH" )
+        if( __var MATCHES "^ENV_.*$" )
+          string( REPLACE "ENV_" "" __var "${__var}" )
+          set( __value "$ENV{${__var}}" )
+        elseif( DEFINED ${__var} )
+          set( __value "${${__var}}" )
+        elseif( __values )
+          set( __value "${__var}" )
+        else()
+          set( __value "" )
+        endif()
+
+        if( NOT " ${__value}" STREQUAL " " AND (NOT __test_path OR EXISTS "${__value}") )
+          set( ${var_name} "${__value}" )
+          break()
+        endif()
+      endif()
+    endforeach()
+    unset( __value )
+    unset( __values )
+  endif()
+
+  if( __test_path )
+    file( TO_CMAKE_PATH "${${var_name}}" ${var_name} )
+  endif()
+  unset( __test_path )
+endmacro()
+
+macro( __DETECT_NATIVE_API_LEVEL _var _path )
+  set( __ndkApiLevelRegex "^[\t ]*#define[\t ]+__ANDROID_API__[\t ]+([0-9]+)[\t ]*.*$" )
+  file( STRINGS ${_path} __apiFileContent REGEX "${__ndkApiLevelRegex}" )
+  if( NOT __apiFileContent )
+    message( SEND_ERROR "Could not get Android native API level. Probably you have specified invalid level value, or your copy of NDK/toolchain is broken." )
+  endif()
+  string( REGEX REPLACE "${__ndkApiLevelRegex}" "\\1" ${_var} "${__apiFileContent}" )
+  unset( __apiFileContent )
+  unset( __ndkApiLevelRegex )
+endmacro()
+
+macro( __DETECT_TOOLCHAIN_MACHINE_NAME _var _root )
+ if( EXISTS "${_root}" )
+    file( GLOB __gccExePath RELATIVE "${_root}/bin/" "${_root}/bin/*-gcc${TOOL_OS_SUFFIX}" )
+    __LIST_FILTER( __gccExePath "^[.].*" )
+    list( LENGTH __gccExePath __gccExePathsCount )
+    if( NOT __gccExePathsCount EQUAL 1  AND NOT _CMAKE_IN_TRY_COMPILE )
+      message( WARNING "Could not determine machine name for compiler from ${_root}" )
+      set( ${_var} "" )
+    else()
+      get_filename_component( __gccExeName "${__gccExePath}" NAME_WE )
+      string( REPLACE "-gcc" "" ${_var} "${__gccExeName}" )
+    endif()
+    unset( __gccExePath )
+    unset( __gccExePathsCount )
+    unset( __gccExeName )
+  else()
+    set( ${_var} "" )
+  endif()
+endmacro()
+
+
+# fight against cygwin
+set( ANDROID_FORBID_SYGWIN TRUE CACHE BOOL "Prevent cmake from working under cygwin and using cygwin tools")
+mark_as_advanced( ANDROID_FORBID_SYGWIN )
+if( ANDROID_FORBID_SYGWIN )
+ if( CYGWIN )
+  message( FATAL_ERROR "Android NDK and android-cmake toolchain are not welcome Cygwin. It is unlikely that this cmake toolchain will work under cygwin. But if you want to try then you can set cmake variable ANDROID_FORBID_SYGWIN to FALSE and rerun cmake." )
+ endif()
+
+ if( CMAKE_HOST_WIN32 )
+  # remove cygwin from PATH
+  set( __new_path "$ENV{PATH}")
+  __LIST_FILTER( __new_path "cygwin" )
+  set(ENV{PATH} "${__new_path}")
+  unset(__new_path)
+ endif()
+endif()
+
+
+# detect current host platform
+if( NOT DEFINED ANDROID_NDK_HOST_X64 AND (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64" OR CMAKE_HOST_APPLE) )
+ set( ANDROID_NDK_HOST_X64 1 CACHE BOOL "Try to use 64-bit compiler toolchain" )
+ mark_as_advanced( ANDROID_NDK_HOST_X64 )
+endif()
+
+set( TOOL_OS_SUFFIX "" )
+if( CMAKE_HOST_APPLE )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME "darwin-x86_64" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME2 "darwin-x86" )
+elseif( CMAKE_HOST_WIN32 )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME "windows-x86_64" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME2 "windows" )
+ set( TOOL_OS_SUFFIX ".exe" )
+elseif( CMAKE_HOST_UNIX )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME "linux-x86_64" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME2 "linux-x86" )
+else()
+ message( FATAL_ERROR "Cross-compilation on your platform is not supported by this cmake toolchain" )
+endif()
+
+if( NOT ANDROID_NDK_HOST_X64 )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+endif()
+
+# see if we have path to Android NDK
+if( NOT ANDROID_NDK AND NOT ANDROID_STANDALONE_TOOLCHAIN )
+  __INIT_VARIABLE( ANDROID_NDK PATH ENV_ANDROID_NDK )
+endif()
+if( NOT ANDROID_NDK )
+ # see if we have path to Android standalone toolchain
+ __INIT_VARIABLE( ANDROID_STANDALONE_TOOLCHAIN PATH ENV_ANDROID_STANDALONE_TOOLCHAIN )
+
+ if( NOT ANDROID_STANDALONE_TOOLCHAIN )
+  #try to find Android NDK in one of the the default locations
+  set( __ndkSearchPaths )
+  foreach( __ndkSearchPath ${ANDROID_NDK_SEARCH_PATHS} )
+   foreach( suffix ${ANDROID_SUPPORTED_NDK_VERSIONS} )
+    list( APPEND __ndkSearchPaths "${__ndkSearchPath}/android-ndk${suffix}" )
+   endforeach()
+  endforeach()
+  __INIT_VARIABLE( ANDROID_NDK PATH VALUES ${__ndkSearchPaths} )
+  unset( __ndkSearchPaths )
+
+  if( ANDROID_NDK )
+   message( STATUS "Using default path for Android NDK: ${ANDROID_NDK}" )
+   message( STATUS "  If you prefer to use a different location, please define a cmake or environment variable: ANDROID_NDK" )
+  else()
+   #try to find Android standalone toolchain in one of the the default locations
+   __INIT_VARIABLE( ANDROID_STANDALONE_TOOLCHAIN PATH ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH )
+
+   if( ANDROID_STANDALONE_TOOLCHAIN )
+    message( STATUS "Using default path for standalone toolchain ${ANDROID_STANDALONE_TOOLCHAIN}" )
+    message( STATUS "  If you prefer to use a different location, please define the variable: ANDROID_STANDALONE_TOOLCHAIN" )
+   endif( ANDROID_STANDALONE_TOOLCHAIN )
+  endif( ANDROID_NDK )
+ endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
+endif( NOT ANDROID_NDK )
+
+# remember found paths
+if( ANDROID_NDK )
+ get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
+ set( ANDROID_NDK "${ANDROID_NDK}" CACHE INTERNAL "Path of the Android NDK" FORCE )
+ set( BUILD_WITH_ANDROID_NDK True )
+ if( EXISTS "${ANDROID_NDK}/RELEASE.TXT" )
+  file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX "r[0-9]+[a-z]?" )
+  string( REGEX MATCH "r([0-9]+)([a-z]?)" ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ else()
+  set( ANDROID_NDK_RELEASE "r1x" )
+  set( ANDROID_NDK_RELEASE_FULL "unreleased" )
+ endif()
+ string( REGEX REPLACE "r([0-9]+)([a-z]?)" "\\1*1000" ANDROID_NDK_RELEASE_NUM "${ANDROID_NDK_RELEASE}" )
+ string( FIND " abcdefghijklmnopqastuvwxyz" "${CMAKE_MATCH_2}" __ndkReleaseLetterNum )
+ math( EXPR ANDROID_NDK_RELEASE_NUM "${ANDROID_NDK_RELEASE_NUM}+${__ndkReleaseLetterNum}" )
+elseif( ANDROID_STANDALONE_TOOLCHAIN )
+ get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
+ # try to detect change
+ if( CMAKE_AR )
+  string( LENGTH "${ANDROID_STANDALONE_TOOLCHAIN}" __length )
+  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidStandaloneToolchainPreviousPath )
+  if( NOT __androidStandaloneToolchainPreviousPath STREQUAL ANDROID_STANDALONE_TOOLCHAIN )
+   message( FATAL_ERROR "It is not possible to change path to the Android standalone toolchain on subsequent run." )
+  endif()
+  unset( __androidStandaloneToolchainPreviousPath )
+  unset( __length )
+ endif()
+ set( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" CACHE INTERNAL "Path of the Android standalone toolchain" FORCE )
+ set( BUILD_WITH_STANDALONE_TOOLCHAIN True )
+else()
+ list(GET ANDROID_NDK_SEARCH_PATHS 0 ANDROID_NDK_SEARCH_PATH)
+ message( FATAL_ERROR "Could not find neither Android NDK nor Android standalone toolchain.
+    You should either set an environment variable:
+      export ANDROID_NDK=~/my-android-ndk
+    or
+      export ANDROID_STANDALONE_TOOLCHAIN=~/my-android-toolchain
+    or put the toolchain or NDK in the default path:
+      sudo ln -s ~/my-android-ndk ${ANDROID_NDK_SEARCH_PATH}/android-ndk
+      sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
+endif()
+
+# android NDK layout
+if( BUILD_WITH_ANDROID_NDK )
+ if( NOT DEFINED ANDROID_NDK_LAYOUT )
+  # try to automatically detect the layout
+  if( EXISTS "${ANDROID_NDK}/RELEASE.TXT")
+   set( ANDROID_NDK_LAYOUT "RELEASE" )
+  elseif( EXISTS "${ANDROID_NDK}/../../linux-x86/toolchain/" )
+   set( ANDROID_NDK_LAYOUT "LINARO" )
+  elseif( EXISTS "${ANDROID_NDK}/../../gcc/" )
+   set( ANDROID_NDK_LAYOUT "ANDROID" )
+  endif()
+ endif()
+ set( ANDROID_NDK_LAYOUT "${ANDROID_NDK_LAYOUT}" CACHE STRING "The inner layout of NDK" )
+ mark_as_advanced( ANDROID_NDK_LAYOUT )
+ if( ANDROID_NDK_LAYOUT STREQUAL "LINARO" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../${ANDROID_NDK_HOST_SYSTEM_NAME}/toolchain" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ elseif( ANDROID_NDK_LAYOUT STREQUAL "ANDROID" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../gcc/${ANDROID_NDK_HOST_SYSTEM_NAME}/arm" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ else() # ANDROID_NDK_LAYOUT STREQUAL "RELEASE"
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/toolchains" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME2}" )
+ endif()
+ get_filename_component( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK_TOOLCHAINS_PATH}" ABSOLUTE )
+
+ # try to detect change of NDK
+ if( CMAKE_AR )
+  string( LENGTH "${ANDROID_NDK_TOOLCHAINS_PATH}" __length )
+  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
+  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK_TOOLCHAINS_PATH )
+   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
+   " )
+  endif()
+  unset( __androidNdkPreviousPath )
+  unset( __length )
+ endif()
+endif()
+
+
+# get all the details about standalone toolchain
+if( BUILD_WITH_STANDALONE_TOOLCHAIN )
+ __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
+ set( ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
+ set( __availableToolchains "standalone" )
+ __DETECT_TOOLCHAIN_MACHINE_NAME( __availableToolchainMachines "${ANDROID_STANDALONE_TOOLCHAIN}" )
+ if( NOT __availableToolchainMachines )
+  message( FATAL_ERROR "Could not determine machine name of your toolchain. Probably your Android standalone toolchain is broken." )
+ endif()
+ if( __availableToolchainMachines MATCHES x86_64 )
+  set( __availableToolchainArchs "x86_64" )
+ elseif( __availableToolchainMachines MATCHES i686 )
+  set( __availableToolchainArchs "x86" )
+ elseif( __availableToolchainMachines MATCHES aarch64 )
+  set( __availableToolchainArchs "arm64" )
+ elseif( __availableToolchainMachines MATCHES arm )
+  set( __availableToolchainArchs "arm" )
+ elseif( __availableToolchainMachines MATCHES mips64el )
+  set( __availableToolchainArchs "mips64" )
+ elseif( __availableToolchainMachines MATCHES mipsel )
+  set( __availableToolchainArchs "mips" )
+ endif()
+ execute_process( COMMAND "${ANDROID_STANDALONE_TOOLCHAIN}/bin/${__availableToolchainMachines}-gcc${TOOL_OS_SUFFIX}" -dumpversion
+                  OUTPUT_VARIABLE __availableToolchainCompilerVersions OUTPUT_STRIP_TRAILING_WHITESPACE )
+ string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9]+)?" __availableToolchainCompilerVersions "${__availableToolchainCompilerVersions}" )
+ if( EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/bin/clang${TOOL_OS_SUFFIX}" )
+  list( APPEND __availableToolchains "standalone-clang" )
+  list( APPEND __availableToolchainMachines ${__availableToolchainMachines} )
+  list( APPEND __availableToolchainArchs ${__availableToolchainArchs} )
+  list( APPEND __availableToolchainCompilerVersions ${__availableToolchainCompilerVersions} )
+ endif()
+endif()
+
+macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __toolchain_subpath )
+ foreach( __toolchain ${${__availableToolchainsLst}} )
+  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}${__toolchain_subpath}" )
+   SET( __toolchainVersionRegex "^TOOLCHAIN_VERSION[\t ]+:=[\t ]+(.*)$" )
+   FILE( STRINGS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}/setup.mk" __toolchainVersionStr REGEX "${__toolchainVersionRegex}" )
+   if( __toolchainVersionStr )
+    string( REGEX REPLACE "${__toolchainVersionRegex}" "\\1" __toolchainVersionStr "${__toolchainVersionStr}" )
+    string( REGEX REPLACE "-clang3[.][0-9]$" "-${__toolchainVersionStr}" __gcc_toolchain "${__toolchain}" )
+   else()
+    string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
+   endif()
+   unset( __toolchainVersionStr )
+   unset( __toolchainVersionRegex )
+  else()
+   set( __gcc_toolchain "${__toolchain}" )
+  endif()
+  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK_TOOLCHAINS_PATH}/${__gcc_toolchain}${__toolchain_subpath}" )
+  if( __machine )
+   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9x]+)?$" __version "${__gcc_toolchain}" )
+   if( __machine MATCHES x86_64 )
+    set( __arch "x86_64" )
+   elseif( __machine MATCHES i686 )
+    set( __arch "x86" )
+   elseif( __machine MATCHES aarch64 )
+    set( __arch "arm64" )
+   elseif( __machine MATCHES arm )
+    set( __arch "arm" )
+   elseif( __machine MATCHES mips64el )
+    set( __arch "mips64" )
+   elseif( __machine MATCHES mipsel )
+    set( __arch "mips" )
+   else()
+    set( __arch "" )
+   endif()
+   #message("machine: !${__machine}!\narch: !${__arch}!\nversion: !${__version}!\ntoolchain: !${__toolchain}!\n")
+   if (__arch)
+    list( APPEND __availableToolchainMachines "${__machine}" )
+    list( APPEND __availableToolchainArchs "${__arch}" )
+    list( APPEND __availableToolchainCompilerVersions "${__version}" )
+    list( APPEND ${__availableToolchainsVar} "${__toolchain}" )
+   endif()
+  endif()
+  unset( __gcc_toolchain )
+ endforeach()
+endmacro()
+
+# get all the details about NDK
+if( BUILD_WITH_ANDROID_NDK )
+ file( GLOB ANDROID_SUPPORTED_NATIVE_API_LEVELS RELATIVE "${ANDROID_NDK}/platforms" "${ANDROID_NDK}/platforms/android-*" )
+ string( REPLACE "android-" "" ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_SUPPORTED_NATIVE_API_LEVELS}" )
+ set( __availableToolchains "" )
+ set( __availableToolchainMachines "" )
+ set( __availableToolchainArchs "" )
+ set( __availableToolchainCompilerVersions "" )
+ if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_TOOLCHAIN_NAME}/" )
+  # do not go through all toolchains if we know the name
+  set( __availableToolchainsLst "${ANDROID_TOOLCHAIN_NAME}" )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
+   if( __availableToolchains )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
+   endif()
+  endif()
+ endif()
+ if( NOT __availableToolchains )
+  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK_TOOLCHAINS_PATH}" "${ANDROID_NDK_TOOLCHAINS_PATH}/*" )
+  if( __availableToolchainsLst )
+   list(SORT __availableToolchainsLst) # we need clang to go after gcc
+  endif()
+  __LIST_FILTER( __availableToolchainsLst "^[.]" )
+  __LIST_FILTER( __availableToolchainsLst "llvm" )
+  __LIST_FILTER( __availableToolchainsLst "renderscript" )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
+   if( __availableToolchains )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
+   endif()
+  endif()
+ endif()
+ if( NOT __availableToolchains )
+  message( FATAL_ERROR "Could not find any working toolchain in the NDK. Probably your Android NDK is broken." )
+ endif()
+endif()
+
+# build list of available ABIs
+set( ANDROID_SUPPORTED_ABIS "" )
+set( __uniqToolchainArchNames ${__availableToolchainArchs} )
+list( REMOVE_DUPLICATES __uniqToolchainArchNames )
+list( SORT __uniqToolchainArchNames )
+foreach( __arch ${__uniqToolchainArchNames} )
+ list( APPEND ANDROID_SUPPORTED_ABIS ${ANDROID_SUPPORTED_ABIS_${__arch}} )
+endforeach()
+unset( __uniqToolchainArchNames )
+if( NOT ANDROID_SUPPORTED_ABIS )
+ message( FATAL_ERROR "No one of known Android ABIs is supported by this cmake toolchain." )
+endif()
+
+# choose target ABI
+__INIT_VARIABLE( ANDROID_ABI VALUES ${ANDROID_SUPPORTED_ABIS} )
+# verify that target ABI is supported
+list( FIND ANDROID_SUPPORTED_ABIS "${ANDROID_ABI}" __androidAbiIdx )
+if( __androidAbiIdx EQUAL -1 )
+ string( REPLACE ";" "\", \"" PRINTABLE_ANDROID_SUPPORTED_ABIS  "${ANDROID_SUPPORTED_ABIS}" )
+ message( FATAL_ERROR "Specified ANDROID_ABI = \"${ANDROID_ABI}\" is not supported by this cmake toolchain or your NDK/toolchain.
+   Supported values are: \"${PRINTABLE_ANDROID_SUPPORTED_ABIS}\"
+   " )
+endif()
+unset( __androidAbiIdx )
+
+# set target ABI options
+if( ANDROID_ABI STREQUAL "x86" )
+ set( X86 true )
+ set( ANDROID_NDK_ABI_NAME "x86" )
+ set( ANDROID_ARCH_NAME "x86" )
+ set( ANDROID_LLVM_TRIPLE "i686-none-linux-android" )
+ set( CMAKE_SYSTEM_PROCESSOR "i686" )
+elseif( ANDROID_ABI STREQUAL "x86_64" )
+ set( X86 true )
+ set( X86_64 true )
+ set( ANDROID_NDK_ABI_NAME "x86_64" )
+ set( ANDROID_ARCH_NAME "x86_64" )
+ set( CMAKE_SYSTEM_PROCESSOR "x86_64" )
+ set( ANDROID_LLVM_TRIPLE "x86_64-none-linux-android" )
+elseif( ANDROID_ABI STREQUAL "mips64" )
+ set( MIPS64 true )
+ set( ANDROID_NDK_ABI_NAME "mips64" )
+ set( ANDROID_ARCH_NAME "mips64" )
+ set( ANDROID_LLVM_TRIPLE "mips64el-none-linux-android" )
+ set( CMAKE_SYSTEM_PROCESSOR "mips64" )
+elseif( ANDROID_ABI STREQUAL "mips" )
+ set( MIPS true )
+ set( ANDROID_NDK_ABI_NAME "mips" )
+ set( ANDROID_ARCH_NAME "mips" )
+ set( ANDROID_LLVM_TRIPLE "mipsel-none-linux-android" )
+ set( CMAKE_SYSTEM_PROCESSOR "mips" )
+elseif( ANDROID_ABI STREQUAL "arm64-v8a" )
+ set( ARM64_V8A true )
+ set( ANDROID_NDK_ABI_NAME "arm64-v8a" )
+ set( ANDROID_ARCH_NAME "arm64" )
+ set( ANDROID_LLVM_TRIPLE "aarch64-none-linux-android" )
+ set( CMAKE_SYSTEM_PROCESSOR "aarch64" )
+ set( VFPV3 true )
+ set( NEON true )
+elseif( ANDROID_ABI STREQUAL "armeabi" )
+ set( ARMEABI true )
+ set( ANDROID_NDK_ABI_NAME "armeabi" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv5te-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv5te" )
+elseif( ANDROID_ABI STREQUAL "armeabi-v6 with VFP" )
+ set( ARMEABI_V6 true )
+ set( ANDROID_NDK_ABI_NAME "armeabi" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv5te-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv6" )
+ # need always fallback to older platform
+ set( ARMEABI true )
+elseif( ANDROID_ABI STREQUAL "armeabi-v7a")
+ set( ARMEABI_V7A true )
+ set( ANDROID_NDK_ABI_NAME "armeabi-v7a" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
+elseif( ANDROID_ABI STREQUAL "armeabi-v7a with VFPV3" )
+ set( ARMEABI_V7A true )
+ set( ANDROID_NDK_ABI_NAME "armeabi-v7a" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
+ set( VFPV3 true )
+elseif( ANDROID_ABI STREQUAL "armeabi-v7a with NEON" )
+ set( ARMEABI_V7A true )
+ set( ANDROID_NDK_ABI_NAME "armeabi-v7a" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
+ set( VFPV3 true )
+ set( NEON true )
+elseif( ANDROID_ABI STREQUAL "armeabi-v7a-hard with NEON" )
+ set( ARMEABI_V7A_HARD true )
+ set( ANDROID_NDK_ABI_NAME "armeabi-v7a-hard" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
+ set( VFPV3 true )
+ set( NEON true )
+else()
+ message( SEND_ERROR "Unknown ANDROID_ABI=\"${ANDROID_ABI}\" is specified." )
+endif()
+
+if( CMAKE_BINARY_DIR AND EXISTS "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" )
+ # really dirty hack
+ # it is not possible to change CMAKE_SYSTEM_PROCESSOR after the first run...
+ file( APPEND "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" "SET(CMAKE_SYSTEM_PROCESSOR \"${CMAKE_SYSTEM_PROCESSOR}\")\n" )
+endif()
+
+if( ANDROID_ARCH_NAME STREQUAL "arm" AND NOT ARMEABI_V6 )
+ __INIT_VARIABLE( ANDROID_FORCE_ARM_BUILD VALUES OFF )
+ set( ANDROID_FORCE_ARM_BUILD ${ANDROID_FORCE_ARM_BUILD} CACHE BOOL "Use 32-bit ARM instructions instead of Thumb-1" FORCE )
+ mark_as_advanced( ANDROID_FORCE_ARM_BUILD )
+else()
+ unset( ANDROID_FORCE_ARM_BUILD CACHE )
+endif()
+
+# choose toolchain
+if( ANDROID_TOOLCHAIN_NAME )
+ list( FIND __availableToolchains "${ANDROID_TOOLCHAIN_NAME}" __toolchainIdx )
+ if( __toolchainIdx EQUAL -1 )
+  list( SORT __availableToolchains )
+  string( REPLACE ";" "\n  * " toolchains_list "${__availableToolchains}" )
+  set( toolchains_list "  * ${toolchains_list}")
+  message( FATAL_ERROR "Specified toolchain \"${ANDROID_TOOLCHAIN_NAME}\" is missing in your NDK or broken. Please verify that your NDK is working or select another compiler toolchain.
+To configure the toolchain set CMake variable ANDROID_TOOLCHAIN_NAME to one of the following values:\n${toolchains_list}\n" )
+ endif()
+ list( GET __availableToolchainArchs ${__toolchainIdx} __toolchainArch )
+ if( NOT __toolchainArch STREQUAL ANDROID_ARCH_NAME )
+  message( SEND_ERROR "Selected toolchain \"${ANDROID_TOOLCHAIN_NAME}\" is not able to compile binaries for the \"${ANDROID_ARCH_NAME}\" platform." )
+ endif()
+else()
+ set( __toolchainIdx -1 )
+ set( __applicableToolchains "" )
+ set( __toolchainMaxVersion "0.0.0" )
+ list( LENGTH __availableToolchains __availableToolchainsCount )
+ math( EXPR __availableToolchainsCount "${__availableToolchainsCount}-1" )
+ foreach( __idx RANGE ${__availableToolchainsCount} )
+  list( GET __availableToolchainArchs ${__idx} __toolchainArch )
+  if( __toolchainArch STREQUAL ANDROID_ARCH_NAME )
+   list( GET __availableToolchainCompilerVersions ${__idx} __toolchainVersion )
+   string( REPLACE "x" "99" __toolchainVersion "${__toolchainVersion}")
+   if( __toolchainVersion VERSION_GREATER __toolchainMaxVersion )
+    set( __toolchainMaxVersion "${__toolchainVersion}" )
+    set( __toolchainIdx ${__idx} )
+   endif()
+  endif()
+ endforeach()
+ unset( __availableToolchainsCount )
+ unset( __toolchainMaxVersion )
+ unset( __toolchainVersion )
+endif()
+unset( __toolchainArch )
+if( __toolchainIdx EQUAL -1 )
+ message( FATAL_ERROR "No one of available compiler toolchains is able to compile for ${ANDROID_ARCH_NAME} platform." )
+endif()
+list( GET __availableToolchains ${__toolchainIdx} ANDROID_TOOLCHAIN_NAME )
+list( GET __availableToolchainMachines ${__toolchainIdx} ANDROID_TOOLCHAIN_MACHINE_NAME )
+list( GET __availableToolchainCompilerVersions ${__toolchainIdx} ANDROID_COMPILER_VERSION )
+
+unset( __toolchainIdx )
+unset( __availableToolchains )
+unset( __availableToolchainMachines )
+unset( __availableToolchainArchs )
+unset( __availableToolchainCompilerVersions )
+
+# choose native API level
+__INIT_VARIABLE( ANDROID_NATIVE_API_LEVEL ENV_ANDROID_NATIVE_API_LEVEL ANDROID_API_LEVEL ENV_ANDROID_API_LEVEL ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME} ANDROID_DEFAULT_NDK_API_LEVEL )
+string( REPLACE "android-" "" ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" )
+string( STRIP "${ANDROID_NATIVE_API_LEVEL}" ANDROID_NATIVE_API_LEVEL )
+# adjust API level
+set( __real_api_level ${ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME}} )
+foreach( __level ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
+ if( (__level LESS ANDROID_NATIVE_API_LEVEL OR __level STREQUAL ANDROID_NATIVE_API_LEVEL) AND NOT __level LESS __real_api_level )
+  set( __real_api_level ${__level} )
+ endif()
+endforeach()
+if( __real_api_level AND NOT ANDROID_NATIVE_API_LEVEL STREQUAL __real_api_level )
+ message( STATUS "Adjusting Android API level 'android-${ANDROID_NATIVE_API_LEVEL}' to 'android-${__real_api_level}'")
+ set( ANDROID_NATIVE_API_LEVEL ${__real_api_level} )
+endif()
+unset(__real_api_level)
+# validate
+list( FIND ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_NATIVE_API_LEVEL}" __levelIdx )
+if( __levelIdx EQUAL -1 )
+ message( SEND_ERROR "Specified Android native API level 'android-${ANDROID_NATIVE_API_LEVEL}' is not supported by your NDK/toolchain." )
+else()
+ if( BUILD_WITH_ANDROID_NDK )
+  __DETECT_NATIVE_API_LEVEL( __realApiLevel "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}/usr/include/android/api-level.h" )
+  if( NOT __realApiLevel EQUAL ANDROID_NATIVE_API_LEVEL AND NOT __realApiLevel GREATER 9000 )
+   message( SEND_ERROR "Specified Android API level (${ANDROID_NATIVE_API_LEVEL}) does not match to the level found (${__realApiLevel}). Probably your copy of NDK is broken." )
+  endif()
+  unset( __realApiLevel )
+ endif()
+ set( ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" CACHE STRING "Android API level for native code" FORCE )
+ set( CMAKE_ANDROID_API ${ANDROID_NATIVE_API_LEVEL} )
+ if( CMAKE_VERSION VERSION_GREATER "2.8" )
+  list( SORT ANDROID_SUPPORTED_NATIVE_API_LEVELS )
+  set_property( CACHE ANDROID_NATIVE_API_LEVEL PROPERTY STRINGS ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
+ endif()
+endif()
+unset( __levelIdx )
+
+
+# remember target ABI
+set( ANDROID_ABI "${ANDROID_ABI}" CACHE STRING "The target ABI for Android. If arm, then armeabi-v7a is recommended for hardware floating point." FORCE )
+if( CMAKE_VERSION VERSION_GREATER "2.8" )
+ list( SORT ANDROID_SUPPORTED_ABIS_${ANDROID_ARCH_NAME} )
+ set_property( CACHE ANDROID_ABI PROPERTY STRINGS ${ANDROID_SUPPORTED_ABIS_${ANDROID_ARCH_NAME}} )
+endif()
+
+
+# runtime choice (STL, rtti, exceptions)
+if( NOT ANDROID_STL )
+  set( ANDROID_STL gnustl_static )
+endif()
+set( ANDROID_STL "${ANDROID_STL}" CACHE STRING "C++ runtime" )
+set( ANDROID_STL_FORCE_FEATURES ON CACHE BOOL "automatically configure rtti and exceptions support based on C++ runtime" )
+mark_as_advanced( ANDROID_STL ANDROID_STL_FORCE_FEATURES )
+
+if( BUILD_WITH_ANDROID_NDK )
+ if( NOT "${ANDROID_STL}" MATCHES "^(none|system|system_re|gabi\\+\\+_static|gabi\\+\\+_shared|stlport_static|stlport_shared|gnustl_static|gnustl_shared|c\\+\\+_static|c\\+\\+_shared)$")
+  message( FATAL_ERROR "ANDROID_STL is set to invalid value \"${ANDROID_STL}\".
+The possible values are:
+  none           -> Do not configure the runtime.
+  system         -> Use the default minimal system C++ runtime library.
+  system_re      -> Same as system but with rtti and exceptions.
+  gabi++_static  -> Use the GAbi++ runtime as a static library.
+  gabi++_shared  -> Use the GAbi++ runtime as a shared library.
+  stlport_static -> Use the STLport runtime as a static library.
+  stlport_shared -> Use the STLport runtime as a shared library.
+  gnustl_static  -> (default) Use the GNU STL as a static library.
+  gnustl_shared  -> Use the GNU STL as a shared library.
+  c++_shared     -> Use the LLVM libc++ runtime as a shared library.
+  c++_static     -> Use the LLVM libc++ runtime as a static library.
+" )
+ endif()
+elseif( BUILD_WITH_STANDALONE_TOOLCHAIN )
+ if( NOT "${ANDROID_STL}" MATCHES "^(none|gnustl_static|gnustl_shared|c\\+\\+_static|c\\+\\+_shared)$")
+  message( FATAL_ERROR "ANDROID_STL is set to invalid value \"${ANDROID_STL}\".
+The possible values are:
+  none           -> Do not configure the runtime.
+  gnustl_static  -> (default) Use the GNU STL as a static library.
+  gnustl_shared  -> Use the GNU STL as a shared library.
+  c++_shared     -> Use the LLVM libc++ runtime as a shared library.
+  c++_static     -> Use the LLVM libc++ runtime as a static library.
+" )
+ endif()
+endif()
+
+unset( ANDROID_RTTI )
+unset( ANDROID_EXCEPTIONS )
+unset( ANDROID_STL_INCLUDE_DIRS )
+unset( __libstl )
+unset( __libsupcxx )
+
+if( NOT _CMAKE_IN_TRY_COMPILE AND ANDROID_NDK_RELEASE STREQUAL "r7b" AND ARMEABI_V7A AND NOT VFPV3 AND ANDROID_STL MATCHES "gnustl" )
+ message( WARNING  "The GNU STL armeabi-v7a binaries from NDK r7b can crash non-NEON devices. The files provided with NDK r7b were not configured properly, resulting in crashes on Tegra2-based devices and others when trying to use certain floating-point functions (e.g., cosf, sinf, expf).
+You are strongly recommended to switch to another NDK release.
+" )
+endif()
+
+if( NOT _CMAKE_IN_TRY_COMPILE AND X86 AND ANDROID_STL MATCHES "gnustl" AND ANDROID_NDK_RELEASE STREQUAL "r6" )
+  message( WARNING  "The x86 system header file from NDK r6 has incorrect definition for ptrdiff_t. You are recommended to upgrade to a newer NDK release or manually patch the header:
+See https://android.googlesource.com/platform/development.git f907f4f9d4e56ccc8093df6fee54454b8bcab6c2
+  diff --git a/ndk/platforms/android-9/arch-x86/include/machine/_types.h b/ndk/platforms/android-9/arch-x86/include/machine/_types.h
+  index 5e28c64..65892a1 100644
+  --- a/ndk/platforms/android-9/arch-x86/include/machine/_types.h
+  +++ b/ndk/platforms/android-9/arch-x86/include/machine/_types.h
+  @@ -51,7 +51,11 @@ typedef long int       ssize_t;
+   #endif
+   #ifndef _PTRDIFF_T
+   #define _PTRDIFF_T
+  -typedef long           ptrdiff_t;
+  +#  ifdef __ANDROID__
+  +     typedef int            ptrdiff_t;
+  +#  else
+  +     typedef long           ptrdiff_t;
+  +#  endif
+   #endif
+" )
+endif()
+
+
+# setup paths and STL for standalone toolchain
+if( BUILD_WITH_STANDALONE_TOOLCHAIN )
+ set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_STANDALONE_TOOLCHAIN}" )
+ set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_STANDALONE_TOOLCHAIN}" )
+ set( ANDROID_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot" )
+
+ if( NOT ANDROID_STL STREQUAL "none" )
+  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_STANDALONE_TOOLCHAIN}/include/c++/${ANDROID_COMPILER_VERSION}" )
+  if( NOT EXISTS "${ANDROID_STL_INCLUDE_DIRS}" )
+   # old location ( pre r8c )
+   set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/include/c++/${ANDROID_COMPILER_VERSION}" )
+  endif()
+  if( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND EXISTS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/${CMAKE_SYSTEM_PROCESSOR}/bits" )
+   list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/${CMAKE_SYSTEM_PROCESSOR}" )
+  elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/thumb/bits" )
+   list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/thumb" )
+  else()
+   list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}" )
+  endif()
+  # always search static GNU STL to get the location of libsupc++.a
+  if( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libstdc++.a" )
+   set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb" )
+  elseif( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libstdc++.a" )
+   set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}" )
+  elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libstdc++.a" )
+   set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb" )
+  elseif( EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libstdc++.a" )
+   set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib" )
+  endif()
+  if( __libstl )
+   set( __libsupcxx "${__libstl}/libsupc++.a" )
+   set( __libstl    "${__libstl}/libstdc++.a" )
+  endif()
+  if( NOT EXISTS "${__libsupcxx}" )
+   message( FATAL_ERROR "The required libstdsupc++.a is missing in your standalone toolchain.
+ Usually it happens because of bug in make-standalone-toolchain.sh script from NDK r7, r7b and r7c.
+ You need to either upgrade to newer NDK or manually copy
+     $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a
+ to
+     ${__libsupcxx}
+   " )
+  endif()
+  if( ANDROID_STL STREQUAL "gnustl_shared" )
+   if( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libgnustl_shared.so" )
+    set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libgnustl_shared.so" )
+   elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libgnustl_shared.so" )
+    set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libgnustl_shared.so" )
+   elseif( EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libgnustl_shared.so" )
+    set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libgnustl_shared.so" )
+   endif()
+  endif()
+ endif()
+endif()
+
+# clang
+if( "${ANDROID_TOOLCHAIN_NAME}" STREQUAL "standalone-clang" )
+ set( ANDROID_COMPILER_IS_CLANG 1 )
+ execute_process( COMMAND "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/clang${TOOL_OS_SUFFIX}" --version OUTPUT_VARIABLE ANDROID_CLANG_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE )
+ string( REGEX MATCH "[0-9]+[.][0-9]+" ANDROID_CLANG_VERSION "${ANDROID_CLANG_VERSION}")
+elseif( "${ANDROID_TOOLCHAIN_NAME}" MATCHES "-clang3[.][0-9]?$" )
+ string( REGEX MATCH "3[.][0-9]$" ANDROID_CLANG_VERSION "${ANDROID_TOOLCHAIN_NAME}")
+ string( REGEX REPLACE "-clang${ANDROID_CLANG_VERSION}$" "-${ANDROID_COMPILER_VERSION}" ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
+ if( NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}/bin/clang${TOOL_OS_SUFFIX}" )
+  message( FATAL_ERROR "Could not find the Clang compiler driver" )
+ endif()
+ set( ANDROID_COMPILER_IS_CLANG 1 )
+ set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+else()
+ set( ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
+ unset( ANDROID_COMPILER_IS_CLANG CACHE )
+endif()
+
+string( REPLACE "." "" _clang_name "clang${ANDROID_CLANG_VERSION}" )
+if( NOT EXISTS "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}${TOOL_OS_SUFFIX}" )
+ set( _clang_name "clang" )
+endif()
+
+
+# setup paths and STL for NDK
+if( BUILD_WITH_ANDROID_NDK )
+ set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+ set( ANDROID_SYSROOT "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}" )
+
+ if( ANDROID_STL STREQUAL "none" )
+  # do nothing
+ elseif( ANDROID_STL STREQUAL "system" )
+  set( ANDROID_RTTI             OFF )
+  set( ANDROID_EXCEPTIONS       OFF )
+  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/system/include" )
+ elseif( ANDROID_STL STREQUAL "system_re" )
+  set( ANDROID_RTTI             ON )
+  set( ANDROID_EXCEPTIONS       ON )
+  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/system/include" )
+ elseif( ANDROID_STL MATCHES "gabi" )
+  if( ANDROID_NDK_RELEASE_NUM LESS 7000 ) # before r7
+   message( FATAL_ERROR "gabi++ is not available in your NDK. You have to upgrade to NDK r7 or newer to use gabi++.")
+  endif()
+  set( ANDROID_RTTI             ON )
+  set( ANDROID_EXCEPTIONS       OFF )
+  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/gabi++/include" )
+  set( __libstl                 "${ANDROID_NDK}/sources/cxx-stl/gabi++/libs/${ANDROID_NDK_ABI_NAME}/libgabi++_static.a" )
+ elseif( ANDROID_STL MATCHES "stlport" )
+  if( NOT ANDROID_NDK_RELEASE_NUM LESS 8004 ) # before r8d
+   set( ANDROID_EXCEPTIONS       ON )
+  else()
+   set( ANDROID_EXCEPTIONS       OFF )
+  endif()
+  if( ANDROID_NDK_RELEASE_NUM LESS 7000 ) # before r7
+   set( ANDROID_RTTI            OFF )
+  else()
+   set( ANDROID_RTTI            ON )
+  endif()
+  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/stlport/stlport" )
+  set( __libstl                 "${ANDROID_NDK}/sources/cxx-stl/stlport/libs/${ANDROID_NDK_ABI_NAME}/libstlport_static.a" )
+ elseif( ANDROID_STL MATCHES "gnustl" )
+  set( ANDROID_EXCEPTIONS       ON )
+  set( ANDROID_RTTI             ON )
+  if( EXISTS "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
+   if( ARMEABI_V7A AND ANDROID_COMPILER_VERSION VERSION_EQUAL "4.7" AND ANDROID_NDK_RELEASE STREQUAL "r8d" )
+    # gnustl binary for 4.7 compiler is buggy :(
+    # TODO: look for right fix
+    set( __libstl                "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/4.6" )
+   else()
+    set( __libstl                "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
+   endif()
+  else()
+   set( __libstl                "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++" )
+  endif()
+  set( ANDROID_STL_INCLUDE_DIRS "${__libstl}/include" "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/include" "${__libstl}/include/backward" )
+  if( EXISTS "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libgnustl_static.a" )
+   set( __libstl                "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libgnustl_static.a" )
+  else()
+   set( __libstl                "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" )
+  endif()
+ elseif( ANDROID_STL MATCHES "c\\+\\+" )
+  set( ANDROID_EXCEPTIONS       ON )
+  set( ANDROID_RTTI             ON )
+  set( __libstl                "${ANDROID_NDK}/sources/cxx-stl/llvm-libc++" )
+  set( __libstl                "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libc++_static.a" )
+  set( __libgnustl             "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
+  set( ANDROID_STL_INCLUDE_DIRS "${__libgnustl}/include" "${__libgnustl}/libs/${ANDROID_NDK_ABI_NAME}/include" "${__libgnustl}/include/backward" )
+ else()
+  message( FATAL_ERROR "Unknown runtime: ${ANDROID_STL}" )
+ endif()
+
+ # find libsupc++.a - rtti & exceptions
+ if( ANDROID_STL STREQUAL "system_re" OR ANDROID_STL MATCHES "gnustl" )
+  set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r8b or newer
+  if( NOT EXISTS "${__libsupcxx}" )
+   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r7-r8
+  endif()
+  if( NOT EXISTS "${__libsupcxx}" ) # before r7
+   if( ARMEABI_V7A )
+    if( ANDROID_FORCE_ARM_BUILD )
+     set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" )
+    else()
+     set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" )
+    endif()
+   elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD )
+    set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libsupc++.a" )
+   else()
+    set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" )
+   endif()
+  endif()
+  if( NOT EXISTS "${__libsupcxx}")
+   message( ERROR "Could not find libsupc++.a for a chosen platform. Either your NDK is not supported or is broken.")
+  endif()
+ endif()
+endif()
+
+
+# case of shared STL linkage
+if( ANDROID_STL MATCHES "shared" AND DEFINED __libstl )
+ string( REPLACE "_static.a" "_shared.so" __libstl "${__libstl}" )
+ if( NOT EXISTS "${__libstl}" )
+   message( FATAL_ERROR "Unable to find shared library ${__libstl}" )
+ endif()
+endif()
+
+
+# ccache support
+__INIT_VARIABLE( _ndk_ccache NDK_CCACHE ENV_NDK_CCACHE )
+if( _ndk_ccache )
+ if( DEFINED NDK_CCACHE AND NOT EXISTS NDK_CCACHE )
+  unset( NDK_CCACHE CACHE )
+ endif()
+ find_program( NDK_CCACHE "${_ndk_ccache}" DOC "The path to ccache binary")
+else()
+ unset( NDK_CCACHE CACHE )
+endif()
+unset( _ndk_ccache )
+
+
+# setup the cross-compiler
+if( NOT CMAKE_C_COMPILER )
+ if( NDK_CCACHE AND NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
+  set( CMAKE_C_COMPILER   "${NDK_CCACHE}" CACHE PATH "ccache as C compiler" )
+  set( CMAKE_CXX_COMPILER "${NDK_CCACHE}" CACHE PATH "ccache as C++ compiler" )
+  if( ANDROID_COMPILER_IS_CLANG )
+   set( CMAKE_C_COMPILER_ARG1   "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}${TOOL_OS_SUFFIX}"   CACHE PATH "C compiler")
+   set( CMAKE_CXX_COMPILER_ARG1 "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler")
+  else()
+   set( CMAKE_C_COMPILER_ARG1   "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc${TOOL_OS_SUFFIX}" CACHE PATH "C compiler")
+   set( CMAKE_CXX_COMPILER_ARG1 "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-g++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler")
+  endif()
+ else()
+  if( ANDROID_COMPILER_IS_CLANG )
+   set( CMAKE_C_COMPILER   "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}${TOOL_OS_SUFFIX}"   CACHE PATH "C compiler")
+   set( CMAKE_CXX_COMPILER "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler")
+  else()
+   set( CMAKE_C_COMPILER   "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc${TOOL_OS_SUFFIX}"    CACHE PATH "C compiler" )
+   set( CMAKE_CXX_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-g++${TOOL_OS_SUFFIX}"    CACHE PATH "C++ compiler" )
+  endif()
+ endif()
+ set( CMAKE_ASM_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc${TOOL_OS_SUFFIX}"     CACHE PATH "assembler" )
+ set( CMAKE_STRIP        "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-strip${TOOL_OS_SUFFIX}"   CACHE PATH "strip" )
+ if( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc-ar${TOOL_OS_SUFFIX}" )
+  # Use gcc-ar if we have it for better LTO support.
+  set( CMAKE_AR           "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc-ar${TOOL_OS_SUFFIX}"      CACHE PATH "archive" )
+ else()
+  set( CMAKE_AR           "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ar${TOOL_OS_SUFFIX}"      CACHE PATH "archive" )
+ endif()
+ set( CMAKE_LINKER       "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ld${TOOL_OS_SUFFIX}"      CACHE PATH "linker" )
+ set( CMAKE_NM           "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-nm${TOOL_OS_SUFFIX}"      CACHE PATH "nm" )
+ set( CMAKE_OBJCOPY      "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-objcopy${TOOL_OS_SUFFIX}" CACHE PATH "objcopy" )
+ set( CMAKE_OBJDUMP      "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-objdump${TOOL_OS_SUFFIX}" CACHE PATH "objdump" )
+ set( CMAKE_RANLIB       "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ranlib${TOOL_OS_SUFFIX}"  CACHE PATH "ranlib" )
+endif()
+
+set( _CMAKE_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_MACHINE_NAME}-" )
+if( CMAKE_VERSION VERSION_LESS 2.8.5 )
+ set( CMAKE_ASM_COMPILER_ARG1 "-c" )
+endif()
+if( APPLE )
+ find_program( CMAKE_INSTALL_NAME_TOOL NAMES install_name_tool )
+ if( NOT CMAKE_INSTALL_NAME_TOOL )
+  message( FATAL_ERROR "Could not find install_name_tool, please check your installation." )
+ endif()
+ mark_as_advanced( CMAKE_INSTALL_NAME_TOOL )
+endif()
+
+# Force set compilers because standard identification works badly for us
+include( CMakeForceCompiler )
+CMAKE_FORCE_C_COMPILER( "${CMAKE_C_COMPILER}" GNU )
+if( ANDROID_COMPILER_IS_CLANG )
+ set( CMAKE_C_COMPILER_ID Clang )
+endif()
+set( CMAKE_C_PLATFORM_ID Linux )
+if( X86_64 OR MIPS64 OR ARM64_V8A )
+ set( CMAKE_C_SIZEOF_DATA_PTR 8 )
+else()
+ set( CMAKE_C_SIZEOF_DATA_PTR 4 )
+endif()
+set( CMAKE_C_HAS_ISYSROOT 1 )
+set( CMAKE_C_COMPILER_ABI ELF )
+CMAKE_FORCE_CXX_COMPILER( "${CMAKE_CXX_COMPILER}" GNU )
+if( ANDROID_COMPILER_IS_CLANG )
+ set( CMAKE_CXX_COMPILER_ID Clang)
+endif()
+set( CMAKE_CXX_PLATFORM_ID Linux )
+set( CMAKE_CXX_SIZEOF_DATA_PTR ${CMAKE_C_SIZEOF_DATA_PTR} )
+set( CMAKE_CXX_HAS_ISYSROOT 1 )
+set( CMAKE_CXX_COMPILER_ABI ELF )
+set( CMAKE_CXX_SOURCE_FILE_EXTENSIONS cc cp cxx cpp CPP c++ C )
+# force ASM compiler (required for CMake < 2.8.5)
+set( CMAKE_ASM_COMPILER_ID_RUN TRUE )
+set( CMAKE_ASM_COMPILER_ID GNU )
+set( CMAKE_ASM_COMPILER_WORKS TRUE )
+set( CMAKE_ASM_COMPILER_FORCED TRUE )
+set( CMAKE_COMPILER_IS_GNUASM 1)
+set( CMAKE_ASM_SOURCE_FILE_EXTENSIONS s S asm )
+
+foreach( lang C CXX ASM )
+ if( ANDROID_COMPILER_IS_CLANG )
+  set( CMAKE_${lang}_COMPILER_VERSION ${ANDROID_CLANG_VERSION} )
+ else()
+  set( CMAKE_${lang}_COMPILER_VERSION ${ANDROID_COMPILER_VERSION} )
+ endif()
+endforeach()
+
+# flags and definitions
+remove_definitions( -DANDROID )
+add_definitions( -DANDROID )
+
+if( ANDROID_SYSROOT MATCHES "[ ;\"]" )
+ if( CMAKE_HOST_WIN32 )
+  # try to convert path to 8.3 form
+  file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "@echo %~s1" )
+  execute_process( COMMAND "$ENV{ComSpec}" /c "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "${ANDROID_SYSROOT}"
+                   OUTPUT_VARIABLE __path OUTPUT_STRIP_TRAILING_WHITESPACE
+                   RESULT_VARIABLE __result ERROR_QUIET )
+  if( __result EQUAL 0 )
+   file( TO_CMAKE_PATH "${__path}" ANDROID_SYSROOT )
+   set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
+  else()
+   set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+  endif()
+ else()
+  set( ANDROID_CXX_FLAGS "'--sysroot=${ANDROID_SYSROOT}'" )
+ endif()
+ if( NOT _CMAKE_IN_TRY_COMPILE )
+  # quotes can break try_compile and compiler identification
+  message(WARNING "Path to your Android NDK (or toolchain) has non-alphanumeric symbols.\nThe build might be broken.\n")
+ endif()
+else()
+ set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
+endif()
+
+# NDK flags
+if (ARM64_V8A )
+ set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS} -funwind-tables" )
+ set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer -fstrict-aliasing" )
+ set( ANDROID_CXX_FLAGS_DEBUG   "-fno-omit-frame-pointer -fno-strict-aliasing" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+  set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} -funswitch-loops -finline-limit=300" )
+ endif()
+elseif( ARMEABI OR ARMEABI_V7A OR ARMEABI_V7A_HARD)
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funwind-tables" )
+ if( NOT ANDROID_FORCE_ARM_BUILD AND NOT ARMEABI_V6 )
+  set( ANDROID_CXX_FLAGS_RELEASE "-mthumb -fomit-frame-pointer -fno-strict-aliasing" )
+  set( ANDROID_CXX_FLAGS_DEBUG   "-marm -fno-omit-frame-pointer -fno-strict-aliasing" )
+  if( NOT ANDROID_COMPILER_IS_CLANG )
+   set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -finline-limit=64" )
+  endif()
+ else()
+  # always compile ARMEABI_V6 in arm mode; otherwise there is no difference from ARMEABI
+  set( ANDROID_CXX_FLAGS_RELEASE "-marm -fomit-frame-pointer -fstrict-aliasing" )
+  set( ANDROID_CXX_FLAGS_DEBUG   "-marm -fno-omit-frame-pointer -fno-strict-aliasing" )
+  if( NOT ANDROID_COMPILER_IS_CLANG )
+   set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funswitch-loops -finline-limit=300" )
+  endif()
+ endif()
+elseif( X86 OR X86_64 )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funwind-tables" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funswitch-loops -finline-limit=300" )
+ endif()
+ set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer -fstrict-aliasing" )
+ set( ANDROID_CXX_FLAGS_DEBUG   "-fno-omit-frame-pointer -fno-strict-aliasing" )
+elseif( MIPS OR MIPS64 )
+ set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS} -fno-strict-aliasing -finline-functions -funwind-tables -fmessage-length=0" )
+ set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer" )
+ set( ANDROID_CXX_FLAGS_DEBUG   "-fno-omit-frame-pointer" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fno-inline-functions-called-once -fgcse-after-reload -frerun-cse-after-loop -frename-registers" )
+  set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} -funswitch-loops -finline-limit=300" )
+ endif()
+elseif()
+ set( ANDROID_CXX_FLAGS_RELEASE "" )
+ set( ANDROID_CXX_FLAGS_DEBUG   "" )
+endif()
+
+set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fsigned-char" ) # good/necessary when porting desktop libraries
+
+if( NOT X86 AND NOT ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "-Wno-psabi ${ANDROID_CXX_FLAGS}" )
+endif()
+
+if( NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.6" )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -no-canonical-prefixes" ) # see https://android-review.googlesource.com/#/c/47564/
+endif()
+
+# ABI-specific flags
+if( ARMEABI_V7A_HARD )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv7-a -mfloat-abi=hard -mhard-float -D_NDK_MATH_NO_SOFTFP=1" )
+ if( NEON )
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=neon" )
+ elseif( VFPV3 )
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3" )
+ else()
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3-d16" )
+ endif()
+elseif( ARMEABI_V7A )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv7-a -mfloat-abi=softfp" )
+ if( NEON )
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=neon" )
+ elseif( VFPV3 )
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3" )
+ else()
+  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3-d16" )
+ endif()
+
+elseif( ARMEABI_V6 )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv6 -mfloat-abi=softfp -mfpu=vfp" ) # vfp == vfpv2
+elseif( ARMEABI )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv5te -mtune=xscale -msoft-float" )
+endif()
+
+if( ANDROID_STL MATCHES "gnustl" AND (EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}") )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+else()
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+endif()
+
+# STL
+if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
+ if( EXISTS "${__libstl}" )
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libstl}\"" )
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libstl}\"" )
+  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} \"${__libstl}\"" )
+ endif()
+ if( EXISTS "${__libsupcxx}" )
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libsupcxx}\"" )
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libsupcxx}\"" )
+  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
+  # C objects:
+  set( CMAKE_C_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_C_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_C_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+  set( CMAKE_C_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_C_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_C_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+  set( CMAKE_C_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_C_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+  set( CMAKE_C_CREATE_SHARED_LIBRARY "${CMAKE_C_CREATE_SHARED_LIBRARY} \"${__libsupcxx}\"" )
+  set( CMAKE_C_CREATE_SHARED_MODULE  "${CMAKE_C_CREATE_SHARED_MODULE} \"${__libsupcxx}\"" )
+  set( CMAKE_C_LINK_EXECUTABLE       "${CMAKE_C_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
+ endif()
+ if( ANDROID_STL MATCHES "gnustl" )
+  if( NOT EXISTS "${ANDROID_LIBM_PATH}" )
+   set( ANDROID_LIBM_PATH -lm )
+  endif()
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} ${ANDROID_LIBM_PATH}" )
+ endif()
+endif()
+
+# variables controlling optional build flags
+if( ANDROID_NDK_RELEASE_NUM LESS 7000 ) # before r7
+ # libGLESv2.so in NDK's prior to r7 refers to missing external symbols.
+ # So this flag option is required for all projects using OpenGL from native.
+ __INIT_VARIABLE( ANDROID_SO_UNDEFINED                      VALUES ON )
+else()
+ __INIT_VARIABLE( ANDROID_SO_UNDEFINED                      VALUES OFF )
+endif()
+__INIT_VARIABLE( ANDROID_NO_UNDEFINED                       VALUES ON )
+__INIT_VARIABLE( ANDROID_FUNCTION_LEVEL_LINKING             VALUES ON )
+__INIT_VARIABLE( ANDROID_GOLD_LINKER                        VALUES ON )
+__INIT_VARIABLE( ANDROID_NOEXECSTACK                        VALUES ON )
+__INIT_VARIABLE( ANDROID_RELRO                              VALUES ON )
+
+set( ANDROID_NO_UNDEFINED           ${ANDROID_NO_UNDEFINED}           CACHE BOOL "Show all undefined symbols as linker errors" )
+set( ANDROID_SO_UNDEFINED           ${ANDROID_SO_UNDEFINED}           CACHE BOOL "Allows or disallows undefined symbols in shared libraries" )
+set( ANDROID_FUNCTION_LEVEL_LINKING ${ANDROID_FUNCTION_LEVEL_LINKING} CACHE BOOL "Put each function in separate section and enable garbage collection of unused input sections at link time" )
+set( ANDROID_GOLD_LINKER            ${ANDROID_GOLD_LINKER}            CACHE BOOL "Enables gold linker" )
+set( ANDROID_NOEXECSTACK            ${ANDROID_NOEXECSTACK}            CACHE BOOL "Allows or disallows undefined symbols in shared libraries" )
+set( ANDROID_RELRO                  ${ANDROID_RELRO}                  CACHE BOOL "Enables RELRO - a memory corruption mitigation technique" )
+mark_as_advanced( ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_FUNCTION_LEVEL_LINKING ANDROID_GOLD_LINKER ANDROID_NOEXECSTACK ANDROID_RELRO )
+
+# linker flags
+set( ANDROID_LINKER_FLAGS "" )
+
+if( ARMEABI_V7A )
+ # this is *required* to use the following linker flags that routes around
+ # a CPU bug in some Cortex-A8 implementations:
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--fix-cortex-a8" )
+endif()
+
+if( ARMEABI_V7A_HARD )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-warn-mismatch -lm_hard" )
+endif()
+
+if( ANDROID_NO_UNDEFINED )
+ if( MIPS )
+  # there is some sysroot-related problem in mips linker...
+  if( NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
+   set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined -Wl,-rpath-link,${ANDROID_SYSROOT}/usr/lib" )
+  endif()
+ else()
+  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ endif()
+endif()
+
+if( ANDROID_SO_UNDEFINED )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,-allow-shlib-undefined" )
+endif()
+
+if( ANDROID_FUNCTION_LEVEL_LINKING )
+ set( ANDROID_CXX_FLAGS    "${ANDROID_CXX_FLAGS} -fdata-sections -ffunction-sections" )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--gc-sections" )
+endif()
+
+if( ANDROID_COMPILER_VERSION VERSION_EQUAL "4.6" )
+ if( ANDROID_GOLD_LINKER AND (CMAKE_HOST_UNIX OR ANDROID_NDK_RELEASE_NUM GREATER 8002) AND (ARMEABI OR ARMEABI_V7A OR ARMEABI_V7A_HARD OR X86) )
+  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=gold" )
+ elseif( ANDROID_NDK_RELEASE_NUM GREATER 8002 ) # after r8b
+  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=bfd" )
+ elseif( ANDROID_NDK_RELEASE STREQUAL "r8b" AND ARMEABI AND NOT _CMAKE_IN_TRY_COMPILE )
+  message( WARNING "The default bfd linker from arm GCC 4.6 toolchain can fail with 'unresolvable R_ARM_THM_CALL relocation' error message. See https://code.google.com/p/android/issues/detail?id=35342
+  On Linux and OS X host platform you can workaround this problem using gold linker (default).
+  Rerun cmake with -DANDROID_GOLD_LINKER=ON option in case of problems.
+" )
+ endif()
+endif() # version 4.6
+
+if( ANDROID_NOEXECSTACK )
+ if( ANDROID_COMPILER_IS_CLANG )
+  set( ANDROID_CXX_FLAGS    "${ANDROID_CXX_FLAGS} -Xclang -mnoexecstack" )
+ else()
+  set( ANDROID_CXX_FLAGS    "${ANDROID_CXX_FLAGS} -Wa,--noexecstack" )
+ endif()
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,-z,noexecstack" )
+endif()
+
+if( ANDROID_RELRO )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,-z,relro -Wl,-z,now" )
+endif()
+
+if( ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "-target ${ANDROID_LLVM_TRIPLE} -Qunused-arguments ${ANDROID_CXX_FLAGS}" )
+ if( BUILD_WITH_ANDROID_NDK )
+  set( ANDROID_CXX_FLAGS "-gcc-toolchain ${ANDROID_TOOLCHAIN_ROOT} ${ANDROID_CXX_FLAGS}" )
+ endif()
+endif()
+
+# cache flags
+set( CMAKE_CXX_FLAGS           ""                        CACHE STRING "c++ flags" )
+set( CMAKE_C_FLAGS             ""                        CACHE STRING "c flags" )
+set( CMAKE_CXX_FLAGS_RELEASE   "-O3 -DNDEBUG"            CACHE STRING "c++ Release flags" )
+set( CMAKE_C_FLAGS_RELEASE     "-O3 -DNDEBUG"            CACHE STRING "c Release flags" )
+set( CMAKE_CXX_FLAGS_DEBUG     "-O0 -g -DDEBUG -D_DEBUG" CACHE STRING "c++ Debug flags" )
+set( CMAKE_C_FLAGS_DEBUG       "-O0 -g -DDEBUG -D_DEBUG" CACHE STRING "c Debug flags" )
+set( CMAKE_SHARED_LINKER_FLAGS ""                        CACHE STRING "shared linker flags" )
+set( CMAKE_MODULE_LINKER_FLAGS ""                        CACHE STRING "module linker flags" )
+set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-z,nocopyreloc"      CACHE STRING "executable linker flags" )
+
+# put flags to cache (for debug purpose only)
+set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS}"         CACHE INTERNAL "Android specific c/c++ flags" )
+set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE}" CACHE INTERNAL "Android specific c/c++ Release flags" )
+set( ANDROID_CXX_FLAGS_DEBUG   "${ANDROID_CXX_FLAGS_DEBUG}"   CACHE INTERNAL "Android specific c/c++ Debug flags" )
+set( ANDROID_LINKER_FLAGS      "${ANDROID_LINKER_FLAGS}"      CACHE INTERNAL "Android specific c/c++ linker flags" )
+
+# finish flags
+set( CMAKE_CXX_FLAGS           "${ANDROID_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" )
+set( CMAKE_C_FLAGS             "${ANDROID_CXX_FLAGS} ${CMAKE_C_FLAGS}" )
+set( CMAKE_CXX_FLAGS_RELEASE   "${ANDROID_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}" )
+set( CMAKE_C_FLAGS_RELEASE     "${ANDROID_CXX_FLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}" )
+set( CMAKE_CXX_FLAGS_DEBUG     "${ANDROID_CXX_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}" )
+set( CMAKE_C_FLAGS_DEBUG       "${ANDROID_CXX_FLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}" )
+set( CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}" )
+set( CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}" )
+set( CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )
+
+if( MIPS AND BUILD_WITH_ANDROID_NDK AND ANDROID_NDK_RELEASE STREQUAL "r8" )
+ set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
+ set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
+ set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
+endif()
+
+# pie/pic
+if( NOT (ANDROID_NATIVE_API_LEVEL LESS 16) AND (NOT DEFINED ANDROID_APP_PIE OR ANDROID_APP_PIE) AND (CMAKE_VERSION VERSION_GREATER 2.8.8) )
+ set( CMAKE_POSITION_INDEPENDENT_CODE TRUE )
+ set( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fPIE -pie")
+else()
+ set( CMAKE_POSITION_INDEPENDENT_CODE FALSE )
+ set( CMAKE_CXX_FLAGS "-fpic ${CMAKE_CXX_FLAGS}" )
+ set( CMAKE_C_FLAGS   "-fpic ${CMAKE_C_FLAGS}" )
+endif()
+
+# configure rtti
+if( DEFINED ANDROID_RTTI AND ANDROID_STL_FORCE_FEATURES )
+ if( ANDROID_RTTI )
+  set( CMAKE_CXX_FLAGS "-frtti ${CMAKE_CXX_FLAGS}" )
+ else()
+  set( CMAKE_CXX_FLAGS "-fno-rtti ${CMAKE_CXX_FLAGS}" )
+ endif()
+endif()
+
+# configure exceptios
+if( DEFINED ANDROID_EXCEPTIONS AND ANDROID_STL_FORCE_FEATURES )
+ if( ANDROID_EXCEPTIONS )
+  set( CMAKE_CXX_FLAGS "-fexceptions ${CMAKE_CXX_FLAGS}" )
+  set( CMAKE_C_FLAGS "-fexceptions ${CMAKE_C_FLAGS}" )
+ else()
+  set( CMAKE_CXX_FLAGS "-fno-exceptions ${CMAKE_CXX_FLAGS}" )
+  set( CMAKE_C_FLAGS "-fno-exceptions ${CMAKE_C_FLAGS}" )
+ endif()
+endif()
+
+# global includes and link directories
+include_directories( SYSTEM "${ANDROID_SYSROOT}/usr/include" ${ANDROID_STL_INCLUDE_DIRS} )
+get_filename_component(__android_install_path "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" ABSOLUTE) # avoid CMP0015 policy warning
+link_directories( "${__android_install_path}" )
+
+# detect if need link crtbegin_so.o explicitly
+if( NOT DEFINED ANDROID_EXPLICIT_CRT_LINK )
+ set( __cmd "${CMAKE_CXX_CREATE_SHARED_LIBRARY}" )
+ string( REPLACE "<CMAKE_CXX_COMPILER>" "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_C_COMPILER>"   "${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}"   __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CXX_FLAGS>" "${CMAKE_CXX_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<LANGUAGE_COMPILE_FLAGS>" "" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_FLAGS>" "${CMAKE_SHARED_LINKER_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS>" "-shared" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET_SONAME>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET>" "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain_crtlink_test.so" __cmd "${__cmd}" )
+ string( REPLACE "<OBJECTS>" "\"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_LIBRARIES>" "" __cmd "${__cmd}" )
+ separate_arguments( __cmd )
+ foreach( __var ANDROID_NDK ANDROID_NDK_TOOLCHAINS_PATH ANDROID_STANDALONE_TOOLCHAIN )
+  if( ${__var} )
+   set( __tmp "${${__var}}" )
+   separate_arguments( __tmp )
+   string( REPLACE "${__tmp}" "${${__var}}" __cmd "${__cmd}")
+  endif()
+ endforeach()
+ string( REPLACE "'" "" __cmd "${__cmd}" )
+ string( REPLACE "\"" "" __cmd "${__cmd}" )
+ execute_process( COMMAND ${__cmd} RESULT_VARIABLE __cmd_result OUTPUT_QUIET ERROR_QUIET )
+ if( __cmd_result EQUAL 0 )
+  set( ANDROID_EXPLICIT_CRT_LINK ON )
+ else()
+  set( ANDROID_EXPLICIT_CRT_LINK OFF )
+ endif()
+endif()
+
+if( ANDROID_EXPLICIT_CRT_LINK )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+endif()
+
+# setup output directories
+set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )
+
+if( DEFINED LIBRARY_OUTPUT_PATH_ROOT
+      OR EXISTS "${CMAKE_SOURCE_DIR}/AndroidManifest.xml"
+      OR (EXISTS "${CMAKE_SOURCE_DIR}/../AndroidManifest.xml" AND EXISTS "${CMAKE_SOURCE_DIR}/../jni/") )
+  set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "Root for binaries output, set this to change where Android libs are installed to" )
+  if( NOT _CMAKE_IN_TRY_COMPILE )
+    if( EXISTS "${CMAKE_SOURCE_DIR}/jni/CMakeLists.txt" )
+      set( EXECUTABLE_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/bin/${ANDROID_NDK_ABI_NAME}" CACHE PATH "Output directory for applications" )
+    else()
+      set( EXECUTABLE_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/bin" CACHE PATH "Output directory for applications" )
+    endif()
+    set( LIBRARY_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/libs/${ANDROID_NDK_ABI_NAME}" CACHE PATH "Output directory for Android libs" )
+  endif()
+endif()
+
+# copy shaed stl library to build directory
+if( NOT _CMAKE_IN_TRY_COMPILE AND __libstl MATCHES "[.]so$" AND DEFINED LIBRARY_OUTPUT_PATH )
+  get_filename_component( __libstlname "${__libstl}" NAME )
+  execute_process( COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${__libstl}" "${LIBRARY_OUTPUT_PATH}/${__libstlname}" RESULT_VARIABLE __fileCopyProcess )
+  if( NOT __fileCopyProcess EQUAL 0 OR NOT EXISTS "${LIBRARY_OUTPUT_PATH}/${__libstlname}")
+    message( SEND_ERROR "Failed copying of ${__libstl} to the ${LIBRARY_OUTPUT_PATH}/${__libstlname}" )
+  endif()
+  unset( __fileCopyProcess )
+  unset( __libstlname )
+endif()
+
+
+# set these global flags for cmake client scripts to change behavior
+set( ANDROID True )
+set( BUILD_ANDROID True )
+
+# where is the target environment
+set( CMAKE_FIND_ROOT_PATH "${ANDROID_TOOLCHAIN_ROOT}/bin" "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}" "${ANDROID_SYSROOT}" "${CMAKE_INSTALL_PREFIX}" "${CMAKE_INSTALL_PREFIX}/share" )
+
+# only search for libraries and includes in the ndk toolchain
+set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY )
+set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
+set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
+
+
+# macro to find packages on the host OS
+macro( find_host_package )
+ set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
+ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
+ set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER )
+ if( CMAKE_HOST_WIN32 )
+  SET( WIN32 1 )
+  SET( UNIX )
+ elseif( CMAKE_HOST_APPLE )
+  SET( APPLE 1 )
+  SET( UNIX )
+ endif()
+ find_package( ${ARGN} )
+ SET( WIN32 )
+ SET( APPLE )
+ SET( UNIX 1 )
+ set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY )
+ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
+ set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
+endmacro()
+
+
+# macro to find programs on the host OS
+macro( find_host_program )
+ set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
+ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
+ set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER )
+ if( CMAKE_HOST_WIN32 )
+  SET( WIN32 1 )
+  SET( UNIX )
+ elseif( CMAKE_HOST_APPLE )
+  SET( APPLE 1 )
+  SET( UNIX )
+ endif()
+ find_program( ${ARGN} )
+ SET( WIN32 )
+ SET( APPLE )
+ SET( UNIX 1 )
+ set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY )
+ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
+ set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
+endmacro()
+
+
+# export toolchain settings for the try_compile() command
+if( NOT _CMAKE_IN_TRY_COMPILE )
+ set( __toolchain_config "")
+ foreach( __var NDK_CCACHE  LIBRARY_OUTPUT_PATH_ROOT  ANDROID_FORBID_SYGWIN
+                ANDROID_NDK_HOST_X64
+                ANDROID_NDK
+                ANDROID_NDK_LAYOUT
+                ANDROID_STANDALONE_TOOLCHAIN
+                ANDROID_TOOLCHAIN_NAME
+                ANDROID_ABI
+                ANDROID_NATIVE_API_LEVEL
+                ANDROID_STL
+                ANDROID_STL_FORCE_FEATURES
+                ANDROID_FORCE_ARM_BUILD
+                ANDROID_NO_UNDEFINED
+                ANDROID_SO_UNDEFINED
+                ANDROID_FUNCTION_LEVEL_LINKING
+                ANDROID_GOLD_LINKER
+                ANDROID_NOEXECSTACK
+                ANDROID_RELRO
+                ANDROID_LIBM_PATH
+                ANDROID_EXPLICIT_CRT_LINK
+                ANDROID_APP_PIE
+                )
+  if( DEFINED ${__var} )
+   if( ${__var} MATCHES " ")
+    set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" CACHE INTERNAL \"\" )\n" )
+   else()
+    set( __toolchain_config "${__toolchain_config}set( ${__var} ${${__var}} CACHE INTERNAL \"\" )\n" )
+   endif()
+  endif()
+ endforeach()
+ file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/android.toolchain.config.cmake" "${__toolchain_config}" )
+ unset( __toolchain_config )
+endif()
+
+
+# force cmake to produce / instead of \ in build commands for Ninja generator
+if( CMAKE_GENERATOR MATCHES "Ninja" AND CMAKE_HOST_WIN32 )
+ # it is a bad hack after all
+ # CMake generates Ninja makefiles with UNIX paths only if it thinks that we are going to build with MinGW
+ set( CMAKE_COMPILER_IS_MINGW TRUE ) # tell CMake that we are MinGW
+ set( CMAKE_CROSSCOMPILING TRUE )    # stop recursion
+ enable_language( C )
+ enable_language( CXX )
+ # unset( CMAKE_COMPILER_IS_MINGW ) # can't unset because CMake does not convert back-slashes in response files without it
+ unset( MINGW )
+endif()
+
+
+# Variables controlling behavior or set by cmake toolchain:
+#   ANDROID_ABI : "armeabi-v7a" (default), "armeabi", "armeabi-v7a with NEON", "armeabi-v7a-hard with NEON", "armeabi-v7a with VFPV3", "armeabi-v6 with VFP", "x86", "mips", "arm64-v8a", "x86_64", "mips64"
+#   ANDROID_NATIVE_API_LEVEL : 3,4,5,8,9,14,15,16,17,18,19,21 (depends on NDK version)
+#   ANDROID_STL : gnustl_static/gnustl_shared/stlport_static/stlport_shared/gabi++_static/gabi++_shared/system_re/system/none
+#   ANDROID_FORBID_SYGWIN : ON/OFF
+#   ANDROID_NO_UNDEFINED : ON/OFF
+#   ANDROID_SO_UNDEFINED : OFF/ON  (default depends on NDK version)
+#   ANDROID_FUNCTION_LEVEL_LINKING : ON/OFF
+#   ANDROID_GOLD_LINKER : ON/OFF
+#   ANDROID_NOEXECSTACK : ON/OFF
+#   ANDROID_RELRO : ON/OFF
+#   ANDROID_FORCE_ARM_BUILD : ON/OFF
+#   ANDROID_STL_FORCE_FEATURES : ON/OFF
+#   ANDROID_LIBM_PATH : path to libm.so (set to something like $(TOP)/out/target/product/<product_name>/obj/lib/libm.so) to workaround unresolved `sincos`
+# Can be set only at the first run:
+#   ANDROID_NDK : path to your NDK install
+#   NDK_CCACHE : path to your ccache executable
+#   ANDROID_TOOLCHAIN_NAME : the NDK name of compiler toolchain
+#   ANDROID_NDK_HOST_X64 : try to use x86_64 toolchain (default for x64 host systems)
+#   ANDROID_NDK_LAYOUT : the inner NDK structure (RELEASE, LINARO, ANDROID)
+#   LIBRARY_OUTPUT_PATH_ROOT : <any valid path>
+#   ANDROID_STANDALONE_TOOLCHAIN
+#
+# Primary read-only variables:
+#   ANDROID : always TRUE
+#   ARMEABI : TRUE for arm v6 and older devices
+#   ARMEABI_V6 : TRUE for arm v6
+#   ARMEABI_V7A : TRUE for arm v7a
+#   ARMEABI_V7A_HARD : TRUE for arm v7a with hardfp
+#   ARM64_V8A : TRUE for arm64-v8a
+#   NEON : TRUE if NEON unit is enabled
+#   VFPV3 : TRUE if VFP version 3 is enabled
+#   X86 : TRUE if configured for x86
+#   X86_64 : TRUE if configured for x86_64
+#   MIPS : TRUE if configured for mips
+#   MIPS64 : TRUE if configured for mips64
+#   BUILD_WITH_ANDROID_NDK : TRUE if NDK is used
+#   BUILD_WITH_STANDALONE_TOOLCHAIN : TRUE if standalone toolchain is used
+#   ANDROID_NDK_HOST_SYSTEM_NAME : "windows", "linux-x86" or "darwin-x86" depending on host platform
+#   ANDROID_NDK_ABI_NAME : "armeabi", "armeabi-v7a", "armeabi-v7a-hard", "x86", "mips", "arm64-v8a", "x86_64", "mips64" depending on ANDROID_ABI
+#   ANDROID_NDK_RELEASE : from r5 to r10d; set only for NDK
+#   ANDROID_NDK_RELEASE_NUM : numeric ANDROID_NDK_RELEASE version (1000*major+minor)
+#   ANDROID_ARCH_NAME : "arm", "x86", "mips", "arm64", "x86_64", "mips64" depending on ANDROID_ABI
+#   ANDROID_SYSROOT : path to the compiler sysroot
+#   TOOL_OS_SUFFIX : "" or ".exe" depending on host platform
+#   ANDROID_COMPILER_IS_CLANG : TRUE if clang compiler is used
+#
+# Secondary (less stable) read-only variables:
+#   ANDROID_COMPILER_VERSION : GCC version used (not Clang version)
+#   ANDROID_CLANG_VERSION : version of clang compiler if clang is used
+#   ANDROID_CXX_FLAGS : C/C++ compiler flags required by Android platform
+#   ANDROID_SUPPORTED_ABIS : list of currently allowed values for ANDROID_ABI
+#   ANDROID_TOOLCHAIN_MACHINE_NAME : "arm-linux-androideabi", "arm-eabi" or "i686-android-linux"
+#   ANDROID_TOOLCHAIN_ROOT : path to the top level of toolchain (standalone or placed inside NDK)
+#   ANDROID_CLANG_TOOLCHAIN_ROOT : path to clang tools
+#   ANDROID_SUPPORTED_NATIVE_API_LEVELS : list of native API levels found inside NDK
+#   ANDROID_STL_INCLUDE_DIRS : stl include paths
+#   ANDROID_RTTI : if rtti is enabled by the runtime
+#   ANDROID_EXCEPTIONS : if exceptions are enabled by the runtime
+#   ANDROID_GCC_TOOLCHAIN_NAME : read-only, differs from ANDROID_TOOLCHAIN_NAME only if clang is used
+#
+# Defaults:
+#   ANDROID_DEFAULT_NDK_API_LEVEL
+#   ANDROID_DEFAULT_NDK_API_LEVEL_${ARCH}
+#   ANDROID_NDK_SEARCH_PATHS
+#   ANDROID_SUPPORTED_ABIS_${ARCH}
+#   ANDROID_SUPPORTED_NDK_VERSIONS
diff --git a/build.sh b/build.sh
new file mode 100644
index 00000000000..ac4bfa7a81b
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/bash
+
+##### android armv7
+mkdir -p build-android-armv7
+pushd build-android-armv7
+cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="armeabi-v7a with NEON" -DANDROID_NATIVE_API_LEVEL=android-9 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF ..
+make
+make install
+popd
+
+##### android aarch64
+mkdir -p build-android-aarch64
+pushd build-android-aarch64
+cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_NATIVE_API_LEVEL=android-21 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF ..
+make
+make install
+popd
+
+##### ios armv7 arm64
+mkdir -p build-ios
+pushd build-ios
+cmake -DCMAKE_TOOLCHAIN_FILE=../iosxc.toolchain.cmake ..
+make
+make install
+popd
+
+##### ios simulator i386 x86_64
+mkdir -p build-ios-sim
+pushd build-ios-sim
+cmake -DCMAKE_TOOLCHAIN_FILE=../iossimxc.toolchain.cmake ..
+make
+make install
+popd
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 00000000000..2373832bd09
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,9 @@
+
+find_package(OpenCV REQUIRED core highgui imgproc)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src)
+
+add_executable(squeezenet squeezenet.cpp)
+
+target_link_libraries(squeezenet ncnn ${OpenCV_LIBS})
diff --git a/examples/squeezencnn/AndroidManifest.xml b/examples/squeezencnn/AndroidManifest.xml
new file mode 100644
index 00000000000..5624e012f60
--- /dev/null
+++ b/examples/squeezencnn/AndroidManifest.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+      package="com.tencent.squeezencnn"
+      android:versionCode="1"
+      android:versionName="1.1">
+    <application android:label="@string/app_name" >
+        <activity android:name="MainActivity"
+                  android:label="@string/app_name">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+</manifest> 
diff --git a/examples/squeezencnn/ant.properties b/examples/squeezencnn/ant.properties
new file mode 100644
index 00000000000..9281e74f1c2
--- /dev/null
+++ b/examples/squeezencnn/ant.properties
@@ -0,0 +1,21 @@
+# This file is used to override default values used by the Ant build system.
+#
+# This file must be checked into Version Control Systems, as it is
+# integral to the build system of your project.
+
+# This file is only used by the Ant script.
+
+# You can use this to override default values such as
+#  'source.dir' for the location of your java source folder and
+#  'out.dir' for the location of your output folder.
+
+# You can also use it define how the release builds are signed by declaring
+# the following properties:
+#  'key.store' for the location of your keystore and
+#  'key.alias' for the name of the key to use.
+# The password will be asked during the build when you use the 'release' target.
+
+key.store=/home/nihui/osd/nihuini-release-key.keystore
+key.alias=nihuini
+key.store.password=nihuini
+key.alias.password=nihuini
diff --git a/examples/squeezencnn/assets/squeezenet_v1.1.bin b/examples/squeezencnn/assets/squeezenet_v1.1.bin
new file mode 120000
index 00000000000..655c56c35be
--- /dev/null
+++ b/examples/squeezencnn/assets/squeezenet_v1.1.bin
@@ -0,0 +1 @@
+../../squeezenet_v1.1.bin
\ No newline at end of file
diff --git a/examples/squeezencnn/assets/squeezenet_v1.1.param.bin b/examples/squeezencnn/assets/squeezenet_v1.1.param.bin
new file mode 100644
index 00000000000..c419dc9e1bd
Binary files /dev/null and b/examples/squeezencnn/assets/squeezenet_v1.1.param.bin differ
diff --git a/examples/squeezencnn/assets/synset_words.txt b/examples/squeezencnn/assets/synset_words.txt
new file mode 120000
index 00000000000..f84db6c2fce
--- /dev/null
+++ b/examples/squeezencnn/assets/synset_words.txt
@@ -0,0 +1 @@
+../../synset_words.txt
\ No newline at end of file
diff --git a/examples/squeezencnn/build.xml b/examples/squeezencnn/build.xml
new file mode 100644
index 00000000000..47b725e7431
--- /dev/null
+++ b/examples/squeezencnn/build.xml
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project name="squeezencnn" default="help">
+
+    <!-- The local.properties file is created and updated by the 'android' tool.
+         It contains the path to the SDK. It should *NOT* be checked into
+         Version Control Systems. -->
+    <property file="local.properties" />
+
+    <!-- The ant.properties file can be created by you. It is only edited by the
+         'android' tool to add properties to it.
+         This is the place to change some Ant specific build properties.
+         Here are some properties you may want to change/update:
+
+         source.dir
+             The name of the source directory. Default is 'src'.
+         out.dir
+             The name of the output directory. Default is 'bin'.
+
+         For other overridable properties, look at the beginning of the rules
+         files in the SDK, at tools/ant/build.xml
+
+         Properties related to the SDK location or the project target should
+         be updated using the 'android' tool with the 'update' action.
+
+         This file is an integral part of the build system for your
+         application and should be checked into Version Control Systems.
+
+         -->
+    <property file="ant.properties" />
+
+    <!-- if sdk.dir was not set from one of the property file, then
+         get it from the ANDROID_HOME env var.
+         This must be done before we load project.properties since
+         the proguard config can use sdk.dir -->
+    <property environment="env" />
+    <condition property="sdk.dir" value="${env.ANDROID_HOME}">
+        <isset property="env.ANDROID_HOME" />
+    </condition>
+
+    <!-- The project.properties file is created and updated by the 'android'
+         tool, as well as ADT.
+
+         This contains project specific properties such as project target, and library
+         dependencies. Lower level build properties are stored in ant.properties
+         (or in .classpath for Eclipse projects).
+
+         This file is an integral part of the build system for your
+         application and should be checked into Version Control Systems. -->
+    <loadproperties srcFile="project.properties" />
+
+    <!-- quick check on sdk.dir -->
+    <fail
+            message="sdk.dir is missing. Make sure to generate local.properties using 'android update project' or to inject it through the ANDROID_HOME environment variable."
+            unless="sdk.dir"
+    />
+
+    <!--
+        Import per project custom build rules if present at the root of the project.
+        This is the place to put custom intermediary targets such as:
+            -pre-build
+            -pre-compile
+            -post-compile (This is typically used for code obfuscation.
+                           Compiled code location: ${out.classes.absolute.dir}
+                           If this is not done in place, override ${out.dex.input.absolute.dir})
+            -post-package
+            -post-build
+            -pre-clean
+    -->
+    <import file="custom_rules.xml" optional="true" />
+
+    <!-- Import the actual build file.
+
+         To customize existing targets, there are two options:
+         - Customize only one target:
+             - copy/paste the target into this file, *before* the
+               <import> task.
+             - customize it to your needs.
+         - Customize the whole content of build.xml
+             - copy/paste the content of the rules files (minus the top node)
+               into this file, replacing the <import> task.
+             - customize to your needs.
+
+         ***********************
+         ****** IMPORTANT ******
+         ***********************
+         In all cases you must update the value of version-tag below to read 'custom' instead of an integer,
+         in order to avoid having your file be overridden by tools such as "android update project"
+    -->
+    <!-- version-tag: 1 -->
+    <import file="${sdk.dir}/tools/ant/build.xml" />
+
+</project>
diff --git a/examples/squeezencnn/jni/Android.mk b/examples/squeezencnn/jni/Android.mk
new file mode 100644
index 00000000000..11f00a3b9a1
--- /dev/null
+++ b/examples/squeezencnn/jni/Android.mk
@@ -0,0 +1,30 @@
+LOCAL_PATH := $(call my-dir)
+
+# change this folder path to yours
+NCNN_INSTALL_PATH := /home/nihui/dev/qqfacecnn/ncnn/build-android-armv7/install
+
+include $(CLEAR_VARS)
+LOCAL_MODULE := ncnn
+LOCAL_SRC_FILES := $(NCNN_INSTALL_PATH)/lib/libncnn.a
+include $(PREBUILT_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := squeezencnn
+LOCAL_SRC_FILES := squeezencnn_jni.cpp
+
+LOCAL_C_INCLUDES := $(NCNN_INSTALL_PATH)/include
+
+LOCAL_STATIC_LIBRARIES := ncnn
+
+LOCAL_CFLAGS := -O2 -fvisibility=hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math
+LOCAL_CPPFLAGS := -O2 -fvisibility=hidden -fvisibility-inlines-hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math
+LOCAL_LDFLAGS += -Wl,--gc-sections
+
+LOCAL_CFLAGS += -fopenmp
+LOCAL_CPPFLAGS += -fopenmp
+LOCAL_LDFLAGS += -fopenmp
+
+LOCAL_LDLIBS := -lz -llog -ljnigraphics
+
+include $(BUILD_SHARED_LIBRARY)
diff --git a/examples/squeezencnn/jni/Application.mk b/examples/squeezencnn/jni/Application.mk
new file mode 100644
index 00000000000..a98c0484adc
--- /dev/null
+++ b/examples/squeezencnn/jni/Application.mk
@@ -0,0 +1,7 @@
+
+# APP_STL := stlport_static
+APP_STL := gnustl_static
+# APP_ABI := armeabi armeabi-v7a
+APP_ABI := armeabi-v7a
+APP_PLATFORM := android-9
+NDK_TOOLCHAIN_VERSION := 4.9
diff --git a/examples/squeezencnn/jni/squeezencnn_jni.cpp b/examples/squeezencnn/jni/squeezencnn_jni.cpp
new file mode 100644
index 00000000000..036f1435845
--- /dev/null
+++ b/examples/squeezencnn/jni/squeezencnn_jni.cpp
@@ -0,0 +1,181 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <android/bitmap.h>
+#include <android/log.h>
+
+#include <jni.h>
+
+#include <string>
+#include <vector>
+
+// ncnn
+#include "net.h"
+
+#include "squeezenet_v1.1.id.h"
+
+#include <sys/time.h>
+#include <unistd.h>
+
+static struct timeval tv_begin;
+static struct timeval tv_end;
+static double elasped;
+
+static void bench_start()
+{
+    gettimeofday(&tv_begin, NULL);
+}
+
+static void bench_end(const char* comment)
+{
+    gettimeofday(&tv_end, NULL);
+    elasped = ((tv_end.tv_sec - tv_begin.tv_sec) * 1000000.0f + tv_end.tv_usec - tv_begin.tv_usec) / 1000.0f;
+//     fprintf(stderr, "%.2fms   %s\n", elasped, comment);
+    __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "%.2fms   %s", elasped, comment);
+}
+
+static std::vector<unsigned char> squeezenet_param;
+static std::vector<unsigned char> squeezenet_bin;
+static std::vector<std::string> squeezenet_words;
+static ncnn::Net squeezenet;
+
+static std::vector<std::string> split_string(const std::string& str, const std::string& delimiter)
+{
+    std::vector<std::string> strings;
+
+    std::string::size_type pos = 0;
+    std::string::size_type prev = 0;
+    while ((pos = str.find(delimiter, prev)) != std::string::npos)
+    {
+        strings.push_back(str.substr(prev, pos - prev));
+        prev = pos + 1;
+    }
+
+    // To get the last substring (or only, if delimiter is not found)
+    strings.push_back(str.substr(prev));
+
+    return strings;
+}
+
+extern "C" {
+
+// public native boolean Init(byte[] param, byte[] bin, byte[] words);
+JNIEXPORT jboolean JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Init(JNIEnv* env, jobject thiz, jbyteArray param, jbyteArray bin, jbyteArray words)
+{
+    // init param
+    {
+        int len = env->GetArrayLength(param);
+        squeezenet_param.resize(len);
+        env->GetByteArrayRegion(param, 0, len, (jbyte*)squeezenet_param.data());
+        int ret = squeezenet.load_param(squeezenet_param.data());
+        __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_param %d %d", ret, len);
+    }
+
+    // init bin
+    {
+        int len = env->GetArrayLength(bin);
+        squeezenet_bin.resize(len);
+        env->GetByteArrayRegion(bin, 0, len, (jbyte*)squeezenet_bin.data());
+        int ret = squeezenet.load_model(squeezenet_bin.data());
+        __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_model %d %d", ret, len);
+    }
+
+    // init words
+    {
+        int len = env->GetArrayLength(words);
+        std::string words_buffer;
+        words_buffer.resize(len);
+        env->GetByteArrayRegion(words, 0, len, (jbyte*)words_buffer.data());
+        squeezenet_words = split_string(words_buffer, "\n");
+    }
+
+    return JNI_TRUE;
+}
+
+// public native String Detect(Bitmap bitmap);
+JNIEXPORT jstring JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Detect(JNIEnv* env, jobject thiz, jobject bitmap)
+{
+    bench_start();
+
+    // ncnn from bitmap
+    ncnn::Mat in;
+    {
+        AndroidBitmapInfo info;
+        AndroidBitmap_getInfo(env, bitmap, &info);
+        int width = info.width;
+        int height = info.height;
+        if (width != 227 || height != 227)
+            return NULL;
+        if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888)
+            return NULL;
+
+        void* indata;
+        AndroidBitmap_lockPixels(env, bitmap, &indata);
+
+        in = ncnn::Mat::from_pixels((const unsigned char*)indata, ncnn::Mat::PIXEL_RGBA2BGR, width, height);
+
+        AndroidBitmap_unlockPixels(env, bitmap);
+    }
+
+    // squeezenet
+    std::vector<float> cls_scores;
+    {
+        const float mean_vals[3] = {104.f, 117.f, 123.f};
+        in.substract_mean_normalize(mean_vals, 0);
+
+        ncnn::Extractor ex = squeezenet.create_extractor();
+        ex.set_light_mode(true);
+        ex.set_num_threads(4);
+
+        ex.input(squeezenet_v1_1_param_id::BLOB_data, in);
+
+        ncnn::Mat out;
+        ex.extract(squeezenet_v1_1_param_id::BLOB_prob, out);
+
+        cls_scores.resize(out.c);
+        for (int j=0; j<out.c; j++)
+        {
+            const float* prob = out.data + out.cstep * j;
+            cls_scores[j] = prob[0];
+        }
+    }
+
+    // return top class
+    int top_class = 0;
+    float max_score = 0.f;
+    for (size_t i=0; i<cls_scores.size(); i++)
+    {
+        float s = cls_scores[i];
+//         __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "%d %f", i, s);
+        if (s > max_score)
+        {
+            top_class = i;
+            max_score = s;
+        }
+    }
+
+    const std::string& word = squeezenet_words[top_class];
+    char tmp[32];
+    sprintf(tmp, "%.3f", max_score);
+    std::string result_str = std::string(word.c_str() + 10) + " = " + tmp;
+
+    // +10 to skip leading n03179701
+    jstring result = env->NewStringUTF(result_str.c_str());
+
+    bench_end("detect");
+
+    return result;
+}
+
+}
diff --git a/examples/squeezencnn/jni/squeezenet_v1.1.id.h b/examples/squeezencnn/jni/squeezenet_v1.1.id.h
new file mode 100644
index 00000000000..94ae7f5c5b3
--- /dev/null
+++ b/examples/squeezencnn/jni/squeezenet_v1.1.id.h
@@ -0,0 +1,163 @@
+#ifndef NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
+#define NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
+namespace squeezenet_v1_1_param_id {
+const int LAYER_data = 0;
+const int BLOB_data = 0;
+const int LAYER_conv1 = 1;
+const int BLOB_conv1 = 1;
+const int LAYER_relu_conv1 = 2;
+const int BLOB_conv1_relu_conv1 = 2;
+const int LAYER_pool1 = 3;
+const int BLOB_pool1 = 3;
+const int LAYER_fire2_squeeze1x1 = 4;
+const int BLOB_fire2_squeeze1x1 = 4;
+const int LAYER_fire2_relu_squeeze1x1 = 5;
+const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1 = 5;
+const int LAYER_splitncnn_0 = 6;
+const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_0 = 6;
+const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_1 = 7;
+const int LAYER_fire2_expand1x1 = 7;
+const int BLOB_fire2_expand1x1 = 8;
+const int LAYER_fire2_relu_expand1x1 = 8;
+const int BLOB_fire2_expand1x1_fire2_relu_expand1x1 = 9;
+const int LAYER_fire2_expand3x3 = 9;
+const int BLOB_fire2_expand3x3 = 10;
+const int LAYER_fire2_relu_expand3x3 = 10;
+const int BLOB_fire2_expand3x3_fire2_relu_expand3x3 = 11;
+const int LAYER_fire2_concat = 11;
+const int BLOB_fire2_concat = 12;
+const int LAYER_fire3_squeeze1x1 = 12;
+const int BLOB_fire3_squeeze1x1 = 13;
+const int LAYER_fire3_relu_squeeze1x1 = 13;
+const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1 = 14;
+const int LAYER_splitncnn_1 = 14;
+const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_0 = 15;
+const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_1 = 16;
+const int LAYER_fire3_expand1x1 = 15;
+const int BLOB_fire3_expand1x1 = 17;
+const int LAYER_fire3_relu_expand1x1 = 16;
+const int BLOB_fire3_expand1x1_fire3_relu_expand1x1 = 18;
+const int LAYER_fire3_expand3x3 = 17;
+const int BLOB_fire3_expand3x3 = 19;
+const int LAYER_fire3_relu_expand3x3 = 18;
+const int BLOB_fire3_expand3x3_fire3_relu_expand3x3 = 20;
+const int LAYER_fire3_concat = 19;
+const int BLOB_fire3_concat = 21;
+const int LAYER_pool3 = 20;
+const int BLOB_pool3 = 22;
+const int LAYER_fire4_squeeze1x1 = 21;
+const int BLOB_fire4_squeeze1x1 = 23;
+const int LAYER_fire4_relu_squeeze1x1 = 22;
+const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1 = 24;
+const int LAYER_splitncnn_2 = 23;
+const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_0 = 25;
+const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_1 = 26;
+const int LAYER_fire4_expand1x1 = 24;
+const int BLOB_fire4_expand1x1 = 27;
+const int LAYER_fire4_relu_expand1x1 = 25;
+const int BLOB_fire4_expand1x1_fire4_relu_expand1x1 = 28;
+const int LAYER_fire4_expand3x3 = 26;
+const int BLOB_fire4_expand3x3 = 29;
+const int LAYER_fire4_relu_expand3x3 = 27;
+const int BLOB_fire4_expand3x3_fire4_relu_expand3x3 = 30;
+const int LAYER_fire4_concat = 28;
+const int BLOB_fire4_concat = 31;
+const int LAYER_fire5_squeeze1x1 = 29;
+const int BLOB_fire5_squeeze1x1 = 32;
+const int LAYER_fire5_relu_squeeze1x1 = 30;
+const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1 = 33;
+const int LAYER_splitncnn_3 = 31;
+const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_0 = 34;
+const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_1 = 35;
+const int LAYER_fire5_expand1x1 = 32;
+const int BLOB_fire5_expand1x1 = 36;
+const int LAYER_fire5_relu_expand1x1 = 33;
+const int BLOB_fire5_expand1x1_fire5_relu_expand1x1 = 37;
+const int LAYER_fire5_expand3x3 = 34;
+const int BLOB_fire5_expand3x3 = 38;
+const int LAYER_fire5_relu_expand3x3 = 35;
+const int BLOB_fire5_expand3x3_fire5_relu_expand3x3 = 39;
+const int LAYER_fire5_concat = 36;
+const int BLOB_fire5_concat = 40;
+const int LAYER_pool5 = 37;
+const int BLOB_pool5 = 41;
+const int LAYER_fire6_squeeze1x1 = 38;
+const int BLOB_fire6_squeeze1x1 = 42;
+const int LAYER_fire6_relu_squeeze1x1 = 39;
+const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1 = 43;
+const int LAYER_splitncnn_4 = 40;
+const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_0 = 44;
+const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_1 = 45;
+const int LAYER_fire6_expand1x1 = 41;
+const int BLOB_fire6_expand1x1 = 46;
+const int LAYER_fire6_relu_expand1x1 = 42;
+const int BLOB_fire6_expand1x1_fire6_relu_expand1x1 = 47;
+const int LAYER_fire6_expand3x3 = 43;
+const int BLOB_fire6_expand3x3 = 48;
+const int LAYER_fire6_relu_expand3x3 = 44;
+const int BLOB_fire6_expand3x3_fire6_relu_expand3x3 = 49;
+const int LAYER_fire6_concat = 45;
+const int BLOB_fire6_concat = 50;
+const int LAYER_fire7_squeeze1x1 = 46;
+const int BLOB_fire7_squeeze1x1 = 51;
+const int LAYER_fire7_relu_squeeze1x1 = 47;
+const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1 = 52;
+const int LAYER_splitncnn_5 = 48;
+const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_0 = 53;
+const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_1 = 54;
+const int LAYER_fire7_expand1x1 = 49;
+const int BLOB_fire7_expand1x1 = 55;
+const int LAYER_fire7_relu_expand1x1 = 50;
+const int BLOB_fire7_expand1x1_fire7_relu_expand1x1 = 56;
+const int LAYER_fire7_expand3x3 = 51;
+const int BLOB_fire7_expand3x3 = 57;
+const int LAYER_fire7_relu_expand3x3 = 52;
+const int BLOB_fire7_expand3x3_fire7_relu_expand3x3 = 58;
+const int LAYER_fire7_concat = 53;
+const int BLOB_fire7_concat = 59;
+const int LAYER_fire8_squeeze1x1 = 54;
+const int BLOB_fire8_squeeze1x1 = 60;
+const int LAYER_fire8_relu_squeeze1x1 = 55;
+const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1 = 61;
+const int LAYER_splitncnn_6 = 56;
+const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_0 = 62;
+const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_1 = 63;
+const int LAYER_fire8_expand1x1 = 57;
+const int BLOB_fire8_expand1x1 = 64;
+const int LAYER_fire8_relu_expand1x1 = 58;
+const int BLOB_fire8_expand1x1_fire8_relu_expand1x1 = 65;
+const int LAYER_fire8_expand3x3 = 59;
+const int BLOB_fire8_expand3x3 = 66;
+const int LAYER_fire8_relu_expand3x3 = 60;
+const int BLOB_fire8_expand3x3_fire8_relu_expand3x3 = 67;
+const int LAYER_fire8_concat = 61;
+const int BLOB_fire8_concat = 68;
+const int LAYER_fire9_squeeze1x1 = 62;
+const int BLOB_fire9_squeeze1x1 = 69;
+const int LAYER_fire9_relu_squeeze1x1 = 63;
+const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1 = 70;
+const int LAYER_splitncnn_7 = 64;
+const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_0 = 71;
+const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_1 = 72;
+const int LAYER_fire9_expand1x1 = 65;
+const int BLOB_fire9_expand1x1 = 73;
+const int LAYER_fire9_relu_expand1x1 = 66;
+const int BLOB_fire9_expand1x1_fire9_relu_expand1x1 = 74;
+const int LAYER_fire9_expand3x3 = 67;
+const int BLOB_fire9_expand3x3 = 75;
+const int LAYER_fire9_relu_expand3x3 = 68;
+const int BLOB_fire9_expand3x3_fire9_relu_expand3x3 = 76;
+const int LAYER_fire9_concat = 69;
+const int BLOB_fire9_concat = 77;
+const int LAYER_drop9 = 70;
+const int BLOB_fire9_concat_drop9 = 78;
+const int LAYER_conv10 = 71;
+const int BLOB_conv10 = 79;
+const int LAYER_relu_conv10 = 72;
+const int BLOB_conv10_relu_conv10 = 80;
+const int LAYER_pool10 = 73;
+const int BLOB_pool10 = 81;
+const int LAYER_prob = 74;
+const int BLOB_prob = 82;
+} // namespace squeezenet_v1_1_param_id
+#endif // NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
diff --git a/examples/squeezencnn/local.properties b/examples/squeezencnn/local.properties
new file mode 100644
index 00000000000..916b3624c19
--- /dev/null
+++ b/examples/squeezencnn/local.properties
@@ -0,0 +1,10 @@
+# This file is automatically generated by Android Tools.
+# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
+#
+# This file must *NOT* be checked into Version Control Systems,
+# as it contains information specific to your local configuration.
+
+# location of the SDK. This is only used by Ant
+# For customization when using a Version Control System, please read the
+# header note.
+sdk.dir=/home/nihui/osd/android-sdk-linux
diff --git a/examples/squeezencnn/proguard-project.txt b/examples/squeezencnn/proguard-project.txt
new file mode 100644
index 00000000000..f2fe1559a21
--- /dev/null
+++ b/examples/squeezencnn/proguard-project.txt
@@ -0,0 +1,20 @@
+# To enable ProGuard in your project, edit project.properties
+# to define the proguard.config property as described in that file.
+#
+# Add project specific ProGuard rules here.
+# By default, the flags in this file are appended to flags specified
+# in ${sdk.dir}/tools/proguard/proguard-android.txt
+# You can edit the include path and order by changing the ProGuard
+# include property in project.properties.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# Add any project specific keep options here:
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
diff --git a/examples/squeezencnn/project.properties b/examples/squeezencnn/project.properties
new file mode 100644
index 00000000000..c6998b3d101
--- /dev/null
+++ b/examples/squeezencnn/project.properties
@@ -0,0 +1,14 @@
+# This file is automatically generated by Android Tools.
+# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
+#
+# This file must be checked in Version Control Systems.
+#
+# To customize properties used by the Ant build system edit
+# "ant.properties", and override values to adapt the script to your
+# project structure.
+#
+# To enable ProGuard to shrink and obfuscate your code, uncomment this (available properties: sdk.dir, user.home):
+#proguard.config=${sdk.dir}/tools/proguard/proguard-android.txt:proguard-project.txt
+
+# Project target.
+target=android-9
diff --git a/examples/squeezencnn/res/layout/main.xml b/examples/squeezencnn/res/layout/main.xml
new file mode 100644
index 00000000000..37cf35675ba
--- /dev/null
+++ b/examples/squeezencnn/res/layout/main.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:orientation="vertical"
+    android:layout_width="fill_parent"
+    android:layout_height="fill_parent">
+
+    <LinearLayout
+        android:orientation="horizontal"
+        android:layout_width="fill_parent"
+        android:layout_height="wrap_content">
+
+    <Button
+        android:id="@+id/buttonImage"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="选图" />
+    <Button
+        android:id="@+id/buttonDetect"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="识别" />
+    </LinearLayout>
+
+    <TextView
+        android:id="@+id/infoResult"
+        android:layout_width="fill_parent"
+        android:layout_height="wrap_content"
+        android:text="" />
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="fill_parent"
+        android:layout_height="fill_parent"
+        android:layout_weight="1" />
+
+</LinearLayout>
diff --git a/examples/squeezencnn/res/values/strings.xml b/examples/squeezencnn/res/values/strings.xml
new file mode 100644
index 00000000000..283e0263a07
--- /dev/null
+++ b/examples/squeezencnn/res/values/strings.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <string name="app_name">squeezencnn</string>
+</resources>
diff --git a/examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java b/examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java
new file mode 100644
index 00000000000..666aaee840e
--- /dev/null
+++ b/examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java
@@ -0,0 +1,189 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+package com.tencent.squeezencnn;
+
+import android.app.Activity;
+import android.os.Bundle;
+
+import android.content.Context;
+import android.content.Intent;
+import android.database.Cursor;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+import android.net.Uri;
+import android.provider.MediaStore;
+import android.util.Log;
+import android.view.View;
+import android.widget.Button;
+import android.widget.ImageView;
+import android.widget.TextView;
+
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.tencent.squeezencnn.SqueezeNcnn;
+
+public class MainActivity extends Activity
+{
+    private static final int SELECT_IMAGE = 1;
+
+    private TextView infoResult;
+    private ImageView imageView;
+    private Bitmap yourSelectedImage = null;
+
+    private SqueezeNcnn squeezencnn = new SqueezeNcnn();
+
+    /** Called when the activity is first created. */
+    @Override
+    public void onCreate(Bundle savedInstanceState)
+    {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.main);
+
+        try
+        {
+            initSqueezeNcnn();
+        }
+        catch (IOException e)
+        {
+            Log.e("MainActivity", "initSqueezeNcnn error");
+        }
+
+        infoResult = (TextView) findViewById(R.id.infoResult);
+        imageView = (ImageView) findViewById(R.id.imageView);
+
+        Button buttonImage = (Button) findViewById(R.id.buttonImage);
+        buttonImage.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View arg0) {
+                Intent i = new Intent(Intent.ACTION_PICK);
+                i.setType("image/*");
+                startActivityForResult(i, SELECT_IMAGE);
+            }
+        });
+
+        Button buttonDetect = (Button) findViewById(R.id.buttonDetect);
+        buttonDetect.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View arg0) {
+                if (yourSelectedImage == null)
+                    return;
+
+                String result = squeezencnn.Detect(yourSelectedImage);
+
+                if (result == null)
+                {
+                    infoResult.setText("detect failed");
+                }
+                else
+                {
+                    infoResult.setText(result);
+                }
+            }
+        });
+    }
+
+    private void initSqueezeNcnn() throws IOException
+    {
+        byte[] param = null;
+        byte[] bin = null;
+        byte[] words = null;
+
+        {
+            InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.param.bin");
+            int available = assetsInputStream.available();
+            param = new byte[available];
+            int byteCode = assetsInputStream.read(param);
+            assetsInputStream.close();
+        }
+        {
+            InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.bin");
+            int available = assetsInputStream.available();
+            bin = new byte[available];
+            int byteCode = assetsInputStream.read(bin);
+            assetsInputStream.close();
+        }
+        {
+            InputStream assetsInputStream = getAssets().open("synset_words.txt");
+            int available = assetsInputStream.available();
+            words = new byte[available];
+            int byteCode = assetsInputStream.read(words);
+            assetsInputStream.close();
+        }
+
+        squeezencnn.Init(param, bin, words);
+    }
+
+    @Override
+    protected void onActivityResult(int requestCode, int resultCode, Intent data)
+    {
+        super.onActivityResult(requestCode, resultCode, data);
+
+        if (resultCode == RESULT_OK && null != data) {
+            Uri selectedImage = data.getData();
+
+            try
+            {
+                if (requestCode == SELECT_IMAGE) {
+                    Bitmap bitmap = decodeUri(selectedImage);
+
+                    Bitmap rgba = bitmap.copy(Bitmap.Config.ARGB_8888, true);
+
+                    // resize to 227x227
+                    yourSelectedImage = Bitmap.createScaledBitmap(rgba, 227, 227, false);
+
+                    imageView.setImageBitmap(yourSelectedImage);
+                }
+            }
+            catch (FileNotFoundException e)
+            {
+                Log.e("MainActivity", "FileNotFoundException");
+                return;
+            }
+        }
+    }
+
+    private Bitmap decodeUri(Uri selectedImage) throws FileNotFoundException
+    {
+        // Decode image size
+        BitmapFactory.Options o = new BitmapFactory.Options();
+        o.inJustDecodeBounds = true;
+        BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o);
+
+        // The new size we want to scale to
+        final int REQUIRED_SIZE = 400;
+
+        // Find the correct scale value. It should be the power of 2.
+        int width_tmp = o.outWidth, height_tmp = o.outHeight;
+        int scale = 1;
+        while (true) {
+            if (width_tmp / 2 < REQUIRED_SIZE
+               || height_tmp / 2 < REQUIRED_SIZE) {
+                break;
+            }
+            width_tmp /= 2;
+            height_tmp /= 2;
+            scale *= 2;
+        }
+
+        // Decode with inSampleSize
+        BitmapFactory.Options o2 = new BitmapFactory.Options();
+        o2.inSampleSize = scale;
+        return BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o2);
+    }
+
+}
diff --git a/examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java b/examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java
new file mode 100644
index 00000000000..ac0b5973229
--- /dev/null
+++ b/examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+package com.tencent.squeezencnn;
+
+import android.graphics.Bitmap;
+import android.content.Context;
+
+public class SqueezeNcnn
+{
+    public native boolean Init(byte[] param, byte[] bin, byte[] words);
+
+    public native String Detect(Bitmap bitmap);
+
+    static {
+        System.loadLibrary("squeezencnn");
+    }
+}
diff --git a/examples/squeezenet.cpp b/examples/squeezenet.cpp
new file mode 100644
index 00000000000..bab2a35ba94
--- /dev/null
+++ b/examples/squeezenet.cpp
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdio.h>
+#include <algorithm>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+
+#include "net.h"
+
+static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
+{
+    ncnn::Net squeezenet;
+    squeezenet.load_param("squeezenet_v1.1.param");
+    squeezenet.load_model("squeezenet_v1.1.bin");
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 227, 227);
+
+    const float mean_vals[3] = {104.f, 117.f, 123.f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Extractor ex = squeezenet.create_extractor();
+    ex.set_light_mode(true);
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("prob", out);
+
+    cls_scores.resize(out.c);
+    for (int j=0; j<out.c; j++)
+    {
+        const float* prob = out.data + out.cstep * j;
+        cls_scores[j] = prob[0];
+    }
+
+    return 0;
+}
+
+static int print_topk(const std::vector<float>& cls_scores, int topk)
+{
+    // partial sort topk with index
+    int size = cls_scores.size();
+    std::vector< std::pair<float, int> > vec;
+    vec.resize(size);
+    for (int i=0; i<size; i++)
+    {
+        vec[i] = std::make_pair(cls_scores[i], i);
+    }
+
+    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                      std::greater< std::pair<float, int> >());
+
+    // print topk and score
+    for (int i=0; i<topk; i++)
+    {
+        float score = vec[i].first;
+        int index = vec[i].second;
+        fprintf(stderr, "%d = %f\n", index, score);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, CV_LOAD_IMAGE_COLOR);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<float> cls_scores;
+    detect_squeezenet(m, cls_scores);
+
+    print_topk(cls_scores, 3);
+
+    return 0;
+}
+
diff --git a/examples/squeezenet_v1.1.bin b/examples/squeezenet_v1.1.bin
new file mode 100644
index 00000000000..2b39bf8c42d
Binary files /dev/null and b/examples/squeezenet_v1.1.bin differ
diff --git a/examples/squeezenet_v1.1.caffemodel b/examples/squeezenet_v1.1.caffemodel
new file mode 100644
index 00000000000..9d2fc33abf6
Binary files /dev/null and b/examples/squeezenet_v1.1.caffemodel differ
diff --git a/examples/squeezenet_v1.1.param b/examples/squeezenet_v1.1.param
new file mode 100644
index 00000000000..6c1bd296e22
--- /dev/null
+++ b/examples/squeezenet_v1.1.param
@@ -0,0 +1,76 @@
+75 83
+Input            data             0 1 data 3 227 227
+Convolution      conv1            1 1 data conv1 64 3 1 2 0 1 1728
+ReLU             relu_conv1       1 1 conv1 conv1_relu_conv1 0.000000
+Pooling          pool1            1 1 conv1_relu_conv1 pool1 0 3 2 0 0
+Convolution      fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 16 1 1 1 0 1 1024
+ReLU             fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0.000000
+Split            splitncnn_0      1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
+Convolution      fire2/expand1x1  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 64 1 1 1 0 1 1024
+ReLU             fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 0.000000
+Convolution      fire2/expand3x3  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 64 3 1 1 1 1 9216
+ReLU             fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 0.000000
+Concat           fire2/concat     2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat
+Convolution      fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 16 1 1 1 0 1 2048
+ReLU             fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 0.000000
+Split            splitncnn_1      1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
+Convolution      fire3/expand1x1  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 64 1 1 1 0 1 1024
+ReLU             fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 0.000000
+Convolution      fire3/expand3x3  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 64 3 1 1 1 1 9216
+ReLU             fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 0.000000
+Concat           fire3/concat     2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat
+Pooling          pool3            1 1 fire3/concat pool3 0 3 2 0 0
+Convolution      fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 32 1 1 1 0 1 4096
+ReLU             fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 0.000000
+Split            splitncnn_2      1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
+Convolution      fire4/expand1x1  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 128 1 1 1 0 1 4096
+ReLU             fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 0.000000
+Convolution      fire4/expand3x3  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 128 3 1 1 1 1 36864
+ReLU             fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 0.000000
+Concat           fire4/concat     2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat
+Convolution      fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 32 1 1 1 0 1 8192
+ReLU             fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 0.000000
+Split            splitncnn_3      1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
+Convolution      fire5/expand1x1  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 128 1 1 1 0 1 4096
+ReLU             fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 0.000000
+Convolution      fire5/expand3x3  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 128 3 1 1 1 1 36864
+ReLU             fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 0.000000
+Concat           fire5/concat     2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat
+Pooling          pool5            1 1 fire5/concat pool5 0 3 2 0 0
+Convolution      fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 48 1 1 1 0 1 12288
+ReLU             fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 0.000000
+Split            splitncnn_4      1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
+Convolution      fire6/expand1x1  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 192 1 1 1 0 1 9216
+ReLU             fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 0.000000
+Convolution      fire6/expand3x3  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 192 3 1 1 1 1 82944
+ReLU             fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 0.000000
+Concat           fire6/concat     2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat
+Convolution      fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 48 1 1 1 0 1 18432
+ReLU             fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 0.000000
+Split            splitncnn_5      1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
+Convolution      fire7/expand1x1  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 192 1 1 1 0 1 9216
+ReLU             fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 0.000000
+Convolution      fire7/expand3x3  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 192 3 1 1 1 1 82944
+ReLU             fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 0.000000
+Concat           fire7/concat     2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat
+Convolution      fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 64 1 1 1 0 1 24576
+ReLU             fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 0.000000
+Split            splitncnn_6      1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
+Convolution      fire8/expand1x1  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 256 1 1 1 0 1 16384
+ReLU             fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 0.000000
+Convolution      fire8/expand3x3  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 256 3 1 1 1 1 147456
+ReLU             fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 0.000000
+Concat           fire8/concat     2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat
+Convolution      fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 64 1 1 1 0 1 32768
+ReLU             fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 0.000000
+Split            splitncnn_7      1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
+Convolution      fire9/expand1x1  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 256 1 1 1 0 1 16384
+ReLU             fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 0.000000
+Convolution      fire9/expand3x3  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 256 3 1 1 1 1 147456
+ReLU             fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 0.000000
+Concat           fire9/concat     2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat
+Dropout          drop9            1 1 fire9/concat fire9/concat_drop9
+Convolution      conv10           1 1 fire9/concat_drop9 conv10 1000 1 1 1 1 1 512000
+ReLU             relu_conv10      1 1 conv10 conv10_relu_conv10 0.000000
+Pooling          pool10           1 1 conv10_relu_conv10 pool10 1 0 1 0 1
+Softmax          prob             1 1 pool10 prob
diff --git a/examples/squeezenet_v1.1.prototxt b/examples/squeezenet_v1.1.prototxt
new file mode 100644
index 00000000000..7dc9853b4e5
--- /dev/null
+++ b/examples/squeezenet_v1.1.prototxt
@@ -0,0 +1,548 @@
+name: "squeezenet_v1.1_deploy"
+
+layer {
+  name: "data"
+  type: "Input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "relu_conv1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fire2/squeeze1x1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "fire2/squeeze1x1"
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire2/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire2/squeeze1x1"
+  top: "fire2/squeeze1x1"
+}
+layer {
+  name: "fire2/expand1x1"
+  type: "Convolution"
+  bottom: "fire2/squeeze1x1"
+  top: "fire2/expand1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire2/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire2/expand1x1"
+  top: "fire2/expand1x1"
+}
+layer {
+  name: "fire2/expand3x3"
+  type: "Convolution"
+  bottom: "fire2/squeeze1x1"
+  top: "fire2/expand3x3"
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire2/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire2/expand3x3"
+  top: "fire2/expand3x3"
+}
+layer {
+  name: "fire2/concat"
+  type: "Concat"
+  bottom: "fire2/expand1x1"
+  bottom: "fire2/expand3x3"
+  top: "fire2/concat"
+}
+layer {
+  name: "fire3/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire2/concat"
+  top: "fire3/squeeze1x1"
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire3/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire3/squeeze1x1"
+  top: "fire3/squeeze1x1"
+}
+layer {
+  name: "fire3/expand1x1"
+  type: "Convolution"
+  bottom: "fire3/squeeze1x1"
+  top: "fire3/expand1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire3/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire3/expand1x1"
+  top: "fire3/expand1x1"
+}
+layer {
+  name: "fire3/expand3x3"
+  type: "Convolution"
+  bottom: "fire3/squeeze1x1"
+  top: "fire3/expand3x3"
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire3/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire3/expand3x3"
+  top: "fire3/expand3x3"
+}
+layer {
+  name: "fire3/concat"
+  type: "Concat"
+  bottom: "fire3/expand1x1"
+  bottom: "fire3/expand3x3"
+  top: "fire3/concat"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "fire3/concat"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fire4/squeeze1x1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "fire4/squeeze1x1"
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire4/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire4/squeeze1x1"
+  top: "fire4/squeeze1x1"
+}
+layer {
+  name: "fire4/expand1x1"
+  type: "Convolution"
+  bottom: "fire4/squeeze1x1"
+  top: "fire4/expand1x1"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire4/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire4/expand1x1"
+  top: "fire4/expand1x1"
+}
+layer {
+  name: "fire4/expand3x3"
+  type: "Convolution"
+  bottom: "fire4/squeeze1x1"
+  top: "fire4/expand3x3"
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire4/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire4/expand3x3"
+  top: "fire4/expand3x3"
+}
+layer {
+  name: "fire4/concat"
+  type: "Concat"
+  bottom: "fire4/expand1x1"
+  bottom: "fire4/expand3x3"
+  top: "fire4/concat"
+}
+layer {
+  name: "fire5/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire4/concat"
+  top: "fire5/squeeze1x1"
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire5/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire5/squeeze1x1"
+  top: "fire5/squeeze1x1"
+}
+layer {
+  name: "fire5/expand1x1"
+  type: "Convolution"
+  bottom: "fire5/squeeze1x1"
+  top: "fire5/expand1x1"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire5/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire5/expand1x1"
+  top: "fire5/expand1x1"
+}
+layer {
+  name: "fire5/expand3x3"
+  type: "Convolution"
+  bottom: "fire5/squeeze1x1"
+  top: "fire5/expand3x3"
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire5/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire5/expand3x3"
+  top: "fire5/expand3x3"
+}
+layer {
+  name: "fire5/concat"
+  type: "Concat"
+  bottom: "fire5/expand1x1"
+  bottom: "fire5/expand3x3"
+  top: "fire5/concat"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "fire5/concat"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fire6/squeeze1x1"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fire6/squeeze1x1"
+  convolution_param {
+    num_output: 48
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire6/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire6/squeeze1x1"
+  top: "fire6/squeeze1x1"
+}
+layer {
+  name: "fire6/expand1x1"
+  type: "Convolution"
+  bottom: "fire6/squeeze1x1"
+  top: "fire6/expand1x1"
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire6/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire6/expand1x1"
+  top: "fire6/expand1x1"
+}
+layer {
+  name: "fire6/expand3x3"
+  type: "Convolution"
+  bottom: "fire6/squeeze1x1"
+  top: "fire6/expand3x3"
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire6/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire6/expand3x3"
+  top: "fire6/expand3x3"
+}
+layer {
+  name: "fire6/concat"
+  type: "Concat"
+  bottom: "fire6/expand1x1"
+  bottom: "fire6/expand3x3"
+  top: "fire6/concat"
+}
+layer {
+  name: "fire7/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire6/concat"
+  top: "fire7/squeeze1x1"
+  convolution_param {
+    num_output: 48
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire7/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire7/squeeze1x1"
+  top: "fire7/squeeze1x1"
+}
+layer {
+  name: "fire7/expand1x1"
+  type: "Convolution"
+  bottom: "fire7/squeeze1x1"
+  top: "fire7/expand1x1"
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire7/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire7/expand1x1"
+  top: "fire7/expand1x1"
+}
+layer {
+  name: "fire7/expand3x3"
+  type: "Convolution"
+  bottom: "fire7/squeeze1x1"
+  top: "fire7/expand3x3"
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire7/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire7/expand3x3"
+  top: "fire7/expand3x3"
+}
+layer {
+  name: "fire7/concat"
+  type: "Concat"
+  bottom: "fire7/expand1x1"
+  bottom: "fire7/expand3x3"
+  top: "fire7/concat"
+}
+layer {
+  name: "fire8/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire7/concat"
+  top: "fire8/squeeze1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire8/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire8/squeeze1x1"
+  top: "fire8/squeeze1x1"
+}
+layer {
+  name: "fire8/expand1x1"
+  type: "Convolution"
+  bottom: "fire8/squeeze1x1"
+  top: "fire8/expand1x1"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire8/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire8/expand1x1"
+  top: "fire8/expand1x1"
+}
+layer {
+  name: "fire8/expand3x3"
+  type: "Convolution"
+  bottom: "fire8/squeeze1x1"
+  top: "fire8/expand3x3"
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire8/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire8/expand3x3"
+  top: "fire8/expand3x3"
+}
+layer {
+  name: "fire8/concat"
+  type: "Concat"
+  bottom: "fire8/expand1x1"
+  bottom: "fire8/expand3x3"
+  top: "fire8/concat"
+}
+layer {
+  name: "fire9/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire8/concat"
+  top: "fire9/squeeze1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire9/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire9/squeeze1x1"
+  top: "fire9/squeeze1x1"
+}
+layer {
+  name: "fire9/expand1x1"
+  type: "Convolution"
+  bottom: "fire9/squeeze1x1"
+  top: "fire9/expand1x1"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire9/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire9/expand1x1"
+  top: "fire9/expand1x1"
+}
+layer {
+  name: "fire9/expand3x3"
+  type: "Convolution"
+  bottom: "fire9/squeeze1x1"
+  top: "fire9/expand3x3"
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire9/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire9/expand3x3"
+  top: "fire9/expand3x3"
+}
+layer {
+  name: "fire9/concat"
+  type: "Concat"
+  bottom: "fire9/expand1x1"
+  bottom: "fire9/expand3x3"
+  top: "fire9/concat"
+}
+layer {
+  name: "drop9"
+  type: "Dropout"
+  bottom: "fire9/concat"
+  top: "fire9/concat"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "conv10"
+  type: "Convolution"
+  bottom: "fire9/concat"
+  top: "conv10"
+  convolution_param {
+    num_output: 1000
+    pad: 1
+    kernel_size: 1
+  }
+}
+layer {
+  name: "relu_conv10"
+  type: "ReLU"
+  bottom: "conv10"
+  top: "conv10"
+}
+layer {
+  name: "pool10"
+  type: "Pooling"
+  bottom: "conv10"
+  top: "pool10"
+  pooling_param {
+    pool: AVE
+    global_pooling: true
+  }
+}
+layer {
+  name: "prob"
+  type: "Softmax"
+  bottom: "pool10"
+  top: "prob"
+}
diff --git a/examples/synset_words.txt b/examples/synset_words.txt
new file mode 100644
index 00000000000..a9e8c7f50d1
--- /dev/null
+++ b/examples/synset_words.txt
@@ -0,0 +1,1000 @@
+n01440764 tench, Tinca tinca
+n01443537 goldfish, Carassius auratus
+n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+n01491361 tiger shark, Galeocerdo cuvieri
+n01494475 hammerhead, hammerhead shark
+n01496331 electric ray, crampfish, numbfish, torpedo
+n01498041 stingray
+n01514668 cock
+n01514859 hen
+n01518878 ostrich, Struthio camelus
+n01530575 brambling, Fringilla montifringilla
+n01531178 goldfinch, Carduelis carduelis
+n01532829 house finch, linnet, Carpodacus mexicanus
+n01534433 junco, snowbird
+n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+n01558993 robin, American robin, Turdus migratorius
+n01560419 bulbul
+n01580077 jay
+n01582220 magpie
+n01592084 chickadee
+n01601694 water ouzel, dipper
+n01608432 kite
+n01614925 bald eagle, American eagle, Haliaeetus leucocephalus
+n01616318 vulture
+n01622779 great grey owl, great gray owl, Strix nebulosa
+n01629819 European fire salamander, Salamandra salamandra
+n01630670 common newt, Triturus vulgaris
+n01631663 eft
+n01632458 spotted salamander, Ambystoma maculatum
+n01632777 axolotl, mud puppy, Ambystoma mexicanum
+n01641577 bullfrog, Rana catesbeiana
+n01644373 tree frog, tree-frog
+n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+n01664065 loggerhead, loggerhead turtle, Caretta caretta
+n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+n01667114 mud turtle
+n01667778 terrapin
+n01669191 box turtle, box tortoise
+n01675722 banded gecko
+n01677366 common iguana, iguana, Iguana iguana
+n01682714 American chameleon, anole, Anolis carolinensis
+n01685808 whiptail, whiptail lizard
+n01687978 agama
+n01688243 frilled lizard, Chlamydosaurus kingi
+n01689811 alligator lizard
+n01692333 Gila monster, Heloderma suspectum
+n01693334 green lizard, Lacerta viridis
+n01694178 African chameleon, Chamaeleo chamaeleon
+n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+n01697457 African crocodile, Nile crocodile, Crocodylus niloticus
+n01698640 American alligator, Alligator mississipiensis
+n01704323 triceratops
+n01728572 thunder snake, worm snake, Carphophis amoenus
+n01728920 ringneck snake, ring-necked snake, ring snake
+n01729322 hognose snake, puff adder, sand viper
+n01729977 green snake, grass snake
+n01734418 king snake, kingsnake
+n01735189 garter snake, grass snake
+n01737021 water snake
+n01739381 vine snake
+n01740131 night snake, Hypsiglena torquata
+n01742172 boa constrictor, Constrictor constrictor
+n01744401 rock python, rock snake, Python sebae
+n01748264 Indian cobra, Naja naja
+n01749939 green mamba
+n01751748 sea snake
+n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus
+n01756291 sidewinder, horned rattlesnake, Crotalus cerastes
+n01768244 trilobite
+n01770081 harvestman, daddy longlegs, Phalangium opilio
+n01770393 scorpion
+n01773157 black and gold garden spider, Argiope aurantia
+n01773549 barn spider, Araneus cavaticus
+n01773797 garden spider, Aranea diademata
+n01774384 black widow, Latrodectus mactans
+n01774750 tarantula
+n01775062 wolf spider, hunting spider
+n01776313 tick
+n01784675 centipede
+n01795545 black grouse
+n01796340 ptarmigan
+n01797886 ruffed grouse, partridge, Bonasa umbellus
+n01798484 prairie chicken, prairie grouse, prairie fowl
+n01806143 peacock
+n01806567 quail
+n01807496 partridge
+n01817953 African grey, African gray, Psittacus erithacus
+n01818515 macaw
+n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+n01820546 lorikeet
+n01824575 coucal
+n01828970 bee eater
+n01829413 hornbill
+n01833805 hummingbird
+n01843065 jacamar
+n01843383 toucan
+n01847000 drake
+n01855032 red-breasted merganser, Mergus serrator
+n01855672 goose
+n01860187 black swan, Cygnus atratus
+n01871265 tusker
+n01872401 echidna, spiny anteater, anteater
+n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+n01877812 wallaby, brush kangaroo
+n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+n01883070 wombat
+n01910747 jellyfish
+n01914609 sea anemone, anemone
+n01917289 brain coral
+n01924916 flatworm, platyhelminth
+n01930112 nematode, nematode worm, roundworm
+n01943899 conch
+n01944390 snail
+n01945685 slug
+n01950731 sea slug, nudibranch
+n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore
+n01968897 chambered nautilus, pearly nautilus, nautilus
+n01978287 Dungeness crab, Cancer magister
+n01978455 rock crab, Cancer irroratus
+n01980166 fiddler crab
+n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+n01983481 American lobster, Northern lobster, Maine lobster, Homarus americanus
+n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+n01985128 crayfish, crawfish, crawdad, crawdaddy
+n01986214 hermit crab
+n01990800 isopod
+n02002556 white stork, Ciconia ciconia
+n02002724 black stork, Ciconia nigra
+n02006656 spoonbill
+n02007558 flamingo
+n02009229 little blue heron, Egretta caerulea
+n02009912 American egret, great white heron, Egretta albus
+n02011460 bittern
+n02012849 crane
+n02013706 limpkin, Aramus pictus
+n02017213 European gallinule, Porphyrio porphyrio
+n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana
+n02018795 bustard
+n02025239 ruddy turnstone, Arenaria interpres
+n02027492 red-backed sandpiper, dunlin, Erolia alpina
+n02028035 redshank, Tringa totanus
+n02033041 dowitcher
+n02037110 oystercatcher, oyster catcher
+n02051845 pelican
+n02056570 king penguin, Aptenodytes patagonica
+n02058221 albatross, mollymawk
+n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+n02074367 dugong, Dugong dugon
+n02077923 sea lion
+n02085620 Chihuahua
+n02085782 Japanese spaniel
+n02085936 Maltese dog, Maltese terrier, Maltese
+n02086079 Pekinese, Pekingese, Peke
+n02086240 Shih-Tzu
+n02086646 Blenheim spaniel
+n02086910 papillon
+n02087046 toy terrier
+n02087394 Rhodesian ridgeback
+n02088094 Afghan hound, Afghan
+n02088238 basset, basset hound
+n02088364 beagle
+n02088466 bloodhound, sleuthhound
+n02088632 bluetick
+n02089078 black-and-tan coonhound
+n02089867 Walker hound, Walker foxhound
+n02089973 English foxhound
+n02090379 redbone
+n02090622 borzoi, Russian wolfhound
+n02090721 Irish wolfhound
+n02091032 Italian greyhound
+n02091134 whippet
+n02091244 Ibizan hound, Ibizan Podenco
+n02091467 Norwegian elkhound, elkhound
+n02091635 otterhound, otter hound
+n02091831 Saluki, gazelle hound
+n02092002 Scottish deerhound, deerhound
+n02092339 Weimaraner
+n02093256 Staffordshire bullterrier, Staffordshire bull terrier
+n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+n02093647 Bedlington terrier
+n02093754 Border terrier
+n02093859 Kerry blue terrier
+n02093991 Irish terrier
+n02094114 Norfolk terrier
+n02094258 Norwich terrier
+n02094433 Yorkshire terrier
+n02095314 wire-haired fox terrier
+n02095570 Lakeland terrier
+n02095889 Sealyham terrier, Sealyham
+n02096051 Airedale, Airedale terrier
+n02096177 cairn, cairn terrier
+n02096294 Australian terrier
+n02096437 Dandie Dinmont, Dandie Dinmont terrier
+n02096585 Boston bull, Boston terrier
+n02097047 miniature schnauzer
+n02097130 giant schnauzer
+n02097209 standard schnauzer
+n02097298 Scotch terrier, Scottish terrier, Scottie
+n02097474 Tibetan terrier, chrysanthemum dog
+n02097658 silky terrier, Sydney silky
+n02098105 soft-coated wheaten terrier
+n02098286 West Highland white terrier
+n02098413 Lhasa, Lhasa apso
+n02099267 flat-coated retriever
+n02099429 curly-coated retriever
+n02099601 golden retriever
+n02099712 Labrador retriever
+n02099849 Chesapeake Bay retriever
+n02100236 German short-haired pointer
+n02100583 vizsla, Hungarian pointer
+n02100735 English setter
+n02100877 Irish setter, red setter
+n02101006 Gordon setter
+n02101388 Brittany spaniel
+n02101556 clumber, clumber spaniel
+n02102040 English springer, English springer spaniel
+n02102177 Welsh springer spaniel
+n02102318 cocker spaniel, English cocker spaniel, cocker
+n02102480 Sussex spaniel
+n02102973 Irish water spaniel
+n02104029 kuvasz
+n02104365 schipperke
+n02105056 groenendael
+n02105162 malinois
+n02105251 briard
+n02105412 kelpie
+n02105505 komondor
+n02105641 Old English sheepdog, bobtail
+n02105855 Shetland sheepdog, Shetland sheep dog, Shetland
+n02106030 collie
+n02106166 Border collie
+n02106382 Bouvier des Flandres, Bouviers des Flandres
+n02106550 Rottweiler
+n02106662 German shepherd, German shepherd dog, German police dog, alsatian
+n02107142 Doberman, Doberman pinscher
+n02107312 miniature pinscher
+n02107574 Greater Swiss Mountain dog
+n02107683 Bernese mountain dog
+n02107908 Appenzeller
+n02108000 EntleBucher
+n02108089 boxer
+n02108422 bull mastiff
+n02108551 Tibetan mastiff
+n02108915 French bulldog
+n02109047 Great Dane
+n02109525 Saint Bernard, St Bernard
+n02109961 Eskimo dog, husky
+n02110063 malamute, malemute, Alaskan malamute
+n02110185 Siberian husky
+n02110341 dalmatian, coach dog, carriage dog
+n02110627 affenpinscher, monkey pinscher, monkey dog
+n02110806 basenji
+n02110958 pug, pug-dog
+n02111129 Leonberg
+n02111277 Newfoundland, Newfoundland dog
+n02111500 Great Pyrenees
+n02111889 Samoyed, Samoyede
+n02112018 Pomeranian
+n02112137 chow, chow chow
+n02112350 keeshond
+n02112706 Brabancon griffon
+n02113023 Pembroke, Pembroke Welsh corgi
+n02113186 Cardigan, Cardigan Welsh corgi
+n02113624 toy poodle
+n02113712 miniature poodle
+n02113799 standard poodle
+n02113978 Mexican hairless
+n02114367 timber wolf, grey wolf, gray wolf, Canis lupus
+n02114548 white wolf, Arctic wolf, Canis lupus tundrarum
+n02114712 red wolf, maned wolf, Canis rufus, Canis niger
+n02114855 coyote, prairie wolf, brush wolf, Canis latrans
+n02115641 dingo, warrigal, warragal, Canis dingo
+n02115913 dhole, Cuon alpinus
+n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+n02117135 hyena, hyaena
+n02119022 red fox, Vulpes vulpes
+n02119789 kit fox, Vulpes macrotis
+n02120079 Arctic fox, white fox, Alopex lagopus
+n02120505 grey fox, gray fox, Urocyon cinereoargenteus
+n02123045 tabby, tabby cat
+n02123159 tiger cat
+n02123394 Persian cat
+n02123597 Siamese cat, Siamese
+n02124075 Egyptian cat
+n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+n02127052 lynx, catamount
+n02128385 leopard, Panthera pardus
+n02128757 snow leopard, ounce, Panthera uncia
+n02128925 jaguar, panther, Panthera onca, Felis onca
+n02129165 lion, king of beasts, Panthera leo
+n02129604 tiger, Panthera tigris
+n02130308 cheetah, chetah, Acinonyx jubatus
+n02132136 brown bear, bruin, Ursus arctos
+n02133161 American black bear, black bear, Ursus americanus, Euarctos americanus
+n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+n02134418 sloth bear, Melursus ursinus, Ursus ursinus
+n02137549 mongoose
+n02138441 meerkat, mierkat
+n02165105 tiger beetle
+n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+n02167151 ground beetle, carabid beetle
+n02168699 long-horned beetle, longicorn, longicorn beetle
+n02169497 leaf beetle, chrysomelid
+n02172182 dung beetle
+n02174001 rhinoceros beetle
+n02177972 weevil
+n02190166 fly
+n02206856 bee
+n02219486 ant, emmet, pismire
+n02226429 grasshopper, hopper
+n02229544 cricket
+n02231487 walking stick, walkingstick, stick insect
+n02233338 cockroach, roach
+n02236044 mantis, mantid
+n02256656 cicada, cicala
+n02259212 leafhopper
+n02264363 lacewing, lacewing fly
+n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+n02268853 damselfly
+n02276258 admiral
+n02277742 ringlet, ringlet butterfly
+n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+n02280649 cabbage butterfly
+n02281406 sulphur butterfly, sulfur butterfly
+n02281787 lycaenid, lycaenid butterfly
+n02317335 starfish, sea star
+n02319095 sea urchin
+n02321529 sea cucumber, holothurian
+n02325366 wood rabbit, cottontail, cottontail rabbit
+n02326432 hare
+n02328150 Angora, Angora rabbit
+n02342885 hamster
+n02346627 porcupine, hedgehog
+n02356798 fox squirrel, eastern fox squirrel, Sciurus niger
+n02361337 marmot
+n02363005 beaver
+n02364673 guinea pig, Cavia cobaya
+n02389026 sorrel
+n02391049 zebra
+n02395406 hog, pig, grunter, squealer, Sus scrofa
+n02396427 wild boar, boar, Sus scrofa
+n02397096 warthog
+n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius
+n02403003 ox
+n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+n02410509 bison
+n02412080 ram, tup
+n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+n02417914 ibex, Capra ibex
+n02422106 hartebeest
+n02422699 impala, Aepyceros melampus
+n02423022 gazelle
+n02437312 Arabian camel, dromedary, Camelus dromedarius
+n02437616 llama
+n02441942 weasel
+n02442845 mink
+n02443114 polecat, fitch, foulmart, foumart, Mustela putorius
+n02443484 black-footed ferret, ferret, Mustela nigripes
+n02444819 otter
+n02445715 skunk, polecat, wood pussy
+n02447366 badger
+n02454379 armadillo
+n02457408 three-toed sloth, ai, Bradypus tridactylus
+n02480495 orangutan, orang, orangutang, Pongo pygmaeus
+n02480855 gorilla, Gorilla gorilla
+n02481823 chimpanzee, chimp, Pan troglodytes
+n02483362 gibbon, Hylobates lar
+n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus
+n02484975 guenon, guenon monkey
+n02486261 patas, hussar monkey, Erythrocebus patas
+n02486410 baboon
+n02487347 macaque
+n02488291 langur
+n02488702 colobus, colobus monkey
+n02489166 proboscis monkey, Nasalis larvatus
+n02490219 marmoset
+n02492035 capuchin, ringtail, Cebus capucinus
+n02492660 howler monkey, howler
+n02493509 titi, titi monkey
+n02493793 spider monkey, Ateles geoffroyi
+n02494079 squirrel monkey, Saimiri sciureus
+n02497673 Madagascar cat, ring-tailed lemur, Lemur catta
+n02500267 indri, indris, Indri indri, Indri brevicaudatus
+n02504013 Indian elephant, Elephas maximus
+n02504458 African elephant, Loxodonta africana
+n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+n02514041 barracouta, snoek
+n02526121 eel
+n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+n02606052 rock beauty, Holocanthus tricolor
+n02607072 anemone fish
+n02640242 sturgeon
+n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus
+n02643566 lionfish
+n02655020 puffer, pufferfish, blowfish, globefish
+n02666196 abacus
+n02667093 abaya
+n02669723 academic gown, academic robe, judge's robe
+n02672831 accordion, piano accordion, squeeze box
+n02676566 acoustic guitar
+n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier
+n02690373 airliner
+n02692877 airship, dirigible
+n02699494 altar
+n02701002 ambulance
+n02704792 amphibian, amphibious vehicle
+n02708093 analog clock
+n02727426 apiary, bee house
+n02730930 apron
+n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+n02749479 assault rifle, assault gun
+n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack
+n02776631 bakery, bakeshop, bakehouse
+n02777292 balance beam, beam
+n02782093 balloon
+n02783161 ballpoint, ballpoint pen, ballpen, Biro
+n02786058 Band Aid
+n02787622 banjo
+n02788148 bannister, banister, balustrade, balusters, handrail
+n02790996 barbell
+n02791124 barber chair
+n02791270 barbershop
+n02793495 barn
+n02794156 barometer
+n02795169 barrel, cask
+n02797295 barrow, garden cart, lawn cart, wheelbarrow
+n02799071 baseball
+n02802426 basketball
+n02804414 bassinet
+n02804610 bassoon
+n02807133 bathing cap, swimming cap
+n02808304 bath towel
+n02808440 bathtub, bathing tub, bath, tub
+n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+n02814860 beacon, lighthouse, beacon light, pharos
+n02815834 beaker
+n02817516 bearskin, busby, shako
+n02823428 beer bottle
+n02823750 beer glass
+n02825657 bell cote, bell cot
+n02834397 bib
+n02835271 bicycle-built-for-two, tandem bicycle, tandem
+n02837789 bikini, two-piece
+n02840245 binder, ring-binder
+n02841315 binoculars, field glasses, opera glasses
+n02843684 birdhouse
+n02859443 boathouse
+n02860847 bobsled, bobsleigh, bob
+n02865351 bolo tie, bolo, bola tie, bola
+n02869837 bonnet, poke bonnet
+n02870880 bookcase
+n02871525 bookshop, bookstore, bookstall
+n02877765 bottlecap
+n02879718 bow
+n02883205 bow tie, bow-tie, bowtie
+n02892201 brass, memorial tablet, plaque
+n02892767 brassiere, bra, bandeau
+n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+n02895154 breastplate, aegis, egis
+n02906734 broom
+n02909870 bucket, pail
+n02910353 buckle
+n02916936 bulletproof vest
+n02917067 bullet train, bullet
+n02927161 butcher shop, meat market
+n02930766 cab, hack, taxi, taxicab
+n02939185 caldron, cauldron
+n02948072 candle, taper, wax light
+n02950826 cannon
+n02951358 canoe
+n02951585 can opener, tin opener
+n02963159 cardigan
+n02965783 car mirror
+n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig
+n02966687 carpenter's kit, tool kit
+n02971356 carton
+n02974003 car wheel
+n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+n02978881 cassette
+n02979186 cassette player
+n02980441 castle
+n02981792 catamaran
+n02988304 CD player
+n02992211 cello, violoncello
+n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone
+n02999410 chain
+n03000134 chainlink fence
+n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+n03000684 chain saw, chainsaw
+n03014705 chest
+n03016953 chiffonier, commode
+n03017168 chime, bell, gong
+n03018349 china cabinet, china closet
+n03026506 Christmas stocking
+n03028079 church, church building
+n03032252 cinema, movie theater, movie theatre, movie house, picture palace
+n03041632 cleaver, meat cleaver, chopper
+n03042490 cliff dwelling
+n03045698 cloak
+n03047690 clog, geta, patten, sabot
+n03062245 cocktail shaker
+n03063599 coffee mug
+n03063689 coffeepot
+n03065424 coil, spiral, volute, whorl, helix
+n03075370 combination lock
+n03085013 computer keyboard, keypad
+n03089624 confectionery, confectionary, candy store
+n03095699 container ship, containership, container vessel
+n03100240 convertible
+n03109150 corkscrew, bottle screw
+n03110669 cornet, horn, trumpet, trump
+n03124043 cowboy boot
+n03124170 cowboy hat, ten-gallon hat
+n03125729 cradle
+n03126707 crane
+n03127747 crash helmet
+n03127925 crate
+n03131574 crib, cot
+n03133878 Crock Pot
+n03134739 croquet ball
+n03141823 crutch
+n03146219 cuirass
+n03160309 dam, dike, dyke
+n03179701 desk
+n03180011 desktop computer
+n03187595 dial telephone, dial phone
+n03188531 diaper, nappy, napkin
+n03196217 digital clock
+n03197337 digital watch
+n03201208 dining table, board
+n03207743 dishrag, dishcloth
+n03207941 dishwasher, dish washer, dishwashing machine
+n03208938 disk brake, disc brake
+n03216828 dock, dockage, docking facility
+n03218198 dogsled, dog sled, dog sleigh
+n03220513 dome
+n03223299 doormat, welcome mat
+n03240683 drilling platform, offshore rig
+n03249569 drum, membranophone, tympan
+n03250847 drumstick
+n03255030 dumbbell
+n03259280 Dutch oven
+n03271574 electric fan, blower
+n03272010 electric guitar
+n03272562 electric locomotive
+n03290653 entertainment center
+n03291819 envelope
+n03297495 espresso maker
+n03314780 face powder
+n03325584 feather boa, boa
+n03337140 file, file cabinet, filing cabinet
+n03344393 fireboat
+n03345487 fire engine, fire truck
+n03347037 fire screen, fireguard
+n03355925 flagpole, flagstaff
+n03372029 flute, transverse flute
+n03376595 folding chair
+n03379051 football helmet
+n03384352 forklift
+n03388043 fountain
+n03388183 fountain pen
+n03388549 four-poster
+n03393912 freight car
+n03394916 French horn, horn
+n03400231 frying pan, frypan, skillet
+n03404251 fur coat
+n03417042 garbage truck, dustcart
+n03424325 gasmask, respirator, gas helmet
+n03425413 gas pump, gasoline pump, petrol pump, island dispenser
+n03443371 goblet
+n03444034 go-kart
+n03445777 golf ball
+n03445924 golfcart, golf cart
+n03447447 gondola
+n03447721 gong, tam-tam
+n03450230 gown
+n03452741 grand piano, grand
+n03457902 greenhouse, nursery, glasshouse
+n03459775 grille, radiator grille
+n03461385 grocery store, grocery, food market, market
+n03467068 guillotine
+n03476684 hair slide
+n03476991 hair spray
+n03478589 half track
+n03481172 hammer
+n03482405 hamper
+n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier
+n03485407 hand-held computer, hand-held microcomputer
+n03485794 handkerchief, hankie, hanky, hankey
+n03492542 hard disc, hard disk, fixed disk
+n03494278 harmonica, mouth organ, harp, mouth harp
+n03495258 harp
+n03496892 harvester, reaper
+n03498962 hatchet
+n03527444 holster
+n03529860 home theater, home theatre
+n03530642 honeycomb
+n03532672 hook, claw
+n03534580 hoopskirt, crinoline
+n03535780 horizontal bar, high bar
+n03538406 horse cart, horse-cart
+n03544143 hourglass
+n03584254 iPod
+n03584829 iron, smoothing iron
+n03590841 jack-o'-lantern
+n03594734 jean, blue jean, denim
+n03594945 jeep, landrover
+n03595614 jersey, T-shirt, tee shirt
+n03598930 jigsaw puzzle
+n03599486 jinrikisha, ricksha, rickshaw
+n03602883 joystick
+n03617480 kimono
+n03623198 knee pad
+n03627232 knot
+n03630383 lab coat, laboratory coat
+n03633091 ladle
+n03637318 lampshade, lamp shade
+n03642806 laptop, laptop computer
+n03649909 lawn mower, mower
+n03657121 lens cap, lens cover
+n03658185 letter opener, paper knife, paperknife
+n03661043 library
+n03662601 lifeboat
+n03666591 lighter, light, igniter, ignitor
+n03670208 limousine, limo
+n03673027 liner, ocean liner
+n03676483 lipstick, lip rouge
+n03680355 Loafer
+n03690938 lotion
+n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+n03692522 loupe, jeweler's loupe
+n03697007 lumbermill, sawmill
+n03706229 magnetic compass
+n03709823 mailbag, postbag
+n03710193 mailbox, letter box
+n03710637 maillot
+n03710721 maillot, tank suit
+n03717622 manhole cover
+n03720891 maraca
+n03721384 marimba, xylophone
+n03724870 mask
+n03729826 matchstick
+n03733131 maypole
+n03733281 maze, labyrinth
+n03733805 measuring cup
+n03742115 medicine chest, medicine cabinet
+n03743016 megalith, megalithic structure
+n03759954 microphone, mike
+n03761084 microwave, microwave oven
+n03763968 military uniform
+n03764736 milk can
+n03769881 minibus
+n03770439 miniskirt, mini
+n03770679 minivan
+n03773504 missile
+n03775071 mitten
+n03775546 mixing bowl
+n03776460 mobile home, manufactured home
+n03777568 Model T
+n03777754 modem
+n03781244 monastery
+n03782006 monitor
+n03785016 moped
+n03786901 mortar
+n03787032 mortarboard
+n03788195 mosque
+n03788365 mosquito net
+n03791053 motor scooter, scooter
+n03792782 mountain bike, all-terrain bike, off-roader
+n03792972 mountain tent
+n03793489 mouse, computer mouse
+n03794056 mousetrap
+n03796401 moving van
+n03803284 muzzle
+n03804744 nail
+n03814639 neck brace
+n03814906 necklace
+n03825788 nipple
+n03832673 notebook, notebook computer
+n03837869 obelisk
+n03838899 oboe, hautboy, hautbois
+n03840681 ocarina, sweet potato
+n03841143 odometer, hodometer, mileometer, milometer
+n03843555 oil filter
+n03854065 organ, pipe organ
+n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO
+n03866082 overskirt
+n03868242 oxcart
+n03868863 oxygen mask
+n03871628 packet
+n03873416 paddle, boat paddle
+n03874293 paddlewheel, paddle wheel
+n03874599 padlock
+n03876231 paintbrush
+n03877472 pajama, pyjama, pj's, jammies
+n03877845 palace
+n03884397 panpipe, pandean pipe, syrinx
+n03887697 paper towel
+n03888257 parachute, chute
+n03888605 parallel bars, bars
+n03891251 park bench
+n03891332 parking meter
+n03895866 passenger car, coach, carriage
+n03899768 patio, terrace
+n03902125 pay-phone, pay-station
+n03903868 pedestal, plinth, footstall
+n03908618 pencil box, pencil case
+n03908714 pencil sharpener
+n03916031 perfume, essence
+n03920288 Petri dish
+n03924679 photocopier
+n03929660 pick, plectrum, plectron
+n03929855 pickelhaube
+n03930313 picket fence, paling
+n03930630 pickup, pickup truck
+n03933933 pier
+n03935335 piggy bank, penny bank
+n03937543 pill bottle
+n03938244 pillow
+n03942813 ping-pong ball
+n03944341 pinwheel
+n03947888 pirate, pirate ship
+n03950228 pitcher, ewer
+n03954731 plane, carpenter's plane, woodworking plane
+n03956157 planetarium
+n03958227 plastic bag
+n03961711 plate rack
+n03967562 plow, plough
+n03970156 plunger, plumber's helper
+n03976467 Polaroid camera, Polaroid Land camera
+n03976657 pole
+n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+n03980874 poncho
+n03982430 pool table, billiard table, snooker table
+n03983396 pop bottle, soda bottle
+n03991062 pot, flowerpot
+n03992509 potter's wheel
+n03995372 power drill
+n03998194 prayer rug, prayer mat
+n04004767 printer
+n04005630 prison, prison house
+n04008634 projectile, missile
+n04009552 projector
+n04019541 puck, hockey puck
+n04023962 punching bag, punch bag, punching ball, punchball
+n04026417 purse
+n04033901 quill, quill pen
+n04033995 quilt, comforter, comfort, puff
+n04037443 racer, race car, racing car
+n04039381 racket, racquet
+n04040759 radiator
+n04041544 radio, wireless
+n04044716 radio telescope, radio reflector
+n04049303 rain barrel
+n04065272 recreational vehicle, RV, R.V.
+n04067472 reel
+n04069434 reflex camera
+n04070727 refrigerator, icebox
+n04074963 remote control, remote
+n04081281 restaurant, eating house, eating place, eatery
+n04086273 revolver, six-gun, six-shooter
+n04090263 rifle
+n04099969 rocking chair, rocker
+n04111531 rotisserie
+n04116512 rubber eraser, rubber, pencil eraser
+n04118538 rugby ball
+n04118776 rule, ruler
+n04120489 running shoe
+n04125021 safe
+n04127249 safety pin
+n04131690 saltshaker, salt shaker
+n04133789 sandal
+n04136333 sarong
+n04141076 sax, saxophone
+n04141327 scabbard
+n04141975 scale, weighing machine
+n04146614 school bus
+n04147183 schooner
+n04149813 scoreboard
+n04152593 screen, CRT screen
+n04153751 screw
+n04154565 screwdriver
+n04162706 seat belt, seatbelt
+n04179913 sewing machine
+n04192698 shield, buckler
+n04200800 shoe shop, shoe-shop, shoe store
+n04201297 shoji
+n04204238 shopping basket
+n04204347 shopping cart
+n04208210 shovel
+n04209133 shower cap
+n04209239 shower curtain
+n04228054 ski
+n04229816 ski mask
+n04235860 sleeping bag
+n04238763 slide rule, slipstick
+n04239074 sliding door
+n04243546 slot, one-armed bandit
+n04251144 snorkel
+n04252077 snowmobile
+n04252225 snowplow, snowplough
+n04254120 soap dispenser
+n04254680 soccer ball
+n04254777 sock
+n04258138 solar dish, solar collector, solar furnace
+n04259630 sombrero
+n04263257 soup bowl
+n04264628 space bar
+n04265275 space heater
+n04266014 space shuttle
+n04270147 spatula
+n04273569 speedboat
+n04275548 spider web, spider's web
+n04277352 spindle
+n04285008 sports car, sport car
+n04286575 spotlight, spot
+n04296562 stage
+n04310018 steam locomotive
+n04311004 steel arch bridge
+n04311174 steel drum
+n04317175 stethoscope
+n04325704 stole
+n04326547 stone wall
+n04328186 stopwatch, stop watch
+n04330267 stove
+n04332243 strainer
+n04335435 streetcar, tram, tramcar, trolley, trolley car
+n04336792 stretcher
+n04344873 studio couch, day bed
+n04346328 stupa, tope
+n04347754 submarine, pigboat, sub, U-boat
+n04350905 suit, suit of clothes
+n04355338 sundial
+n04355933 sunglass
+n04356056 sunglasses, dark glasses, shades
+n04357314 sunscreen, sunblock, sun blocker
+n04366367 suspension bridge
+n04367480 swab, swob, mop
+n04370456 sweatshirt
+n04371430 swimming trunks, bathing trunks
+n04371774 swing
+n04372370 switch, electric switch, electrical switch
+n04376876 syringe
+n04380533 table lamp
+n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle
+n04392985 tape player
+n04398044 teapot
+n04399382 teddy, teddy bear
+n04404412 television, television system
+n04409515 tennis ball
+n04417672 thatch, thatched roof
+n04418357 theater curtain, theatre curtain
+n04423845 thimble
+n04428191 thresher, thrasher, threshing machine
+n04429376 throne
+n04435653 tile roof
+n04442312 toaster
+n04443257 tobacco shop, tobacconist shop, tobacconist
+n04447861 toilet seat
+n04456115 torch
+n04458633 totem pole
+n04461696 tow truck, tow car, wrecker
+n04462240 toyshop
+n04465501 tractor
+n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+n04476259 tray
+n04479046 trench coat
+n04482393 tricycle, trike, velocipede
+n04483307 trimaran
+n04485082 tripod
+n04486054 triumphal arch
+n04487081 trolleybus, trolley coach, trackless trolley
+n04487394 trombone
+n04493381 tub, vat
+n04501370 turnstile
+n04505470 typewriter keyboard
+n04507155 umbrella
+n04509417 unicycle, monocycle
+n04515003 upright, upright piano
+n04517823 vacuum, vacuum cleaner
+n04522168 vase
+n04523525 vault
+n04525038 velvet
+n04525305 vending machine
+n04532106 vestment
+n04532670 viaduct
+n04536866 violin, fiddle
+n04540053 volleyball
+n04542943 waffle iron
+n04548280 wall clock
+n04548362 wallet, billfold, notecase, pocketbook
+n04550184 wardrobe, closet, press
+n04552348 warplane, military plane
+n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+n04554684 washer, automatic washer, washing machine
+n04557648 water bottle
+n04560804 water jug
+n04562935 water tower
+n04579145 whiskey jug
+n04579432 whistle
+n04584207 wig
+n04589890 window screen
+n04590129 window shade
+n04591157 Windsor tie
+n04591713 wine bottle
+n04592741 wing
+n04596742 wok
+n04597913 wooden spoon
+n04599235 wool, woolen, woollen
+n04604644 worm fence, snake fence, snake-rail fence, Virginia fence
+n04606251 wreck
+n04612504 yawl
+n04613696 yurt
+n06359193 web site, website, internet site, site
+n06596364 comic book
+n06785654 crossword puzzle, crossword
+n06794110 street sign
+n06874185 traffic light, traffic signal, stoplight
+n07248320 book jacket, dust cover, dust jacket, dust wrapper
+n07565083 menu
+n07579787 plate
+n07583066 guacamole
+n07584110 consomme
+n07590611 hot pot, hotpot
+n07613480 trifle
+n07614500 ice cream, icecream
+n07615774 ice lolly, lolly, lollipop, popsicle
+n07684084 French loaf
+n07693725 bagel, beigel
+n07695742 pretzel
+n07697313 cheeseburger
+n07697537 hotdog, hot dog, red hot
+n07711569 mashed potato
+n07714571 head cabbage
+n07714990 broccoli
+n07715103 cauliflower
+n07716358 zucchini, courgette
+n07716906 spaghetti squash
+n07717410 acorn squash
+n07717556 butternut squash
+n07718472 cucumber, cuke
+n07718747 artichoke, globe artichoke
+n07720875 bell pepper
+n07730033 cardoon
+n07734744 mushroom
+n07742313 Granny Smith
+n07745940 strawberry
+n07747607 orange
+n07749582 lemon
+n07753113 fig
+n07753275 pineapple, ananas
+n07753592 banana
+n07754684 jackfruit, jak, jack
+n07760859 custard apple
+n07768694 pomegranate
+n07802026 hay
+n07831146 carbonara
+n07836838 chocolate sauce, chocolate syrup
+n07860988 dough
+n07871810 meat loaf, meatloaf
+n07873807 pizza, pizza pie
+n07875152 potpie
+n07880968 burrito
+n07892512 red wine
+n07920052 espresso
+n07930864 cup
+n07932039 eggnog
+n09193705 alp
+n09229709 bubble
+n09246464 cliff, drop, drop-off
+n09256479 coral reef
+n09288635 geyser
+n09332890 lakeside, lakeshore
+n09399592 promontory, headland, head, foreland
+n09421951 sandbar, sand bar
+n09428293 seashore, coast, seacoast, sea-coast
+n09468604 valley, vale
+n09472597 volcano
+n09835506 ballplayer, baseball player
+n10148035 groom, bridegroom
+n10565667 scuba diver
+n11879895 rapeseed
+n11939491 daisy
+n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+n12144580 corn
+n12267677 acorn
+n12620546 hip, rose hip, rosehip
+n12768682 buckeye, horse chestnut, conker
+n12985857 coral fungus
+n12998815 agaric
+n13037406 gyromitra
+n13040303 stinkhorn, carrion fungus
+n13044778 earthstar
+n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+n13054560 bolete
+n13133613 ear, spike, capitulum
+n15075141 toilet tissue, toilet paper, bathroom tissue
diff --git a/ios.toolchain.cmake b/ios.toolchain.cmake
new file mode 100644
index 00000000000..05176def861
--- /dev/null
+++ b/ios.toolchain.cmake
@@ -0,0 +1,193 @@
+# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake
+# files which are included with CMake 2.8.4
+# It has been altered for iOS development
+
+# Options:
+#
+# IOS_PLATFORM = iPhoneOS (default) or iPhoneSimulator
+#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
+#   iPhoneOS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
+#   iPhoneSimulator - used to build for the Simulator platforms, which have an x86 arch.
+#
+# CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
+#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+#   If set manually, it will override the default location and force the user of a particular Developer Platform
+#
+# CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
+#   By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
+#   In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
+#   If set manually, this will force the use of a specific SDK version
+
+# Macros:
+#
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
+#  A convenience macro for setting xcode specific properties on targets
+#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
+#
+# find_host_package (PROGRAM ARGS)
+#  A macro used to find executable programs on the host system, not within the iOS environment.
+#  Thanks to the android-cmake project for providing the command
+
+# Standard settings
+set (CMAKE_SYSTEM_NAME Darwin)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (APPLE True)
+set (IOS True)
+
+# Required as of cmake 2.8.10
+set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
+
+# Determine the cmake host system version so we know where to find the iOS SDKs
+find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin)
+if (CMAKE_UNAME)
+	exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION)
+	string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}")
+endif (CMAKE_UNAME)
+
+# Force the compilers to gcc for iOS
+include (CMakeForceCompiler)
+CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple)
+CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple)
+set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
+
+# Skip the platform compiler checks for cross compiling
+set (CMAKE_CXX_COMPILER_WORKS TRUE)
+set (CMAKE_C_COMPILER_WORKS TRUE)
+
+# All iOS/Darwin specific settings - some may be redundant
+set (CMAKE_SHARED_LIBRARY_PREFIX "lib")
+set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
+set (CMAKE_SHARED_MODULE_PREFIX "lib")
+set (CMAKE_SHARED_MODULE_SUFFIX ".so")
+set (CMAKE_MODULE_EXISTS 1)
+set (CMAKE_DL_LIBS "")
+
+set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
+set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
+set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
+set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
+
+# Hidden visibilty is required for cxx on iOS 
+set (CMAKE_C_FLAGS_INIT "-isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")
+set (CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -fvisibility=hidden -fvisibility-inlines-hidden -isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")
+
+set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
+set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
+
+set (CMAKE_PLATFORM_HAS_INSTALLNAME 1)
+set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
+set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
+set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
+set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
+set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
+
+# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
+# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
+# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
+# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
+if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+	find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
+endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+
+# Setup iOS platform unless specified manually with IOS_PLATFORM
+if (NOT DEFINED IOS_PLATFORM)
+    set (IOS_PLATFORM "iPhoneOS")
+endif (NOT DEFINED IOS_PLATFORM)
+set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
+
+# Check the platform selection and setup for developer root
+if (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+	set (IOS_PLATFORM_LOCATION "iPhoneOS.platform")
+
+	# This causes the installers to properly locate the output libraries
+	set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
+elseif (${IOS_PLATFORM} STREQUAL "iPhoneSimulator")
+	set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
+
+	# This causes the installers to properly locate the output libraries
+	set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
+else (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+    message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose iPhoneOS or iPhoneSimulator")
+endif (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+
+# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
+# Note Xcode 4.3 changed the installation location, choose the most recent one available
+set (XCODE_POST_43_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+	if (EXISTS ${XCODE_POST_43_ROOT})
+		set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
+	elseif(EXISTS ${XCODE_PRE_43_ROOT})
+		set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
+	endif (EXISTS ${XCODE_POST_43_ROOT})
+endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
+
+# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
+if (NOT DEFINED CMAKE_IOS_SDK_ROOT)
+	file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
+	if (_CMAKE_IOS_SDKS) 
+		list (SORT _CMAKE_IOS_SDKS)
+		list (REVERSE _CMAKE_IOS_SDKS)
+		list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
+	else (_CMAKE_IOS_SDKS)
+		message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
+	endif (_CMAKE_IOS_SDKS)
+	message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
+endif (NOT DEFINED CMAKE_IOS_SDK_ROOT)
+set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
+
+# Set the sysroot default to the most recent SDK
+set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+# set the architecture for iOS 
+# NOTE: Currently both ARCHS_STANDARD_32_BIT and ARCHS_UNIVERSAL_IPHONE_OS set armv7 only, so set both manually
+if (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+	set (IOS_ARCH armv7)
+else (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+	set (IOS_ARCH i386)
+endif (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+
+set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
+
+# Set the find root to the iOS developer roots and to user defined paths
+set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string  "iOS find search path root")
+
+# default to searching for frameworks first
+set (CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set (CMAKE_SYSTEM_FRAMEWORK_PATH
+	${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+	${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks
+	${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks
+)
+
+# only search the iOS sdks, not the remainder of the host filesystem
+set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+
+# This little macro lets you set any XCode specific property
+macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
+	set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
+endmacro (set_xcode_property)
+
+
+# This macro lets you find executable programs on the host system
+macro (find_host_package)
+	set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+	set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+	set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+	set (IOS FALSE)
+
+	find_package(${ARGN})
+
+	set (IOS TRUE)
+	set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+	set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+	set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro (find_host_package)
+
diff --git a/iossimxc.toolchain.cmake b/iossimxc.toolchain.cmake
new file mode 100644
index 00000000000..27bda76a25b
--- /dev/null
+++ b/iossimxc.toolchain.cmake
@@ -0,0 +1,40 @@
+# Standard settings
+# set(UNIX True)
+# set(Darwin True)
+# set(IOS True)
+set (CMAKE_SYSTEM_NAME Darwin)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (APPLE True)
+set (IOS True)
+
+# suppress -rdynamic
+# set(CMAKE_SYSTEM_NAME Generic)
+
+set(CMAKE_C_COMPILER i386-apple-darwin11-clang)
+set(CMAKE_CXX_COMPILER i386-apple-darwin11-clang++)
+
+set(_CMAKE_TOOLCHAIN_PREFIX i386-apple-darwin11-)
+
+set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target-sim/SDK/")
+
+# Set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS Simulator support")
+
+# set the architecture for iOS
+# set(IOS_ARCH i386)
+# set(IOS_ARCH x86_64)
+set(IOS_ARCH i386;x86_64)
+
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS Simulator")
+
+# Set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS Simulator find search path root")
+
+# searching for frameworks only
+set(CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+    ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+)
diff --git a/iosxc.toolchain.cmake b/iosxc.toolchain.cmake
new file mode 100644
index 00000000000..a4e9751b5a5
--- /dev/null
+++ b/iosxc.toolchain.cmake
@@ -0,0 +1,39 @@
+# Standard settings
+# set(UNIX True)
+# set(Darwin True)
+# set(IOS True)
+set (CMAKE_SYSTEM_NAME Darwin)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (APPLE True)
+set (IOS True)
+
+# suppress -rdynamic
+# set(CMAKE_SYSTEM_NAME Generic)
+
+set(CMAKE_C_COMPILER arm-apple-darwin11-clang)
+set(CMAKE_CXX_COMPILER arm-apple-darwin11-clang++)
+
+set(_CMAKE_TOOLCHAIN_PREFIX arm-apple-darwin11-)
+
+set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target/SDK/")
+
+# Set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+# set the architecture for iOS
+# set(IOS_ARCH arm64)
+set(IOS_ARCH armv7;arm64)
+
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
+
+# Set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root")
+
+# searching for frameworks only
+set(CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+    ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+)
diff --git a/package.sh b/package.sh
new file mode 100644
index 00000000000..ff743923b4c
--- /dev/null
+++ b/package.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/bash
+
+NAME=ncnn
+
+##### package android lib
+ANDROIDPKGNAME=${NAME}-android-lib
+rm -rf $ANDROIDPKGNAME
+mkdir -p $ANDROIDPKGNAME
+mkdir -p $ANDROIDPKGNAME/armeabi-v7a
+mkdir -p $ANDROIDPKGNAME/arm64-v8a
+mkdir -p $ANDROIDPKGNAME/include
+cp build-android-armv7/install/lib/lib${NAME}.a $ANDROIDPKGNAME/armeabi-v7a/
+cp build-android-aarch64/install/lib/lib${NAME}.a $ANDROIDPKGNAME/arm64-v8a/
+cp build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/
+rm -f $ANDROIDPKGNAME.zip
+zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME
+
+##### package ios framework
+IOSPKGNAME=${NAME}.framework
+rm -rf $IOSPKGNAME
+mkdir -p $IOSPKGNAME/Versions/A/Headers
+mkdir -p $IOSPKGNAME/Versions/A/Resources
+ln -s A $IOSPKGNAME/Versions/Current
+ln -s Versions/Current/Headers $IOSPKGNAME/Headers
+ln -s Versions/Current/Resources $IOSPKGNAME/Resources
+ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
+lipo -create \
+    build-ios/install/lib/lib${NAME}.a \
+    build-ios-sim/install/lib/lib${NAME}.a \
+    -o $IOSPKGNAME/Versions/A/${NAME}
+cp -r build-ios/install/include/* $IOSPKGNAME/Versions/A/Headers/
+cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
+rm -f $IOSPKGNAME.zip
+zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 00000000000..d9e491eb059
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,135 @@
+
+##############################################
+
+configure_file(platform.h.in ${CMAKE_CURRENT_BINARY_DIR}/platform.h)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer)
+
+set(ncnn_SRCS
+    blob.cpp
+    cpu.cpp
+    layer.cpp
+    mat.cpp
+    mat_pixel.cpp
+    net.cpp
+    opencv.cpp
+)
+
+macro(ncnn_add_layer class)
+    string(TOLOWER ${class} name)
+
+    # WITH_LAYER_xxx option
+    if(${ARGC} EQUAL 2)
+        option(WITH_LAYER_${name} "build with layer ${name}" ${ARGV1})
+    else()
+        option(WITH_LAYER_${name} "build with layer ${name}" ON)
+    endif()
+
+    message("WITH_LAYER_${name} = ${WITH_LAYER_${name}}")
+
+    if(WITH_LAYER_${name})
+        list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp")
+
+        # look for arch specific implementation and append source
+        # optimized implementation for armv7 aarch64
+        if((ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
+            OR (ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64"))
+            OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7"))
+            OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64"))
+            OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7;arm64")))
+            if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp")
+                list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp")
+                set(WITH_LAYER_${name}_arm 1)
+            endif()
+        else()
+            if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp")
+                list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp")
+                set(WITH_LAYER_${name}_x86 1)
+            endif()
+        endif()
+    endif()
+
+    # generate layer_declaration and layer_registry file
+    if(WITH_LAYER_${name})
+        if(WITH_LAYER_${name}_arm)
+            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
+                "extern Layer* ${class}_arm_layer_creator();\n")
+            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
+                "#if NCNN_STRING\n{\"${class}\",${class}_arm_layer_creator},\n#else\n{${class}_arm_layer_creator},\n#endif\n")
+        elseif(WITH_LAYER_${name}_x86)
+            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
+                "extern Layer* ${class}_x86_layer_creator();\n")
+            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
+                "#if NCNN_STRING\n{\"${class}\",${class}_x86_layer_creator},\n#else\n{${class}_x86_layer_creator},\n#endif\n")
+        else()
+            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
+                "extern Layer* ${class}_layer_creator();\n")
+            file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
+                "#if NCNN_STRING\n{\"${class}\",${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
+        endif()
+    else()
+        file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h "#if NCNN_STRING\n{\"${class}\",0},\n#else\n{0},\n#endif\n")
+    endif()
+endmacro()
+
+# create new
+file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h)
+file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h)
+
+# layer implementation
+ncnn_add_layer(AbsVal)
+ncnn_add_layer(ArgMax OFF)
+ncnn_add_layer(BatchNorm)
+ncnn_add_layer(Bias)
+ncnn_add_layer(BNLL)
+ncnn_add_layer(Concat)
+ncnn_add_layer(Convolution)
+ncnn_add_layer(Crop)
+ncnn_add_layer(Deconvolution)
+ncnn_add_layer(Dropout)
+ncnn_add_layer(Eltwise)
+ncnn_add_layer(ELU)
+ncnn_add_layer(Embed OFF)
+ncnn_add_layer(Exp)
+ncnn_add_layer(Flatten)
+ncnn_add_layer(InnerProduct)
+ncnn_add_layer(Input)
+ncnn_add_layer(Log)
+ncnn_add_layer(LRN)
+ncnn_add_layer(MemoryData OFF)
+ncnn_add_layer(MVN)
+ncnn_add_layer(Pooling)
+ncnn_add_layer(Power)
+ncnn_add_layer(PReLU)
+ncnn_add_layer(Proposal OFF)
+ncnn_add_layer(Reduction OFF)
+ncnn_add_layer(ReLU)
+ncnn_add_layer(Reshape OFF)
+ncnn_add_layer(ROIPooling OFF)
+ncnn_add_layer(Scale)
+ncnn_add_layer(Sigmoid)
+ncnn_add_layer(Slice)
+ncnn_add_layer(Softmax)
+ncnn_add_layer(Split)
+ncnn_add_layer(SPP OFF)
+ncnn_add_layer(TanH)
+ncnn_add_layer(Threshold)
+ncnn_add_layer(Tile OFF)
+ncnn_add_layer(RNN OFF)
+ncnn_add_layer(LSTM OFF)
+
+add_library(ncnn STATIC ${ncnn_SRCS})
+
+install(TARGETS ncnn ARCHIVE DESTINATION lib)
+install(FILES
+    blob.h
+    cpu.h
+    layer.h
+    mat.h
+    net.h
+    opencv.h
+    ${CMAKE_CURRENT_BINARY_DIR}/platform.h
+    DESTINATION include
+)
diff --git a/src/blob.cpp b/src/blob.cpp
new file mode 100644
index 00000000000..8af899fb799
--- /dev/null
+++ b/src/blob.cpp
@@ -0,0 +1,24 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "blob.h"
+
+namespace ncnn {
+
+Blob::Blob()
+{
+    producer = -1;
+}
+
+} // namespace ncnn
diff --git a/src/blob.h b/src/blob.h
new file mode 100644
index 00000000000..31f2c1d48d7
--- /dev/null
+++ b/src/blob.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include <string>
+#include <vector>
+#include "platform.h"
+
+namespace ncnn {
+
+class Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    std::vector<int> consumers;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/src/cpu.cpp b/src/cpu.cpp
new file mode 100644
index 00000000000..c43832a165c
--- /dev/null
+++ b/src/cpu.cpp
@@ -0,0 +1,471 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu.h"
+
+#include <stdio.h>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef __ANDROID__
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#if __APPLE__
+#include "TargetConditionals.h"
+#if TARGET_OS_IPHONE
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <mach/machine.h>
+#define __IOS__ 1
+#endif
+#endif
+
+namespace ncnn {
+
+#ifdef __ANDROID__
+
+// extract the ELF HW capabilities bitmap from /proc/self/auxv
+static unsigned int get_elf_hwcap_from_proc_self_auxv()
+{
+    FILE* fp = fopen("/proc/self/auxv", "rb");
+    if (!fp)
+    {
+        return 0;
+    }
+
+#define AT_HWCAP 16
+#define AT_HWCAP2 26
+
+    struct { unsigned int tag; unsigned int value; } entry;
+
+    unsigned int result = 0;
+    while (!feof(fp))
+    {
+        int nread = fread((char*)&entry, sizeof(entry), 1, fp);
+        if (nread != 1)
+            break;
+
+        if (entry.tag == 0 && entry.value == 0)
+            break;
+
+        if (entry.tag == AT_HWCAP)
+        {
+            result = entry.value;
+            break;
+        }
+    }
+
+    fclose(fp);
+
+    return result;
+}
+
+static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();
+
+#if __aarch64__
+// from arch/arm64/include/uapi/asm/hwcap.h
+#define HWCAP_ASIMD     (1 << 1)
+#define HWCAP_ASIMDHP   (1 << 10)
+#else
+// from arch/arm/include/uapi/asm/hwcap.h
+#define HWCAP_NEON      (1 << 12)
+#define HWCAP_VFPv4     (1 << 16)
+#endif
+
+#endif // __ANDROID__
+
+#if __IOS__
+static cpu_type_t get_hw_cputype()
+{
+    cpu_type_t value = 0;
+    size_t len = sizeof(value);
+    sysctlbyname("hw.cputype", &value, &len, NULL, 0);
+    return value;
+}
+
+static cpu_subtype_t get_hw_cpusubtype()
+{
+    cpu_subtype_t value = 0;
+    size_t len = sizeof(value);
+    sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
+    return value;
+}
+
+static cpu_type_t g_hw_cputype = get_hw_cputype();
+static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
+#endif // __IOS__
+
+int cpu_support_arm_neon()
+{
+#ifdef __ANDROID__
+#if __aarch64__
+    return g_hwcaps & HWCAP_ASIMD;
+#else
+    return g_hwcaps & HWCAP_NEON;
+#endif
+#elif __IOS__
+#if __aarch64__
+    return g_hw_cputype == CPU_TYPE_ARM64;
+#else
+    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
+#endif
+#else
+    return 0;
+#endif
+}
+
+int cpu_support_arm_vfpv4()
+{
+#ifdef __ANDROID__
+#if __aarch64__
+    // neon always enable fma and fp16
+    return g_hwcaps & HWCAP_ASIMD;
+#else
+    return g_hwcaps & HWCAP_VFPv4;
+#endif
+#elif __IOS__
+#if __aarch64__
+    return g_hw_cputype == CPU_TYPE_ARM64;
+#else
+    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
+#endif
+#else
+    return 0;
+#endif
+}
+
+int cpu_support_arm_asimdhp()
+{
+#ifdef __ANDROID__
+#if __aarch64__
+    return g_hwcaps & HWCAP_ASIMDHP;
+#else
+    return 0;
+#endif
+#elif __IOS__
+#if __aarch64__
+    return 0;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
+
+static int get_cpucount()
+{
+#ifdef __ANDROID__
+    // get cpu count from /proc/cpuinfo
+    FILE* fp = fopen("/proc/cpuinfo", "rb");
+    if (!fp)
+        return 1;
+
+    int count = 0;
+    char line[1024];
+    while (!feof(fp))
+    {
+        char* s = fgets(line, 1024, fp);
+        if (!s)
+            break;
+
+        if (memcmp(line, "processor", 9) == 0)
+        {
+            count++;
+        }
+    }
+
+    fclose(fp);
+
+    if (count < 1)
+        count = 1;
+
+    return count;
+#elif __IOS__
+    int count = 0;
+    size_t len = sizeof(count);
+    sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
+
+    if (count < 1)
+        count = 1;
+
+    return count;
+#else
+    return 1;
+#endif
+}
+
+static int g_cpucount = get_cpucount();
+
+int get_cpu_count()
+{
+    return g_cpucount;
+}
+
+#ifdef __ANDROID__
+static int get_max_freq_khz(int cpuid)
+{
+    char path[256];
+    sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
+
+    FILE* fp = fopen(path, "rb");
+
+    if (!fp)
+        return -1;
+
+    int max_freq_khz = 0;
+    while (!feof(fp))
+    {
+        int freq_khz = 0;
+        int nscan = fscanf(fp, "%d %*d", &freq_khz);
+        if (nscan != 1)
+            break;
+
+        if (freq_khz > max_freq_khz)
+            max_freq_khz = freq_khz;
+    }
+
+    fclose(fp);
+
+    return max_freq_khz;
+}
+
+static int set_sched_affinity(const std::vector<int>& cpuids)
+{
+    // cpu_set_t definition
+    // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity
+#define CPU_SETSIZE 1024
+#define __NCPUBITS  (8 * sizeof (unsigned long))
+typedef struct
+{
+   unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
+} cpu_set_t;
+
+#define CPU_SET(cpu, cpusetp) \
+  ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
+
+#define CPU_ZERO(cpusetp) \
+  memset((cpusetp), 0, sizeof(cpu_set_t))
+
+    // set affinity for thread
+    pid_t pid = gettid();
+
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    for (int i=0; i<(int)cpuids.size(); i++)
+    {
+        CPU_SET(cpuids[i], &mask);
+    }
+
+    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
+    if (syscallret)
+    {
+        fprintf(stderr, "syscall error %d\n", syscallret);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int sort_cpuid_by_max_frequency(std::vector<int>& cpuids, int* little_cluster_offset)
+{
+    const int cpu_count = cpuids.size();
+
+    *little_cluster_offset = 0;
+
+    if (cpu_count == 0)
+        return 0;
+
+    std::vector<int> cpu_max_freq_khz;
+    cpu_max_freq_khz.resize(cpu_count);
+
+    for (int i=0; i<cpu_count; i++)
+    {
+        int max_freq_khz = get_max_freq_khz(i);
+
+//         printf("%d max freq = %d khz\n", i, max_freq_khz);
+
+        cpuids[i] = i;
+        cpu_max_freq_khz[i] = max_freq_khz;
+    }
+
+    // sort cpuid as big core first
+    // simple bubble sort
+    for (int i=0; i<cpu_count; i++)
+    {
+        for (int j=i+1; j<cpu_count; j++)
+        {
+            if (cpu_max_freq_khz[i] < cpu_max_freq_khz[j])
+            {
+                // swap
+                int tmp = cpuids[i];
+                cpuids[i] = cpuids[j];
+                cpuids[j] = tmp;
+
+                tmp = cpu_max_freq_khz[i];
+                cpu_max_freq_khz[i] = cpu_max_freq_khz[j];
+                cpu_max_freq_khz[j] = tmp;
+            }
+        }
+    }
+
+    // SMP
+    int mid_max_freq_khz = (cpu_max_freq_khz.front() + cpu_max_freq_khz.back()) / 2;
+    if (mid_max_freq_khz == cpu_max_freq_khz.back())
+        return 0;
+
+    for (int i=0; i<cpu_count; i++)
+    {
+        if (cpu_max_freq_khz[i] < mid_max_freq_khz)
+        {
+            *little_cluster_offset = i;
+            break;
+        }
+    }
+
+    return 0;
+}
+#endif // __ANDROID__
+
+static int g_powersave = 0;
+
+int get_cpu_powersave()
+{
+    return g_powersave;
+}
+
+int set_cpu_powersave(int powersave)
+{
+#ifdef __ANDROID__
+    static std::vector<int> sorted_cpuids;
+    static int little_cluster_offset = 0;
+
+    if (sorted_cpuids.empty())
+    {
+        // 0 ~ g_cpucount
+        sorted_cpuids.resize(g_cpucount);
+        for (int i=0; i<g_cpucount; i++)
+        {
+            sorted_cpuids[i] = i;
+        }
+
+        // descent sort by max frequency
+        sort_cpuid_by_max_frequency(sorted_cpuids, &little_cluster_offset);
+    }
+
+    if (little_cluster_offset == 0)
+    {
+        fprintf(stderr, "SMP cpu powersave not supported\n");
+        return -1;
+    }
+
+    // prepare affinity cpuid
+    std::vector<int> cpuids;
+    if (powersave == 0)
+    {
+        cpuids = sorted_cpuids;
+    }
+    else if (powersave == 1)
+    {
+        cpuids = std::vector<int>(sorted_cpuids.begin() + little_cluster_offset, sorted_cpuids.end());
+    }
+    else if (powersave == 2)
+    {
+        cpuids = std::vector<int>(sorted_cpuids.begin(), sorted_cpuids.begin() +  + little_cluster_offset);
+    }
+    else
+    {
+        fprintf(stderr, "powersave %d not supported\n", powersave);
+        return -1;
+    }
+
+#ifdef _OPENMP
+    // set affinity for each thread
+    int num_threads = cpuids.size();
+    omp_set_num_threads(num_threads);
+    std::vector<int> ssarets(num_threads, 0);
+    #pragma omp parallel for
+    for (int i=0; i<num_threads; i++)
+    {
+        ssarets[i] = set_sched_affinity(cpuids);
+    }
+    for (int i=0; i<num_threads; i++)
+    {
+        if (ssarets[i] != 0)
+        {
+            return -1;
+        }
+    }
+#else
+    int ssaret = set_sched_affinity(cpuids);
+    if (ssaret != 0)
+    {
+        return -1;
+    }
+#endif
+
+    g_powersave = powersave;
+
+    return 0;
+#elif __IOS__
+    // thread affinity not supported on ios
+    return -1;
+#else
+    // TODO
+    return -1;
+#endif
+}
+
+int get_omp_num_threads()
+{
+#ifdef _OPENMP
+    return omp_get_num_threads();
+#else
+    return 1;
+#endif
+}
+
+void set_omp_num_threads(int num_threads)
+{
+#ifdef _OPENMP
+    omp_set_num_threads(num_threads);
+#else
+    (void)num_threads;
+#endif
+}
+
+int get_omp_dynamic()
+{
+#ifdef _OPENMP
+    return omp_get_dynamic();
+#else
+    return 0;
+#endif
+}
+
+void set_omp_dynamic(int dynamic)
+{
+#ifdef _OPENMP
+    omp_set_dynamic(dynamic);
+#else
+    (void)dynamic;
+#endif
+}
+
+} // namespace ncnn
diff --git a/src/cpu.h b/src/cpu.h
new file mode 100644
index 00000000000..9209cec8a09
--- /dev/null
+++ b/src/cpu.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+namespace ncnn {
+
+// test optional cpu features
+// neon = armv7 neon or aarch64 asimd
+int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+int cpu_support_arm_asimdhp();
+
+// cpu info
+int get_cpu_count();
+
+// bind all threads on little clusters if powersave enabled
+// affacts HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+int get_cpu_powersave();
+int set_cpu_powersave(int powersave);
+
+// misc function wrapper for openmp routines
+int get_omp_num_threads();
+void set_omp_num_threads(int num_threads);
+
+int get_omp_dynamic();
+void set_omp_dynamic(int dynamic);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/src/layer.cpp b/src/layer.cpp
new file mode 100644
index 00000000000..c7843074588
--- /dev/null
+++ b/src/layer.cpp
@@ -0,0 +1,130 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+
+#include <stdio.h>
+#include <string.h>
+
+namespace ncnn {
+
+Layer::Layer()
+{
+    one_blob_only = false;
+    support_inplace = false;
+}
+
+Layer::~Layer()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Layer::load_param(FILE* /*paramfp*/)
+{
+    return 0;
+}
+#endif // NCNN_STRING
+
+int Layer::load_param_bin(FILE* /*paramfp*/)
+{
+    return 0;
+}
+
+int Layer::load_model(FILE* /*binfp*/)
+{
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Layer::load_param(const unsigned char*& /*mem*/)
+{
+    return 0;
+}
+
+int Layer::load_model(const unsigned char*& /*mem*/)
+{
+    return 0;
+}
+
+int Layer::forward(const std::vector<Mat>& /*bottom_blobs*/, std::vector<Mat>& /*top_blobs*/) const
+{
+    return -1;
+}
+
+int Layer::forward(const Mat& /*bottom_blob*/, Mat& /*top_blob*/) const
+{
+    return -1;
+}
+
+int Layer::forward_inplace(std::vector<Mat>& bottom_top_blobs) const
+{
+    std::vector<Mat> top_blobs;
+    int ret = forward(bottom_top_blobs, top_blobs);
+    bottom_top_blobs = top_blobs;
+    return ret;
+}
+
+int Layer::forward_inplace(Mat& bottom_top_blob) const
+{
+    Mat top_blob;
+    int ret = forward(bottom_top_blob, top_blob);
+    bottom_top_blob = top_blob;
+    return ret;
+}
+
+#include "layer_declaration.h"
+
+static const layer_registry_entry layer_registry[] =
+{
+#include "layer_registry.h"
+};
+
+static const int layer_registry_entry_count = sizeof(layer_registry) / sizeof(layer_registry_entry);
+
+#if NCNN_STRING
+int layer_to_index(const char* type)
+{
+    for (int i=0; i<layer_registry_entry_count; i++)
+    {
+        if (strcmp(type, layer_registry[i].name) == 0)
+        {
+            return i;
+        }
+    }
+
+    fprintf(stderr, "layer %s not exists\n", type);
+    return -1;
+}
+#endif // NCNN_STRING
+
+Layer* create_layer(int index)
+{
+    if (index < 0 || index >= layer_registry_entry_count)
+    {
+        fprintf(stderr, "layer index %d not exists\n", index);
+        return 0;
+    }
+
+    layer_creator_func layer_creator = layer_registry[index].creator;
+    if (!layer_creator)
+    {
+        fprintf(stderr, "layer index %d not enabled\n", index);
+        return 0;
+    }
+
+    return layer_creator();
+}
+
+} // namespace ncnn
diff --git a/src/layer.h b/src/layer.h
new file mode 100644
index 00000000000..13bb2e5520f
--- /dev/null
+++ b/src/layer.h
@@ -0,0 +1,163 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load layer specific parameter from plain param file
+    // return 0 if success
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    // load layer specific parameter from binary param file
+    // return 0 if success
+    virtual int load_param_bin(FILE* paramfp);
+
+    // load layer specific weight data from model file
+    // return 0 if success
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+
+    // load layer specific parameter from memory
+    // memory pointer is 32-bit aligned
+    // return 0 if success
+    virtual int load_param(const unsigned char*& mem);
+
+    // load layer specific weight data from memory
+    // memory pointer is 32-bit aligned
+    // return 0 if success
+    virtual int load_model(const unsigned char*& mem);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs) const;
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+};
+
+namespace LayerType {
+enum
+{
+    AbsVal      = 0,
+    ArgMax      = 1,
+    BatchNorm   = 2,
+    Bias        = 3,
+    BNLL        = 4,
+    Concat      = 5,
+    Convolution = 6,
+    Crop        = 7,
+    Deconvolution = 8,
+    Dropout     = 9,
+    ELU         = 10,
+    Eltwise     = 11,
+    Embed       = 12,
+    Exp         = 13,
+    Flatten     = 14,
+    InnerProduct = 15,
+    Input       = 16,
+    Log         = 17,
+    LRN         = 18,
+    MemoryData  = 19,
+    MVN         = 20,
+    Pooling     = 21,
+    Power       = 22,
+    PReLU       = 23,
+    Proposal    = 24,
+    Reduction   = 25,
+    ReLU        = 26,
+    Reshape     = 27,
+    ROIPooling  = 28,
+    Scale       = 29,
+    Sigmoid     = 30,
+    Slice       = 31,
+    Softmax     = 32,
+    Split       = 33,
+    SPP         = 34,
+    TanH        = 35,
+    Threshold   = 36,
+    Tile        = 37,
+    RNN         = 38,
+    LSTM        = 39,
+
+    CustomBit   = (1<<8),
+};
+} // namespace LayerType
+
+// layer factory function
+typedef Layer* (*layer_creator_func)();
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+int layer_to_index(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name) \
+    Layer* name##_layer_creator() { return new name; }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/src/layer/absval.cpp b/src/layer/absval.cpp
new file mode 100644
index 00000000000..ed8f7e66a5f
--- /dev/null
+++ b/src/layer/absval.cpp
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "absval.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(AbsVal)
+
+AbsVal::AbsVal()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int AbsVal::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            if (ptr[i] < 0)
+                outptr[i] = -ptr[i];
+            else
+                outptr[i] = ptr[i];
+        }
+    }
+
+    return 0;
+}
+
+int AbsVal::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            if (ptr[i] < 0)
+                ptr[i] = -ptr[i];
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/absval.h b/src/layer/absval.h
new file mode 100644
index 00000000000..28fdceb51e1
--- /dev/null
+++ b/src/layer/absval.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ABSVAL_H
+#define LAYER_ABSVAL_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class AbsVal : public Layer
+{
+public:
+    AbsVal();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ABSVAL_H
diff --git a/src/layer/argmax.cpp b/src/layer/argmax.cpp
new file mode 100644
index 00000000000..a0fb82fea66
--- /dev/null
+++ b/src/layer/argmax.cpp
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "argmax.h"
+#include <algorithm>
+#include <functional>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(ArgMax)
+
+ArgMax::ArgMax()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int ArgMax::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d", &out_max_val, &topk);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "ArgMax load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int ArgMax::load_param_bin(FILE* paramfp)
+{
+    fread(&out_max_val, sizeof(int), 1, paramfp);
+
+    fread(&topk, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int ArgMax::load_param(const unsigned char*& mem)
+{
+    out_max_val = *(int*)(mem);
+    mem += 4;
+
+    topk = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int size = bottom_blob.total();
+
+    if (out_max_val)
+        top_blob.create(topk, 2);
+    else
+        top_blob.create(topk, 1);
+    if (top_blob.empty())
+        return -100;
+
+    const float* ptr = bottom_blob;
+
+    // partial sort topk with index
+    // optional value
+    std::vector< std::pair<float, int> > vec;
+    vec.resize(size);
+    for (int i=0; i<size; i++)
+    {
+        vec[i] = std::make_pair(ptr[i], i);
+    }
+
+    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                        std::greater< std::pair<float, int> >());
+
+    float* outptr = top_blob;
+    if (out_max_val)
+    {
+        float* valptr = outptr + topk;
+        for (int i=0; i<topk; i++)
+        {
+            outptr[i] = vec[i].first;
+            valptr[i] = vec[i].second;
+        }
+    }
+    else
+    {
+        for (int i=0; i<topk; i++)
+        {
+            outptr[i] = vec[i].second;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/argmax.h b/src/layer/argmax.h
new file mode 100644
index 00000000000..2b7086fba2b
--- /dev/null
+++ b/src/layer/argmax.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ARGMAX_H
+#define LAYER_ARGMAX_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class ArgMax : public Layer
+{
+public:
+    ArgMax();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+public:
+    int out_max_val;
+    int topk;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ARGMAX_H
diff --git a/src/layer/arm/absval_arm.cpp b/src/layer/arm/absval_arm.cpp
new file mode 100644
index 00000000000..db487ef8f35
--- /dev/null
+++ b/src/layer/arm/absval_arm.cpp
@@ -0,0 +1,152 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "absval_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(AbsVal_arm)
+
+int AbsVal_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _outp = vabsq_f32(_p);
+            vst1q_f32(outptr, _outp);
+
+            ptr += 4;
+            outptr += 4;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "vld1.f32   {d0-d1}, [%1]!      \n"
+            "vabs.f32   q0, q0              \n"
+            "subs       %0, #1              \n"
+            "vst1.f32   {d0-d1}, [%2]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(ptr),    // %1
+              "=r"(outptr)  // %2
+            : "0"(nn),
+              "1"(ptr),
+              "2"(outptr)
+            : "cc", "memory", "q0"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            *outptr = *ptr > 0 ? *ptr : -*ptr;
+
+            ptr++;
+            outptr++;
+        }
+    }
+
+    return 0;
+}
+
+int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            _p = vabsq_f32(_p);
+            vst1q_f32(ptr, _p);
+
+            ptr += 4;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "vld1.f32   {d0-d1}, [%1]       \n"
+            "vabs.f32   q0, q0              \n"
+            "subs       %0, #1              \n"
+            "vst1.f32   {d0-d1}, [%1]!      \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(ptr)     // %1
+            : "0"(nn),
+              "1"(ptr)
+            : "cc", "memory", "q0"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            *ptr = *ptr > 0 ? *ptr : -*ptr;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/absval_arm.h b/src/layer/arm/absval_arm.h
new file mode 100644
index 00000000000..787da11af91
--- /dev/null
+++ b/src/layer/arm/absval_arm.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ABSVAL_ARM_H
+#define LAYER_ABSVAL_ARM_H
+
+#include "absval.h"
+
+namespace ncnn {
+
+class AbsVal_arm : public AbsVal
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ABSVAL_ARM_H
diff --git a/src/layer/arm/batchnorm_arm.cpp b/src/layer/arm/batchnorm_arm.cpp
new file mode 100644
index 00000000000..0469410a0a6
--- /dev/null
+++ b/src/layer/arm/batchnorm_arm.cpp
@@ -0,0 +1,186 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "batchnorm_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(BatchNorm_arm)
+
+int BatchNorm_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // a = bias - slope * mean / sqrt(var)
+    // b = slope / sqrt(var)
+    // value = b * value + a
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    const float* a_data_ptr = a_data;
+    const float* b_data_ptr = b_data;
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        float a = a_data_ptr[q];
+        float b = b_data_ptr[q];
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        float32x4_t _a = vdupq_n_f32(a);
+        float32x4_t _b = vdupq_n_f32(b);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _outp = _a;
+            _outp = vfmaq_f32(_outp, _p, _b);
+            vst1q_f32(outptr, _outp);
+
+            ptr += 4;
+            outptr += 4;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "vdup.f32   q1, %6              \n"
+            "vdup.f32   q2, %7              \n"
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld1.f32   {d0-d1}, [%1 :128]! \n"
+            "vorr.32    q3, q1, q1          \n"
+            "vmla.f32   q3, q0, q2          \n"
+            "subs       %0, #1              \n"
+            "vst1.f32   {d6-d7}, [%2 :128]! \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(ptr),    // %1
+              "=r"(outptr)  // %2
+            : "0"(nn),
+              "1"(ptr),
+              "2"(outptr),
+              "r"(a),       // %6
+              "r"(b)        // %7
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            *outptr = b * *ptr + a;
+
+            ptr++;
+            outptr++;
+        }
+    }
+
+    return 0;
+}
+
+int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+    // a = bias - slope * mean / sqrt(var)
+    // b = slope / sqrt(var)
+    // value = b * value + a
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int size = w * h;
+
+    const float* a_data_ptr = a_data;
+    const float* b_data_ptr = b_data;
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        float a = a_data_ptr[q];
+        float b = b_data_ptr[q];
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        float32x4_t _a = vdupq_n_f32(a);
+        float32x4_t _b = vdupq_n_f32(b);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _outp = _a;
+            _outp = vfmaq_f32(_outp, _p, _b);
+            vst1q_f32(ptr, _outp);
+
+            ptr += 4;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "vdup.f32   q1, %4              \n"
+            "vdup.f32   q2, %5              \n"
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld1.f32   {d0-d1}, [%1 :128]  \n"
+            "vorr.32    q3, q1, q1          \n"
+            "vmla.f32   q3, q0, q2          \n"
+            "subs       %0, #1              \n"
+            "vst1.f32   {d6-d7}, [%1 :128]! \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(ptr)     // %1
+            : "0"(nn),
+              "1"(ptr),
+              "r"(a),       // %4
+              "r"(b)        // %5
+            : "cc", "memory", "q0", "q1", "q2", "q3"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            *ptr = b * *ptr + a;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/batchnorm_arm.h b/src/layer/arm/batchnorm_arm.h
new file mode 100644
index 00000000000..448b5a49834
--- /dev/null
+++ b/src/layer/arm/batchnorm_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BATCHNORM_ARM_H
+#define LAYER_BATCHNORM_ARM_H
+
+#include "batchnorm.h"
+
+namespace ncnn {
+
+class BatchNorm_arm : public BatchNorm
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BATCHNORM_ARM_H
diff --git a/src/layer/arm/bias_arm.cpp b/src/layer/arm/bias_arm.cpp
new file mode 100644
index 00000000000..e32e8f39652
--- /dev/null
+++ b/src/layer/arm/bias_arm.cpp
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "bias_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Bias_arm)
+
+int Bias_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    const float* bias_ptr = bias_data;
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        float bias = bias_ptr[q];
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        float32x4_t _bias = vdupq_n_f32(bias);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _outp = vaddq_f32(_p, _bias);
+            vst1q_f32(outptr, _outp);
+
+            ptr += 4;
+            outptr += 4;
+        }
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *outptr = *ptr + bias;
+
+            ptr++;
+            outptr++;
+        }
+    }
+
+    return 0;
+}
+
+int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    const float* bias_ptr = bias_data;
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        float bias = bias_ptr[q];
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        float32x4_t _bias = vdupq_n_f32(bias);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _outp = vaddq_f32(_p, _bias);
+            vst1q_f32(ptr, _outp);
+
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *ptr = *ptr + bias;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/bias_arm.h b/src/layer/arm/bias_arm.h
new file mode 100644
index 00000000000..27f13f8ea2e
--- /dev/null
+++ b/src/layer/arm/bias_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BIAS_ARM_H
+#define LAYER_BIAS_ARM_H
+
+#include "bias.h"
+
+namespace ncnn {
+
+class Bias_arm : public Bias
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BIAS_ARM_H
diff --git a/src/layer/arm/convolution_1x1.h b/src/layer/arm/convolution_1x1.h
new file mode 100644
index 00000000000..32778526bad
--- /dev/null
+++ b/src/layer/arm/convolution_1x1.h
@@ -0,0 +1,543 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        int q = 0;
+
+        for (; q+3<inch; q+=4)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+            const float* img1 = bottom_blob.channel(q+1);
+            const float* img2 = bottom_blob.channel(q+2);
+            const float* img3 = bottom_blob.channel(q+3);
+
+            const float* kernel0 = kernel + p*inch  + q;
+            const float k0 = kernel0[0];
+            const float k1 = kernel0[1];
+            const float k2 = kernel0[2];
+            const float k3 = kernel0[3];
+
+            const float* r0 = img0;
+            const float* r1 = img1;
+            const float* r2 = img2;
+            const float* r3 = img3;
+
+            int size = outw * outh;
+
+#if __ARM_NEON
+            int nn = size >> 3;
+            int remain = size & 7;
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+            float32x4_t _k0 = vdupq_n_f32(k0);
+            float32x4_t _k1 = vdupq_n_f32(k1);
+            float32x4_t _k2 = vdupq_n_f32(k2);
+            float32x4_t _k3 = vdupq_n_f32(k3);
+#if __aarch64__
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(r0);
+                float32x4_t _pn = vld1q_f32(r0+4);
+
+                float32x4_t _outp = vld1q_f32(outptr);
+                float32x4_t _outpn = vld1q_f32(outptr+4);
+
+                _outp = vfmaq_f32(_outp, _p, _k0);
+                _outpn = vfmaq_f32(_outpn, _pn, _k0);
+
+                float32x4_t _p1 = vld1q_f32(r1);
+                float32x4_t _p1n = vld1q_f32(r1+4);
+
+                _outp = vfmaq_f32(_outp, _p1, _k1);
+                _outpn = vfmaq_f32(_outpn, _p1n, _k1);
+
+                float32x4_t _p2 = vld1q_f32(r2);
+                float32x4_t _p2n = vld1q_f32(r2+4);
+
+                _outp = vfmaq_f32(_outp, _p2, _k2);
+                _outpn = vfmaq_f32(_outpn, _p2n, _k2);
+
+                float32x4_t _p3 = vld1q_f32(r3);
+                float32x4_t _p3n = vld1q_f32(r3+4);
+
+                _outp = vfmaq_f32(_outp, _p3, _k3);
+                _outpn = vfmaq_f32(_outpn, _p3n, _k3);
+
+                vst1q_f32(outptr, _outp);
+                vst1q_f32(outptr+4, _outpn);
+
+                r0 += 8;
+                r1 += 8;
+                r2 += 8;
+                r3 += 8;
+                outptr += 8;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "pld        [%2, #256]          \n"
+                "vld1.f32   {d4-d7}, [%2 :128]! \n"
+                "0:                             \n"
+                "pld        [%1, #256]          \n"
+                "vld1.f32   {d0-d3}, [%1 :128]  \n"
+                "vmla.f32   q0, q2, %q12        \n"
+                "vmla.f32   q1, q3, %q12        \n"
+                "pld        [%3, #256]          \n"
+                "vld1.f32   {d4-d7}, [%3 :128]! \n"
+                "vmla.f32   q0, q2, %q13        \n"
+                "vmla.f32   q1, q3, %q13        \n"
+                "pld        [%4, #256]          \n"
+                "vld1.f32   {d4-d7}, [%4 :128]! \n"
+                "vmla.f32   q0, q2, %q14        \n"
+                "vmla.f32   q1, q3, %q14        \n"
+                "pld        [%5, #256]          \n"
+                "vld1.f32   {d4-d7}, [%5 :128]! \n"
+                "vmla.f32   q0, q2, %q15        \n"
+                "vmla.f32   q1, q3, %q15        \n"
+                "pld        [%2, #256]          \n"
+                "vld1.f32   {d4-d7}, [%2 :128]! \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d3}, [%1 :128]! \n"
+                "bne        0b                  \n"
+                "sub        %2, #32             \n"
+                : "=r"(nn),     // %0
+                  "=r"(outptr), // %1
+                  "=r"(r0),     // %2
+                  "=r"(r1),     // %3
+                  "=r"(r2),     // %4
+                  "=r"(r3)      // %5
+                : "0"(nn),
+                  "1"(outptr),
+                  "2"(r0),
+                  "3"(r1),
+                  "4"(r2),
+                  "5"(r3),
+                  "w"(_k0),     // %12
+                  "w"(_k1),     // %13
+                  "w"(_k2),     // %14
+                  "w"(_k3)      // %15
+                : "cc", "memory", "q0", "q1", "q2", "q3"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                float sum = *r0 * k0;
+                float sum1 = *r1 * k1;
+                float sum2 = *r2 * k2;
+                float sum3 = *r3 * k3;
+
+                *outptr += sum + sum1 + sum2 + sum3;
+
+                r0++;
+                r1++;
+                r2++;
+                r3++;
+                outptr++;
+            }
+
+        }
+
+        for (; q<inch; q++)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch  + q;
+            const float k0 = kernel0[0];
+
+            const float* r0 = img0;
+
+            int size = outw * outh;
+
+#if __ARM_NEON
+            int nn = size >> 3;
+            int remain = size & 7;
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+            float32x4_t _k0 = vdupq_n_f32(k0);
+#if __aarch64__
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(r0);
+                float32x4_t _outp = vld1q_f32(outptr);
+
+                float32x4_t _pn = vld1q_f32(r0+4);
+                float32x4_t _outpn = vld1q_f32(outptr+4);
+
+                _outp = vfmaq_f32(_outp, _p, _k0);
+                _outpn = vfmaq_f32(_outpn, _pn, _k0);
+
+                vst1q_f32(outptr, _outp);
+                vst1q_f32(outptr+4, _outpn);
+
+                r0 += 8;
+                outptr += 8;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "pld        [%2, #256]          \n"
+                "vld1.f32   {d4-d7}, [%2 :128]! \n"
+                "0:                             \n"
+                "pld        [%1, #256]          \n"
+                "vld1.f32   {d0-d3}, [%1 :128]  \n"
+                "vmla.f32   q0, q2, %q6         \n"
+                "vmla.f32   q1, q3, %q6         \n"
+                "pld        [%2, #256]          \n"
+                "vld1.f32   {d4-d7}, [%2 :128]! \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d3}, [%1 :128]! \n"
+                "bne        0b                  \n"
+                "sub        %2, #32             \n"
+                : "=r"(nn),     // %0
+                  "=r"(outptr), // %1
+                  "=r"(r0)      // %2
+                : "0"(nn),
+                  "1"(outptr),
+                  "2"(r0),
+                  "w"(_k0)      // %6
+                : "cc", "memory", "q0", "q1", "q2", "q3"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                float sum = *r0 * k0;
+
+                *outptr += sum;
+
+                r0++;
+                outptr++;
+            }
+
+        }
+    }
+
+}
+
+static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        int q = 0;
+
+        for (; q+3<inch; q+=4)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+            const float* img1 = bottom_blob.channel(q+1);
+            const float* img2 = bottom_blob.channel(q+2);
+            const float* img3 = bottom_blob.channel(q+3);
+
+            const float* kernel0 = kernel + p*inch  + q;
+            const float k0 = kernel0[0];
+            const float k1 = kernel0[1];
+            const float k2 = kernel0[2];
+            const float k3 = kernel0[3];
+
+            const float* r0 = img0;
+            const float* r1 = img1;
+            const float* r2 = img2;
+            const float* r3 = img3;
+
+            for (int i = 0; i < outh; i++)
+            {
+#if __ARM_NEON
+                int nn = outw >> 3;
+                int remain = outw & 7;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+                float32x4_t _k0 = vdupq_n_f32(k0);
+                float32x4_t _k1 = vdupq_n_f32(k1);
+                float32x4_t _k2 = vdupq_n_f32(k2);
+                float32x4_t _k3 = vdupq_n_f32(k3);
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4x2_t _px2 = vld2q_f32(r0);
+                    float32x4_t _p = _px2.val[0];
+                    float32x4_t _outp = vld1q_f32(outptr);
+
+                    float32x4x2_t _pnx2 = vld2q_f32(r0+8);
+                    float32x4_t _pn = _pnx2.val[0];
+                    float32x4_t _outpn = vld1q_f32(outptr+4);
+
+                    _outp = vmlaq_f32(_outp, _p, _k0);
+                    _outpn = vmlaq_f32(_outpn, _pn, _k0);
+
+                    float32x4x2_t _p1x2 = vld2q_f32(r1);
+                    float32x4_t _p1 = _p1x2.val[0];
+                    float32x4x2_t _p1nx2 = vld2q_f32(r1+8);
+                    float32x4_t _p1n = _p1nx2.val[0];
+
+                    _outp = vmlaq_f32(_outp, _p1, _k1);
+                    _outpn = vmlaq_f32(_outpn, _p1n, _k1);
+
+                    float32x4x2_t _p2x2 = vld2q_f32(r2);
+                    float32x4_t _p2 = _p2x2.val[0];
+                    float32x4x2_t _p2nx2 = vld2q_f32(r2+8);
+                    float32x4_t _p2n = _p2nx2.val[0];
+
+                    _outp = vmlaq_f32(_outp, _p2, _k2);
+                    _outpn = vmlaq_f32(_outpn, _p2n, _k2);
+
+                    float32x4x2_t _p3x2 = vld2q_f32(r3);
+                    float32x4_t _p3 = _p3x2.val[0];
+                    float32x4x2_t _p3nx2 = vld2q_f32(r3+8);
+                    float32x4_t _p3n = _p3nx2.val[0];
+
+                    _outp = vmlaq_f32(_outp, _p3, _k3);
+                    _outpn = vmlaq_f32(_outpn, _p3n, _k3);
+
+                    vst1q_f32(outptr, _outp);
+                    vst1q_f32(outptr+8, _outpn);
+
+                    r0 += 16;
+                    r1 += 16;
+                    r2 += 16;
+                    r3 += 16;
+                    outptr += 8;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "pld        [%2, #512]          \n"
+                    "vld2.f32   {d4-d7}, [%2]!      \n"
+                    "vld2.f32   {d16-d19}, [%2]!    \n"
+                    "0:                             \n"
+                    "pld        [%1, #256]          \n"
+                    "vld1.f32   {d0-d3}, [%1]       \n"
+                    "vmla.f32   q0, q2, %q12        \n"
+                    "vmla.f32   q1, q8, %q12        \n"
+                    "pld        [%3, #512]          \n"
+                    "vld2.f32   {d4-d7}, [%3]!      \n"
+                    "vld2.f32   {d16-d19}, [%3]!    \n"
+                    "vmla.f32   q0, q2, %q13        \n"
+                    "vmla.f32   q1, q8, %q13        \n"
+                    "pld        [%4, #512]          \n"
+                    "vld2.f32   {d4-d7}, [%4]!      \n"
+                    "vld2.f32   {d16-d19}, [%4]!    \n"
+                    "vmla.f32   q0, q2, %q14        \n"
+                    "vmla.f32   q1, q8, %q14        \n"
+                    "pld        [%5, #512]          \n"
+                    "vld2.f32   {d4-d7}, [%5]!      \n"
+                    "vld2.f32   {d16-d19}, [%5]!    \n"
+                    "vmla.f32   q0, q2, %q15        \n"
+                    "vmla.f32   q1, q8, %q15        \n"
+                    "pld        [%2, #512]          \n"
+                    "vld2.f32   {d4-d7}, [%2]!      \n"
+                    "vld2.f32   {d16-d19}, [%2]!    \n"
+                    "subs       %0, #1              \n"
+                    "vst1.f32   {d0-d3}, [%1]!      \n"
+                    "bne        0b                  \n"
+                    "sub        %2, #64             \n"
+                    : "=r"(nn),     // %0
+                      "=r"(outptr), // %1
+                      "=r"(r0),     // %2
+                      "=r"(r1),     // %3
+                      "=r"(r2),     // %4
+                      "=r"(r3)      // %5
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(r0),
+                      "3"(r1),
+                      "4"(r2),
+                      "5"(r3),
+                      "w"(_k0),     // %12
+                      "w"(_k1),     // %13
+                      "w"(_k2),     // %14
+                      "w"(_k3)      // %15
+                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    float sum = *r0 * k0;
+                    float sum1 = *r1 * k1;
+                    float sum2 = *r2 * k2;
+                    float sum3 = *r3 * k3;
+
+                    *outptr += sum + sum1 + sum2 + sum3;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+                r3 += tailstep;
+            }
+
+        }
+
+        for (; q<inch; q++)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch  + q;
+            const float k0 = kernel0[0];
+
+            const float* r0 = img0;
+
+            for (int i = 0; i < outh; i++)
+            {
+#if __ARM_NEON
+                int nn = outw >> 3;
+                int remain = outw & 7;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+                float32x4_t _k0 = vdupq_n_f32(k0);
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4x2_t _px2 = vld2q_f32(r0);
+                    float32x4_t _p = _px2.val[0];
+                    float32x4_t _outp = vld1q_f32(outptr);
+
+                    float32x4x2_t _pnx2 = vld2q_f32(r0+8);
+                    float32x4_t _pn = _pnx2.val[0];
+                    float32x4_t _outpn = vld1q_f32(outptr+4);
+
+                    _outp = vmlaq_f32(_outp, _p, _k0);
+                    _outpn = vmlaq_f32(_outpn, _pn, _k0);
+
+                    vst1q_f32(outptr, _outp);
+                    vst1q_f32(outptr+4, _outpn);
+
+                    r0 += 16;
+                    outptr += 8;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "pld        [%2, #512]          \n"
+                    "vld2.f32   {d4-d7}, [%2]!      \n"
+                    "vld2.f32   {d16-d19}, [%2]!    \n"
+                    "0:                             \n"
+                    "pld        [%1, #256]          \n"
+                    "vld1.f32   {d0-d3}, [%1]       \n"
+                    "vmla.f32   q0, q2, %q6         \n"
+                    "vmla.f32   q1, q8, %q6         \n"
+                    "pld        [%2, #512]          \n"
+                    "vld2.f32   {d4-d7}, [%2]!      \n"
+                    "vld2.f32   {d16-d19}, [%2]!    \n"
+                    "subs       %0, #1              \n"
+                    "vst1.f32   {d0-d3}, [%1]!      \n"
+                    "bne        0b                  \n"
+                    "sub        %2, #64             \n"
+                    : "=r"(nn),     // %0
+                      "=r"(outptr), // %1
+                      "=r"(r0)      // %2
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(r0),
+                      "w"(_k0)      // %6
+                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    float sum = *r0 * k0;
+
+                    *outptr += sum;
+
+                    r0 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+            }
+
+        }
+    }
+
+}
diff --git a/src/layer/arm/convolution_2x2.h b/src/layer/arm/convolution_2x2.h
new file mode 100644
index 00000000000..fc4ed6672a8
--- /dev/null
+++ b/src/layer/arm/convolution_2x2.h
@@ -0,0 +1,381 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        int q = 0;
+
+        for (; q+1<inch; q+=2)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+            const float* img1 = bottom_blob.channel(q+1);
+
+            const float* kernel0 = kernel + p*inch*4  + q*4;
+            const float* kernel1 = kernel0 + 4;
+
+            const float* r00 = img0;
+            const float* r01 = img0 + w;
+
+            const float* r10 = img1;
+            const float* r11 = img1 + w;
+
+#if __ARM_NEON
+            float32x4_t _k0 = vld1q_f32(kernel0);
+            float32x4_t _k1 = vld1q_f32(kernel1);
+#endif // __ARM_NEON
+
+            for (int i = 0; i < outh; i++)
+            {
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw & 3;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _r000 = vld1q_f32(r00);
+                    float32x4_t _r010 = vld1q_f32(r01);
+                    float32x4_t _r001 = vld1q_f32(r00 + 1);
+                    float32x4_t _r011 = vld1q_f32(r01 + 1);
+
+                    float32x4_t _r100 = vld1q_f32(r10);
+                    float32x4_t _r110 = vld1q_f32(r11);
+                    float32x4_t _r101 = vld1q_f32(r10 + 1);
+                    float32x4_t _r111 = vld1q_f32(r11 + 1);
+
+                    float32x4_t _sum = vld1q_f32(outptr);
+
+                    _sum = vmlaq_lane_f32(_sum, _r000, vget_low_f32(_k0), 0);
+                    _sum = vmlaq_lane_f32(_sum, _r001, vget_low_f32(_k0), 1);
+                    _sum = vmlaq_lane_f32(_sum, _r010, vget_high_f32(_k0), 0);
+                    _sum = vmlaq_lane_f32(_sum, _r011, vget_high_f32(_k0), 1);
+
+                    _sum = vmlaq_lane_f32(_sum, _r100, vget_low_f32(_k1), 0);
+                    _sum = vmlaq_lane_f32(_sum, _r101, vget_low_f32(_k1), 1);
+                    _sum = vmlaq_lane_f32(_sum, _r110, vget_high_f32(_k1), 0);
+                    _sum = vmlaq_lane_f32(_sum, _r111, vget_high_f32(_k1), 1);
+
+                    vst1q_f32(outptr, _sum);
+
+                    r00 += 4;
+                    r01 += 4;
+                    r10 += 4;
+                    r11 += 4;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "pld        [%1, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%1]!      \n"
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d4-d5}, [%2]!      \n"
+
+                    "pld        [%3, #128]          \n"
+                    "vld1.f32   {d24-d25}, [%3]!    \n"
+                    "pld        [%4, #128]          \n"
+                    "vld1.f32   {d28-d29}, [%4]!    \n"
+
+                    "0:                             \n"
+                    "pld        [%5, #128]          \n"
+                    "vld1.f32   {d18-d19}, [%5]     \n"// q9 = sum
+
+                    "vmul.f32   q8, q0, %e12[0]     \n"
+                    "vmla.f32   q9, q2, %f12[0]     \n"
+
+                    "pld        [%1, #128]          \n"
+                    "vld1.f32   {d2-d3}, [%1]!      \n"
+
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d6-d7}, [%2]!      \n"
+
+                    "vext.f32   q10, q0, q1, #1     \n"
+                    "vext.f32   q11, q2, q3, #1     \n"
+
+                    "vmla.f32   q8, q12, %e13[0]    \n"
+                    "vmla.f32   q9, q14, %f13[0]    \n"
+
+                    "pld        [%3, #128]          \n"
+                    "vld1.f32   {d26-d27}, [%3]!    \n"
+
+                    "pld        [%4, #128]          \n"
+                    "vld1.f32   {d30-d31}, [%4]!    \n"
+
+                    "vmla.f32   q8, q10, %e12[1]    \n"
+                    "vmla.f32   q9, q11, %f12[1]    \n"
+
+                    "vext.f32   q10, q12, q13, #1   \n"
+                    "vext.f32   q11, q14, q15, #1   \n"
+
+                    "vmla.f32   q8, q10, %e13[1]    \n"
+                    "vmla.f32   q9, q11, %f13[1]    \n"
+
+                    "vorr       q0, q1, q1          \n"
+                    "vorr       q2, q3, q3          \n"
+
+                    "vadd.f32   q8, q8, q9          \n"
+
+                    "vorr       q12, q13, q13       \n"
+                    "vorr       q14, q15, q15       \n"
+
+                    "subs       %0, #1              \n"
+
+                    "vst1.f32   {d16-d17}, [%5]!    \n"
+
+                    "bne        0b                  \n"
+                    "sub        %1, #16             \n"
+                    "sub        %2, #16             \n"
+                    "sub        %3, #16             \n"
+                    "sub        %4, #16             \n"
+                    : "=r"(nn),     // %0
+                      "=r"(r00),    // %1
+                      "=r"(r01),    // %2
+                      "=r"(r10),    // %3
+                      "=r"(r11),    // %4
+                      "=r"(outptr)  // %5
+                    : "0"(nn),
+                      "1"(r00),
+                      "2"(r01),
+                      "3"(r10),
+                      "4"(r11),
+                      "5"(outptr),
+                      "w"(_k0),     // %12
+                      "w"(_k1)      // %13
+                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+                for (; remain>0; remain--)
+                {
+#if __ARM_NEON
+                    float32x2_t _r00 = vld1_f32(r00);
+                    float32x2_t _r01 = vld1_f32(r01);
+                    float32x4_t _r00r1 = vcombine_f32(_r00, _r01);
+                    float32x4_t _s0s1 = vmulq_f32(_r00r1, _k0);
+
+                    float32x2_t _r10 = vld1_f32(r10);
+                    float32x2_t _r11 = vld1_f32(r11);
+                    float32x4_t _r10r1 = vcombine_f32(_r10, _r11);
+                    _s0s1 = vmlaq_f32(_s0s1, _r10r1, _k1);
+
+                    float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
+                    _s = vpadd_f32(_s, _s);
+                    *outptr += vget_lane_f32(_s, 0);
+#else
+                    float sum = 0.f;
+
+                    sum += r00[0] * kernel0[0];
+                    sum += r00[1] * kernel0[1];
+                    sum += r01[0] * kernel0[2];
+                    sum += r01[1] * kernel0[3];
+
+                    sum += r10[0] * kernel1[0];
+                    sum += r10[1] * kernel1[1];
+                    sum += r11[0] * kernel1[2];
+                    sum += r11[1] * kernel1[3];
+
+                    *outptr += sum;
+#endif // __ARM_NEON
+
+                    r00 += 1;
+                    r01 += 1;
+                    r10 += 1;
+                    r11 += 1;
+                    outptr++;
+                }
+
+                r00 += 1;
+                r01 += 1;
+                r10 += 1;
+                r11 += 1;
+            }
+        }
+
+        for (; q<inch; q++)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch*4  + q*4;
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+
+#if __ARM_NEON
+            float32x4_t _k0 = vdupq_n_f32(kernel0[0]);
+            float32x4_t _k1 = vdupq_n_f32(kernel0[1]);
+            float32x4_t _k2 = vdupq_n_f32(kernel0[2]);
+            float32x4_t _k3 = vdupq_n_f32(kernel0[3]);
+#endif // __ARM_NEON
+
+            for (int i = 0; i < outh; i++)
+            {
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw & 3;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _r00 = vld1q_f32(r0);
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r01 = vld1q_f32(r0 + 1);
+                    float32x4_t _r11 = vld1q_f32(r1 + 1);
+
+                    float32x4_t _sum = vld1q_f32(outptr);
+                    float32x4_t _sum2;
+
+                    _sum = vmlaq_f32(_sum, _r00, _k0);
+                    _sum2 = vmulq_f32(_r01, _k1);
+                    _sum = vmlaq_f32(_sum, _r10, _k2);
+                    _sum2 = vmlaq_f32(_sum2, _r11, _k3);
+
+                    _sum = vaddq_f32(_sum, _sum2);
+
+                    vst1q_f32(outptr, _sum);
+
+                    r0 += 4;
+                    r1 += 4;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "pld        [%1, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%1]!      \n"
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d4-d5}, [%2]!      \n"
+
+                    "0:                             \n"
+                    "pld        [%3, #128]          \n"
+                    "vld1.f32   {d18-d19}, [%3]     \n"// q9 = sum
+
+                    "vmul.f32   q8, q0, %q8         \n"
+                    "vmla.f32   q9, q2, %q10        \n"
+
+                    "pld        [%1, #128]          \n"
+                    "vld1.f32   {d2-d3}, [%1]!      \n"
+                    "vext.f32   q10, q0, q1, #1     \n"
+
+                    "vmla.f32   q8, q10, %q9        \n"
+
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d6-d7}, [%2]!      \n"
+                    "vext.f32   q11, q2, q3, #1     \n"
+
+                    "vmla.f32   q9, q11, %q11       \n"
+
+                    "vorr       q0, q1, q1          \n"
+                    "vadd.f32   q8, q8, q9          \n"
+                    "vorr       q2, q3, q3          \n"
+
+                    "subs       %0, #1              \n"
+                    "vst1.f32   {d16-d17}, [%3]!    \n"
+                    "bne        0b                  \n"
+                    "sub        %1, #16             \n"
+                    "sub        %2, #16             \n"
+                    : "=r"(nn),     // %0
+                      "=r"(r0),     // %1
+                      "=r"(r1),     // %2
+                      "=r"(outptr)  // %3
+                    : "0"(nn),
+                      "1"(r0),
+                      "2"(r1),
+                      "3"(outptr),
+                      "w"(_k0),     // %8
+                      "w"(_k1),     // %9
+                      "w"(_k2),     // %10
+                      "w"(_k3)      // %11
+                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+                float32x4_t _k0123 = vld1q_f32(kernel0);
+#endif
+
+                for (; remain>0; remain--)
+                {
+#if __ARM_NEON
+                    float32x2_t _r0 = vld1_f32(r0);
+                    float32x2_t _r1 = vld1_f32(r1);
+                    float32x4_t _r0r1 = vcombine_f32(_r0, _r1);
+                    float32x4_t _s0s1 = vmulq_f32(_r0r1, _k0123);
+                    float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
+                    _s = vpadd_f32(_s, _s);
+                    *outptr += vget_lane_f32(_s, 0);
+#else
+                    float sum = 0.f;
+                    sum += r0[0] * kernel0[0];
+                    sum += r0[1] * kernel0[1];
+                    sum += r1[0] * kernel0[2];
+                    sum += r1[1] * kernel0[3];
+                    *outptr += sum;
+#endif
+
+                    r0 += 1;
+                    r1 += 1;
+                    outptr++;
+                }
+
+                r0 += 1;
+                r1 += 1;
+
+            }
+
+        }
+    }
+
+}
diff --git a/src/layer/arm/convolution_3x3.h b/src/layer/arm/convolution_3x3.h
new file mode 100644
index 00000000000..c998c514a1c
--- /dev/null
+++ b/src/layer/arm/convolution_3x3.h
@@ -0,0 +1,753 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        const float* kernel0 = kernel + p*inch*9;
+
+        for (int q=0; q<inch; q++)
+        {
+            float* outptr = out;
+            float* outptr2 = outptr + outw;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+            const float* r2 = img0 + w*2;
+            const float* r3 = img0 + w*3;
+
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+#if __ARM_NEON
+            float32x4_t _k0123 = vld1q_f32(kernel0);
+            float32x4_t _k3456 = vld1q_f32(kernel0+3);
+            float32x4_t _k6789 = vld1q_f32(kernel0+6);
+#endif // __ARM_NEON
+
+            int i = 0;
+
+            for (; i+1 < outh; i+=2)
+            {
+
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw & 3;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _sum1 = vld1q_f32(outptr);
+                    float32x4_t _sum2 = vdupq_n_f32(0.f);
+                    float32x4_t _sum3 = vld1q_f32(outptr2);
+                    float32x4_t _sum4 = vdupq_n_f32(0.f);
+
+                    float32x4_t _r00 = vld1q_f32(r0);
+                    float32x4_t _r00n = vld1q_f32(r0 + 4);
+                    float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
+                    float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);
+
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r10n = vld1q_f32(r1 + 4);
+                    float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
+                    float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);
+
+                    float32x4_t _r20 = vld1q_f32(r2);
+                    float32x4_t _r20n = vld1q_f32(r2 + 4);
+                    float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
+                    float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);
+
+                    float32x4_t _r30 = vld1q_f32(r3);
+                    float32x4_t _r30n = vld1q_f32(r3 + 4);
+                    float32x4_t _r31 = vextq_f32(_r30, _r30n, 1);
+                    float32x4_t _r32 = vextq_f32(_r30, _r30n, 2);
+
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1);
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0);
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2);
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1);
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2);
+
+                    _sum3 = vfmaq_laneq_f32(_sum3, _r10, _k0123, 0);
+                    _sum4 = vfmaq_laneq_f32(_sum4, _r11, _k0123, 1);
+                    _sum3 = vfmaq_laneq_f32(_sum3, _r12, _k0123, 2);
+                    _sum4 = vfmaq_laneq_f32(_sum4, _r20, _k3456, 0);
+                    _sum3 = vfmaq_laneq_f32(_sum3, _r21, _k3456, 1);
+                    _sum4 = vfmaq_laneq_f32(_sum4, _r22, _k3456, 2);
+                    _sum3 = vfmaq_laneq_f32(_sum3, _r30, _k6789, 0);
+                    _sum4 = vfmaq_laneq_f32(_sum4, _r31, _k6789, 1);
+                    _sum3 = vfmaq_laneq_f32(_sum3, _r32, _k6789, 2);
+
+                    _sum1 = vaddq_f32(_sum1, _sum2);
+                    _sum3 = vaddq_f32(_sum3, _sum4);
+
+                    vst1q_f32(outptr, _sum1);
+                    vst1q_f32(outptr2, _sum3);
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    outptr += 4;
+                    outptr2 += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "veor       q6, q6              \n"
+                    "veor       q15, q15            \n"
+
+                    "pld        [%3, #192]          \n"
+                    "vld1.f32   {d18-d20}, [%3 :64] \n"// r0
+                    "add        %3, #16             \n"
+
+                    "veor       q13, q13            \n"
+                    "veor       q14, q14            \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"
+                    "vext.32    q12, q9, q10, #2    \n"
+
+                    "0:                             \n"
+
+                    "pld        [%1, #128]          \n"
+                    "vld1.f32   {d14-d15}, [%1 :64] \n"// _sum
+
+                    "vmla.f32   q7, q9, %e14[0]     \n"
+                    "vmla.f32   q6, q11, %e14[1]    \n"
+                    "vmla.f32   q13, q12, %f14[0]   \n"
+
+                    "pld        [%4, #192]          \n"
+                    "vld1.f32   {d18-d20}, [%4]     \n"// r1
+                    "add        %4, #16             \n"
+
+                    "vmla.f32   q7, q9, %e15[0]     \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"
+                    "vext.32    q12, q9, q10, #2    \n"
+
+                    "vmla.f32   q6, q11, %e15[1]    \n"
+                    "vmla.f32   q13, q12, %f15[0]   \n"
+
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d16-d17}, [%2]     \n"// _sum2
+
+                    "vmla.f32   q8, q9, %e14[0]     \n"
+                    "vmla.f32   q14, q11, %e14[1]   \n"
+                    "vmla.f32   q15, q12, %f14[0]   \n"
+
+                    "pld        [%5, #192]          \n"
+                    "vld1.f32   {d18-d20}, [%5 :64] \n"// r2
+                    "add        %5, #16             \n"
+
+                    "vmla.f32   q7, q9, %e16[0]     \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"
+                    "vext.32    q12, q9, q10, #2    \n"
+
+                    "vmla.f32   q6, q11, %e16[1]    \n"
+                    "vmla.f32   q13, q12, %f16[0]   \n"
+
+                    "vmla.f32   q8, q9, %e15[0]     \n"
+                    "vmla.f32   q14, q11, %e15[1]   \n"
+                    "vmla.f32   q15, q12, %f15[0]   \n"
+
+                    "pld        [%6, #192]          \n"
+                    "vld1.f32   {d18-d20}, [%6]     \n"// r3
+                    "add        %6, #16             \n"
+
+                    "vmla.f32   q8, q9, %e16[0]     \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"
+                    "vext.32    q12, q9, q10, #2    \n"
+
+                    "vmla.f32   q14, q11, %e16[1]   \n"
+                    "vmla.f32   q15, q12, %f16[0]   \n"
+
+                    "vadd.f32   q7, q7, q6          \n"
+                    "veor       q6, q6              \n"
+
+                    "pld        [%3, #192]          \n"
+                    "vld1.f32   {d18-d20}, [%3 :64] \n"// r0
+
+                    "vadd.f32   q8, q8, q14         \n"
+                    "veor       q14, q14            \n"
+                    "vadd.f32   q7, q7, q13         \n"
+                    "veor       q13, q13            \n"
+                    "vadd.f32   q8, q8, q15         \n"
+                    "veor       q15, q15            \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"
+                    "vext.32    q12, q9, q10, #2    \n"
+
+                    "add        %3, #16             \n"
+
+                    "vst1.f32   {d14-d15}, [%1]!    \n"
+                    "vst1.f32   {d16-d17}, [%2]!    \n"
+
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+
+                    "sub        %3, #16             \n"
+                    : "=r"(nn),         // %0
+                      "=r"(outptr),     // %1
+                      "=r"(outptr2),    // %2
+                      "=r"(r0),         // %3
+                      "=r"(r1),         // %4
+                      "=r"(r2),         // %5
+                      "=r"(r3)          // %6
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(outptr2),
+                      "3"(r0),
+                      "4"(r1),
+                      "5"(r2),
+                      "6"(r3),
+                      "w"(_k0123),      // %14
+                      "w"(_k3456),      // %15
+                      "w"(_k6789)       // %16
+                    : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+#if __ARM_NEON
+                    float32x4_t _r00 = vld1q_f32(r0);
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r20 = vld1q_f32(r2);
+                    float32x4_t _r30 = vld1q_f32(r3);
+
+                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
+                    _sum = vmlaq_f32(_sum, _r10, _k3456);
+                    _sum = vmlaq_f32(_sum, _r20, _k6789);
+
+                    float32x4_t _sum2 = vmulq_f32(_r10, _k0123);
+                    _sum2 = vmlaq_f32(_sum2, _r20, _k3456);
+                    _sum2 = vmlaq_f32(_sum2, _r30, _k6789);
+
+                    _sum = vsetq_lane_f32(*outptr, _sum, 3);
+                    _sum2 = vsetq_lane_f32(*outptr2, _sum2, 3);
+
+#if __aarch64__
+                    *outptr = vaddvq_f32(_sum);
+                    *outptr2 = vaddvq_f32(_sum2);
+#else
+                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+                    float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
+
+                    float32x2_t _sss2 = vpadd_f32(_ss, _ss2);
+
+                    *outptr = vget_lane_f32(_sss2, 0);
+                    *outptr2 = vget_lane_f32(_sss2, 1);
+#endif // __aarch64__
+#else
+                    float sum = 0;
+                    float sum2 = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+
+                    sum2 += r1[0] * k0[0];
+                    sum2 += r1[1] * k0[1];
+                    sum2 += r1[2] * k0[2];
+                    sum2 += r2[0] * k1[0];
+                    sum2 += r2[1] * k1[1];
+                    sum2 += r2[2] * k1[2];
+                    sum2 += r3[0] * k2[0];
+                    sum2 += r3[1] * k2[1];
+                    sum2 += r3[2] * k2[2];
+
+                    *outptr += sum;
+                    *outptr2 += sum2;
+#endif
+                    r0++;
+                    r1++;
+                    r2++;
+                    r3++;
+                    outptr++;
+                    outptr2++;
+                }
+
+                r0 += 2 + w;
+                r1 += 2 + w;
+                r2 += 2 + w;
+                r3 += 2 + w;
+
+                outptr += outw;
+                outptr2 += outw;
+            }
+
+            for (; i < outh; i++)
+            {
+
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw & 3;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _sum1 = vld1q_f32(outptr);
+                    float32x4_t _sum2 = vdupq_n_f32(0.f);
+
+                    float32x4_t _r00 = vld1q_f32(r0);
+                    float32x4_t _r00n = vld1q_f32(r0 + 4);
+                    float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
+                    float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);
+
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r10n = vld1q_f32(r1 + 4);
+                    float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
+                    float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);
+
+                    float32x4_t _r20 = vld1q_f32(r2);
+                    float32x4_t _r20n = vld1q_f32(r2 + 4);
+                    float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
+                    float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);
+
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1);
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0);
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2);
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1);
+                    _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2);
+
+                    _sum1 = vaddq_f32(_sum1, _sum2);
+
+                    vst1q_f32(outptr, _sum1);
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "pld        [%2, #192]          \n"
+                    "vld1.f32   {d16-d18}, [%2]     \n"// r0
+                    "add        %2, #16             \n"
+
+                    "veor       q13, q13            \n"
+                    "veor       q14, q14            \n"
+
+                    "vext.32    q10, q8, q9, #1     \n"
+                    "vext.32    q11, q8, q9, #2     \n"
+
+                    "0:                             \n"
+
+                    "pld        [%1, #128]          \n"
+                    "vld1.f32   {d14-d15}, [%1]     \n"// _sum
+
+                    "vmla.f32   q7, q8, %e10[0]     \n"
+                    "vmla.f32   q13, q10, %e10[1]   \n"
+                    "vmla.f32   q14, q11, %f10[0]   \n"
+
+                    "pld        [%3, #192]          \n"
+                    "vld1.f32   {d16-d18}, [%3]     \n"// r1
+                    "add        %3, #16             \n"
+
+                    "vmla.f32   q7, q8, %e11[0]     \n"
+
+                    "vext.32    q10, q8, q9, #1     \n"
+                    "vext.32    q11, q8, q9, #2     \n"
+
+                    "vmla.f32   q13, q10, %e11[1]   \n"
+                    "vmla.f32   q14, q11, %f11[0]   \n"
+
+                    "pld        [%4, #192]          \n"
+                    "vld1.f32   {d16-d18}, [%4]     \n"// r2
+                    "add        %4, #16             \n"
+
+                    "vmla.f32   q7, q8, %e12[0]     \n"
+
+                    "vext.32    q10, q8, q9, #1     \n"
+                    "vext.32    q11, q8, q9, #2     \n"
+
+                    "vmla.f32   q13, q10, %e12[1]   \n"
+                    "vmla.f32   q14, q11, %f12[0]   \n"
+
+                    "pld        [%2, #192]          \n"
+                    "vld1.f32   {d16-d18}, [%2]     \n"// r0
+                    "add        %2, #16             \n"
+
+                    "vadd.f32   q7, q7, q13         \n"
+                    "veor       q13, q13            \n"
+                    "vadd.f32   q7, q7, q14         \n"
+                    "veor       q14, q14            \n"
+
+                    "vext.32    q10, q8, q9, #1     \n"
+                    "vext.32    q11, q8, q9, #2     \n"
+
+                    "vst1.f32   {d14-d15}, [%1]!    \n"
+
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+
+                    "sub        %2, #16             \n"
+                    : "=r"(nn),         // %0
+                      "=r"(outptr),     // %1
+                      "=r"(r0),         // %2
+                      "=r"(r1),         // %3
+                      "=r"(r2)          // %4
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(r0),
+                      "3"(r1),
+                      "4"(r2),
+                      "w"(_k0123),      // %10
+                      "w"(_k3456),      // %11
+                      "w"(_k6789)       // %12
+                    : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+#if __ARM_NEON
+                    float32x4_t _r00 = vld1q_f32(r0);
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r20 = vld1q_f32(r2);
+
+                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
+                    _sum = vmlaq_f32(_sum, _r10, _k3456);
+                    _sum = vmlaq_f32(_sum, _r20, _k6789);
+
+                    _sum = vsetq_lane_f32(*outptr, _sum, 3);
+
+#if __aarch64__
+                    *outptr = vaddvq_f32(_sum);
+#else
+                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+                    _ss = vpadd_f32(_ss, _ss);
+
+                    *outptr = vget_lane_f32(_ss, 0);
+#endif // __aarch64__
+#else
+                    float sum = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+
+                    *outptr += sum;
+#endif
+                    r0++;
+                    r1++;
+                    r2++;
+                    outptr++;
+                }
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+            }
+
+            kernel0 += 9;
+        }
+    }
+
+}
+
+static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        const float* kernel0 = kernel + p*inch*9;
+
+        for (int q=0; q<inch; q++)
+        {
+            float* outptr = out;
+            float* outptr2 = outptr + outw;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+            const float* r2 = img0 + w*2;
+
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+#if __ARM_NEON
+            float32x4_t _k0123 = vld1q_f32(k0);
+            float32x4_t _k3456 = vld1q_f32(k1);
+            float32x4_t _k6789 = vld1q_f32(k2);
+#endif // __ARM_NEON
+
+            int i = 0;
+
+            for (; i < outh; i++)
+            {
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw & 3;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _outp = vld1q_f32(outptr);
+
+                    float32x4x2_t _r0 = vld2q_f32(r0);
+                    float32x4x2_t _r0n = vld2q_f32(r0+8);
+
+                    float32x4_t _r00 = _r0.val[0];// 0 2 4 6
+                    float32x4_t _r01 = _r0.val[1];// 1 3 5 7
+                    float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8
+
+                    _outp = vfmaq_laneq_f32(_outp, _r00, _k0123, 0);
+                    _outp = vfmaq_laneq_f32(_outp, _r01, _k0123, 1);
+                    _outp = vfmaq_laneq_f32(_outp, _r02, _k0123, 2);
+
+                    float32x4x2_t _r1 = vld2q_f32(r1);
+                    float32x4x2_t _r1n = vld2q_f32(r1+8);
+
+                    float32x4_t _r10 = _r1.val[0];
+                    float32x4_t _r11 = _r1.val[1];
+                    float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1);
+
+                    _outp = vfmaq_laneq_f32(_outp, _r10, _k3456, 0);
+                    _outp = vfmaq_laneq_f32(_outp, _r11, _k3456, 1);
+                    _outp = vfmaq_laneq_f32(_outp, _r12, _k3456, 2);
+
+                    float32x4x2_t _r2 = vld2q_f32(r2);
+                    float32x4x2_t _r2n = vld2q_f32(r2+8);
+
+                    float32x4_t _r20 = _r2.val[0];
+                    float32x4_t _r21 = _r2.val[1];
+                    float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1);
+
+                    _outp = vfmaq_laneq_f32(_outp, _r20, _k6789, 0);
+                    _outp = vfmaq_laneq_f32(_outp, _r21, _k6789, 1);
+                    _outp = vfmaq_laneq_f32(_outp, _r22, _k6789, 2);
+
+                    vst1q_f32(outptr, _outp);
+
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "pld        [%2, #256]          \n"
+                    "vld2.f32   {d4-d7}, [%2]!      \n"
+
+                    "veor       q10, q10            \n"
+                    "veor       q11, q11            \n"
+
+                    "0:                             \n"
+                    "pld        [%1, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%1]       \n"
+
+                    "vmla.f32   q0, q2, %e10[0]     \n"
+                    "vmla.f32   q10, q3, %e10[1]    \n"
+
+                    "pld        [%2, #256]          \n"
+                    "vld2.f32   {d16-d19}, [%2]     \n"
+                    "vext.32    q1, q2, q8, #1      \n"
+
+                    "vmla.f32   q11, q1, %f10[0]    \n"
+
+                    "pld        [%3, #256]          \n"
+                    "vld2.f32   {d4-d7}, [%3]!      \n"
+
+                    "vmla.f32   q0, q2, %e11[0]     \n"
+                    "vmla.f32   q10, q3, %e11[1]    \n"
+
+                    "pld        [%3, #256]          \n"
+                    "vld2.f32   {d16-d19}, [%3]     \n"
+                    "vext.32    q1, q2, q8, #1      \n"
+
+                    "vmla.f32   q11, q1, %f11[0]    \n"
+
+                    "pld        [%4, #256]          \n"
+                    "vld2.f32   {d4-d7}, [%4]!      \n"
+
+                    "vmla.f32   q0, q2, %e12[0]     \n"
+                    "vmla.f32   q10, q3, %e12[1]    \n"
+
+                    "pld        [%4, #256]          \n"
+                    "vld2.f32   {d16-d19}, [%4]     \n"
+                    "vext.32    q1, q2, q8, #1      \n"
+
+                    "vmla.f32   q11, q1, %f12[0]    \n"
+
+                    "pld        [%2, #256]          \n"
+                    "vld2.f32   {d4-d7}, [%2]!      \n"
+
+                    "vadd.f32   q0, q0, q10         \n"
+                    "veor       q10, q10            \n"
+                    "vadd.f32   q0, q0, q11         \n"
+                    "veor       q11, q11            \n"
+
+                    "subs       %0, #1              \n"
+                    "vst1.f32   {d0-d1}, [%1]!      \n"
+                    "bne        0b                  \n"
+                    "sub        %2, #32             \n"
+                    : "=r"(nn),     // %0
+                      "=r"(outptr), // %1
+                      "=r"(r0),     // %2
+                      "=r"(r1),
+                      "=r"(r2)
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(r0),
+                      "3"(r1),
+                      "4"(r2),
+                      "w"(_k0123),  // %10
+                      "w"(_k3456),  // %11
+                      "w"(_k6789)   // %12
+                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+#if __ARM_NEON
+                    float32x4_t _r00 = vld1q_f32(r0);
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r20 = vld1q_f32(r2);
+
+                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
+                    _sum = vmlaq_f32(_sum, _r10, _k3456);
+                    _sum = vmlaq_f32(_sum, _r20, _k6789);
+
+                    _sum = vsetq_lane_f32(*outptr, _sum, 3);
+
+#if __aarch64__
+                    *outptr = vaddvq_f32(_sum);
+#else
+                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+                    _ss = vpadd_f32(_ss, _ss);
+
+                    *outptr = vget_lane_f32(_ss, 0);
+#endif // __aarch64__
+#else
+                    float sum = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+
+                    *outptr += sum;
+#endif // __ARM_NEON
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+            }
+
+            kernel0 += 9;
+        }
+    }
+}
diff --git a/src/layer/arm/convolution_4x4.h b/src/layer/arm/convolution_4x4.h
new file mode 100644
index 00000000000..18ef572f094
--- /dev/null
+++ b/src/layer/arm/convolution_4x4.h
@@ -0,0 +1,340 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        for (int q=0; q<inch; q++)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch*16  + q*16;
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+            const float* r2 = img0 + w*2;
+            const float* r3 = img0 + w*3;
+
+#if __ARM_NEON
+            float32x4_t _k0123 = vld1q_f32(kernel0);
+            float32x4_t _k4567 = vld1q_f32(kernel0+4);
+            float32x4_t _k891011 = vld1q_f32(kernel0+8);
+            float32x4_t _k12131415 = vld1q_f32(kernel0+12);
+#else
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 4;
+            const float* k2 = kernel0 + 8;
+            const float* k3 = kernel0 + 12;
+#endif // __ARM_NEON
+
+            for (int i = 0; i < outh; i++)
+            {
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw - (nn << 2);
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _r00 = vld1q_f32(r0);
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r20 = vld1q_f32(r2);
+                    float32x4_t _r30 = vld1q_f32(r3);
+
+                    float32x4_t _r01 = vld1q_f32(r0 + 4);
+                    float32x4_t _r11 = vld1q_f32(r1 + 4);
+                    float32x4_t _r21 = vld1q_f32(r2 + 4);
+                    float32x4_t _r31 = vld1q_f32(r3 + 4);
+
+                    float32x4_t _r02 = vld1q_f32(r0 + 8);
+                    float32x4_t _r12 = vld1q_f32(r1 + 8);
+                    float32x4_t _r22 = vld1q_f32(r2 + 8);
+                    float32x4_t _r32 = vld1q_f32(r3 + 8);
+
+                    float32x4_t _r03 = vld1q_f32(r0 + 12);
+                    float32x4_t _r13 = vld1q_f32(r1 + 12);
+                    float32x4_t _r23 = vld1q_f32(r2 + 12);
+                    float32x4_t _r33 = vld1q_f32(r3 + 12);
+
+                    float32x4_t _sum0 = vmulq_f32(_r00, _k0123);
+                    float32x4_t _sum1 = vmulq_f32(_r01, _k0123);
+                    float32x4_t _sum2 = vmulq_f32(_r02, _k0123);
+                    float32x4_t _sum3 = vmulq_f32(_r03, _k0123);
+
+                    _sum0 = vfmaq_f32(_sum0, _r10, _k4567);
+                    _sum1 = vfmaq_f32(_sum1, _r11, _k4567);
+                    _sum2 = vfmaq_f32(_sum2, _r12, _k4567);
+                    _sum3 = vfmaq_f32(_sum3, _r13, _k4567);
+
+                    _sum0 = vfmaq_f32(_sum0, _r20, _k891011);
+                    _sum1 = vfmaq_f32(_sum1, _r21, _k891011);
+                    _sum2 = vfmaq_f32(_sum2, _r22, _k891011);
+                    _sum3 = vfmaq_f32(_sum3, _r23, _k891011);
+
+                    _sum0 = vfmaq_f32(_sum0, _r30, _k12131415);
+                    _sum1 = vfmaq_f32(_sum1, _r31, _k12131415);
+                    _sum2 = vfmaq_f32(_sum2, _r32, _k12131415);
+                    _sum3 = vfmaq_f32(_sum3, _r33, _k12131415);
+
+                    float32x4_t _s01 = vpaddq_f32(_sum0, _sum1);
+                    float32x4_t _s23 = vpaddq_f32(_sum2, _sum3);
+                    float32x4_t _sum = vpaddq_f32(_s01, _s23);
+
+                    float32x4_t _outp = vld1q_f32(outptr);
+
+                    _outp = vaddq_f32(_outp, _sum);
+
+                    vst1q_f32(outptr, _sum);
+
+                    r0 += 16;
+                    r1 += 16;
+                    r2 += 16;
+                    r3 += 16;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+
+                    "pld        [%1, #128]          \n"
+
+                    "0:                             \n"
+
+                    "pld        [%2, #512]          \n"
+                    "pld        [%3, #512]          \n"
+
+                    "vld1.f32   {d14-d15}, [%1]     \n"// q7 = outptr
+
+                    "vld1.f32   {d16-d17}, [%2]!    \n"// q8  = r0
+                    "vld1.f32   {d18-d19}, [%3]!    \n"// q9  = r1
+
+                    "pld        [%4, #512]          \n"
+                    "pld        [%5, #512]          \n"
+
+                    "vmul.f32   q12, q8, %q12       \n"
+                    "vmul.f32   q13, q9, %q13       \n"
+
+                    "vld1.f32   {d20-d21}, [%4]!    \n"// q10 = r2
+                    "vld1.f32   {d22-d23}, [%5]!    \n"// q11 = r3
+
+                    "vmla.f32   q12, q10, %q14      \n"
+                    "vmla.f32   q13, q11, %q15      \n"
+
+                    "vadd.f32   q5, q12, q13        \n"
+
+                    "vld1.f32   {d16-d17}, [%2]!    \n"// q8  = r0
+                    "vld1.f32   {d18-d19}, [%3]!    \n"// q9  = r1
+
+                    "vmul.f32   q12, q8, %q12       \n"
+                    "vmul.f32   q13, q9, %q13       \n"
+
+                    "vld1.f32   {d20-d21}, [%4]!    \n"// q10 = r2
+                    "vld1.f32   {d22-d23}, [%5]!    \n"// q11 = r3
+
+                    "vmla.f32   q12, q10, %q14      \n"
+                    "vmla.f32   q13, q11, %q15      \n"
+
+                    "vadd.f32   q6, q12, q13        \n"
+
+                    "vld1.f32   {d16-d17}, [%2]!    \n"// q8  = r0
+                    "vld1.f32   {d18-d19}, [%3]!    \n"// q9  = r1
+
+                    "vmul.f32   q12, q8, %q12       \n"
+                    "vmul.f32   q13, q9, %q13       \n"
+
+                    "vld1.f32   {d20-d21}, [%4]!    \n"// q10 = r2
+                    "vld1.f32   {d22-d23}, [%5]!    \n"// q11 = r3
+
+                    "vmla.f32   q12, q10, %q14      \n"
+                    "vmla.f32   q13, q11, %q15      \n"
+
+                    "vadd.f32   q14, q12, q13       \n"
+
+                    "vld1.f32   {d16-d17}, [%2]!    \n"// q8  = r0
+                    "vld1.f32   {d18-d19}, [%3]!    \n"// q9  = r1
+
+                    "vmul.f32   q12, q8, %q12       \n"
+                    "vmul.f32   q13, q9, %q13       \n"
+
+                    "vld1.f32   {d20-d21}, [%4]!    \n"// q10 = r2
+                    "vld1.f32   {d22-d23}, [%5]!    \n"// q11 = r3
+
+                    "vmla.f32   q12, q10, %q14      \n"
+                    "vmla.f32   q13, q11, %q15      \n"
+
+                    "vadd.f32   q15, q12, q13       \n"
+
+                    "vadd.f32   d10, d10, d11       \n"
+                    "vadd.f32   d28, d28, d29       \n"
+                    "vadd.f32   d11, d12, d13       \n"
+                    "vadd.f32   d29, d30, d31       \n"
+
+                    "vpadd.f32  d10, d10, d11       \n"
+                    "vpadd.f32  d11, d28, d29       \n"
+
+                    "vadd.f32   q7, q7, q5          \n"
+
+                    "vst1.f32   {d14-d15}, [%1]!    \n"
+
+                    "pld        [%1, #128]          \n"
+
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+                    : "=r"(nn),         // %0
+                      "=r"(outptr),     // %1
+                      "=r"(r0),         // %2
+                      "=r"(r1),         // %3
+                      "=r"(r2),         // %4
+                      "=r"(r3)          // %5
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(r0),
+                      "3"(r1),
+                      "4"(r2),
+                      "5"(r3),
+                      "w"(_k0123),      // %12
+                      "w"(_k4567),      // %13
+                      "w"(_k891011),    // %14
+                      "w"(_k12131415)   // %15
+                    : "cc", "memory", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+#if __ARM_NEON
+#if __aarch64__
+                    float32x4_t _r0 = vld1q_f32(r0);
+                    float32x4_t _r1 = vld1q_f32(r1);
+                    float32x4_t _r2 = vld1q_f32(r2);
+                    float32x4_t _r3 = vld1q_f32(r3);
+
+                    float32x4_t _sum = vmulq_f32(_r0, _k0123);
+                    _sum = vmlaq_f32(_sum, _r1, _k4567);
+                    _sum = vmlaq_f32(_sum, _r2, _k891011);
+                    _sum = vmlaq_f32(_sum, _r3, _k12131415);
+
+                    *outptr += vaddvq_f32(_sum);
+#else
+                    float sum = 0.f;
+
+                    asm volatile(
+                        "vld1.f32   {d16-d17}, [%0]!    \n"// q8  = r0
+                        "vld1.f32   {d18-d19}, [%1]!    \n"// q9  = r1
+
+                        "vmul.f32   q12, q8, %q9        \n"
+                        "vmul.f32   q13, q9, %q10       \n"
+
+                        "vld1.f32   {d20-d21}, [%2]!    \n"// q10 = r2
+                        "vld1.f32   {d22-d23}, [%3]!    \n"// q11 = r3
+
+                        "vmla.f32   q12, q10, %q11      \n"
+                        "vmla.f32   q13, q11, %q12      \n"
+
+                        "vadd.f32   q5, q12, q13        \n"
+                        "vadd.f32   d10, d10, d11       \n"
+                        "vpadd.f32  d10, d10, d10       \n"
+                        "vmov.f32   %4, d10[0]          \n"
+                        : "=r"(r0),         // %0
+                          "=r"(r1),         // %1
+                          "=r"(r2),         // %2
+                          "=r"(r3),         // %3
+                          "=r"(sum)         // %4
+                        : "0"(r0),
+                          "1"(r1),
+                          "2"(r2),
+                          "3"(r3),
+                          "w"(_k0123),      // %9
+                          "w"(_k4567),      // %10
+                          "w"(_k891011),    // %11
+                          "w"(_k12131415)   // %12
+                        : "cc", "memory", "q5", "q6", "q8", "q9", "q10", "q11", "q12", "q13"
+                    );
+
+                    *outptr += sum;
+#endif // __aarch64__
+#else
+                    float sum = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r0[3] * k0[3];
+
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r1[3] * k1[3];
+
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+                    sum += r2[3] * k2[3];
+
+                    sum += r3[0] * k3[0];
+                    sum += r3[1] * k3[1];
+                    sum += r3[2] * k3[2];
+                    sum += r3[3] * k3[3];
+
+                    *outptr += sum;
+#endif // __ARM_NEON
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    outptr++;
+                }
+
+                r0 += w * 3;
+                r1 += w * 3;
+                r2 += w * 3;
+                r3 += w * 3;
+            }
+
+        }
+    }
+
+}
+
diff --git a/src/layer/arm/convolution_5x5.h b/src/layer/arm/convolution_5x5.h
new file mode 100644
index 00000000000..2c44a3b4cff
--- /dev/null
+++ b/src/layer/arm/convolution_5x5.h
@@ -0,0 +1,1251 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        for (int q=0; q<inch; q++)
+        {
+            float* outptr = out;
+            float* outptr2 = outptr + outw;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch*25  + q*25;
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+            const float* r2 = img0 + w*2;
+            const float* r3 = img0 + w*3;
+            const float* r4 = img0 + w*4;
+            const float* r5 = img0 + w*5;
+
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 5;
+            const float* k2 = kernel0 + 10;
+            const float* k3 = kernel0 + 15;
+            const float* k4 = kernel0 + 20;
+
+#if __ARM_NEON
+            float32x4_t _k0123 = vld1q_f32(kernel0);
+            float32x4_t _k4567 = vld1q_f32(kernel0+4);
+            float32x4_t _k891011 = vld1q_f32(kernel0+8);
+            float32x4_t _k12131415 = vld1q_f32(kernel0+12);
+            float32x4_t _k16171819 = vld1q_f32(kernel0+16);
+            float32x4_t _k20212223 = vld1q_f32(kernel0+20);
+            float32x4_t _k24242424 = vdupq_n_f32(kernel0[24]);
+#endif // __ARM_NEON
+
+            int i = 0;
+
+            for (; i+1 < outh; i+=2)
+            {
+
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw - (nn << 2);
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _sum = vdupq_n_f32(0.f);
+                    float32x4_t _sum2 = vdupq_n_f32(0.f);
+
+                    float32x4_t _r00 = vld1q_f32(r0);
+                    float32x4_t _r04 = vld1q_f32(r0 + 4);
+                    float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
+                    float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
+                    float32x4_t _r03 = vextq_f32(_r00, _r04, 3);
+
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r14 = vld1q_f32(r1 + 4);
+                    float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
+                    float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
+                    float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
+
+                    float32x4_t _r20 = vld1q_f32(r2);
+                    float32x4_t _r24 = vld1q_f32(r2 + 4);
+                    float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
+                    float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
+                    float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
+
+                    float32x4_t _r30 = vld1q_f32(r3);
+                    float32x4_t _r34 = vld1q_f32(r3 + 4);
+                    float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
+                    float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
+                    float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
+
+                    float32x4_t _r40 = vld1q_f32(r4);
+                    float32x4_t _r44 = vld1q_f32(r4 + 4);
+                    float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
+                    float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
+                    float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
+
+                    float32x4_t _r50 = vld1q_f32(r5);
+                    float32x4_t _r54 = vld1q_f32(r5 + 4);
+                    float32x4_t _r51 = vextq_f32(_r50, _r54, 1);
+                    float32x4_t _r52 = vextq_f32(_r50, _r54, 2);
+                    float32x4_t _r53 = vextq_f32(_r50, _r54, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
+
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k0123, 0);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r11, _k0123, 1);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k0123, 2);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r13, _k0123, 3);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r14, _k4567, 0);
+
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r20, _k4567, 1);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k4567, 2);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r22, _k4567, 3);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r23, _k891011, 0);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r24, _k891011, 1);
+
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r30, _k891011, 2);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r31, _k891011, 3);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r32, _k12131415, 0);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r33, _k12131415, 1);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r34, _k12131415, 2);
+
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r40, _k12131415, 3);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r41, _k16171819, 0);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r42, _k16171819, 1);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r43, _k16171819, 2);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r44, _k16171819, 3);
+
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r50, _k20212223, 0);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r51, _k20212223, 1);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r52, _k20212223, 2);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r53, _k20212223, 3);
+                    _sum2 = vfmaq_laneq_f32(_sum2, _r54, _k24242424, 0);
+
+                    vst1q_f32(outptr, _sum);
+                    vst1q_f32(outptr2, _sum2);
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    r4 += 4;
+                    r5 += 4;
+                    outptr += 4;
+                    outptr2 += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+//                     "veor       q13, q13            \n"
+//                     "veor       q14, q14            \n"
+
+                    "pld        [%1, #128]          \n"
+
+                    "vld1.f32   {d14-d15}, [%1]     \n"// q7 = out
+
+                    "0:                             \n"
+
+                    // q11 = rx1 / rx3
+                    // q12 = rx2
+
+                    // q13 q14 = intermediate sum register
+
+                    "pld        [%2, #128]          \n"
+
+                    "vld1.f32   {d16-d17}, [%2]     \n"// q8 = out2
+
+
+                    "pld        [%4, #256]          \n"
+
+                    // r1
+                    "vld1.f32   {d18-d21}, [%4]     \n"// q9 q10 = r10 r14
+                    "add        %4, #16             \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"// r11
+                    "vmul.f32   q13, q9, %e19[1]    \n"
+                    "vmla.f32   q8, q9, %e18[0]     \n"
+
+                    "vext.32    q12, q9, q10, #2    \n"// r12
+                    "vmla.f32   q7, q11, %f19[0]    \n"
+                    "vmul.f32   q14, q11, %e18[1]   \n"
+
+                    "vext.32    q11, q9, q10, #3    \n"// r13
+                    "vmla.f32   q13, q12, %f19[1]   \n"
+                    "vmla.f32   q8, q12, %f18[0]    \n"
+
+                    "vmla.f32   q7, q11, %e20[0]    \n"
+                    "vmla.f32   q14, q11, %f18[1]   \n"
+
+                    "pld        [%5, #256]          \n"
+
+                    "vmla.f32   q13, q10, %e20[1]   \n"
+                    "vmla.f32   q8, q10, %e19[0]    \n"
+
+                    // r2
+                    "vld1.f32   {d18-d21}, [%5]     \n"// q9 q10 = r20 r24
+                    "add        %5, #16             \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"// r21
+                    "vmla.f32   q7, q9, %f20[0]     \n"
+                    "vmla.f32   q14, q9, %e19[1]    \n"
+
+                    "vext.32    q12, q9, q10, #2    \n"// r22
+                    "vmla.f32   q13, q11, %f20[1]   \n"
+                    "vmla.f32   q8, q11, %f19[0]    \n"
+
+                    "vext.32    q11, q9, q10, #3    \n"// r23
+                    "vmla.f32   q7, q12, %e21[0]    \n"
+                    "vmla.f32   q14, q12, %f19[1]   \n"
+
+                    "vmla.f32   q13, q11, %e21[1]   \n"
+                    "vmla.f32   q8, q11, %e20[0]    \n"
+
+                    "pld        [%6, #256]          \n"
+
+                    "vmla.f32   q7, q10, %f21[0]    \n"
+                    "vmla.f32   q14, q10, %e20[1]   \n"
+
+                    // r3
+                    "vld1.f32   {d18-d21}, [%6]     \n"// q9 q10 = r30 r34
+                    "add        %6, #16             \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"// r31
+                    "vmla.f32   q13, q9, %f21[1]    \n"
+                    "vmla.f32   q8, q9, %f20[0]     \n"
+
+                    "vext.32    q12, q9, q10, #2    \n"// r32
+                    "vmla.f32   q7, q11, %e22[0]    \n"
+                    "vmla.f32   q14, q11, %f20[1]   \n"
+
+                    "vext.32    q11, q9, q10, #3    \n"// r33
+                    "vmla.f32   q13, q12, %e22[1]   \n"
+                    "vmla.f32   q8, q12, %e21[0]    \n"
+
+                    "vmla.f32   q7, q11, %f22[0]    \n"
+                    "vmla.f32   q14, q11, %e21[1]   \n"
+
+                    "pld        [%7, #256]          \n"
+
+                    "vmla.f32   q13, q10, %f22[1]   \n"
+                    "vmla.f32   q8, q10, %f21[0]    \n"
+
+                    // r4
+                    "vld1.f32   {d18-d21}, [%7]     \n"// q9 q10 = r40 r44
+                    "add        %7, #16             \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"// r41
+                    "vmla.f32   q7, q9, %e23[0]     \n"
+                    "vmla.f32   q14, q9, %f21[1]    \n"
+
+                    "vext.32    q12, q9, q10, #2    \n"// r42
+                    "vmla.f32   q13, q11, %e23[1]   \n"
+                    "vmla.f32   q8, q11, %e22[0]    \n"
+
+                    "vext.32    q11, q9, q10, #3    \n"// r43
+                    "vmla.f32   q7, q12, %f23[0]    \n"
+                    "vmla.f32   q14, q12, %e22[1]   \n"
+
+                    "vmla.f32   q13, q11, %f23[1]   \n"
+                    "vmla.f32   q8, q11, %f22[0]    \n"
+
+                    "pld        [%3, #256]          \n"
+
+                    "vmla.f32   q7, q10, %e24[0]    \n"
+                    "vmla.f32   q14, q10, %f22[1]   \n"
+
+                    // r0 and r5
+                    "vld1.f32   {d18-d21}, [%3]     \n"// q9 q10 = r00 r04
+                    "add        %3, #16             \n"
+
+                    "vext.32    q11, q9, q10, #1    \n"// r01
+                    "vmla.f32   q13, q11, %e18[1]   \n"
+
+                    "vext.32    q12, q9, q10, #2    \n"// r02
+                    "vmla.f32   q7, q12, %f18[0]    \n"
+
+                    "vext.32    q11, q9, q10, #3    \n"// r03
+
+                    "pld        [%8, #256]          \n"
+
+                    "vmla.f32   q13, q11, %f18[1]   \n"
+
+                    // r5
+                    "vld1.f32   {d22-d25}, [%8]     \n"// q11 q12 = r50 r54
+                    "add        %8, #16             \n"
+
+                    "vmla.f32   q8, q11, %e23[0]    \n"
+                    "vmla.f32   q14, q12, %e24[0]   \n"
+
+                    "vmla.f32   q7, q9, %e18[0]     \n"
+                    "vmla.f32   q13, q10, %e19[0]   \n"
+
+                    "vext.32    q9, q11, q12, #1    \n"// r51
+                    "vext.32    q10, q11, q12, #2   \n"// r52
+
+                    "vmla.f32   q14, q9, %e23[1]    \n"
+
+                    "vext.32    q9, q11, q12, #3    \n"// r53
+                    "vmla.f32   q8, q10, %f23[0]    \n"
+
+                    "vmla.f32   q14, q9, %f23[1]    \n"
+
+                    "vadd.f32   q7, q7, q13         \n"
+
+//                     "veor       q13, q13            \n"
+
+                    "vst1.f32   {d14-d15}, [%1]!    \n"
+
+                    "vadd.f32   q8, q8, q14         \n"
+
+                    "pld        [%1, #128]          \n"
+
+                    "vld1.f32   {d14-d15}, [%1]     \n"// q7 = out
+
+//                     "veor       q14, q14            \n"
+
+                    "vst1.f32   {d16-d17}, [%2]!    \n"
+
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+                    : "=r"(nn),         // %0
+                      "=r"(outptr),     // %1
+                      "=r"(outptr2),    // %2
+                      "=r"(r0),         // %3
+                      "=r"(r1),         // %4
+                      "=r"(r2),         // %5
+                      "=r"(r3),         // %6
+                      "=r"(r4),         // %7
+                      "=r"(r5)          // %8
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(outptr2),
+                      "3"(r0),
+                      "4"(r1),
+                      "5"(r2),
+                      "6"(r3),
+                      "7"(r4),
+                      "8"(r5),
+                      "w"(_k0123),      // %18
+                      "w"(_k4567),      // %19
+                      "w"(_k891011),    // %20
+                      "w"(_k12131415),  // %21
+                      "w"(_k16171819),  // %22
+                      "w"(_k20212223),  // %23
+                      "w"(_k24242424)   // %24
+                    : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    float sum = 0;
+                    float sum2 = 0;
+#if __ARM_NEON
+                    float32x4_t _r1 = vld1q_f32(r1);
+                    float32x4_t _k1 = vld1q_f32(k1);
+                    float32x4_t _sum = vmulq_f32(_r1, _k1);
+                    float32x4_t _sum2 = vmulq_f32(_r1, _k0123);
+
+                    float32x4_t _r2 = vld1q_f32(r2);
+                    float32x4_t _k2 = vld1q_f32(k2);
+                    _sum = vmlaq_f32(_sum, _r2, _k2);
+                    _sum2 = vmlaq_f32(_sum2, _r2, _k1);
+
+                    float32x4_t _r3 = vld1q_f32(r3);
+                    _sum = vmlaq_f32(_sum, _r3, _k20212223);
+                    _sum2 = vmlaq_f32(_sum2, _r3, _k2);
+
+                    float32x4_t _r4 = vld1q_f32(r4);
+                    float32x4_t _k4 = vld1q_f32(k4);
+                    _sum = vmlaq_f32(_sum, _r4, _k4);
+                    _sum2 = vmlaq_f32(_sum2, _r4, _k20212223);
+
+                    float32x4_t _r0 = vld1q_f32(r0);
+                    _sum = vmlaq_f32(_sum, _r0, _k0123);
+                    float32x4_t _r5 = vld1q_f32(r5);
+                    _sum2 = vmlaq_f32(_sum2, _r5, _k4);
+
+                    float32x4_t _k_t4;
+                    _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
+                    _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
+                    _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
+                    _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);
+
+                    float32x4_t _r_t4;
+
+                    _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
+                    _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
+                    _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
+                    _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
+                    _sum = vmlaq_f32(_sum, _r_t4, _k_t4);
+
+                    sum = r4[4] * k4[4];
+
+                    _r_t4 = vextq_f32(_r_t4, _r_t4, 1);
+                    _r_t4 = vsetq_lane_f32(r4[4], _r_t4, 3);
+                    _sum2 = vmlaq_f32(_sum2, _r_t4, _k_t4);
+
+                    sum2 = r5[4] * k4[4];
+
+                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+                    float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
+                    float32x2_t _ss_ss2 = vpadd_f32(_ss, _ss2);
+
+                    sum += vget_lane_f32(_ss_ss2, 0);
+                    sum2 += vget_lane_f32(_ss_ss2, 1);
+#else
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r0[3] * k0[3];
+                    sum += r0[4] * k0[4];
+
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r1[3] * k1[3];
+                    sum += r1[4] * k1[4];
+
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+                    sum += r2[3] * k2[3];
+                    sum += r2[4] * k2[4];
+
+                    sum += r3[0] * k3[0];
+                    sum += r3[1] * k3[1];
+                    sum += r3[2] * k3[2];
+                    sum += r3[3] * k3[3];
+                    sum += r3[4] * k3[4];
+
+                    sum += r4[0] * k4[0];
+                    sum += r4[1] * k4[1];
+                    sum += r4[2] * k4[2];
+                    sum += r4[3] * k4[3];
+                    sum += r4[4] * k4[4];
+
+                    sum2 += r1[0] * k0[0];
+                    sum2 += r1[1] * k0[1];
+                    sum2 += r1[2] * k0[2];
+                    sum2 += r1[3] * k0[3];
+                    sum2 += r1[4] * k0[4];
+
+                    sum2 += r2[0] * k1[0];
+                    sum2 += r2[1] * k1[1];
+                    sum2 += r2[2] * k1[2];
+                    sum2 += r2[3] * k1[3];
+                    sum2 += r2[4] * k1[4];
+
+                    sum2 += r3[0] * k2[0];
+                    sum2 += r3[1] * k2[1];
+                    sum2 += r3[2] * k2[2];
+                    sum2 += r3[3] * k2[3];
+                    sum2 += r3[4] * k2[4];
+
+                    sum2 += r4[0] * k3[0];
+                    sum2 += r4[1] * k3[1];
+                    sum2 += r4[2] * k3[2];
+                    sum2 += r4[3] * k3[3];
+                    sum2 += r4[4] * k3[4];
+
+                    sum2 += r5[0] * k4[0];
+                    sum2 += r5[1] * k4[1];
+                    sum2 += r5[2] * k4[2];
+                    sum2 += r5[3] * k4[3];
+                    sum2 += r5[4] * k4[4];
+#endif // __ARM_NEON
+                    *outptr += sum;
+                    *outptr2 += sum2;
+
+                    r0++;
+                    r1++;
+                    r2++;
+                    r3++;
+                    r4++;
+                    r5++;
+                    outptr++;
+                    outptr2++;
+                }
+
+                r0 += 4 + w;
+                r1 += 4 + w;
+                r2 += 4 + w;
+                r3 += 4 + w;
+                r4 += 4 + w;
+                r5 += 4 + w;
+
+                outptr += outw;
+                outptr2 += outw;
+            }
+
+            for (; i < outh; i++)
+            {
+
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw - (nn << 2);
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _sum = vdupq_n_f32(0.f);
+
+                    float32x4_t _r00 = vld1q_f32(r0);
+                    float32x4_t _r04 = vld1q_f32(r0 + 4);
+                    float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
+                    float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
+                    float32x4_t _r03 = vextq_f32(_r00, _r04, 3);
+
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r14 = vld1q_f32(r1 + 4);
+                    float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
+                    float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
+                    float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
+
+                    float32x4_t _r20 = vld1q_f32(r2);
+                    float32x4_t _r24 = vld1q_f32(r2 + 4);
+                    float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
+                    float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
+                    float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
+
+                    float32x4_t _r30 = vld1q_f32(r3);
+                    float32x4_t _r34 = vld1q_f32(r3 + 4);
+                    float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
+                    float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
+                    float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
+
+                    float32x4_t _r40 = vld1q_f32(r4);
+                    float32x4_t _r44 = vld1q_f32(r4 + 4);
+                    float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
+                    float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
+                    float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
+
+                    vst1q_f32(outptr, _sum);
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    r4 += 4;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+//                     "veor       q15, q15            \n"// _sum3 = 0;
+
+                    "pld        [%1, #128]          \n"
+
+                    "pld        [%2, #256]          \n"
+
+                    "vld1.f32   {d16-d19}, [%2]     \n"// _r00 = vld1q_f32(r0+j);
+                    "add        %2, #16             \n"
+
+                    "0:                             \n"
+
+                    "vld1.f32   {d14-d15}, [%1]     \n"// _sum = vld1q_f32(outptr+j);
+                    "veor       q13, q13            \n"// _sum2 = 0;
+                    "veor       q14, q14            \n"// _sum3 = 0;
+
+                    "vext.32    q10, q8, q9, #1     \n"// _r01
+                    "vext.32    q11, q8, q9, #2     \n"// _r02
+                    "vext.32    q12, q8, q9, #3     \n"// _r03
+
+                    "vmla.f32   q7, q8, %e14[0]     \n"
+                    "vmla.f32   q13, q10, %e14[1]   \n"
+
+                    "pld        [%3, #256]          \n"
+
+                    "vmla.f32   q14, q11, %f14[0]   \n"
+                    "vmul.f32   q15, q12, %f14[1]   \n"
+                    "vmla.f32   q7, q9, %e15[0]     \n"
+
+                    "vld1.f32   {d16-d19}, [%3]     \n"
+                    "add        %3, #16             \n"
+                    "vext.32    q10, q8, q9, #1     \n"
+                    "vext.32    q11, q8, q9, #2     \n"
+                    "vext.32    q12, q8, q9, #3     \n"
+
+                    "vmla.f32   q7, q8, %e15[1]     \n"
+                    "vmla.f32   q13, q10, %f15[0]   \n"
+
+                    "pld        [%4, #256]          \n"
+
+                    "vmla.f32   q14, q11, %f15[1]   \n"
+                    "vmla.f32   q15, q12, %e16[0]   \n"
+                    "vmla.f32   q7, q9, %e16[1]     \n"
+
+                    "vld1.f32   {d16-d19}, [%4]     \n"
+                    "add        %4, #16             \n"
+                    "vext.32    q10, q8, q9, #1     \n"
+                    "vext.32    q11, q8, q9, #2     \n"
+                    "vext.32    q12, q8, q9, #3     \n"
+
+                    "vmla.f32   q7, q8, %f16[0]     \n"
+                    "vmla.f32   q13, q10, %f16[1]   \n"
+
+                    "pld        [%5, #256]          \n"
+
+                    "vmla.f32   q14, q11, %e17[0]   \n"
+                    "vmla.f32   q15, q12, %e17[1]   \n"
+                    "vmla.f32   q7, q9, %f17[0]     \n"
+
+                    "vld1.f32   {d16-d19}, [%5]     \n"
+                    "add        %5, #16             \n"
+                    "vext.32    q10, q8, q9, #1     \n"
+                    "vext.32    q11, q8, q9, #2     \n"
+                    "vext.32    q12, q8, q9, #3     \n"
+
+                    "vmla.f32   q7, q8, %f17[1]     \n"
+                    "vmla.f32   q13, q10, %e18[0]   \n"
+
+                    "pld        [%6, #256]          \n"
+
+                    "vmla.f32   q14, q11, %e18[1]   \n"
+                    "vmla.f32   q15, q12, %f18[0]   \n"
+                    "vmla.f32   q7, q9, %f18[1]     \n"
+
+                    "vld1.f32   {d16-d19}, [%6]     \n"
+                    "add        %6, #16             \n"
+                    "vext.32    q10, q8, q9, #1     \n"
+                    "vext.32    q11, q8, q9, #2     \n"
+                    "vext.32    q12, q8, q9, #3     \n"
+
+                    "vmla.f32   q7, q8, %e19[0]     \n"
+                    "vmla.f32   q13, q10, %e19[1]   \n"
+                    "vmla.f32   q14, q11, %f19[0]   \n"
+                    "vmla.f32   q15, q12, %f19[1]   \n"
+                    "vmla.f32   q7, q9, %e20[0]     \n"
+
+                    "vadd.f32   q14, q14, q15       \n"
+                    "vadd.f32   q7, q7, q13         \n"
+//                     "veor       q15, q15            \n"// _sum3 = 0;
+
+                    "pld        [%2, #256]          \n"
+
+                    "vadd.f32   q7, q7, q14         \n"
+
+                    "vld1.f32   {d16-d19}, [%2]     \n"// _r00 = vld1q_f32(r0+j);
+                    "add        %2, #16             \n"
+
+                    "vst1.f32   {d14-d15}, [%1]!    \n"
+
+                    "pld        [%1, #128]          \n"
+
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+
+                    "sub        %2, #16             \n"
+                    : "=r"(nn),         // %0
+                      "=r"(outptr),     // %1
+                      "=r"(r0),         // %2
+                      "=r"(r1),         // %3
+                      "=r"(r2),         // %4
+                      "=r"(r3),         // %5
+                      "=r"(r4)          // %6
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(r0),
+                      "3"(r1),
+                      "4"(r2),
+                      "5"(r3),
+                      "6"(r4),
+                      "w"(_k0123),      // %14
+                      "w"(_k4567),      // %15
+                      "w"(_k891011),    // %16
+                      "w"(_k12131415),  // %17
+                      "w"(_k16171819),  // %18
+                      "w"(_k20212223),  // %19
+                      "w"(_k24242424)   // %20
+                    : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    float sum = 0;
+#if __ARM_NEON
+                    float32x4_t _r0 = vld1q_f32(r0);
+                    float32x4_t _sum = vmulq_f32(_r0, _k0123);
+
+                    float32x4_t _r1 = vld1q_f32(r1);
+                    _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));
+
+                    float32x4_t _r2 = vld1q_f32(r2);
+                    _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));
+
+                    float32x4_t _r3 = vld1q_f32(r3);
+                    _sum = vmlaq_f32(_sum, _r3, _k20212223);
+
+                    float32x4_t _r4 = vld1q_f32(r4);
+                    _sum = vmlaq_f32(_sum, _r4, vld1q_f32(k4));
+
+                    float32x4_t _k_t4;
+                    _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
+                    _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
+                    _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
+                    _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);
+
+                    float32x4_t _r_t4;
+
+                    _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
+                    _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
+                    _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
+                    _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
+                    _sum = vmlaq_f32(_sum, _r_t4, _k_t4);
+
+                    sum = r4[4] * k4[4];
+
+                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+                    _ss = vpadd_f32(_ss, _ss);
+
+                    sum += vget_lane_f32(_ss, 0);
+#else
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r0[3] * k0[3];
+                    sum += r0[4] * k0[4];
+
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r1[3] * k1[3];
+                    sum += r1[4] * k1[4];
+
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+                    sum += r2[3] * k2[3];
+                    sum += r2[4] * k2[4];
+
+                    sum += r3[0] * k3[0];
+                    sum += r3[1] * k3[1];
+                    sum += r3[2] * k3[2];
+                    sum += r3[3] * k3[3];
+                    sum += r3[4] * k3[4];
+
+                    sum += r4[0] * k4[0];
+                    sum += r4[1] * k4[1];
+                    sum += r4[2] * k4[2];
+                    sum += r4[3] * k4[3];
+                    sum += r4[4] * k4[4];
+#endif
+                    *outptr += sum;
+
+                    r0++;
+                    r1++;
+                    r2++;
+                    r3++;
+                    r4++;
+                    outptr++;
+                }
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+                r3 += 4;
+                r4 += 4;
+
+            }
+
+        }
+    }
+
+}
+
+static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        for (int q=0; q<inch; q++)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch*25  + q*25;
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+            const float* r2 = img0 + w*2;
+            const float* r3 = img0 + w*3;
+            const float* r4 = img0 + w*4;
+
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 5;
+            const float* k2 = kernel0 + 10;
+            const float* k3 = kernel0 + 15;
+            const float* k4 = kernel0 + 20;
+
+#if __ARM_NEON
+            float32x4_t _k0123 = vld1q_f32(kernel0);
+            float32x4_t _k4567 = vld1q_f32(kernel0+4);
+            float32x4_t _k891011 = vld1q_f32(kernel0+8);
+            float32x4_t _k12131415 = vld1q_f32(kernel0+12);
+            float32x4_t _k16171819 = vld1q_f32(kernel0+16);
+            float32x4_t _k20212223 = vld1q_f32(kernel0+20);
+            float32x4_t _k24242424 = vdupq_n_f32(kernel0[24]);
+#endif // __ARM_NEON
+
+            for (int i = 0; i < outh; i++)
+            {
+
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw - (nn << 2);
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _sum = vdupq_n_f32(0.f);
+
+                    float32x4x2_t _r00_02461357 = vld2q_f32(r0);
+                    float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8);
+                    float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14
+                    float32x4_t _r0_9111315 = _r00nx2.val[1];// 9 11 13 15
+                    float32x4_t _r00 = _r00_02461357.val[0];// 0 2 4 6
+                    float32x4_t _r01 = _r00_02461357.val[1];// 1 3 5 7
+                    float32x4_t _r02 = vextq_f32(_r00, _r0_8101214, 1);// 2 4 6 8
+                    float32x4_t _r03 = vextq_f32(_r01, _r0_9111315, 1);// 3 5 7 9
+                    float32x4_t _r04 = vextq_f32(_r00, _r0_8101214, 2);// 4 6 8 10
+
+                    float32x4x2_t _r10_02461357 = vld2q_f32(r1);
+                    float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8);
+                    float32x4_t _r1_8101214 = _r10nx2.val[0];
+                    float32x4_t _r1_9111315 = _r10nx2.val[1];
+                    float32x4_t _r10 = _r10_02461357.val[0];
+                    float32x4_t _r11 = _r10_02461357.val[1];
+                    float32x4_t _r12 = vextq_f32(_r10, _r1_8101214, 1);
+                    float32x4_t _r13 = vextq_f32(_r11, _r1_9111315, 1);
+                    float32x4_t _r14 = vextq_f32(_r10, _r1_8101214, 2);
+
+                    float32x4x2_t _r20_02461357 = vld2q_f32(r2);
+                    float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8);
+                    float32x4_t _r2_8101214 = _r20nx2.val[0];
+                    float32x4_t _r2_9111315 = _r20nx2.val[1];
+                    float32x4_t _r20 = _r20_02461357.val[0];
+                    float32x4_t _r21 = _r20_02461357.val[1];
+                    float32x4_t _r22 = vextq_f32(_r20, _r2_8101214, 1);
+                    float32x4_t _r23 = vextq_f32(_r21, _r2_9111315, 1);
+                    float32x4_t _r24 = vextq_f32(_r20, _r2_8101214, 2);
+
+                    float32x4x2_t _r30_02461357 = vld2q_f32(r3);
+                    float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8);
+                    float32x4_t _r3_8101214 = _r30nx2.val[0];
+                    float32x4_t _r3_9111315 = _r30nx2.val[1];
+                    float32x4_t _r30 = _r30_02461357.val[0];
+                    float32x4_t _r31 = _r30_02461357.val[1];
+                    float32x4_t _r32 = vextq_f32(_r30, _r3_8101214, 1);
+                    float32x4_t _r33 = vextq_f32(_r31, _r3_9111315, 1);
+                    float32x4_t _r34 = vextq_f32(_r30, _r3_8101214, 2);
+
+                    float32x4x2_t _r40_02461357 = vld2q_f32(r4);
+                    float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8);
+                    float32x4_t _r4_8101214 = _r40nx2.val[0];
+                    float32x4_t _r4_9111315 = _r40nx2.val[1];
+                    float32x4_t _r40 = _r40_02461357.val[0];
+                    float32x4_t _r41 = _r40_02461357.val[1];
+                    float32x4_t _r42 = vextq_f32(_r40, _r4_8101214, 1);
+                    float32x4_t _r43 = vextq_f32(_r41, _r4_9111315, 1);
+                    float32x4_t _r44 = vextq_f32(_r40, _r4_8101214, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
+
+                    vst1q_f32(outptr, _sum);
+
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                    r3 += 8;
+                    r4 += 8;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "veor       q15, q15            \n"// _sump3 = 0;
+                    "pld        [%1, #128]          \n"
+                    "veor       q13, q13            \n"// _sump2 = 0;
+                    "pld        [%2, #256]          \n"
+                    "veor       q14, q14            \n"// _sump3 = 0;
+
+                    "vld2.f32   {d16-d19}, [%2]!    \n"// q8  = 0  2  4  6   q9  = 1  3  5  7
+
+                    "pld        [%2, #256]          \n"
+
+                    "vld2.f32   {d20-d23}, [%2]     \n"// q10 = 8 10 12 14   q11 = 9 11 13 15
+
+                    "0:                             \n"
+
+                    "vld1.f32   {d14-d15}, [%1]     \n"// q7 = outptr
+
+                    "vext.32    q12, q8, q10, #1    \n"// q12 = 2 4 6 8
+                    "vext.32    q11, q9, q11, #1    \n"// q11 = 3 5 7 9
+                    "vext.32    q10, q8, q10, #2    \n"// q10 = 4 6 8 10
+
+                    "vmla.f32   q7, q8, %e14[0]     \n"
+                    "vmla.f32   q13, q9, %e14[1]    \n"
+
+                    "pld        [%3, #256]          \n"
+
+                    "vmla.f32   q14, q12, %f14[0]   \n"
+                    "vmla.f32   q15, q11, %f14[1]   \n"
+                    "vmla.f32   q7, q10, %e15[0]    \n"
+
+                    "vld2.f32   {d16-d19}, [%3]!    \n"
+
+                    "pld        [%3, #256]          \n"
+
+                    "vld2.f32   {d20-d23}, [%3]     \n"
+                    "vext.32    q12, q8, q10, #1    \n"
+                    "vext.32    q11, q9, q11, #1    \n"
+                    "vext.32    q10, q8, q10, #2    \n"
+
+                    "vmla.f32   q7, q8, %e15[1]     \n"
+                    "vmla.f32   q13, q9, %f15[0]    \n"
+
+                    "pld        [%4, #256]          \n"
+
+                    "vmla.f32   q14, q12, %f15[1]   \n"
+                    "vmla.f32   q15, q11, %e16[0]   \n"
+                    "vmla.f32   q7, q10, %e16[1]    \n"
+
+                    "vld2.f32   {d16-d19}, [%4]!    \n"
+
+                    "pld        [%4, #256]          \n"
+
+                    "vld2.f32   {d20-d23}, [%4]     \n"
+                    "vext.32    q12, q8, q10, #1    \n"
+                    "vext.32    q11, q9, q11, #1    \n"
+                    "vext.32    q10, q8, q10, #2    \n"
+
+                    "vmla.f32   q7, q8, %f16[0]     \n"
+                    "vmla.f32   q13, q9, %f16[1]    \n"
+
+                    "pld        [%5, #256]          \n"
+
+                    "vmla.f32   q14, q12, %e17[0]   \n"
+                    "vmla.f32   q15, q11, %e17[1]   \n"
+                    "vmla.f32   q7, q10, %f17[0]    \n"
+
+                    "vld2.f32   {d16-d19}, [%5]!    \n"
+
+                    "pld        [%5, #256]          \n"
+
+                    "vld2.f32   {d20-d23}, [%5]     \n"
+                    "vext.32    q12, q8, q10, #1    \n"
+                    "vext.32    q11, q9, q11, #1    \n"
+                    "vext.32    q10, q8, q10, #2    \n"
+
+                    "vmla.f32   q7, q8, %f17[1]     \n"
+                    "vmla.f32   q13, q9, %e18[0]    \n"
+
+                    "pld        [%6, #256]          \n"
+
+                    "vmla.f32   q14, q12, %e18[1]   \n"
+                    "vmla.f32   q15, q11, %f18[0]   \n"
+                    "vmla.f32   q7, q10, %f18[1]    \n"
+
+                    "vld2.f32   {d16-d19}, [%6]!    \n"
+
+                    "pld        [%6, #256]          \n"
+
+                    "vld2.f32   {d20-d23}, [%6]     \n"
+                    "vext.32    q12, q8, q10, #1    \n"
+                    "vext.32    q11, q9, q11, #1    \n"
+                    "vext.32    q10, q8, q10, #2    \n"
+
+                    "vmla.f32   q7, q8, %e19[0]     \n"
+                    "vmla.f32   q13, q9, %e19[1]    \n"
+                    "vmla.f32   q14, q12, %f19[0]   \n"
+                    "vmla.f32   q15, q11, %f19[1]   \n"
+                    "vmla.f32   q7, q10, %e20[0]    \n"
+
+                    "pld        [%2, #256]          \n"
+
+                    "vld2.f32   {d16-d19}, [%2]!    \n"// q8  = 0  2  4  6   q9  = 1  3  5  7
+
+                    "vadd.f32   q14, q14, q15       \n"
+                    "vadd.f32   q7, q7, q13         \n"
+                    "veor       q15, q15            \n"// _sump3 = 0;
+                    "veor       q13, q13            \n"// _sump2 = 0;
+
+                    "pld        [%2, #256]          \n"
+
+                    "vadd.f32   q7, q7, q14         \n"
+
+                    "vld2.f32   {d20-d23}, [%2]     \n"// q10 = 8 10 12 14   q11 = 9 11 13 15
+
+                    "veor       q14, q14            \n"// _sump3 = 0;
+
+                    "vst1.f32   {d14-d15}, [%1]!    \n"
+
+                    "pld        [%1, #128]          \n"
+
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+
+                    "sub        %2, #32             \n"
+                    : "=r"(nn),         // %0
+                      "=r"(outptr),     // %1
+                      "=r"(r0),         // %2
+                      "=r"(r1),         // %3
+                      "=r"(r2),         // %4
+                      "=r"(r3),         // %5
+                      "=r"(r4)          // %6
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(r0),
+                      "3"(r1),
+                      "4"(r2),
+                      "5"(r3),
+                      "6"(r4),
+                      "w"(_k0123),      // %14
+                      "w"(_k4567),      // %15
+                      "w"(_k891011),    // %16
+                      "w"(_k12131415),  // %17
+                      "w"(_k16171819),  // %18
+                      "w"(_k20212223),  // %19
+                      "w"(_k24242424)   // %20
+                    : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    float sum = 0;
+#if __ARM_NEON
+                    float32x4_t _r0 = vld1q_f32(r0);
+                    float32x4_t _sum = vmulq_f32(_r0, _k0123);
+
+                    float32x4_t _r1 = vld1q_f32(r1);
+                    _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));
+
+                    float32x4_t _r2 = vld1q_f32(r2);
+                    _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));
+
+                    float32x4_t _r3 = vld1q_f32(r3);
+                    _sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3));
+
+                    float32x4_t _r4 = vld1q_f32(r4);
+                    _sum = vmlaq_f32(_sum, _r4, _k20212223);
+
+                    sum += r0[4] * k0[4];
+                    sum += r1[4] * k1[4];
+                    sum += r2[4] * k2[4];
+                    sum += r3[4] * k3[4];
+                    sum += r4[4] * k4[4];
+
+                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+                    _ss = vpadd_f32(_ss, _ss);
+
+                    sum += vget_lane_f32(_ss, 0);
+#else
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r0[3] * k0[3];
+                    sum += r0[4] * k0[4];
+
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r1[3] * k1[3];
+                    sum += r1[4] * k1[4];
+
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+                    sum += r2[3] * k2[3];
+                    sum += r2[4] * k2[4];
+
+                    sum += r3[0] * k3[0];
+                    sum += r3[1] * k3[1];
+                    sum += r3[2] * k3[2];
+                    sum += r3[3] * k3[3];
+                    sum += r3[4] * k3[4];
+
+                    sum += r4[0] * k4[0];
+                    sum += r4[1] * k4[1];
+                    sum += r4[2] * k4[2];
+                    sum += r4[3] * k4[3];
+                    sum += r4[4] * k4[4];
+#endif
+                    *outptr += sum;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+                    r4 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+                r3 += tailstep;
+                r4 += tailstep;
+            }
+
+        }
+    }
+
+}
diff --git a/src/layer/arm/convolution_7x7.h b/src/layer/arm/convolution_7x7.h
new file mode 100644
index 00000000000..7c018b1b39d
--- /dev/null
+++ b/src/layer/arm/convolution_7x7.h
@@ -0,0 +1,1073 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        for (int q=0; q<inch; q++)
+        {
+            float* outptr = out;
+            float* outptr2 = out + outw;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch*49  + q*49;
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+            const float* r2 = img0 + w*2;
+            const float* r3 = img0 + w*3;
+            const float* r4 = img0 + w*4;
+            const float* r5 = img0 + w*5;
+            const float* r6 = img0 + w*6;
+            const float* r7 = img0 + w*7;
+
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 7;
+            const float* k2 = kernel0 + 14;
+            const float* k3 = kernel0 + 21;
+            const float* k4 = kernel0 + 28;
+            const float* k5 = kernel0 + 35;
+            const float* k6 = kernel0 + 42;
+
+            int i = 0;
+
+            for (; i < outh; i++)
+            {
+
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw - (nn << 2);
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _sum = vld1q_f32(outptr);
+
+                    float32x4_t _k0123 = vld1q_f32(k0);
+                    float32x4_t _k4567 = vld1q_f32(k0 + 4);
+
+                    float32x4_t _r00 = vld1q_f32(r0);// 0 1 2 3
+                    float32x4_t _r04 = vld1q_f32(r0 + 4);// 4 5 6 7
+                    float32x4_t _r00n = vld1q_f32(r0 + 8);// 8 9 10 11
+                    float32x4_t _r01 = vextq_f32(_r00, _r04, 1);// 1 2 3 4
+                    float32x4_t _r02 = vextq_f32(_r00, _r04, 2);// 2 3 4 5
+                    float32x4_t _r03 = vextq_f32(_r00, _r04, 3);// 3 4 5 6
+                    float32x4_t _r05 = vextq_f32(_r04, _r00n, 1);// 5 6 7 8
+                    float32x4_t _r06 = vextq_f32(_r04, _r00n, 2);// 6 7 8 9
+
+                    _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2);
+
+                    float32x4_t _k78910 = vld1q_f32(k1);
+                    float32x4_t _k11121314 = vld1q_f32(k1 + 4);
+
+                    float32x4_t _r10 = vld1q_f32(r1);
+                    float32x4_t _r14 = vld1q_f32(r1 + 4);
+                    float32x4_t _r10n = vld1q_f32(r1 + 8);
+                    float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
+                    float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
+                    float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
+                    float32x4_t _r15 = vextq_f32(_r14, _r10n, 1);
+                    float32x4_t _r16 = vextq_f32(_r14, _r10n, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r10, _k78910, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r11, _k78910, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r12, _k78910, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r13, _k78910, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r14, _k11121314, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2);
+
+                    float32x4_t _k14151617 = vld1q_f32(k2);
+                    float32x4_t _k18192021 = vld1q_f32(k2 + 4);
+
+                    float32x4_t _r20 = vld1q_f32(r2);
+                    float32x4_t _r24 = vld1q_f32(r2 + 4);
+                    float32x4_t _r20n = vld1q_f32(r2 + 8);
+                    float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
+                    float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
+                    float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
+                    float32x4_t _r25 = vextq_f32(_r24, _r20n, 1);
+                    float32x4_t _r26 = vextq_f32(_r24, _r20n, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r20, _k14151617, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r21, _k14151617, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r22, _k14151617, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r23, _k14151617, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r24, _k18192021, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2);
+
+                    float32x4_t _k21222324 = vld1q_f32(k3);
+                    float32x4_t _k25262728 = vld1q_f32(k3 + 4);
+
+                    float32x4_t _r30 = vld1q_f32(r3);
+                    float32x4_t _r34 = vld1q_f32(r3 + 4);
+                    float32x4_t _r30n = vld1q_f32(r3 + 8);
+                    float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
+                    float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
+                    float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
+                    float32x4_t _r35 = vextq_f32(_r34, _r30n, 1);
+                    float32x4_t _r36 = vextq_f32(_r34, _r30n, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r30, _k21222324, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r31, _k21222324, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r32, _k21222324, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r33, _k21222324, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r34, _k25262728, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2);
+
+                    float32x4_t _k28293031 = vld1q_f32(k4);
+                    float32x4_t _k32333435 = vld1q_f32(k4 + 4);
+
+                    float32x4_t _r40 = vld1q_f32(r4);
+                    float32x4_t _r44 = vld1q_f32(r4 + 4);
+                    float32x4_t _r40n = vld1q_f32(r4 + 8);
+                    float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
+                    float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
+                    float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
+                    float32x4_t _r45 = vextq_f32(_r44, _r40n, 1);
+                    float32x4_t _r46 = vextq_f32(_r44, _r40n, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r40, _k28293031, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r41, _k28293031, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r42, _k28293031, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r43, _k28293031, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r44, _k32333435, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2);
+
+                    float32x4_t _k35363738 = vld1q_f32(k5);
+                    float32x4_t _k39404142 = vld1q_f32(k5 + 4);
+
+                    float32x4_t _r50 = vld1q_f32(r5);
+                    float32x4_t _r54 = vld1q_f32(r5 + 4);
+                    float32x4_t _r50n = vld1q_f32(r5 + 8);
+                    float32x4_t _r51 = vextq_f32(_r50, _r54, 1);
+                    float32x4_t _r52 = vextq_f32(_r50, _r54, 2);
+                    float32x4_t _r53 = vextq_f32(_r50, _r54, 3);
+                    float32x4_t _r55 = vextq_f32(_r54, _r50n, 1);
+                    float32x4_t _r56 = vextq_f32(_r54, _r50n, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r50, _k35363738, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r51, _k35363738, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r52, _k35363738, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r53, _k35363738, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r54, _k39404142, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2);
+
+                    float32x4_t _k42434445 = vld1q_f32(k6);
+                    float32x4_t _k46474849 = vld1q_f32(k6 + 4);
+
+                    float32x4_t _r60 = vld1q_f32(r6);
+                    float32x4_t _r64 = vld1q_f32(r6 + 4);
+                    float32x4_t _r60n = vld1q_f32(r6 + 8);
+                    float32x4_t _r61 = vextq_f32(_r60, _r64, 1);
+                    float32x4_t _r62 = vextq_f32(_r60, _r64, 2);
+                    float32x4_t _r63 = vextq_f32(_r60, _r64, 3);
+                    float32x4_t _r65 = vextq_f32(_r64, _r60n, 1);
+                    float32x4_t _r66 = vextq_f32(_r64, _r60n, 2);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r60, _k42434445, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r61, _k42434445, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r62, _k42434445, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r63, _k42434445, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r64, _k46474849, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r65, _k46474849, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r66, _k46474849, 2);
+
+                    vst1q_f32(outptr, _sum);
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    r4 += 4;
+                    r5 += 4;
+                    r6 += 4;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "0:                             \n"
+
+                    "pld        [%1, #256]          \n"
+                    "vld1.f32   {d24-d25}, [%1]     \n"// _sum
+                    "veor       q13, q13            \n"// _sum2 = 0;
+                    "veor       q14, q14            \n"// _sum3 = 0;
+                    "veor       q15, q15            \n"// _sum4 = 0;
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d8-d11}, [%9]      \n"// q4 q5 = k0123 k4567
+                    "add        %9, #28             \n"
+
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%2]!      \n"// q0 = 0  1  2  3
+                    "vmla.f32   q12, q0, d8[0]      \n"
+
+                    "pld        [%2, #256]          \n"
+                    "vld1.f32   {d4-d7}, [%2]       \n"// q2 = 4  5  6  7  q3 = 8  9 10 11
+                    "vmla.f32   q13, q2, d10[0]     \n"
+
+                    "vext.32    q1, q0, q2, #1      \n"// q1 = 1  2  3  4
+                    "vext.32    q10, q2, q3, #1     \n"// q10= 5  6  7  8
+                    "vmla.f32   q14, q1, d8[1]      \n"
+                    "vmla.f32   q15, q10, d10[1]    \n"
+
+                    "vext.32    q8, q0, q2, #2      \n"// q8 = 2  3  4  5
+                    "vext.32    q11, q2, q3, #2     \n"// q11= 6  7  8  9
+                    "vmla.f32   q12, q8, d9[0]      \n"
+                    "vmla.f32   q13, q11, d11[0]    \n"
+
+                    "vext.32    q9, q0, q2, #3      \n"// q9 = 3  4  5  6
+                    "vmla.f32   q14, q9, d9[1]      \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d12-d15}, [%9]     \n"// q6 q7 = k78910 k11121314
+                    "add        %9, #28             \n"
+
+                    "pld        [%3, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%3]!      \n"
+                    "vmla.f32   q15, q0, d12[0]     \n"
+
+                    "pld        [%3, #256]          \n"
+                    "vld1.f32   {d4-d7}, [%3]       \n"
+                    "vmla.f32   q12, q2, d14[0]     \n"
+
+                    "vext.32    q1, q0, q2, #1      \n"
+                    "vext.32    q10, q2, q3, #1     \n"
+                    "vmla.f32   q13, q1, d12[1]     \n"
+                    "vmla.f32   q14, q10, d14[1]    \n"
+
+                    "vext.32    q8, q0, q2, #2      \n"
+                    "vext.32    q11, q2, q3, #2     \n"
+                    "vmla.f32   q15, q8, d13[0]     \n"
+                    "vmla.f32   q12, q11, d15[0]    \n"
+
+                    "vext.32    q9, q0, q2, #3      \n"
+                    "vmla.f32   q13, q9, d13[1]     \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d8-d11}, [%9]      \n"// q4 q5 = k14151617 k18192021
+                    "add        %9, #28             \n"
+
+                    "pld        [%4, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%4]!      \n"
+                    "vmla.f32   q14, q0, d8[0]      \n"
+
+                    "pld        [%4, #256]          \n"
+                    "vld1.f32   {d4-d7}, [%4]       \n"
+                    "vmla.f32   q15, q2, d10[0]     \n"
+
+                    "vext.32    q1, q0, q2, #1      \n"
+                    "vext.32    q10, q2, q3, #1     \n"
+                    "vmla.f32   q12, q1, d8[1]      \n"
+                    "vmla.f32   q13, q10, d10[1]    \n"
+
+                    "vext.32    q8, q0, q2, #2      \n"
+                    "vext.32    q11, q2, q3, #2     \n"
+                    "vmla.f32   q14, q8, d9[0]      \n"
+                    "vmla.f32   q15, q11, d11[0]    \n"
+
+                    "vext.32    q9, q0, q2, #3      \n"
+                    "vmla.f32   q12, q9, d9[1]      \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d12-d15}, [%9]     \n"// q6 q7 = k21222324 k25262728
+                    "add        %9, #28             \n"
+
+                    "pld        [%5, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%5]!      \n"
+                    "vmla.f32   q13, q0, d12[0]     \n"
+
+                    "pld        [%5, #256]          \n"
+                    "vld1.f32   {d4-d7}, [%5]       \n"
+                    "vmla.f32   q14, q2, d14[0]     \n"
+
+                    "vext.32    q1, q0, q2, #1      \n"
+                    "vext.32    q10, q2, q3, #1     \n"
+                    "vmla.f32   q15, q1, d12[1]     \n"
+                    "vmla.f32   q12, q10, d14[1]    \n"
+
+                    "vext.32    q8, q0, q2, #2      \n"
+                    "vext.32    q11, q2, q3, #2     \n"
+                    "vmla.f32   q13, q8, d13[0]     \n"
+                    "vmla.f32   q14, q11, d15[0]    \n"
+
+                    "vext.32    q9, q0, q2, #3      \n"
+                    "vmla.f32   q15, q9, d13[1]     \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d8-d11}, [%9]      \n"// q4 q5 = k28293031 k32333435
+                    "add        %9, #28             \n"
+
+                    "pld        [%6, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%6]!      \n"
+                    "vmla.f32   q12, q0, d8[0]      \n"
+
+                    "pld        [%6, #256]          \n"
+                    "vld1.f32   {d4-d7}, [%6]       \n"
+                    "vmla.f32   q13, q2, d10[0]     \n"
+
+                    "vext.32    q1, q0, q2, #1      \n"
+                    "vext.32    q10, q2, q3, #1     \n"
+                    "vmla.f32   q14, q1, d8[1]      \n"
+                    "vmla.f32   q15, q10, d10[1]    \n"
+
+                    "vext.32    q8, q0, q2, #2      \n"
+                    "vext.32    q11, q2, q3, #2     \n"
+                    "vmla.f32   q12, q8, d9[0]      \n"
+                    "vmla.f32   q13, q11, d11[0]    \n"
+
+                    "vext.32    q9, q0, q2, #3      \n"
+                    "vmla.f32   q14, q9, d9[1]      \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d12-d15}, [%9]     \n"// q6 q7 = k35363738 k39404142
+                    "add        %9, #28             \n"
+
+                    "pld        [%7, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%7]!      \n"
+                    "vmla.f32   q15, q0, d12[0]     \n"
+
+                    "pld        [%7, #256]          \n"
+                    "vld1.f32   {d4-d7}, [%7]       \n"
+                    "vmla.f32   q12, q2, d14[0]     \n"
+
+                    "vext.32    q1, q0, q2, #1      \n"
+                    "vext.32    q10, q2, q3, #1     \n"
+                    "vmla.f32   q13, q1, d12[1]     \n"
+                    "vmla.f32   q14, q10, d14[1]    \n"
+
+                    "vext.32    q8, q0, q2, #2      \n"
+                    "vext.32    q11, q2, q3, #2     \n"
+                    "vmla.f32   q15, q8, d13[0]     \n"
+                    "vmla.f32   q12, q11, d15[0]    \n"
+
+                    "vext.32    q9, q0, q2, #3      \n"
+                    "vmla.f32   q13, q9, d13[1]     \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d8-d11}, [%9]      \n"// q4 q5 = k42434445 k46474849
+                    "sub        %9, #168            \n"// restore k0
+
+                    "pld        [%8, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%8]!      \n"
+                    "vmla.f32   q14, q0, d8[0]      \n"
+
+                    "pld        [%8, #256]          \n"
+                    "vld1.f32   {d4-d7}, [%8]       \n"
+                    "vmla.f32   q15, q2, d10[0]     \n"
+
+                    "vext.32    q1, q0, q2, #1      \n"
+                    "vext.32    q10, q2, q3, #1     \n"
+                    "vmla.f32   q12, q1, d8[1]      \n"
+                    "vmla.f32   q13, q10, d10[1]    \n"
+
+                    "vext.32    q8, q0, q2, #2      \n"
+                    "vext.32    q11, q2, q3, #2     \n"
+                    "vmla.f32   q14, q8, d9[0]      \n"
+                    "vmla.f32   q15, q11, d11[0]    \n"
+
+                    "vext.32    q9, q0, q2, #3      \n"
+                    "vmla.f32   q12, q9, d9[1]      \n"
+
+                    "vadd.f32   q13, q13, q14       \n"
+                    "vadd.f32   q13, q13, q15       \n"
+                    "vadd.f32   q12, q12, q13       \n"
+
+                    "vst1.f32   {d24-d25}, [%1]!    \n"
+
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+                    : "=r"(nn),         // %0
+                      "=r"(outptr),     // %1
+                      "=r"(r0),         // %2
+                      "=r"(r1),         // %3
+                      "=r"(r2),         // %4
+                      "=r"(r3),         // %5
+                      "=r"(r4),         // %6
+                      "=r"(r5),         // %7
+                      "=r"(r6),         // %8
+                      "=r"(k0)          // %9
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(r0),
+                      "3"(r1),
+                      "4"(r2),
+                      "5"(r3),
+                      "6"(r4),
+                      "7"(r5),
+                      "8"(r6),
+                      "9"(k0)
+                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+                for (; remain>0; remain--)
+                {
+                    float sum = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r0[3] * k0[3];
+                    sum += r0[4] * k0[4];
+                    sum += r0[5] * k0[5];
+                    sum += r0[6] * k0[6];
+
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r1[3] * k1[3];
+                    sum += r1[4] * k1[4];
+                    sum += r1[5] * k1[5];
+                    sum += r1[6] * k1[6];
+
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+                    sum += r2[3] * k2[3];
+                    sum += r2[4] * k2[4];
+                    sum += r2[5] * k2[5];
+                    sum += r2[6] * k2[6];
+
+                    sum += r3[0] * k3[0];
+                    sum += r3[1] * k3[1];
+                    sum += r3[2] * k3[2];
+                    sum += r3[3] * k3[3];
+                    sum += r3[4] * k3[4];
+                    sum += r3[5] * k3[5];
+                    sum += r3[6] * k3[6];
+
+                    sum += r4[0] * k4[0];
+                    sum += r4[1] * k4[1];
+                    sum += r4[2] * k4[2];
+                    sum += r4[3] * k4[3];
+                    sum += r4[4] * k4[4];
+                    sum += r4[5] * k4[5];
+                    sum += r4[6] * k4[6];
+
+                    sum += r5[0] * k5[0];
+                    sum += r5[1] * k5[1];
+                    sum += r5[2] * k5[2];
+                    sum += r5[3] * k5[3];
+                    sum += r5[4] * k5[4];
+                    sum += r5[5] * k5[5];
+                    sum += r5[6] * k5[6];
+
+                    sum += r6[0] * k6[0];
+                    sum += r6[1] * k6[1];
+                    sum += r6[2] * k6[2];
+                    sum += r6[3] * k6[3];
+                    sum += r6[4] * k6[4];
+                    sum += r6[5] * k6[5];
+                    sum += r6[6] * k6[6];
+
+                    *outptr += sum;
+
+                    r0++;
+                    r1++;
+                    r2++;
+                    r3++;
+                    r4++;
+                    r5++;
+                    r6++;
+                    outptr++;
+                }
+
+                r0 += 6;
+                r1 += 6;
+                r2 += 6;
+                r3 += 6;
+                r4 += 6;
+                r5 += 6;
+                r6 += 6;
+
+            }
+
+        }
+    }
+
+}
+
+static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        for (int q=0; q<inch; q++)
+        {
+            float* outptr = out;
+            float* outptr2 = out + outw;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch*49  + q*49;
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+            const float* r2 = img0 + w*2;
+            const float* r3 = img0 + w*3;
+            const float* r4 = img0 + w*4;
+            const float* r5 = img0 + w*5;
+            const float* r6 = img0 + w*6;
+
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 7;
+            const float* k2 = kernel0 + 14;
+            const float* k3 = kernel0 + 21;
+            const float* k4 = kernel0 + 28;
+            const float* k5 = kernel0 + 35;
+            const float* k6 = kernel0 + 42;
+
+            int i = 0;
+
+            for (; i < outh; i++)
+            {
+
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw - (nn << 2);
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _sum = vld1q_f32(outptr);
+
+                    float32x4_t _k0123 = vld1q_f32(k0);
+                    float32x4_t _k4567 = vld1q_f32(k0 + 4);
+
+                    float32x4x2_t _r00_02461357 = vld2q_f32(r0);
+                    float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8);
+                    float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14
+                    float32x4_t _r0_9111315 = _r00nx2.val[1];// 9 11 13 15
+                    float32x4_t _r00 = _r00_02461357.val[0];// 0 2 4 6
+                    float32x4_t _r01 = _r00_02461357.val[1];// 1 3 5 7
+                    float32x4_t _r02 = vextq_f32(_r00, _r0_8101214, 1);// 2 4 6 8
+                    float32x4_t _r03 = vextq_f32(_r01, _r0_9111315, 1);// 3 5 7 9
+                    float32x4_t _r04 = vextq_f32(_r00, _r0_8101214, 2);// 4 6 8 10
+                    float32x4_t _r05 = vextq_f32(_r01, _r0_9111315, 2);// 5 7 9 11
+                    float32x4_t _r06 = vextq_f32(_r00, _r0_8101214, 3);// 6 8 10 12
+
+                    _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2);
+
+                    float32x4_t _k78910 = vld1q_f32(k1);
+                    float32x4_t _k11121314 = vld1q_f32(k1 + 4);
+
+                    float32x4x2_t _r10_02461357 = vld2q_f32(r1);
+                    float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8);
+                    float32x4_t _r1_8101214 = _r10nx2.val[0];
+                    float32x4_t _r1_9111315 = _r10nx2.val[1];
+                    float32x4_t _r10 = _r10_02461357.val[0];
+                    float32x4_t _r11 = _r10_02461357.val[1];
+                    float32x4_t _r12 = vextq_f32(_r10, _r1_8101214, 1);
+                    float32x4_t _r13 = vextq_f32(_r11, _r1_9111315, 1);
+                    float32x4_t _r14 = vextq_f32(_r10, _r1_8101214, 2);
+                    float32x4_t _r15 = vextq_f32(_r11, _r1_9111315, 2);
+                    float32x4_t _r16 = vextq_f32(_r10, _r1_8101214, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r10, _k78910, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r11, _k78910, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r12, _k78910, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r13, _k78910, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r14, _k11121314, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2);
+
+                    float32x4_t _k14151617 = vld1q_f32(k2);
+                    float32x4_t _k18192021 = vld1q_f32(k2 + 4);
+
+                    float32x4x2_t _r20_02461357 = vld2q_f32(r2);
+                    float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8);
+                    float32x4_t _r2_8101214 = _r20nx2.val[0];
+                    float32x4_t _r2_9111315 = _r20nx2.val[1];
+                    float32x4_t _r20 = _r20_02461357.val[0];
+                    float32x4_t _r21 = _r20_02461357.val[1];
+                    float32x4_t _r22 = vextq_f32(_r20, _r2_8101214, 1);
+                    float32x4_t _r23 = vextq_f32(_r21, _r2_9111315, 1);
+                    float32x4_t _r24 = vextq_f32(_r20, _r2_8101214, 2);
+                    float32x4_t _r25 = vextq_f32(_r21, _r2_9111315, 2);
+                    float32x4_t _r26 = vextq_f32(_r20, _r2_8101214, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r20, _k14151617, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r21, _k14151617, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r22, _k14151617, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r23, _k14151617, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r24, _k18192021, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2);
+
+                    float32x4_t _k21222324 = vld1q_f32(k3);
+                    float32x4_t _k25262728 = vld1q_f32(k3 + 4);
+
+                    float32x4x2_t _r30_02461357 = vld2q_f32(r3);
+                    float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8);
+                    float32x4_t _r3_8101214 = _r30nx2.val[0];
+                    float32x4_t _r3_9111315 = _r30nx2.val[1];
+                    float32x4_t _r30 = _r30_02461357.val[0];
+                    float32x4_t _r31 = _r30_02461357.val[1];
+                    float32x4_t _r32 = vextq_f32(_r30, _r3_8101214, 1);
+                    float32x4_t _r33 = vextq_f32(_r31, _r3_9111315, 1);
+                    float32x4_t _r34 = vextq_f32(_r30, _r3_8101214, 2);
+                    float32x4_t _r35 = vextq_f32(_r31, _r3_9111315, 2);
+                    float32x4_t _r36 = vextq_f32(_r30, _r3_8101214, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r30, _k21222324, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r31, _k21222324, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r32, _k21222324, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r33, _k21222324, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r34, _k25262728, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2);
+
+                    float32x4_t _k28293031 = vld1q_f32(k4);
+                    float32x4_t _k32333435 = vld1q_f32(k4 + 4);
+
+                    float32x4x2_t _r40_02461357 = vld2q_f32(r4);
+                    float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8);
+                    float32x4_t _r4_8101214 = _r40nx2.val[0];
+                    float32x4_t _r4_9111315 = _r40nx2.val[1];
+                    float32x4_t _r40 = _r40_02461357.val[0];
+                    float32x4_t _r41 = _r40_02461357.val[1];
+                    float32x4_t _r42 = vextq_f32(_r40, _r4_8101214, 1);
+                    float32x4_t _r43 = vextq_f32(_r41, _r4_9111315, 1);
+                    float32x4_t _r44 = vextq_f32(_r40, _r4_8101214, 2);
+                    float32x4_t _r45 = vextq_f32(_r41, _r4_9111315, 2);
+                    float32x4_t _r46 = vextq_f32(_r40, _r4_8101214, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r40, _k28293031, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r41, _k28293031, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r42, _k28293031, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r43, _k28293031, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r44, _k32333435, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2);
+
+                    float32x4_t _k35363738 = vld1q_f32(k5);
+                    float32x4_t _k39404142 = vld1q_f32(k5 + 4);
+
+                    float32x4x2_t _r50_02461357 = vld2q_f32(r5);
+                    float32x4x2_t _r50nx2 = vld2q_f32(r5 + 8);
+                    float32x4_t _r5_8101214 = _r50nx2.val[0];
+                    float32x4_t _r5_9111315 = _r50nx2.val[1];
+                    float32x4_t _r50 = _r50_02461357.val[0];
+                    float32x4_t _r51 = _r50_02461357.val[1];
+                    float32x4_t _r52 = vextq_f32(_r50, _r5_8101214, 1);
+                    float32x4_t _r53 = vextq_f32(_r51, _r5_9111315, 1);
+                    float32x4_t _r54 = vextq_f32(_r50, _r5_8101214, 2);
+                    float32x4_t _r55 = vextq_f32(_r51, _r5_9111315, 2);
+                    float32x4_t _r56 = vextq_f32(_r50, _r5_8101214, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r50, _k35363738, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r51, _k35363738, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r52, _k35363738, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r53, _k35363738, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r54, _k39404142, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2);
+
+                    float32x4_t _k42434445 = vld1q_f32(k6);
+                    float32x4_t _k46474849 = vld1q_f32(k6 + 4);
+
+                    float32x4x2_t _r60_02461357 = vld2q_f32(r6);
+                    float32x4x2_t _r60nx2 = vld2q_f32(r6 + 8);
+                    float32x4_t _r6_8101214 = _r60nx2.val[0];
+                    float32x4_t _r6_9111315 = _r60nx2.val[1];
+                    float32x4_t _r60 = _r60_02461357.val[0];
+                    float32x4_t _r61 = _r60_02461357.val[1];
+                    float32x4_t _r62 = vextq_f32(_r60, _r6_8101214, 1);
+                    float32x4_t _r63 = vextq_f32(_r61, _r6_9111315, 1);
+                    float32x4_t _r64 = vextq_f32(_r60, _r6_8101214, 2);
+                    float32x4_t _r65 = vextq_f32(_r61, _r6_9111315, 2);
+                    float32x4_t _r66 = vextq_f32(_r60, _r6_8101214, 3);
+
+                    _sum = vfmaq_laneq_f32(_sum, _r60, _k42434445, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r61, _k42434445, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r62, _k42434445, 2);
+                    _sum = vfmaq_laneq_f32(_sum, _r63, _k42434445, 3);
+                    _sum = vfmaq_laneq_f32(_sum, _r64, _k46474849, 0);
+                    _sum = vfmaq_laneq_f32(_sum, _r65, _k46474849, 1);
+                    _sum = vfmaq_laneq_f32(_sum, _r66, _k46474849, 2);
+
+                    vst1q_f32(outptr, _sum);
+
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                    r3 += 8;
+                    r4 += 8;
+                    r5 += 8;
+                    r6 += 8;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "0:                             \n"
+
+                    "pld        [%1, #256]          \n"
+                    "vld1.f32   {d26-d27}, [%1]     \n"// _sum
+                    "veor       q14, q14            \n"// _sum2 = 0;
+                    "veor       q15, q15            \n"// _sum3 = 0;
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d8-d11}, [%9]      \n"// q4 q5 = k0123 k4567
+                    "add        %9, #28             \n"
+
+                    "pld        [%2, #512]          \n"
+                    "vld2.f32   {d0-d3}, [%2]!      \n"// q0 = 0  2  4  6  q1 = 1  3  5  7
+                    "vmla.f32   q13, q0, d8[0]      \n"
+                    "vmla.f32   q14, q1, d8[1]      \n"
+
+                    "vld2.f32   {d4-d7}, [%2]       \n"// q2 = 8 10 12 14  q3 = 9 11 13 15
+                    "vext.32    q8, q0, q2, #1      \n"// q8 = 2  4  6  8
+                    "vext.32    q9, q1, q3, #1      \n"// q9 = 3  5  7  9
+                    "vmla.f32   q15, q8, d9[0]      \n"
+                    "vmla.f32   q13, q9, d9[1]      \n"
+
+                    "vext.32    q10, q0, q2, #2     \n"// q10= 4  6  8 10
+                    "vext.32    q11, q1, q3, #2     \n"// q11= 5  7  9 11
+                    "vmla.f32   q14, q10, d10[0]    \n"
+                    "vmla.f32   q15, q11, d10[1]    \n"
+
+                    "vext.32    q12, q0, q2, #3     \n"// q12= 6  8 10 12
+                    "vmla.f32   q13, q12, d11[0]    \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d12-d15}, [%9]     \n"// q6 q7 = k78910 k11121314
+                    "add        %9, #28             \n"
+
+                    "pld        [%3, #512]          \n"
+                    "vld2.f32   {d0-d3}, [%3]!      \n"
+                    "vmla.f32   q14, q0, d12[0]     \n"
+                    "vmla.f32   q15, q1, d12[1]     \n"
+
+                    "vld2.f32   {d4-d7}, [%3]       \n"
+                    "vext.32    q8, q0, q2, #1      \n"
+                    "vext.32    q9, q1, q3, #1      \n"
+                    "vmla.f32   q13, q8, d13[0]     \n"
+                    "vmla.f32   q14, q9, d13[1]     \n"
+
+                    "vext.32    q10, q0, q2, #2     \n"
+                    "vext.32    q11, q1, q3, #2     \n"
+                    "vmla.f32   q15, q10, d14[0]    \n"
+                    "vmla.f32   q13, q11, d14[1]    \n"
+
+                    "vext.32    q12, q0, q2, #3     \n"
+                    "vmla.f32   q14, q12, d15[0]    \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d8-d11}, [%9]      \n"// q4 q5 = k14151617 k18192021
+                    "add        %9, #28             \n"
+
+                    "pld        [%4, #512]          \n"
+                    "vld2.f32   {d0-d3}, [%4]!      \n"
+                    "vmla.f32   q15, q0, d8[0]      \n"
+                    "vmla.f32   q13, q1, d8[1]      \n"
+
+                    "vld2.f32   {d4-d7}, [%4]       \n"
+                    "vext.32    q8, q0, q2, #1      \n"
+                    "vext.32    q9, q1, q3, #1      \n"
+                    "vmla.f32   q14, q8, d9[0]      \n"
+                    "vmla.f32   q15, q9, d9[1]      \n"
+
+                    "vext.32    q10, q0, q2, #2     \n"
+                    "vext.32    q11, q1, q3, #2     \n"
+                    "vmla.f32   q13, q10, d10[0]    \n"
+                    "vmla.f32   q14, q11, d10[1]    \n"
+
+                    "vext.32    q12, q0, q2, #3     \n"
+                    "vmla.f32   q15, q12, d11[0]    \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d12-d15}, [%9]     \n"// q6 q7 = k21222324 k25262728
+                    "add        %9, #28             \n"
+
+                    "pld        [%5, #512]          \n"
+                    "vld2.f32   {d0-d3}, [%5]!      \n"
+                    "vmla.f32   q13, q0, d12[0]     \n"
+                    "vmla.f32   q14, q1, d12[1]     \n"
+
+                    "vld2.f32   {d4-d7}, [%5]       \n"
+                    "vext.32    q8, q0, q2, #1      \n"
+                    "vext.32    q9, q1, q3, #1      \n"
+                    "vmla.f32   q15, q8, d13[0]     \n"
+                    "vmla.f32   q13, q9, d13[1]     \n"
+
+                    "vext.32    q10, q0, q2, #2     \n"
+                    "vext.32    q11, q1, q3, #2     \n"
+                    "vmla.f32   q14, q10, d14[0]    \n"
+                    "vmla.f32   q15, q11, d14[1]    \n"
+
+                    "vext.32    q12, q0, q2, #3     \n"
+                    "vmla.f32   q13, q12, d15[0]    \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d8-d11}, [%9]      \n"// q4 q5 = k28293031 k32333435
+                    "add        %9, #28             \n"
+
+                    "pld        [%6, #512]          \n"
+                    "vld2.f32   {d0-d3}, [%6]!      \n"
+                    "vmla.f32   q14, q0, d8[0]      \n"
+                    "vmla.f32   q15, q1, d8[1]      \n"
+
+                    "vld2.f32   {d4-d7}, [%6]       \n"
+                    "vext.32    q8, q0, q2, #1      \n"
+                    "vext.32    q9, q1, q3, #1      \n"
+                    "vmla.f32   q13, q8, d9[0]      \n"
+                    "vmla.f32   q14, q9, d9[1]      \n"
+
+                    "vext.32    q10, q0, q2, #2     \n"
+                    "vext.32    q11, q1, q3, #2     \n"
+                    "vmla.f32   q15, q10, d10[0]    \n"
+                    "vmla.f32   q13, q11, d10[1]    \n"
+
+                    "vext.32    q12, q0, q2, #3     \n"
+                    "vmla.f32   q14, q12, d11[0]    \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d12-d15}, [%9]     \n"// q6 q7 = k35363738 k39404142
+                    "add        %9, #28             \n"
+
+                    "pld        [%7, #512]          \n"
+                    "vld2.f32   {d0-d3}, [%7]!      \n"
+                    "vmla.f32   q15, q0, d12[0]     \n"
+                    "vmla.f32   q13, q1, d12[1]     \n"
+
+                    "vld2.f32   {d4-d7}, [%7]       \n"
+                    "vext.32    q8, q0, q2, #1      \n"
+                    "vext.32    q9, q1, q3, #1      \n"
+                    "vmla.f32   q14, q8, d13[0]     \n"
+                    "vmla.f32   q15, q9, d13[1]     \n"
+
+                    "vext.32    q10, q0, q2, #2     \n"
+                    "vext.32    q11, q1, q3, #2     \n"
+                    "vmla.f32   q13, q10, d14[0]    \n"
+                    "vmla.f32   q14, q11, d14[1]    \n"
+
+                    "vext.32    q12, q0, q2, #3     \n"
+                    "vmla.f32   q15, q12, d15[0]    \n"
+
+                    "pld        [%9, #256]          \n"
+                    "vld1.f32   {d8-d11}, [%9]      \n"// q4 q5 = k42434445 k46474849
+                    "sub        %9, #168            \n"// restore k0
+
+                    "pld        [%8, #512]          \n"
+                    "vld2.f32   {d0-d3}, [%8]!      \n"
+                    "vmla.f32   q13, q0, d8[0]      \n"
+                    "vmla.f32   q14, q1, d8[1]      \n"
+
+                    "vld2.f32   {d4-d7}, [%8]       \n"
+                    "vext.32    q8, q0, q2, #1      \n"
+                    "vext.32    q9, q1, q3, #1      \n"
+                    "vmla.f32   q15, q8, d9[0]      \n"
+                    "vmla.f32   q13, q9, d9[1]      \n"
+
+                    "vext.32    q10, q0, q2, #2     \n"
+                    "vext.32    q11, q1, q3, #2     \n"
+                    "vmla.f32   q14, q10, d10[0]    \n"
+                    "vmla.f32   q15, q11, d10[1]    \n"
+
+                    "vext.32    q12, q0, q2, #3     \n"
+                    "vmla.f32   q13, q12, d11[0]    \n"
+
+                    "vadd.f32   q14, q14, q15       \n"
+                    "vadd.f32   q13, q13, q14       \n"
+
+                    "vst1.f32   {d26-d27}, [%1]!    \n"
+
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+                    : "=r"(nn),         // %0
+                      "=r"(outptr),     // %1
+                      "=r"(r0),         // %2
+                      "=r"(r1),         // %3
+                      "=r"(r2),         // %4
+                      "=r"(r3),         // %5
+                      "=r"(r4),         // %6
+                      "=r"(r5),         // %7
+                      "=r"(r6),         // %8
+                      "=r"(k0)          // %9
+                    : "0"(nn),
+                      "1"(outptr),
+                      "2"(r0),
+                      "3"(r1),
+                      "4"(r2),
+                      "5"(r3),
+                      "6"(r4),
+                      "7"(r5),
+                      "8"(r6),
+                      "9"(k0)
+                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+                for (; remain>0; remain--)
+                {
+                    float sum = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r0[3] * k0[3];
+                    sum += r0[4] * k0[4];
+                    sum += r0[5] * k0[5];
+                    sum += r0[6] * k0[6];
+
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r1[3] * k1[3];
+                    sum += r1[4] * k1[4];
+                    sum += r1[5] * k1[5];
+                    sum += r1[6] * k1[6];
+
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+                    sum += r2[3] * k2[3];
+                    sum += r2[4] * k2[4];
+                    sum += r2[5] * k2[5];
+                    sum += r2[6] * k2[6];
+
+                    sum += r3[0] * k3[0];
+                    sum += r3[1] * k3[1];
+                    sum += r3[2] * k3[2];
+                    sum += r3[3] * k3[3];
+                    sum += r3[4] * k3[4];
+                    sum += r3[5] * k3[5];
+                    sum += r3[6] * k3[6];
+
+                    sum += r4[0] * k4[0];
+                    sum += r4[1] * k4[1];
+                    sum += r4[2] * k4[2];
+                    sum += r4[3] * k4[3];
+                    sum += r4[4] * k4[4];
+                    sum += r4[5] * k4[5];
+                    sum += r4[6] * k4[6];
+
+                    sum += r5[0] * k5[0];
+                    sum += r5[1] * k5[1];
+                    sum += r5[2] * k5[2];
+                    sum += r5[3] * k5[3];
+                    sum += r5[4] * k5[4];
+                    sum += r5[5] * k5[5];
+                    sum += r5[6] * k5[6];
+
+                    sum += r6[0] * k6[0];
+                    sum += r6[1] * k6[1];
+                    sum += r6[2] * k6[2];
+                    sum += r6[3] * k6[3];
+                    sum += r6[4] * k6[4];
+                    sum += r6[5] * k6[5];
+                    sum += r6[6] * k6[6];
+
+                    *outptr += sum;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+                    r4 += 2;
+                    r5 += 2;
+                    r6 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+                r3 += tailstep;
+                r4 += tailstep;
+                r5 += tailstep;
+                r6 += tailstep;
+
+            }
+
+        }
+    }
+
+}
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
new file mode 100644
index 00000000000..61d00e783f0
--- /dev/null
+++ b/src/layer/arm/convolution_arm.cpp
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_arm.h"
+
+namespace ncnn {
+
+#include "convolution_1x1.h"
+#include "convolution_2x2.h"
+#include "convolution_3x3.h"
+#include "convolution_4x4.h"
+#include "convolution_5x5.h"
+#include "convolution_7x7.h"
+
+DEFINE_LAYER_CREATOR(Convolution_arm)
+
+int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // convolv with NxN kernel
+    // value = value + bias
+
+    if (kernel_size > 7 || stride > 4 || dilation != 1)
+    {
+        return Convolution::forward(bottom_blob, top_blob);
+    }
+
+    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
+
+    // kernel_size x stride
+    conv_func conv_func_table[7][4] =
+    {
+        {
+            conv1x1s1_neon,
+            conv1x1s2_neon,
+            0,
+            0
+        }, // kernel_size = 1
+        {
+            conv2x2s1_neon,
+            0,
+            0,
+            0
+        }, // kernel_size = 2
+        {
+            conv3x3s1_neon,
+            conv3x3s2_neon,
+            0,
+            0
+        }, // kernel_size = 3
+        {
+            0,
+            0,
+            0,
+            conv4x4s4_neon
+        }, // kernel_size = 4
+        {
+            conv5x5s1_neon,
+            conv5x5s2_neon,
+            0,
+            0
+        }, // kernel_size = 5
+        {
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 6
+        {
+            conv7x7s1_neon,
+            conv7x7s2_neon,
+            0,
+            0
+        }  // kernel_size = 7
+    };
+
+    conv_func conv = conv_func_table[kernel_size-1][stride-1];
+    if (!conv)
+    {
+        return Convolution::forward(bottom_blob, top_blob);
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    Mat bottom_blob_bordered = bottom_blob;
+    if (pad > 0)
+    {
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
+        if (bottom_blob_bordered.empty())
+            return -100;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+    }
+
+    int outw = (w - kernel_size) / stride + 1;
+    int outh = (h - kernel_size) / stride + 1;
+
+    top_blob.create(outw, outh, num_output);
+    if (top_blob.empty())
+        return -100;
+
+    conv(bottom_blob_bordered, top_blob, weight_data, bias_data);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h
new file mode 100644
index 00000000000..6f2bf05de3d
--- /dev/null
+++ b/src/layer/arm/convolution_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION_ARM_H
+#define LAYER_CONVOLUTION_ARM_H
+
+#include "convolution.h"
+
+namespace ncnn {
+
+class Convolution_arm : public Convolution
+{
+public:
+    virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION_ARM_H
diff --git a/src/layer/arm/eltwise_arm.cpp b/src/layer/arm/eltwise_arm.cpp
new file mode 100644
index 00000000000..eb5f81947d2
--- /dev/null
+++ b/src/layer/arm/eltwise_arm.cpp
@@ -0,0 +1,574 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "eltwise_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Eltwise_arm)
+
+int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    if (op_type == Operation_PROD)
+    {
+        // first blob
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            for (; nn>0; nn--)
+            {
+                float32x4_t _ptr = vld1q_f32(ptr);
+                float32x4_t _ptr1 = vld1q_f32(ptr1);
+                float32x4_t _p = vmulq_f32(_ptr, _ptr1);
+                vst1q_f32(outptr, _p);
+
+                ptr += 4;
+                ptr1 += 4;
+                outptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "pld        [%2, #128]          \n"
+                "vld1.f32   {d0-d1}, [%1 :128]! \n"
+                "vld1.f32   {d2-d3}, [%2 :128]! \n"
+                "vmul.f32   q0, q0, q1          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d1}, [%3 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr),    // %1
+                  "=r"(ptr1),   // %2
+                  "=r"(outptr)  // %3
+                : "0"(nn),
+                  "1"(ptr),
+                  "2"(ptr1),
+                  "3"(outptr)
+                : "cc", "memory", "q0", "q1"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                *outptr = *ptr * *ptr1;
+
+                ptr++;
+                ptr1++;
+                outptr++;
+            }
+        }
+
+        for (size_t b=2; b<bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+                int nn = size >> 2;
+                int remain = size - (nn << 2);
+#else
+                int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _ptr = vld1q_f32(ptr);
+                    float32x4_t _p = vld1q_f32(outptr);
+                    _p = vmulq_f32(_ptr, _p);
+                    vst1q_f32(outptr, _p);
+
+                    ptr += 4;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "0:                             \n"
+                    "pld        [%1, #128]          \n"
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%1 :128]! \n"
+                    "vld1.f32   {d2-d3}, [%2 :128]  \n"
+                    "vmul.f32   q0, q0, q1          \n"
+                    "subs       %0, #1              \n"
+                    "vst1.f32   {d0-d1}, [%2 :128]! \n"
+                    "bne        0b                  \n"
+                    : "=r"(nn),     // %0
+                      "=r"(ptr),    // %1
+                      "=r"(outptr)  // %2
+                    : "0"(nn),
+                      "1"(ptr),
+                      "2"(outptr)
+                    : "cc", "memory", "q0", "q1"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    *outptr *= *ptr;
+
+                    ptr++;
+                    outptr++;
+                }
+            }
+        }
+    }
+    else if (op_type == Operation_SUM)
+    {
+        if (num_coeff == 0)
+        {
+            // first blob
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+                int nn = size >> 2;
+                int remain = size - (nn << 2);
+#else
+                int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _ptr = vld1q_f32(ptr);
+                    float32x4_t _ptr1 = vld1q_f32(ptr1);
+                    float32x4_t _p = vaddq_f32(_ptr, _ptr1);
+                    vst1q_f32(outptr, _p);
+
+                    ptr += 4;
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "0:                             \n"
+                    "pld        [%1, #128]          \n"
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%1 :128]! \n"
+                    "vld1.f32   {d2-d3}, [%2 :128]! \n"
+                    "vadd.f32   q0, q0, q1          \n"
+                    "subs       %0, #1              \n"
+                    "vst1.f32   {d0-d1}, [%3 :128]! \n"
+                    "bne        0b                  \n"
+                    : "=r"(nn),     // %0
+                      "=r"(ptr),    // %1
+                      "=r"(ptr1),   // %2
+                      "=r"(outptr)  // %3
+                    : "0"(nn),
+                      "1"(ptr),
+                      "2"(ptr1),
+                      "3"(outptr)
+                    : "cc", "memory", "q0", "q1"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    *outptr = *ptr + *ptr1;
+
+                    ptr++;
+                    ptr1++;
+                    outptr++;
+                }
+            }
+
+            for (size_t b=2; b<bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                #pragma omp parallel for
+                for (int q=0; q<channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+                    int nn = size >> 2;
+                    int remain = size - (nn << 2);
+#else
+                    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                    for (; nn>0; nn--)
+                    {
+                        float32x4_t _ptr = vld1q_f32(ptr);
+                        float32x4_t _p = vld1q_f32(outptr);
+                        _p = vaddq_f32(_ptr, _p);
+                        vst1q_f32(outptr, _p);
+
+                        ptr += 4;
+                        outptr += 4;
+                    }
+#else
+                    if (nn > 0)
+                    {
+                    asm volatile(
+                        "0:                             \n"
+                        "pld        [%1, #128]          \n"
+                        "pld        [%2, #128]          \n"
+                        "vld1.f32   {d0-d1}, [%1 :128]! \n"
+                        "vld1.f32   {d2-d3}, [%2 :128]  \n"
+                        "vadd.f32   q0, q0, q1          \n"
+                        "subs       %0, #1              \n"
+                        "vst1.f32   {d0-d1}, [%2 :128]! \n"
+                        "bne        0b                  \n"
+                        : "=r"(nn),     // %0
+                          "=r"(ptr),    // %1
+                          "=r"(outptr)  // %2
+                        : "0"(nn),
+                          "1"(ptr),
+                          "2"(outptr)
+                        : "cc", "memory", "q0", "q1"
+                    );
+                    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                    for (; remain>0; remain--)
+                    {
+                        *outptr += *ptr;
+
+                        ptr++;
+                        outptr++;
+                    }
+                }
+            }
+        }
+        else
+        {
+            const float* coeffs_ptr = coeffs;
+
+            // first blob
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            float coeff0 = coeffs_ptr[0];
+            float coeff1 = coeffs_ptr[1];
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+                int nn = size >> 2;
+                int remain = size - (nn << 2);
+#else
+                int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+                float32x4_t _coeff0 = vdupq_n_f32(coeff0);
+                float32x4_t _coeff1 = vdupq_n_f32(coeff1);
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _ptr = vld1q_f32(ptr);
+                    float32x4_t _ptr1 = vld1q_f32(ptr1);
+                    float32x4_t _p = vmulq_f32(_ptr, _coeff0);
+                    _p = vmlaq_f32(_p, _ptr1, _coeff1);
+                    vst1q_f32(outptr, _p);
+
+                    ptr += 4;
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "0:                             \n"
+                    "pld        [%1, #128]          \n"
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%1 :128]! \n"
+                    "vld1.f32   {d2-d3}, [%2 :128]! \n"
+                    "vmul.f32   q0, q0, %q8         \n"
+                    "vmla.f32   q0, q1, %q9         \n"
+                    "subs       %0, #1              \n"
+                    "vst1.f32   {d0-d1}, [%3 :128]! \n"
+                    "bne        0b                  \n"
+                    : "=r"(nn),     // %0
+                      "=r"(ptr),    // %1
+                      "=r"(ptr1),   // %2
+                      "=r"(outptr)  // %3
+                    : "0"(nn),
+                      "1"(ptr),
+                      "2"(ptr1),
+                      "3"(outptr),
+                      "w"(_coeff0), // %8
+                      "w"(_coeff1)  // %9
+                    : "cc", "memory", "q0", "q1"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    *outptr = *ptr * coeff0 + *ptr1 * coeff1;
+
+                    ptr++;
+                    ptr1++;
+                    outptr++;
+                }
+            }
+
+            for (size_t b=2; b<bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                float coeff = coeffs_ptr[b];
+                #pragma omp parallel for
+                for (int q=0; q<channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+                    int nn = size >> 2;
+                    int remain = size - (nn << 2);
+#else
+                    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+                    float32x4_t _coeff = vdupq_n_f32(coeff);
+#if __aarch64__
+                    for (; nn>0; nn--)
+                    {
+                        float32x4_t _ptr = vld1q_f32(ptr);
+                        float32x4_t _p = vld1q_f32(outptr);
+                        _p = vmlaq_f32(_p, _ptr, _coeff);
+                        vst1q_f32(outptr, _p);
+
+                        ptr += 4;
+                        outptr += 4;
+                    }
+#else
+                    if (nn > 0)
+                    {
+                    asm volatile(
+                        "0:                             \n"
+                        "pld        [%1, #128]          \n"
+                        "pld        [%2, #128]          \n"
+                        "vld1.f32   {d0-d1}, [%1 :128]! \n"
+                        "vld1.f32   {d2-d3}, [%2 :128]  \n"
+                        "vmla.f32   q1, q0, %q6         \n"
+                        "subs       %0, #1              \n"
+                        "vst1.f32   {d0-d1}, [%2 :128]! \n"
+                        "bne        0b                  \n"
+                        : "=r"(nn),     // %0
+                          "=r"(ptr),    // %1
+                          "=r"(outptr)  // %2
+                        : "0"(nn),
+                          "1"(ptr),
+                          "2"(outptr),
+                          "w"(_coeff)   // %6
+                        : "cc", "memory", "q0", "q1"
+                    );
+                    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                    for (; remain>0; remain--)
+                    {
+                        *outptr += *ptr * coeff;
+
+                        ptr++;
+                        outptr++;
+                    }
+                }
+            }
+        }
+    }
+    else if (op_type == Operation_MAX)
+    {
+        // first blob
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            for (; nn>0; nn--)
+            {
+                float32x4_t _ptr = vld1q_f32(ptr);
+                float32x4_t _ptr1 = vld1q_f32(ptr1);
+                float32x4_t _p = vmaxq_f32(_ptr, _ptr1);
+                vst1q_f32(outptr, _p);
+
+                ptr += 4;
+                ptr1 += 4;
+                outptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "pld        [%2, #128]          \n"
+                "vld1.f32   {d0-d1}, [%1 :128]! \n"
+                "vld1.f32   {d2-d3}, [%2 :128]! \n"
+                "vmax.f32   q0, q0, q1          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d1}, [%3 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr),    // %1
+                  "=r"(ptr1),   // %2
+                  "=r"(outptr)  // %3
+                : "0"(nn),
+                  "1"(ptr),
+                  "2"(ptr1),
+                  "3"(outptr)
+                : "cc", "memory", "q0", "q1"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                *outptr = std::max(*ptr, *ptr1);
+
+                ptr++;
+                ptr1++;
+                outptr++;
+            }
+        }
+
+        for (size_t b=2; b<bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+                int nn = size >> 2;
+                int remain = size - (nn << 2);
+#else
+                int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _ptr = vld1q_f32(ptr);
+                    float32x4_t _p = vld1q_f32(outptr);
+                    _p = vmaxq_f32(_ptr, _p);
+                    vst1q_f32(outptr, _p);
+
+                    ptr += 4;
+                    outptr += 4;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "0:                             \n"
+                    "pld        [%1, #128]          \n"
+                    "pld        [%2, #128]          \n"
+                    "vld1.f32   {d0-d1}, [%1 :128]! \n"
+                    "vld1.f32   {d2-d3}, [%2 :128]  \n"
+                    "vmax.f32   q0, q0, q1          \n"
+                    "subs       %0, #1              \n"
+                    "vst1.f32   {d0-d1}, [%2 :128]! \n"
+                    "bne        0b                  \n"
+                    : "=r"(nn),     // %0
+                      "=r"(ptr),    // %1
+                      "=r"(outptr)  // %2
+                    : "0"(nn),
+                      "1"(ptr),
+                      "2"(outptr)
+                    : "cc", "memory", "q0", "q1"
+                );
+                }
+#endif // __aarch64__
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    *outptr = std::max(*ptr, *outptr);
+
+                    ptr++;
+                    outptr++;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/eltwise_arm.h b/src/layer/arm/eltwise_arm.h
new file mode 100644
index 00000000000..060fac695fc
--- /dev/null
+++ b/src/layer/arm/eltwise_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ELTWISE_ARM_H
+#define LAYER_ELTWISE_ARM_H
+
+#include "eltwise.h"
+
+namespace ncnn {
+
+class Eltwise_arm : public Eltwise
+{
+public:
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELTWISE_ARM_H
diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
new file mode 100644
index 00000000000..50e6cdaf5c9
--- /dev/null
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "innerproduct_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(InnerProduct_arm)
+
+int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(1, 1, num_output);
+    if (top_blob.empty())
+        return -100;
+
+    // num_output
+    const float* weight_data_ptr = weight_data;
+    #pragma omp parallel for
+    for (int p=0; p<num_output; p++)
+    {
+        float* outptr = top_blob.channel(p);
+        float sum = 0.f;
+
+        if (bias_term)
+            sum = bias_data.data[p];
+
+        const float* w = weight_data_ptr + size * channels * p;
+        const float* w2 = w + size;
+
+#if __ARM_NEON
+        float32x4_t _sum = vdupq_n_f32(0.f);
+        float32x4_t _sum2 = vdupq_n_f32(0.f);
+#endif // __ARM_NEON
+
+        // channels
+        for (int q=0; q<channels; q++)
+        {
+            const float* m = bottom_blob.channel(q);
+
+#if __ARM_NEON
+            int nn = size >> 3;
+            int remain = size & 7;
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            for (; nn>0; nn--)
+            {
+                float32x4_t _m = vld1q_f32(m);
+                float32x4_t _w = vld1q_f32(w);
+                _sum = vfmaq_f32(_sum, _m, _w);
+
+                _m = vld1q_f32(m + 4);
+                _w = vld1q_f32(w + 4);
+                _sum2 = vfmaq_f32(_sum2, _m, _w);
+
+                m += 8;
+                w += 8;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "0:                             \n"
+                "pld        [%1, #256]          \n"
+                "vld1.f32   {d0-d3}, [%1 :128]! \n"
+                "pld        [%2, #256]          \n"
+                "vld1.f32   {d4-d7}, [%2]!      \n"
+                "vmla.f32   %q3, q0, q2         \n"
+                "subs       %0, #1              \n"
+                "vmla.f32   %q4, q1, q3         \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(m),      // %1
+                  "=r"(w),      // %2
+                  "=w"(_sum),   // %3
+                  "=w"(_sum2)   // %4
+                : "0"(nn),
+                  "1"(m),
+                  "2"(w),
+                  "3"(_sum),
+                  "4"(_sum2)
+                : "cc", "memory", "q0", "q1", "q2", "q3"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                sum += *m * *w;
+
+                m++;
+                w++;
+            }
+        }
+
+#if __ARM_NEON
+        _sum = vaddq_f32(_sum, _sum2);
+#if __aarch64__
+        sum += vaddvq_f32(_sum);
+#else
+        float32x2_t _sumss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+        _sumss = vpadd_f32(_sumss, _sumss);
+        sum += vget_lane_f32(_sumss, 0);
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+        outptr[0] = sum;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h
new file mode 100644
index 00000000000..5fdf3fe20f8
--- /dev/null
+++ b/src/layer/arm/innerproduct_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INNERPRODUCT_ARM_H
+#define LAYER_INNERPRODUCT_ARM_H
+
+#include "innerproduct.h"
+
+namespace ncnn {
+
+class InnerProduct_arm : public InnerProduct
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_INNERPRODUCT_ARM_H
diff --git a/src/layer/arm/lrn_arm.cpp b/src/layer/arm/lrn_arm.cpp
new file mode 100644
index 00000000000..901bc6e0243
--- /dev/null
+++ b/src/layer/arm/lrn_arm.cpp
@@ -0,0 +1,227 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "lrn_arm.h"
+#include <math.h>
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#include "neon_mathfun.h"
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(LRN_arm)
+
+int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    // squared values with local_size padding
+    Mat square_blob;
+    square_blob.create(w, h, channels);
+    if (square_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_top_blob.channel(q);
+        float* outptr = square_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _outp = vmulq_f32(_p, _p);
+            vst1q_f32(outptr, _outp);
+
+            ptr += 4;
+            outptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            *outptr = *ptr * *ptr;
+
+            ptr++;
+            outptr++;
+        }
+    }
+
+    float alpha_div_size = alpha / local_size;
+
+    if (region_type == NormRegion_ACROSS_CHANNELS)
+    {
+        Mat square_sum;
+        square_sum.create(w, h, channels);
+        if (square_sum.empty())
+            return -100;
+        square_sum.fill(0.f);
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            // square sum
+            for (int p=q - local_size / 2; p<q + local_size; p++)
+            {
+                if (p < 0 || p >= channels)
+                    continue;
+
+                const float* sptr = square_blob.channel(p);
+                float* ssptr = square_sum.channel(q);
+
+#if __ARM_NEON
+                int nn = size >> 2;
+                int remain = size - (nn << 2);
+#else
+                int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+                for (; nn>0; nn--)
+                {
+                    float32x4_t _sp = vld1q_f32(sptr);
+                    float32x4_t _ssp = vld1q_f32(ssptr);
+                    _ssp = vaddq_f32(_ssp, _sp);
+                    vst1q_f32(ssptr, _ssp);
+
+                    sptr += 4;
+                    ssptr += 4;
+                }
+#endif // __ARM_NEON
+                for (; remain>0; remain--)
+                {
+                    *ssptr += *sptr;
+                    sptr++;
+                    ssptr++;
+                }
+            }
+
+            float* ptr = bottom_top_blob.channel(q);
+            float* ssptr = square_sum.channel(q);
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+            float32x4_t _v1 = vdupq_n_f32(1.f);
+            float32x4_t _ads = vdupq_n_f32(alpha_div_size);
+            float32x4_t _mb = vdupq_n_f32(-beta);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                float32x4_t _ssp = vld1q_f32(ssptr);
+                _ssp = vmulq_f32(_ssp, _ads);
+                _ssp = vaddq_f32(_ssp, _v1);
+                _ssp = pow_ps(_ssp, _mb);
+                _p = vmulq_f32(_p, _ssp);
+                vst1q_f32(ptr, _p);
+
+                ssptr += 4;
+                ptr += 4;
+            }
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                *ptr = *ptr * pow(1.f + alpha_div_size * *ssptr, -beta);
+
+                ssptr++;
+                ptr++;
+            }
+        }
+    }
+    else if (region_type == NormRegion_WITHIN_CHANNEL)
+    {
+        int outw = w;
+        int outh = h;
+
+        Mat square_blob_bordered = square_blob;
+        int pad = local_size / 2;
+        if (pad > 0)
+        {
+            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
+            if (square_blob_bordered.empty())
+                return -100;
+
+            w = square_blob_bordered.w;
+            h = square_blob_bordered.h;
+        }
+
+        const int maxk = local_size * local_size;
+
+        // norm window offsets
+        std::vector<int> _space_ofs(maxk);
+        int* space_ofs = &_space_ofs[0];
+        {
+            int p1 = 0;
+            int p2 = 0;
+            int gap = w - local_size;
+            for (int i = 0; i < local_size; i++)
+            {
+                for (int j = 0; j < local_size; j++)
+                {
+                    space_ofs[p1] = p2;
+                    p1++;
+                    p2++;
+                }
+                p2 += gap;
+            }
+        }
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            const float* sptr = square_blob_bordered.channel(q);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    float ss = 0.f;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        float val = sptr[ space_ofs[k] ];
+                        ss += val;
+                    }
+
+                    ptr[j] = ptr[j] * pow(1.f + alpha_div_size * ss, -beta);
+                }
+
+                ptr += outw;
+                sptr += w;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/lrn_arm.h b/src/layer/arm/lrn_arm.h
new file mode 100644
index 00000000000..6b052945e8b
--- /dev/null
+++ b/src/layer/arm/lrn_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_LRN_ARM_H
+#define LAYER_LRN_ARM_H
+
+#include "lrn.h"
+
+namespace ncnn {
+
+class LRN_arm : public LRN
+{
+public:
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_LRN_ARM_H
diff --git a/src/layer/arm/neon_mathfun.h b/src/layer/arm/neon_mathfun.h
new file mode 100644
index 00000000000..a13371fdbf3
--- /dev/null
+++ b/src/layer/arm/neon_mathfun.h
@@ -0,0 +1,316 @@
+/* NEON implementation of sin, cos, exp and log
+ *
+ *   Inspired by Intel Approximate Math library, and based on the
+ *   corresponding algorithms of the cephes math library
+ */
+
+/* Copyright (C) 2011  Julien Pommier
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  (this is the zlib license)
+ */
+
+#include <arm_neon.h>
+
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 - 1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 - 1.2420140846E-1
+#define c_cephes_log_p4 + 1.4249322787E-1
+#define c_cephes_log_p5 - 1.6668057665E-1
+#define c_cephes_log_p6 + 2.0000714765E-1
+#define c_cephes_log_p7 - 2.4999993993E-1
+#define c_cephes_log_p8 + 3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+
+/* natural logarithm computed for 4 simultaneous float
+ *   return NaN for x <= 0
+ */
+static inline float32x4_t log_ps(float32x4_t x)
+{
+    float32x4_t one = vdupq_n_f32(1);
+
+    x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+    uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+    int32x4_t ux = vreinterpretq_s32_f32(x);
+
+    int32x4_t emm0 = vshrq_n_s32(ux, 23);
+
+    /* keep only the fractional part */
+    ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+    ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+    x = vreinterpretq_f32_s32(ux);
+
+    emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+    float32x4_t e = vcvtq_f32_s32(emm0);
+
+    e = vaddq_f32(e, one);
+
+    /* part2:
+     *     if( x < SQRTHF ) {
+     *       e -= 1;
+     *       x = x + x - 1.0;
+     *     } else { x = x - 1.0; }
+     */
+    uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+    float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+    x = vsubq_f32(x, one);
+    e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+    x = vaddq_f32(x, tmp);
+
+    float32x4_t z = vmulq_f32(x,x);
+
+    float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+    y = vmulq_f32(y, x);
+
+    y = vmulq_f32(y, z);
+
+
+    tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+    y = vaddq_f32(y, tmp);
+
+
+    tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+    y = vsubq_f32(y, tmp);
+
+    tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+    x = vaddq_f32(x, y);
+    x = vaddq_f32(x, tmp);
+    x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+    return x;
+}
+
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+
+/* exp() computed for 4 float at once */
+static inline float32x4_t exp_ps(float32x4_t x)
+{
+    float32x4_t tmp, fx;
+
+    float32x4_t one = vdupq_n_f32(1);
+    x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+    x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+
+    /* perform a floorf */
+    tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+    /* if greater, substract 1 */
+    uint32x4_t mask = vcgtq_f32(tmp, fx);
+    mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+
+
+    fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+    tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+    float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+    x = vsubq_f32(x, tmp);
+    x = vsubq_f32(x, z);
+
+    static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
+    float32x4_t y = vld1q_dup_f32(cephes_exp_p+0);
+    float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1);
+    float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2);
+    float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3);
+    float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4);
+    float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5);
+
+    y = vmulq_f32(y, x);
+    z = vmulq_f32(x, x);
+
+    y = vaddq_f32(y, c1);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c2);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c3);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c4);
+    y = vmulq_f32(y, x);
+    y = vaddq_f32(y, c5);
+
+    y = vmulq_f32(y, z);
+    y = vaddq_f32(y, x);
+    y = vaddq_f32(y, one);
+
+    /* build 2^n */
+    int32x4_t mm;
+    mm = vcvtq_s32_f32(fx);
+    mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+    mm = vshlq_n_s32(mm, 23);
+    float32x4_t pow2n = vreinterpretq_f32_s32(mm);
+
+    y = vmulq_f32(y, pow2n);
+    return y;
+}
+
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1  8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0  2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2  4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+
+/* evaluation of 4 sines & cosines at once.
+ *
+ *   The code is the exact rewriting of the cephes sinf function.
+ *   Precision is excellent as long as x < 8192 (I did not bother to
+ *   take into account the special handling they have for greater values
+ *   -- it does not return garbage for arguments over 8192, though, but
+ *   the extra precision is missing).
+ *
+ *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+ *   surprising but correct result.
+ *
+ *   Note also that when you compute sin(x), cos(x) is available at
+ *   almost no extra price so both sin_ps and cos_ps make use of
+ *   sincos_ps..
+ */
+static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos)
+{
+    // any x
+    float32x4_t xmm1, xmm2, xmm3, y;
+
+    uint32x4_t emm2;
+
+    uint32x4_t sign_mask_sin, sign_mask_cos;
+    sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+    x = vabsq_f32(x);
+
+    /* scale by 4/Pi */
+    y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+
+    /* store the integer part of y in mm0 */
+    emm2 = vcvtq_u32_f32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+    emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+    y = vcvtq_f32_u32(emm2);
+
+    /* get the polynom selection mask
+     *     there is one polynom for 0 <= x <= Pi/4
+     *     and another one for Pi/4<x<=Pi/2
+     *
+     *     Both branches will be computed.
+     */
+    uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+
+    /* The magic pass: "Extended precision modular arithmetic"
+     *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+    xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+    xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+    x = vaddq_f32(x, xmm1);
+    x = vaddq_f32(x, xmm2);
+    x = vaddq_f32(x, xmm3);
+
+    sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+    sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
+     *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+    float32x4_t z = vmulq_f32(x,x);
+    float32x4_t y1, y2;
+
+    y1 = vmulq_n_f32(z, c_coscof_p0);
+    y2 = vmulq_n_f32(z, c_sincof_p0);
+    y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+    y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+    y1 = vmulq_f32(y1, z);
+    y2 = vmulq_f32(y2, z);
+    y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+    y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+    y1 = vmulq_f32(y1, z);
+    y2 = vmulq_f32(y2, z);
+    y1 = vmulq_f32(y1, z);
+    y2 = vmulq_f32(y2, x);
+    y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+    y2 = vaddq_f32(y2, x);
+    y1 = vaddq_f32(y1, vdupq_n_f32(1));
+
+    /* select the correct result from the two polynoms */
+    float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
+    float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
+    *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+    *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+
+static inline float32x4_t sin_ps(float32x4_t x)
+{
+    float32x4_t ysin, ycos;
+    sincos_ps(x, &ysin, &ycos);
+    return ysin;
+}
+
+static inline float32x4_t cos_ps(float32x4_t x)
+{
+    float32x4_t ysin, ycos;
+    sincos_ps(x, &ysin, &ycos);
+    return ycos;
+}
+
+static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b);
+    reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+//     reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+    return vmulq_f32(a, reciprocal);
+}
+
+static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b)
+{
+    // pow(x, m) = exp(m * log(x))
+    return exp_ps(vmulq_f32(b, log_ps(a)));
+}
diff --git a/src/layer/arm/pooling_2x2.h b/src/layer/arm/pooling_2x2.h
new file mode 100644
index 00000000000..77fadb77909
--- /dev/null
+++ b/src/layer/arm/pooling_2x2.h
@@ -0,0 +1,112 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    #pragma omp parallel for
+    for (int q=0; q<inch; q++)
+    {
+        const float* img0 = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        const float* r0 = img0;
+        const float* r1 = img0 + w;
+
+        for (int i = 0; i < outh; i++)
+        {
+#if __ARM_NEON
+            int nn = outw >> 2;
+            int remain = outw - (nn << 2);
+#else
+            int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            for (; nn>0; nn--)
+            {
+                float32x4_t _r00 = vld1q_f32(r0);
+                float32x4_t _r10 = vld1q_f32(r1);
+                float32x4_t _r01 = vld1q_f32(r0 + 4);
+                float32x4_t _r11 = vld1q_f32(r1 + 4);
+
+                float32x4_t _max0 = vmaxq_f32(_r00, _r10);
+                float32x4_t _max1 = vmaxq_f32(_r01, _r11);
+
+                float32x4_t _max = vpmaxq_f32(_max0, _max1);
+
+                vst1q_f32(outptr, _max);
+
+                r0 += 8;
+                r1 += 8;
+                outptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "0:                             \n"
+                "pld        [%1, #256]          \n"
+                "pld        [%2, #256]          \n"
+                "vld1.f32   {d0-d3}, [%1]!      \n"
+                "vld1.f32   {d4-d7}, [%2]!      \n"
+                "vmax.f32   q0, q0, q2          \n"
+                "vmax.f32   q1, q1, q3          \n"
+                "vpmax.f32  d4, d0, d1          \n"
+                "vpmax.f32  d5, d2, d3          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d4-d5}, [%3]!      \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(r0),     // %1
+                  "=r"(r1),     // %2
+                  "=r"(outptr)  // %3
+                : "0"(nn),
+                  "1"(r0),
+                  "2"(r1),
+                  "3"(outptr)
+                : "cc", "memory", "q0", "q1", "q2", "q3"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                float max0 = std::max(r0[0], r0[1]);
+                float max1 = std::max(r1[0], r1[1]);
+
+                *outptr = std::max(max0, max1);
+
+                r0 += 2;
+                r1 += 2;
+                outptr++;
+            }
+
+            r0 += w;
+            r1 += w;
+        }
+    }
+}
diff --git a/src/layer/arm/pooling_3x3.h b/src/layer/arm/pooling_3x3.h
new file mode 100644
index 00000000000..47dad16d22a
--- /dev/null
+++ b/src/layer/arm/pooling_3x3.h
@@ -0,0 +1,170 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+
+    #pragma omp parallel for
+    for (int q=0; q<inch; q++)
+    {
+        const float* img0 = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        const float* r0 = img0;
+        const float* r1 = img0 + w;
+        const float* r2 = img0 + w*2;
+
+        for (int i = 0; i < outh; i++)
+        {
+#if __ARM_NEON
+            int nn = outw >> 2;
+            int remain = outw - (nn << 2);
+#else
+            int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            float32x4x2_t _r0 = vld2q_f32(r0);
+            float32x4x2_t _r1 = vld2q_f32(r1);
+            float32x4x2_t _r2 = vld2q_f32(r2);
+            for (; nn>0; nn--)
+            {
+                float32x4x2_t _r0n = vld2q_f32(r0+8);
+                float32x4x2_t _r1n = vld2q_f32(r1+8);
+                float32x4x2_t _r2n = vld2q_f32(r2+8);
+
+                float32x4_t _max0 = vmaxq_f32(_r0.val[0], _r0.val[1]);
+                float32x4_t _max1 = vmaxq_f32(_r1.val[0], _r1.val[1]);
+                float32x4_t _max2 = vmaxq_f32(_r2.val[0], _r2.val[1]);
+
+                float32x4_t _r02 = vextq_f32(_r0.val[0], _r0n.val[0], 1);
+                float32x4_t _r12 = vextq_f32(_r1.val[0], _r1n.val[0], 1);
+                float32x4_t _r22 = vextq_f32(_r2.val[0], _r2n.val[0], 1);
+
+                _max0 = vmaxq_f32(_max0, _r02);
+                _max1 = vmaxq_f32(_max1, _r12);
+                _max2 = vmaxq_f32(_max2, _r22);
+
+                float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2);
+
+                vst1q_f32(outptr, _max);
+
+                _r0 = _r0n;
+                _r1 = _r1n;
+                _r2 = _r2n;
+
+                r0 += 8;
+                r1 += 8;
+                r2 += 8;
+                outptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "pld        [%1, #256]          \n"
+                "vld2.f32   {d0-d3}, [%1]!      \n"// q0 = 0 2 4 6  q1 = 1 3 5 7
+                "pld        [%2, #256]          \n"
+                "vld2.f32   {d4-d7}, [%2]!      \n"
+                "pld        [%3, #256]          \n"
+                "vld2.f32   {d8-d11}, [%3]!     \n"
+                "0:                             \n"
+                "pld        [%1, #256]          \n"
+                "vld2.f32   {d12-d15}, [%1]!    \n"// q6 = 8 10 12 14  q7 = 9 11 13 15
+
+                "vmax.f32   q12, q0, q1         \n"
+                "vmax.f32   q13, q2, q3         \n"
+
+                "pld        [%2, #256]          \n"
+                "vld2.f32   {d16-d19}, [%2]!    \n"
+
+                "vmax.f32   q14, q4, q5         \n"
+                "vext.32    q0, q0, q6, #1      \n"
+
+                "pld        [%3, #256]          \n"
+                "vld2.f32   {d20-d23}, [%3]!    \n"
+
+                "vext.32    q2, q2, q8, #1      \n"
+
+                "vmax.f32   q12, q12, q0        \n"
+                "vext.32    q4, q4, q10, #1     \n"
+
+                "vmax.f32   q13, q13, q2        \n"
+                "vmax.f32   q14, q14, q4        \n"
+                "vmax.f32   q12, q12, q13       \n"
+
+                "vorr       q0, q6, q6          \n"
+                "vorr       q1, q7, q7          \n"
+                "vmax.f32   q12, q12, q14       \n"
+
+                "vorr       q2, q8, q8          \n"
+                "vorr       q3, q9, q9          \n"
+                "vorr       q4, q10, q10        \n"
+                "vorr       q5, q11, q11        \n"
+
+                "subs       %0, #1              \n"
+                "vst1.f32   {d24-d25}, [%4]!    \n"
+                "bne        0b                  \n"
+                "sub        %1, #32             \n"
+                "sub        %2, #32             \n"
+                "sub        %3, #32             \n"
+                : "=r"(nn),     // %0
+                  "=r"(r0),     // %1
+                  "=r"(r1),     // %2
+                  "=r"(r2),     // %3
+                  "=r"(outptr)  // %4
+                : "0"(nn),
+                  "1"(r0),
+                  "2"(r1),
+                  "3"(r2),
+                  "4"(outptr)
+                : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                float max0 = std::max(std::max(r0[0], r0[1]), r0[2]);
+                float max1 = std::max(std::max(r1[0], r1[1]), r1[2]);
+                float max2 = std::max(std::max(r2[0], r2[1]), r2[2]);
+
+                *outptr = std::max(std::max(max0, max1), max2);
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+                outptr++;
+            }
+
+            r0 += tailstep;//1 + w;
+            r1 += tailstep;//1 + w;
+            r2 += tailstep;//1 + w;
+        }
+    }
+}
diff --git a/src/layer/arm/pooling_arm.cpp b/src/layer/arm/pooling_arm.cpp
new file mode 100644
index 00000000000..59c1c997f9e
--- /dev/null
+++ b/src/layer/arm/pooling_arm.cpp
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pooling_arm.h"
+
+namespace ncnn {
+
+#include "pooling_2x2.h"
+#include "pooling_3x3.h"
+
+DEFINE_LAYER_CREATOR(Pooling_arm)
+
+int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // max value in NxN window
+    // avg value in NxN window
+
+    if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
+    {
+        return Pooling::forward(bottom_blob, top_blob);
+    }
+
+    if (kernel_size != 2 && kernel_size != 3)
+    {
+        return Pooling::forward(bottom_blob, top_blob);
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    Mat bottom_blob_bordered = bottom_blob;
+    if (pad > 0)
+    {
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
+        if (bottom_blob_bordered.empty())
+            return -100;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+    }
+
+    int outw = (w - kernel_size) / stride + 1;
+    int outh = (h - kernel_size) / stride + 1;
+
+    int wtail = (w - kernel_size) % stride;
+    int htail = (h - kernel_size) % stride;
+    if (wtail != 0 || htail != 0)
+    {
+        int wtailpad = 0;
+        int htailpad = 0;
+        if (wtail != 0)
+            wtailpad = kernel_size - wtail;
+        if (htail != 0)
+            htailpad = kernel_size - htail;
+
+        Mat bottom_blob_bordered2;
+        copy_make_border(bottom_blob_bordered, bottom_blob_bordered2, 0, htailpad, 0, wtailpad, BORDER_REPLICATE, 0.f);
+        if (bottom_blob_bordered2.empty())
+            return -100;
+
+        bottom_blob_bordered = bottom_blob_bordered2;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+
+        if (wtail != 0)
+            outw += 1;
+        if (htail != 0)
+            outh += 1;
+    }
+
+    top_blob.create(outw, outh, channels);
+    if (top_blob.empty())
+        return -100;
+
+    if (kernel_size == 2)
+        pooling2x2s2_max_neon(bottom_blob_bordered, top_blob);
+    if (kernel_size == 3)
+        pooling3x3s2_max_neon(bottom_blob_bordered, top_blob);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/pooling_arm.h b/src/layer/arm/pooling_arm.h
new file mode 100644
index 00000000000..b7d774fa273
--- /dev/null
+++ b/src/layer/arm/pooling_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_POOLING_ARM_H
+#define LAYER_POOLING_ARM_H
+
+#include "pooling.h"
+
+namespace ncnn {
+
+class Pooling_arm : public Pooling
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_POOLING_ARM_H
diff --git a/src/layer/arm/prelu_arm.cpp b/src/layer/arm/prelu_arm.cpp
new file mode 100644
index 00000000000..72d9ae0fba6
--- /dev/null
+++ b/src/layer/arm/prelu_arm.cpp
@@ -0,0 +1,182 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "prelu_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(PReLU_arm)
+
+int PReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    const float* slope_data_ptr = slope_data;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+        float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        float32x4_t _zero = vdupq_n_f32(0.f);
+        float32x4_t _slope = vdupq_n_f32(slope);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            uint32x4_t _lemask = vcleq_f32(_p, _zero);
+            float32x4_t _ps = vmulq_f32(_p, _slope);
+            float32x4_t _outp = vbslq_f32(_lemask, _ps, _p);
+            vst1q_f32(outptr, _outp);
+
+            ptr += 4;
+            outptr += 4;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "veor       q1, q0, q0          \n"
+            "vdup.f32   q2, %6              \n"
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld1.f32   {d0-d1}, [%1 :128]  \n"
+            "vcle.f32   q3, q0, q1          \n"
+            "vmul.f32   q4, q0, q2          \n"
+            "vbit.32    q0, q4, q3          \n"
+            "subs       %0, #1              \n"
+            "vst1.f32   {d0-d1}, [%2 :128]! \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(ptr),    // %1
+              "=r"(outptr)  // %2
+            : "0"(nn),
+              "1"(ptr),
+              "2"(outptr),
+              "r"(slope)    // %6
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            if (*ptr < 0)
+                *outptr = *ptr * slope;
+            else
+                *outptr = *ptr;
+
+            ptr++;
+            outptr++;
+        }
+    }
+
+    return 0;
+}
+
+int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    const float* slope_data_ptr = slope_data;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        float32x4_t _zero = vdupq_n_f32(0.f);
+        float32x4_t _slope = vdupq_n_f32(slope);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            uint32x4_t _lemask = vcleq_f32(_p, _zero);
+            float32x4_t _ps = vmulq_f32(_p, _slope);
+            _p = vbslq_f32(_lemask, _ps, _p);
+            vst1q_f32(ptr, _p);
+
+            ptr += 4;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "veor       q1, q0, q0          \n"
+            "vdup.f32   q2, %4              \n"
+            "0:                             \n"
+            "pld        [%1, #128]          \n"
+            "vld1.f32   {d0-d1}, [%1 :128]  \n"
+            "vcle.f32   q3, q0, q1          \n"
+            "vmul.f32   q4, q0, q2          \n"
+            "vbit.32    q0, q4, q3          \n"
+            "subs       %0, #1              \n"
+            "vst1.f32   {d0-d1}, [%1 :128]! \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(ptr)     // %1
+            : "0"(nn),
+              "1"(ptr),
+              "r"(slope)    // %4
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            if (*ptr < 0)
+                *ptr *= slope;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/prelu_arm.h b/src/layer/arm/prelu_arm.h
new file mode 100644
index 00000000000..fbd32f7fe6c
--- /dev/null
+++ b/src/layer/arm/prelu_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PRELU_ARM_H
+#define LAYER_PRELU_ARM_H
+
+#include "prelu.h"
+
+namespace ncnn {
+
+class PReLU_arm : public PReLU
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PRELU_ARM_H
diff --git a/src/layer/arm/relu_arm.cpp b/src/layer/arm/relu_arm.cpp
new file mode 100644
index 00000000000..5477c37afe1
--- /dev/null
+++ b/src/layer/arm/relu_arm.cpp
@@ -0,0 +1,295 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "relu_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(ReLU_arm)
+
+int ReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    if (slope == 0.f)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            float32x4_t _zero = vdupq_n_f32(0.f);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                float32x4_t _outp = vmaxq_f32(_p, _zero);
+                vst1q_f32(outptr, _outp);
+
+                ptr += 4;
+                outptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "veor       q1, q0, q0          \n"
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.f32   {d0-d1}, [%1 :128]! \n"
+                "vmax.f32   q0, q0, q1          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d1}, [%2 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr),    // %1
+                  "=r"(outptr)  // %2
+                : "0"(nn),
+                  "1"(ptr),
+                  "2"(outptr)
+                : "cc", "memory", "q0", "q1"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                *outptr = std::max(*ptr, 0.f);
+
+                ptr++;
+                outptr++;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            float32x4_t _zero = vdupq_n_f32(0.f);
+            float32x4_t _slope = vdupq_n_f32(slope);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                uint32x4_t _lemask = vcleq_f32(_p, _zero);
+                float32x4_t _ps = vmulq_f32(_p, _slope);
+                float32x4_t _outp = vbslq_f32(_lemask, _ps, _p);
+                vst1q_f32(outptr, _outp);
+
+                ptr += 4;
+                outptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "veor       q1, q0, q0          \n"
+                "vdup.f32   q2, %6              \n"
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.f32   {d0-d1}, [%1 :128]  \n"
+                "vcle.f32   q3, q0, q1          \n"
+                "vmul.f32   q4, q0, q2          \n"
+                "vbit.32    q0, q4, q3          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d1}, [%2 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr),    // %1
+                  "=r"(outptr)  // %2
+                : "0"(nn),
+                  "1"(ptr),
+                  "2"(outptr),
+                  "r"(slope)    // %6
+                : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                if (*ptr < 0)
+                    *outptr = *ptr * slope;
+                else
+                    *outptr = *ptr;
+
+                ptr++;
+                outptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    if (slope == 0.f)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            float32x4_t _zero = vdupq_n_f32(0.f);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                _p = vmaxq_f32(_p, _zero);
+                vst1q_f32(ptr, _p);
+
+                ptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "veor       q1, q0, q0          \n"
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.f32   {d0-d1}, [%1 :128]  \n"
+                "vmax.f32   q0, q0, q1          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d1}, [%1 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr)     // %1
+                : "0"(nn),
+                  "1"(ptr)
+                : "cc", "memory", "q0", "q1"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                *ptr = std::max(*ptr, 0.f);
+
+                ptr++;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            float32x4_t _zero = vdupq_n_f32(0.f);
+            float32x4_t _slope = vdupq_n_f32(slope);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                uint32x4_t _lemask = vcleq_f32(_p, _zero);
+                float32x4_t _ps = vmulq_f32(_p, _slope);
+                _p = vbslq_f32(_lemask, _ps, _p);
+                vst1q_f32(ptr, _p);
+
+                ptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "veor       q1, q0, q0          \n"
+                "vdup.f32   q2, %4              \n"
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.f32   {d0-d1}, [%1 :128]  \n"
+                "vcle.f32   q3, q0, q1          \n"
+                "vmul.f32   q4, q0, q2          \n"
+                "vbit.32    q0, q4, q3          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d1}, [%1 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr)     // %1
+                : "0"(nn),
+                  "1"(ptr),
+                  "r"(slope)    // %4
+                : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                if (*ptr < 0)
+                    *ptr *= slope;
+
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/relu_arm.h b/src/layer/arm/relu_arm.h
new file mode 100644
index 00000000000..294a28b8fd8
--- /dev/null
+++ b/src/layer/arm/relu_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RELU_ARM_H
+#define LAYER_RELU_ARM_H
+
+#include "relu.h"
+
+namespace ncnn {
+
+class ReLU_arm : public ReLU
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RELU_ARM_H
diff --git a/src/layer/arm/scale_arm.cpp b/src/layer/arm/scale_arm.cpp
new file mode 100644
index 00000000000..754001d301f
--- /dev/null
+++ b/src/layer/arm/scale_arm.cpp
@@ -0,0 +1,211 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "scale_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Scale_arm)
+
+int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    if (bias_term)
+    {
+        const float* scale_ptr = scale_data;
+        const float* bias_ptr = bias_data;
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            float s = scale_ptr[q];
+            float bias = bias_ptr[q];
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+            float32x4_t _s = vdupq_n_f32(s);
+            float32x4_t _bias = vdupq_n_f32(bias);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                _p = vmlaq_f32(_bias, _p, _s);
+                vst1q_f32(outptr, _p);
+
+                ptr += 4;
+                outptr += 4;
+            }
+#endif // __ARM_NEON
+
+            for (; remain>0; remain--)
+            {
+                *outptr = *ptr * s + bias;
+
+                ptr++;
+                outptr++;
+            }
+        }
+    }
+    else
+    {
+        const float* scale_ptr = scale_data;
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            float s = scale_ptr[q];
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+            float32x4_t _s = vdupq_n_f32(s);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                _p = vmulq_f32(_p, _s);
+                vst1q_f32(outptr, _p);
+
+                ptr += 4;
+                outptr += 4;
+            }
+#endif // __ARM_NEON
+
+            for (; remain>0; remain--)
+            {
+                *outptr = *ptr * s;
+
+                ptr++;
+                outptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    if (bias_term)
+    {
+        const float* scale_ptr = scale_data;
+        const float* bias_ptr = bias_data;
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            float s = scale_ptr[q];
+            float bias = bias_ptr[q];
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+            float32x4_t _s = vdupq_n_f32(s);
+            float32x4_t _bias = vdupq_n_f32(bias);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                _p = vmlaq_f32(_bias, _p, _s);
+                vst1q_f32(ptr, _p);
+
+                ptr += 4;
+            }
+#endif // __ARM_NEON
+
+            for (; remain>0; remain--)
+            {
+                *ptr = *ptr * s + bias;
+
+                ptr++;
+            }
+        }
+    }
+    else
+    {
+        const float* scale_ptr = scale_data;
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            float s = scale_ptr[q];
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+            float32x4_t _s = vdupq_n_f32(s);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                _p = vmulq_f32(_p, _s);
+                vst1q_f32(ptr, _p);
+
+                ptr += 4;
+            }
+#endif // __ARM_NEON
+
+            for (; remain>0; remain--)
+            {
+                *ptr *= s;
+
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/scale_arm.h b/src/layer/arm/scale_arm.h
new file mode 100644
index 00000000000..11a739b90d5
--- /dev/null
+++ b/src/layer/arm/scale_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SCALE_ARM_H
+#define LAYER_SCALE_ARM_H
+
+#include "scale.h"
+
+namespace ncnn {
+
+class Scale_arm : public Scale
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SCALE_ARM_H
diff --git a/src/layer/arm/sigmoid_arm.cpp b/src/layer/arm/sigmoid_arm.cpp
new file mode 100644
index 00000000000..754f9cf5bc5
--- /dev/null
+++ b/src/layer/arm/sigmoid_arm.cpp
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "sigmoid_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#include "neon_mathfun.h"
+#endif // __ARM_NEON
+
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Sigmoid_arm)
+
+int Sigmoid_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        float32x4_t _one = vdupq_n_f32(1.f);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            _p = vnegq_f32(_p);
+            _p = exp_ps(_p);
+            _p = vaddq_f32(_p, _one);
+            float32x4_t _outp = vrecpeq_f32(_p);
+            _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+//             _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+            vst1q_f32(outptr, _outp);
+
+            ptr += 4;
+            outptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            *outptr = 1.f / (1.f + exp(-*ptr));
+
+            ptr++;
+            outptr++;
+        }
+    }
+
+    return 0;
+}
+
+int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        float32x4_t _one = vdupq_n_f32(1.f);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            _p = vnegq_f32(_p);
+            _p = exp_ps(_p);
+            _p = vaddq_f32(_p, _one);
+            _p = vrecpeq_f32(_p);
+            _p = vmulq_f32(vrecpsq_f32(_p, _p), _p);
+//             _p = vmulq_f32(vrecpsq_f32(_p, _p), _p);
+            vst1q_f32(ptr, _p);
+
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            *ptr = 1.f / (1.f + exp(-*ptr));
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h
new file mode 100644
index 00000000000..7fe558db561
--- /dev/null
+++ b/src/layer/arm/sigmoid_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SIGMOID_ARM_H
+#define LAYER_SIGMOID_ARM_H
+
+#include "sigmoid.h"
+
+namespace ncnn {
+
+class Sigmoid_arm : public Sigmoid
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SIGMOID_ARM_H
diff --git a/src/layer/arm/slice_arm.cpp b/src/layer/arm/slice_arm.cpp
new file mode 100644
index 00000000000..b4a7801b8f0
--- /dev/null
+++ b/src/layer/arm/slice_arm.cpp
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "slice_arm.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Slice_arm)
+
+int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    int q = 0;
+    const int* slices_ptr = (const int*)slices.data;
+    for (size_t i=0; i<top_blobs.size(); i++)
+    {
+        int slice = slices_ptr[i];
+        if (slice == -233)
+        {
+            slice = (channels - q) / (top_blobs.size() - i);
+        }
+
+        Mat& top_blob = top_blobs[i];
+        top_blob.create(w, h, slice);
+        if (top_blob.empty())
+            return -100;
+
+        int size = bottom_blob.cstep * slice;
+
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.data;
+
+#if __ARM_NEON
+        int nn = size >> 3;
+        int remain = size - (nn << 3);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _p2 = vld1q_f32(ptr+4);
+            vst1q_f32(outptr, _p);
+            vst1q_f32(outptr+4, _p2);
+
+            ptr += 8;
+            outptr += 8;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "0:                             \n"
+            "pld        [%1, #256]          \n"
+            "vld1.f32   {d0-d3}, [%1 :128]! \n"
+            "subs       %0, #1              \n"
+            "vst1.f32   {d0-d3}, [%2 :128]! \n"
+            "bne        0b                  \n"
+            : "=r"(nn),     // %0
+              "=r"(ptr),    // %1
+              "=r"(outptr)  // %2
+            : "0"(nn),
+              "1"(ptr),
+              "2"(outptr)
+            : "cc", "memory", "q0"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            *outptr++ = *ptr++;
+        }
+
+        q += slice;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/slice_arm.h b/src/layer/arm/slice_arm.h
new file mode 100644
index 00000000000..16e97dc8226
--- /dev/null
+++ b/src/layer/arm/slice_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SLICE_ARM_H
+#define LAYER_SLICE_ARM_H
+
+#include "slice.h"
+
+namespace ncnn {
+
+class Slice_arm : public Slice
+{
+public:
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SLICE_ARM_H
diff --git a/src/layer/arm/softmax_arm.cpp b/src/layer/arm/softmax_arm.cpp
new file mode 100644
index 00000000000..09ed21a7824
--- /dev/null
+++ b/src/layer/arm/softmax_arm.cpp
@@ -0,0 +1,302 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "softmax_arm.h"
+#include <float.h>
+#include <math.h>
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#include "neon_mathfun.h"
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Softmax_arm)
+
+int Softmax_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // value = exp( value - global max value )
+    // sum all value
+    // value = value / sum
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    Mat max;
+    max.create(w, h);
+    if (max.empty())
+        return -100;
+    max.fill(-FLT_MAX);
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* maxptr = max;
+
+        for (int i=0; i<size; i++)
+        {
+            maxptr[i] = std::max(maxptr[i], ptr[i]);
+        }
+    }
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+        float* maxptr = max;
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _max = vld1q_f32(maxptr);
+
+            _p = exp_ps(vsubq_f32(_p, _max));
+
+            vst1q_f32(outptr, _p);
+
+            ptr += 4;
+            maxptr += 4;
+            outptr += 4;
+        }
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *outptr = exp(*ptr - *maxptr);
+
+            ptr++;
+            maxptr++;
+            outptr++;
+        }
+    }
+
+    Mat sum;
+    sum.create(w, h);
+    if (sum.empty())
+        return -100;
+    sum.fill(0.f);
+    for (int q=0; q<channels; q++)
+    {
+        const float* outptr = top_blob.channel(q);
+        float* sumptr = sum;
+
+        for (int i=0; i<size; i++)
+        {
+            sumptr[i] += outptr[i];
+        }
+    }
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* outptr = top_blob.channel(q);
+        float* sumptr = sum;
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(outptr);
+            float32x4_t _sum = vld1q_f32(sumptr);
+#if __aarch64__
+            _p = vdivq_f32(_p, _sum);
+#else
+            _p = div_ps(_p, _sum);
+#endif // __aarch64__
+            vst1q_f32(outptr, _p);
+
+            outptr += 4;
+            sumptr += 4;
+        }
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *outptr /= *sumptr;
+
+            outptr++;
+            sumptr++;
+        }
+    }
+
+    return 0;
+}
+
+int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+    // value = exp( value - global max value )
+    // sum all value
+    // value = value / sum
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    Mat max;
+    max.create(w, h);
+    if (max.empty())
+        return -100;
+    max.fill(-FLT_MAX);
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* maxptr = max;
+
+        for (int i=0; i<size; i++)
+        {
+            maxptr[i] = std::max(maxptr[i], ptr[i]);
+        }
+    }
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* maxptr = max;
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _max = vld1q_f32(maxptr);
+
+            _p = exp_ps(vsubq_f32(_p, _max));
+
+            vst1q_f32(ptr, _p);
+
+            ptr += 4;
+            maxptr += 4;
+        }
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *ptr = exp(*ptr - *maxptr);
+
+            ptr++;
+            maxptr++;
+        }
+    }
+
+    Mat sum;
+    sum.create(w, h);
+    if (sum.empty())
+        return -100;
+    sum.fill(0.f);
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* sumptr = sum;
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _sum = vld1q_f32(sumptr);
+            _sum = vaddq_f32(_sum, _p);
+            vst1q_f32(sumptr, _sum);
+
+            ptr += 4;
+            sumptr += 4;
+        }
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *sumptr += *ptr;
+
+            ptr++;
+            sumptr++;
+        }
+    }
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* sumptr = sum;
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            float32x4_t _sum = vld1q_f32(sumptr);
+#if __aarch64__
+            _p = vdivq_f32(_p, _sum);
+#else
+            _p = div_ps(_p, _sum);
+#endif // __aarch64__
+            vst1q_f32(ptr, _p);
+
+            ptr += 4;
+            sumptr += 4;
+        }
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            *ptr /= *sumptr;
+
+            ptr++;
+            sumptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/softmax_arm.h b/src/layer/arm/softmax_arm.h
new file mode 100644
index 00000000000..3eea580ebeb
--- /dev/null
+++ b/src/layer/arm/softmax_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SOFTMAX_ARM_H
+#define LAYER_SOFTMAX_ARM_H
+
+#include "softmax.h"
+
+namespace ncnn {
+
+class Softmax_arm : public Softmax
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SOFTMAX_ARM_H
diff --git a/src/layer/batchnorm.cpp b/src/layer/batchnorm.cpp
new file mode 100644
index 00000000000..ab6c3e25c58
--- /dev/null
+++ b/src/layer/batchnorm.cpp
@@ -0,0 +1,227 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "batchnorm.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(BatchNorm)
+
+BatchNorm::BatchNorm()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+BatchNorm::~BatchNorm()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int BatchNorm::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d", &channels);
+    if (nscan != 1)
+    {
+        fprintf(stderr, "BatchNorm load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int BatchNorm::load_param_bin(FILE* paramfp)
+{
+    fread(&channels, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int BatchNorm::load_model(FILE* binfp)
+{
+    int nread;
+
+    slope_data.create(channels);
+    if (slope_data.empty())
+        return -100;
+    nread = fread(slope_data, channels * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "BatchNorm read slope_data failed %d\n", nread);
+        return -1;
+    }
+
+    mean_data.create(channels);
+    if (mean_data.empty())
+        return -100;
+    nread = fread(mean_data, channels * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "BatchNorm read mean_data failed %d\n", nread);
+        return -1;
+    }
+
+    var_data.create(channels);
+    if (var_data.empty())
+        return -100;
+    nread = fread(var_data, channels * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "BatchNorm read var_data failed %d\n", nread);
+        return -1;
+    }
+
+    bias_data.create(channels);
+    if (bias_data.empty())
+        return -100;
+    nread = fread(bias_data, channels * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "BatchNorm read bias_data failed %d\n", nread);
+        return -1;
+    }
+
+    a_data.create(channels);
+    if (a_data.empty())
+        return -100;
+    b_data.create(channels);
+    if (b_data.empty())
+        return -100;
+    const float* slope_data_ptr = slope_data;
+    const float* mean_data_ptr = mean_data;
+    const float* var_data_ptr = var_data;
+    const float* bias_data_ptr = bias_data;
+    float* a_data_ptr = a_data;
+    float* b_data_ptr = b_data;
+    for (int i=0; i<channels; i++)
+    {
+        float sqrt_var = sqrt(var_data_ptr[i]);
+        a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var;
+        b_data_ptr[i] = slope_data_ptr[i] / sqrt_var;
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int BatchNorm::load_param(const unsigned char*& mem)
+{
+    channels = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int BatchNorm::load_model(const unsigned char*& mem)
+{
+    slope_data = Mat(channels, (float*)mem);
+    mem += channels * sizeof(float);
+
+    mean_data = Mat(channels, (float*)mem);
+    mem += channels * sizeof(float);
+
+    var_data = Mat(channels, (float*)mem);
+    mem += channels * sizeof(float);
+
+    bias_data = Mat(channels, (float*)mem);
+    mem += channels * sizeof(float);
+
+    a_data.create(channels);
+    if (a_data.empty())
+        return -100;
+    b_data.create(channels);
+    if (b_data.empty())
+        return -100;
+    const float* slope_data_ptr = slope_data;
+    const float* mean_data_ptr = mean_data;
+    const float* var_data_ptr = var_data;
+    const float* bias_data_ptr = bias_data;
+    float* a_data_ptr = a_data;
+    float* b_data_ptr = b_data;
+    for (int i=0; i<channels; i++)
+    {
+        float sqrt_var = sqrt(var_data_ptr[i]);
+        a_data_ptr[i] = bias_data_ptr[i] - slope_data_ptr[i] * mean_data_ptr[i] / sqrt_var;
+        b_data_ptr[i] = slope_data_ptr[i] / sqrt_var;
+    }
+
+    return 0;
+}
+
+int BatchNorm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // a = bias - slope * mean / sqrt(var)
+    // b = slope / sqrt(var)
+    // value = b * value + a
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    const float* a_data_ptr = a_data;
+    const float* b_data_ptr = b_data;
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        float a = a_data_ptr[q];
+        float b = b_data_ptr[q];
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = b * ptr[i] + a;
+        }
+    }
+
+    return 0;
+}
+
+int BatchNorm::forward_inplace(Mat& bottom_top_blob) const
+{
+    // a = bias - slope * mean / sqrt(var)
+    // b = slope / sqrt(var)
+    // value = b * value + a
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int size = w * h;
+
+    const float* a_data_ptr = a_data;
+    const float* b_data_ptr = b_data;
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        float a = a_data_ptr[q];
+        float b = b_data_ptr[q];
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] = b * ptr[i] + a;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/batchnorm.h b/src/layer/batchnorm.h
new file mode 100644
index 00000000000..131ba4b8c9c
--- /dev/null
+++ b/src/layer/batchnorm.h
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BATCHNORM_H
+#define LAYER_BATCHNORM_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class BatchNorm : public Layer
+{
+public:
+    BatchNorm();
+    virtual ~BatchNorm();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    // param
+    int channels;
+
+    // model
+    Mat slope_data;
+    Mat mean_data;
+    Mat var_data;
+    Mat bias_data;
+
+    Mat a_data;
+    Mat b_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BATCHNORM_H
diff --git a/src/layer/bias.cpp b/src/layer/bias.cpp
new file mode 100644
index 00000000000..ab6d9e29a01
--- /dev/null
+++ b/src/layer/bias.cpp
@@ -0,0 +1,139 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "bias.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Bias)
+
+Bias::Bias()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+Bias::~Bias()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Bias::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d", &bias_data_size);
+    if (nscan != 1)
+    {
+        fprintf(stderr, "Bias load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Bias::load_param_bin(FILE* paramfp)
+{
+    fread(&bias_data_size, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int Bias::load_model(FILE* binfp)
+{
+    int nread;
+
+    bias_data.create(bias_data_size);
+    if (bias_data.empty())
+        return -100;
+    nread = fread(bias_data, bias_data_size * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "Bias read bias_data failed %d\n", nread);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Bias::load_param(const unsigned char*& mem)
+{
+    bias_data_size = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Bias::load_model(const unsigned char*& mem)
+{
+    bias_data = Mat(bias_data_size, (float*)mem);
+    mem += bias_data_size * sizeof(float);
+
+    return 0;
+}
+
+int Bias::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    const float* bias_ptr = bias_data;
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        float bias = bias_ptr[q];
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = ptr[i] + bias;
+        }
+    }
+
+    return 0;
+}
+
+int Bias::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    const float* bias_ptr = bias_data;
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        float bias = bias_ptr[q];
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] += bias;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/bias.h b/src/layer/bias.h
new file mode 100644
index 00000000000..3d46a029973
--- /dev/null
+++ b/src/layer/bias.h
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BIAS_H
+#define LAYER_BIAS_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Bias : public Layer
+{
+public:
+    Bias();
+    virtual ~Bias();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    // param
+    int bias_data_size;
+
+    // model
+    Mat bias_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BIAS_H
diff --git a/src/layer/bnll.cpp b/src/layer/bnll.cpp
new file mode 100644
index 00000000000..fa988832c00
--- /dev/null
+++ b/src/layer/bnll.cpp
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "bnll.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(BNLL)
+
+BNLL::BNLL()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int BNLL::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            if (ptr[i] > 0)
+                outptr[i] = ptr[i] + log(1.f + exp(-ptr[i]));
+            else
+                outptr[i] = log(1.f + exp(ptr[i]));
+        }
+    }
+
+    return 0;
+}
+
+int BNLL::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            if (ptr[i] > 0)
+                ptr[i] = ptr[i] + log(1.f + exp(-ptr[i]));
+            else
+                ptr[i] = log(1.f + exp(ptr[i]));
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/bnll.h b/src/layer/bnll.h
new file mode 100644
index 00000000000..490dbdedf5e
--- /dev/null
+++ b/src/layer/bnll.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BNLL_H
+#define LAYER_BNLL_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class BNLL : public Layer
+{
+public:
+    BNLL();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BNLL_H
diff --git a/src/layer/concat.cpp b/src/layer/concat.cpp
new file mode 100644
index 00000000000..6c1b9dc89d2
--- /dev/null
+++ b/src/layer/concat.cpp
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "concat.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Concat)
+
+Concat::Concat()
+{
+}
+
+int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    int w = bottom_blobs[0].w;
+    int h = bottom_blobs[0].h;
+
+    // total channels
+    int top_channels = 0;
+    for (size_t b=0; b<bottom_blobs.size(); b++)
+    {
+        const Mat& bottom_blob = bottom_blobs[b];
+        top_channels += bottom_blob.c;
+    }
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(w, h, top_channels);
+    if (top_blob.empty())
+        return -100;
+
+    int q = 0;
+    for (size_t b=0; b<bottom_blobs.size(); b++)
+    {
+        const Mat& bottom_blob = bottom_blobs[b];
+
+        int channels = bottom_blob.c;
+        int size = bottom_blob.cstep * channels;
+
+        const float* ptr = bottom_blob;
+        float* outptr = top_blob.channel(q);
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = ptr[i];
+        }
+
+        q += channels;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/concat.h b/src/layer/concat.h
new file mode 100644
index 00000000000..b71e4479849
--- /dev/null
+++ b/src/layer/concat.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONCAT_H
+#define LAYER_CONCAT_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Concat : public Layer
+{
+public:
+    Concat();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONCAT_H
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
new file mode 100644
index 00000000000..f638b7f6fdb
--- /dev/null
+++ b/src/layer/convolution.cpp
@@ -0,0 +1,350 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Convolution)
+
+Convolution::Convolution()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+Convolution::~Convolution()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Convolution::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %d %d %d %d %d",
+                       &num_output, &kernel_size, &dilation, &stride, &pad, &bias_term,
+                       &weight_data_size);
+    if (nscan != 7)
+    {
+        fprintf(stderr, "Convolution load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Convolution::load_param_bin(FILE* paramfp)
+{
+    fread(&num_output, sizeof(int), 1, paramfp);
+
+    fread(&kernel_size, sizeof(int), 1, paramfp);
+
+    fread(&dilation, sizeof(int), 1, paramfp);
+
+    fread(&stride, sizeof(int), 1, paramfp);
+
+    fread(&pad, sizeof(int), 1, paramfp);
+
+    fread(&bias_term, sizeof(int), 1, paramfp);
+
+    fread(&weight_data_size, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int Convolution::load_model(FILE* binfp)
+{
+    int nread;
+
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "Convolution read flag_struct failed %d\n", nread);
+        return -1;
+    }
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    weight_data.create(weight_data_size);
+    if (weight_data.empty())
+        return -100;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
+        std::vector<unsigned short> float16_weights;
+        float16_weights.resize(align_weight_data_size);
+        nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read float16_weights failed %d\n", nread);
+            return -1;
+        }
+
+        weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        float quantization_value[256];
+        nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read quantization_value failed %d\n", nread);
+            return -1;
+        }
+
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
+        std::vector<unsigned char> index_array;
+        index_array.resize(align_weight_data_size);
+        nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read index_array failed %d\n", nread);
+            return -1;
+        }
+
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read weight_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    if (bias_term)
+    {
+        bias_data.create(num_output);
+        if (bias_data.empty())
+            return -100;
+        nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read bias_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Convolution::load_param(const unsigned char*& mem)
+{
+    num_output = *(int*)(mem);
+    mem += 4;
+
+    kernel_size = *(int*)(mem);
+    mem += 4;
+
+    dilation = *(int*)(mem);
+    mem += 4;
+
+    stride = *(int*)(mem);
+    mem += 4;
+
+    pad = *(int*)(mem);
+    mem += 4;
+
+    bias_term = *(int*)(mem);
+    mem += 4;
+
+    weight_data_size = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Convolution::load_model(const unsigned char*& mem)
+{
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    memcpy(&flag_struct, mem, sizeof(flag_struct));
+    mem += sizeof(flag_struct);
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
+        mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        const float* quantization_value = (const float*)mem;
+        mem += 256 * sizeof(float);
+
+        const unsigned char* index_array = (const unsigned char*)mem;
+        mem += alignSize(weight_data_size * sizeof(unsigned char), 4);
+
+        weight_data.create(weight_data_size);
+        if (weight_data.empty())
+            return -100;
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        weight_data = Mat(weight_data_size, (float*)mem);
+        mem += weight_data_size * sizeof(float);
+    }
+
+    if (bias_term)
+    {
+        bias_data = Mat(num_output, (float*)mem);
+        mem += num_output * sizeof(float);
+    }
+
+    return 0;
+}
+
+int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // convolv with NxN kernel
+    // value = value + bias
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+//     fprintf(stderr, "Convolution input %d x %d  pad = %d  ksize=%d  stride=%d\n", w, h, pad, kernel_size, stride);
+
+    Mat bottom_blob_bordered = bottom_blob;
+    if (pad > 0)
+    {
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
+        if (bottom_blob_bordered.empty())
+            return -100;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+    }
+
+    const int kernel_extent = dilation * (kernel_size - 1) + 1;
+
+    int outw = (w - kernel_extent) / stride + 1;
+    int outh = (h - kernel_extent) / stride + 1;
+
+    top_blob.create(outw, outh, num_output);
+    if (top_blob.empty())
+        return -100;
+
+    const int maxk = kernel_size * kernel_size;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation - kernel_extent;
+        for (int i = 0; i < kernel_size; i++)
+        {
+            for (int j = 0; j < kernel_size; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    const float* weight_data_ptr = weight_data;
+    #pragma omp parallel for
+    for (int p=0; p<num_output; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = 0.f;
+
+                if (bias_term)
+                    sum = bias_data.data[p];
+
+                const float* kptr = weight_data_ptr + maxk * channels * p;
+
+                // channels
+                for (int q=0; q<channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    const float* sptr = m.data + m.w * i*stride + j*stride;
+
+                    for (int k = 0; k < maxk; k++) // 29.23
+                    {
+                        float val = sptr[ space_ofs[k] ]; // 20.72
+                        float w = kptr[k];
+                        sum += val * w; // 41.45
+                    }
+
+                    kptr += maxk;
+                }
+
+                outptr[j] = sum;
+            }
+
+            outptr += outw;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/convolution.h b/src/layer/convolution.h
new file mode 100644
index 00000000000..99a8a5b544b
--- /dev/null
+++ b/src/layer/convolution.h
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION_H
+#define LAYER_CONVOLUTION_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Convolution : public Layer
+{
+public:
+    Convolution();
+    virtual ~Convolution();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;
+
+public:
+    // param
+    int num_output;
+    int kernel_size;
+    int dilation;
+    int stride;
+    int pad;
+    int bias_term;
+
+    int weight_data_size;
+
+    // model
+    Mat weight_data;
+    Mat bias_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION_H
diff --git a/src/layer/crop.cpp b/src/layer/crop.cpp
new file mode 100644
index 00000000000..e54d0cf1b5e
--- /dev/null
+++ b/src/layer/crop.cpp
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "crop.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Crop)
+
+Crop::Crop()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Crop::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d", &woffset, &hoffset);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "Crop load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Crop::load_param_bin(FILE* paramfp)
+{
+    fread(&woffset, sizeof(int), 1, paramfp);
+
+    fread(&hoffset, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Crop::load_param(const unsigned char*& mem)
+{
+    woffset = *(int*)(mem);
+    mem += 4;
+
+    hoffset = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& reference_blob = bottom_blobs[1];
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+
+    int outw = reference_blob.w;
+    int outh = reference_blob.h;
+
+    int top = hoffset;
+    int bottom = h - outh - hoffset;
+    int left = woffset;
+    int right = w - outw - woffset;
+
+    Mat& top_blob = top_blobs[0];
+
+    copy_cut_border(bottom_blob, top_blob, top, bottom, left, right);
+    if (top_blob.empty())
+        return -100;
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/crop.h b/src/layer/crop.h
new file mode 100644
index 00000000000..9712fbbae0f
--- /dev/null
+++ b/src/layer/crop.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CROP_H
+#define LAYER_CROP_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Crop : public Layer
+{
+public:
+    Crop();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+
+public:
+    int woffset;
+    int hoffset;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CROP_H
diff --git a/src/layer/deconvolution.cpp b/src/layer/deconvolution.cpp
new file mode 100644
index 00000000000..c6dc53984b5
--- /dev/null
+++ b/src/layer/deconvolution.cpp
@@ -0,0 +1,348 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "deconvolution.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Deconvolution)
+
+Deconvolution::Deconvolution()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+Deconvolution::~Deconvolution()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Deconvolution::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %d %d %d %d %d",
+                       &num_output, &kernel_size, &dilation, &stride, &pad, &bias_term,
+                       &weight_data_size);
+    if (nscan != 7)
+    {
+        fprintf(stderr, "Deconvolution load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Deconvolution::load_param_bin(FILE* paramfp)
+{
+    fread(&num_output, sizeof(int), 1, paramfp);
+
+    fread(&kernel_size, sizeof(int), 1, paramfp);
+
+    fread(&dilation, sizeof(int), 1, paramfp);
+
+    fread(&stride, sizeof(int), 1, paramfp);
+
+    fread(&pad, sizeof(int), 1, paramfp);
+
+    fread(&bias_term, sizeof(int), 1, paramfp);
+
+    fread(&weight_data_size, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int Deconvolution::load_model(FILE* binfp)
+{
+    int nread;
+
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "Deconvolution read flag_struct failed %d\n", nread);
+        return -1;
+    }
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    weight_data.create(weight_data_size);
+    if (weight_data.empty())
+        return -100;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
+        std::vector<unsigned short> float16_weights;
+        float16_weights.resize(align_weight_data_size);
+        nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Deconvolution read float16_weights failed %d\n", nread);
+            return -1;
+        }
+
+        weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        float quantization_value[256];
+        nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Deconvolution read quantization_value failed %d\n", nread);
+            return -1;
+        }
+
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
+        std::vector<unsigned char> index_array;
+        index_array.resize(align_weight_data_size);
+        nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Deconvolution read index_array failed %d\n", nread);
+            return -1;
+        }
+
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Deconvolution read weight_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    if (bias_term)
+    {
+        bias_data.create(num_output);
+        if (bias_data.empty())
+            return -100;
+        nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Deconvolution read bias_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Deconvolution::load_param(const unsigned char*& mem)
+{
+    num_output = *(int*)(mem);
+    mem += 4;
+
+    kernel_size = *(int*)(mem);
+    mem += 4;
+
+    dilation = *(int*)(mem);
+    mem += 4;
+
+    stride = *(int*)(mem);
+    mem += 4;
+
+    pad = *(int*)(mem);
+    mem += 4;
+
+    bias_term = *(int*)(mem);
+    mem += 4;
+
+    weight_data_size = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Deconvolution::load_model(const unsigned char*& mem)
+{
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    memcpy(&flag_struct, mem, sizeof(flag_struct));
+    mem += sizeof(flag_struct);
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
+        mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        const float* quantization_value = (const float*)mem;
+        mem += 256 * sizeof(float);
+
+        const unsigned char* index_array = (const unsigned char*)mem;
+        mem += alignSize(weight_data_size * sizeof(unsigned char), 4);
+
+        weight_data.create(weight_data_size);
+        if (weight_data.empty())
+            return -100;
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        weight_data = Mat(weight_data_size, (float*)mem);
+        mem += weight_data_size * sizeof(float);
+    }
+
+    if (bias_term)
+    {
+        bias_data = Mat(num_output, (float*)mem);
+        mem += num_output * sizeof(float);
+    }
+
+    return 0;
+}
+
+int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // backward strided convolv with NxN kernel
+    // value = value + bias
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+//     fprintf(stderr, "Deconvolution input %d x %d  pad = %d  ksize=%d  stride=%d\n", w, h, pad, kernel_size, stride);
+
+    const int kernel_extent = dilation * (kernel_size - 1) + 1;
+
+    int outw = (w - 1) * stride + kernel_extent;
+    int outh = (h - 1) * stride + kernel_extent;
+
+    Mat top_blob_bordered;
+    top_blob_bordered.create(outw, outh, num_output);
+    if (top_blob_bordered.empty())
+        return -100;
+
+    const int maxk = kernel_size * kernel_size;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = outw * dilation - kernel_extent;
+        for (int i = 0; i < kernel_size; i++)
+        {
+            for (int j = 0; j < kernel_size; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    const float* weight_data_ptr = weight_data;
+    #pragma omp parallel for
+    for (int p=0; p<num_output; p++)
+    {
+        Mat out = top_blob_bordered.channel(p);
+
+        const float bias = bias_term ? bias_data.data[p] : 0.f;
+
+        out.fill(bias);
+
+        for (int i = 0; i < h; i++)
+        {
+            for (int j = 0; j < w; j++)
+            {
+                float* outptr = out.data + out.w * i*stride + j*stride;
+
+                const float* kptr = weight_data_ptr + maxk * channels * p;
+
+                // channels
+                for (int q=0; q<channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    float val = *(m.data + m.w * i + j);
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        float w = kptr[k];
+                        outptr[ space_ofs[k] ] += val * w;
+                    }
+
+                    kptr += maxk;
+                }
+            }
+        }
+    }
+
+    top_blob = top_blob_bordered;
+
+    if (pad > 0)
+    {
+        copy_cut_border(top_blob_bordered, top_blob, pad, pad, pad, pad);
+        if (top_blob.empty())
+            return -100;
+
+        outw = top_blob.w;
+        outh = top_blob.h;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/deconvolution.h b/src/layer/deconvolution.h
new file mode 100644
index 00000000000..9095eefe34a
--- /dev/null
+++ b/src/layer/deconvolution.h
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DECONVOLUTION_H
+#define LAYER_DECONVOLUTION_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Deconvolution : public Layer
+{
+public:
+    Deconvolution();
+    virtual ~Deconvolution();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;
+
+public:
+    // param
+    int num_output;
+    int kernel_size;
+    int dilation;
+    int stride;
+    int pad;
+    int bias_term;
+
+    int weight_data_size;
+
+    // model
+    Mat weight_data;
+    Mat bias_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DECONVOLUTION_H
diff --git a/src/layer/dropout.cpp b/src/layer/dropout.cpp
new file mode 100644
index 00000000000..d5faff336d4
--- /dev/null
+++ b/src/layer/dropout.cpp
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dropout.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Dropout)
+
+Dropout::Dropout()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int Dropout::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    top_blob = bottom_blob;
+    return 0;
+}
+
+int Dropout::forward_inplace(Mat& /*bottom_top_blob*/) const
+{
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/dropout.h b/src/layer/dropout.h
new file mode 100644
index 00000000000..72c5c867474
--- /dev/null
+++ b/src/layer/dropout.h
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DROPOUT_H
+#define LAYER_DROPOUT_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Dropout : public Layer
+{
+public:
+    Dropout();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DROPOUT_H
diff --git a/src/layer/eltwise.cpp b/src/layer/eltwise.cpp
new file mode 100644
index 00000000000..02f6402de63
--- /dev/null
+++ b/src/layer/eltwise.cpp
@@ -0,0 +1,246 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "eltwise.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Eltwise)
+
+Eltwise::Eltwise()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Eltwise::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d", &op_type, &num_coeff);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "Eltwise load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    if (num_coeff > 0)
+    {
+        coeffs.create(num_coeff);
+        if (coeffs.empty())
+            return -100;
+        float* coeffs_ptr = coeffs;
+        for (int i=0; i<num_coeff; i++)
+        {
+            int nscan = fscanf(paramfp, "%f", &coeffs_ptr[i]);
+            if (nscan != 1)
+            {
+                fprintf(stderr, "Eltwise load_param failed %d\n", nscan);
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Eltwise::load_param_bin(FILE* paramfp)
+{
+    fread(&op_type, sizeof(int), 1, paramfp);
+
+    fread(&num_coeff, sizeof(int), 1, paramfp);
+
+    if (num_coeff > 0)
+    {
+        coeffs.create(num_coeff);
+        if (coeffs.empty())
+            return -100;
+        float* coeffs_ptr = coeffs;
+        fread(coeffs_ptr, sizeof(float), num_coeff, paramfp);
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Eltwise::load_param(const unsigned char*& mem)
+{
+    op_type = *(int*)(mem);
+    mem += 4;
+
+    num_coeff = *(int*)(mem);
+    mem += 4;
+
+    coeffs = Mat(num_coeff, (float*)mem);
+    mem += num_coeff * sizeof(float);
+
+    return 0;
+}
+
+int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    if (op_type == Operation_PROD)
+    {
+        // first blob
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = ptr[i] * ptr1[i];
+            }
+        }
+
+        for (size_t b=2; b<bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i=0; i<size; i++)
+                {
+                    outptr[i] *= ptr[i];
+                }
+            }
+        }
+    }
+    else if (op_type == Operation_SUM)
+    {
+        if (num_coeff == 0)
+        {
+            // first blob
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i=0; i<size; i++)
+                {
+                    outptr[i] = ptr[i] + ptr1[i];
+                }
+            }
+
+            for (size_t b=2; b<bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                #pragma omp parallel for
+                for (int q=0; q<channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    for (int i=0; i<size; i++)
+                    {
+                        outptr[i] += ptr[i];
+                    }
+                }
+            }
+        }
+        else
+        {
+            const float* coeffs_ptr = coeffs;
+
+            // first blob
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            float coeff0 = coeffs_ptr[0];
+            float coeff1 = coeffs_ptr[1];
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i=0; i<size; i++)
+                {
+                    outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1;
+                }
+            }
+
+            for (size_t b=2; b<bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                float coeff = coeffs_ptr[b];
+                #pragma omp parallel for
+                for (int q=0; q<channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    for (int i=0; i<size; i++)
+                    {
+                        outptr[i] += ptr[i] * coeff;
+                    }
+                }
+            }
+        }
+    }
+    else if (op_type == Operation_MAX)
+    {
+        // first blob
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = std::max(ptr[i], ptr1[i]);
+            }
+        }
+
+        for (size_t b=2; b<bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i=0; i<size; i++)
+                {
+                    outptr[i] = std::max(outptr[i], ptr[i]);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/eltwise.h b/src/layer/eltwise.h
new file mode 100644
index 00000000000..a1c48af5586
--- /dev/null
+++ b/src/layer/eltwise.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ELTWISE_H
+#define LAYER_ELTWISE_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Eltwise : public Layer
+{
+public:
+    Eltwise();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+
+    enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 };
+
+public:
+    // param
+    int op_type;
+    int num_coeff;
+    Mat coeffs;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELTWISE_H
diff --git a/src/layer/elu.cpp b/src/layer/elu.cpp
new file mode 100644
index 00000000000..1c601ebaf45
--- /dev/null
+++ b/src/layer/elu.cpp
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "elu.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(ELU)
+
+ELU::ELU()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int ELU::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%f", &alpha);
+    if (nscan != 1)
+    {
+        fprintf(stderr, "ELU load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int ELU::load_param_bin(FILE* paramfp)
+{
+    fread(&alpha, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int ELU::load_param(const unsigned char*& mem)
+{
+    alpha = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int ELU::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            if (ptr[i] < 0.f)
+                outptr[i] = alpha * (exp(ptr[i]) - 1.f);
+            else
+                outptr[i] = ptr[i];
+        }
+    }
+
+    return 0;
+}
+
+int ELU::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            if (ptr[i] < 0.f)
+                ptr[i] = alpha * (exp(ptr[i]) - 1.f);
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/elu.h b/src/layer/elu.h
new file mode 100644
index 00000000000..945483c6828
--- /dev/null
+++ b/src/layer/elu.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ELU_H
+#define LAYER_ELU_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class ELU : public Layer
+{
+public:
+    ELU();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    float alpha;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELU_H
diff --git a/src/layer/embed.cpp b/src/layer/embed.cpp
new file mode 100644
index 00000000000..51bbbd02ab9
--- /dev/null
+++ b/src/layer/embed.cpp
@@ -0,0 +1,241 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "embed.h"
+#include <string.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Embed)
+
+Embed::Embed()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+Embed::~Embed()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Embed::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %d %d",
+                       &num_output, &input_dim, &bias_term, &weight_data_size);
+    if (nscan != 4)
+    {
+        fprintf(stderr, "Embed load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Embed::load_param_bin(FILE* paramfp)
+{
+    fread(&num_output, sizeof(int), 1, paramfp);
+
+    fread(&input_dim, sizeof(int), 1, paramfp);
+
+    fread(&bias_term, sizeof(int), 1, paramfp);
+
+    fread(&weight_data_size, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int Embed::load_model(FILE* binfp)
+{
+    int nread;
+
+    struct
+    {
+        unsigned char f0;
+        unsigned char f1;
+        unsigned char f2;
+        unsigned char f3;
+    } flag_struct;
+
+    nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "Embed read flag_struct failed %d\n", nread);
+        return -1;
+    }
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    weight_data.create(weight_data_size);
+    if (weight_data.empty())
+        return -100;
+
+    if (flag != 0)
+    {
+        // quantized weight data
+        float quantization_value[256];
+        nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Embed read quantization_value failed %d\n", nread);
+            return -1;
+        }
+
+        std::vector<unsigned char> index_array;
+        index_array.resize(weight_data_size);
+        nread = fread(index_array.data(), weight_data_size * sizeof(unsigned char), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Embed read index_array failed %d\n", nread);
+            return -1;
+        }
+
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Embed read weight_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    if (bias_term)
+    {
+        bias_data.create(num_output);
+        if (bias_data.empty())
+            return -100;
+        nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Embed read bias_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Embed::load_param(const unsigned char*& mem)
+{
+    num_output = *(int*)(mem);
+    mem += 4;
+
+    input_dim = *(int*)(mem);
+    mem += 4;
+
+    bias_term = *(int*)(mem);
+    mem += 4;
+
+    weight_data_size = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Embed::load_model(const unsigned char*& mem)
+{
+    struct
+    {
+        unsigned char f0;
+        unsigned char f1;
+        unsigned char f2;
+        unsigned char f3;
+    } flag_struct;
+
+    memcpy(&flag_struct, mem, sizeof(flag_struct));
+    mem += sizeof(flag_struct);
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    if (flag != 0)
+    {
+        // quantized weight data
+        const float* quantization_value = (const float*)mem;
+        mem += 256 * sizeof(float);
+
+        const unsigned char* index_array = (const unsigned char*)mem;
+        mem += weight_data_size * sizeof(unsigned char);
+
+        weight_data.create(weight_data_size);
+        if (weight_data.empty())
+            return -100;
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        weight_data = Mat(weight_data_size, (float*)mem);
+        mem += weight_data_size * sizeof(float);
+    }
+
+    if (bias_term)
+    {
+        bias_data = Mat(num_output, (float*)mem);
+        mem += num_output * sizeof(float);
+    }
+
+    return 0;
+}
+
+int Embed::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int words = bottom_blob.total();
+
+    top_blob.create(num_output, words, 1);
+    if (top_blob.empty())
+        return -100;
+
+    // num_output
+    const float* word_ptr = bottom_blob;
+    const float* dict_ptr = weight_data;
+    #pragma omp parallel for
+    for (int q=0; q<words; q++)
+    {
+        float* outptr = top_blob.data + top_blob.w * q;
+
+        int word_index = (int)word_ptr[q];
+
+        // check word_index >= 0 && word_index < input_dim
+
+        const float* em = dict_ptr + num_output * word_index;
+
+        memcpy(outptr, em, num_output * sizeof(float));
+
+        if (bias_term)
+        {
+            for (int p=0; p<num_output; p++)
+            {
+                outptr[p] += bias_data.data[p];
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/embed.h b/src/layer/embed.h
new file mode 100644
index 00000000000..9d5e1fd0700
--- /dev/null
+++ b/src/layer/embed.h
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_EMBED_H
+#define LAYER_EMBED_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Embed : public Layer
+{
+public:
+    Embed();
+    virtual ~Embed();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+public:
+    // param
+    int num_output;
+    int input_dim;
+    int bias_term;
+
+    int weight_data_size;
+
+    // model
+    Mat weight_data;
+    Mat bias_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_EMBED_H
diff --git a/src/layer/exp.cpp b/src/layer/exp.cpp
new file mode 100644
index 00000000000..15b269551ba
--- /dev/null
+++ b/src/layer/exp.cpp
@@ -0,0 +1,148 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "exp.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Exp)
+
+Exp::Exp()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Exp::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%f %f %f", &base, &scale, &shift);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "Exp load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Exp::load_param_bin(FILE* paramfp)
+{
+    fread(&base, sizeof(float), 1, paramfp);
+
+    fread(&scale, sizeof(float), 1, paramfp);
+
+    fread(&shift, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Exp::load_param(const unsigned char*& mem)
+{
+    base = *(float*)(mem);
+    mem += 4;
+
+    scale = *(float*)(mem);
+    mem += 4;
+
+    shift = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Exp::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    if (base == -1.f)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = exp(shift + ptr[i] * scale);
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = pow(base, (shift + ptr[i] * scale));
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Exp::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    if (base == -1.f)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                ptr[i] = exp(shift + ptr[i] * scale);
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                ptr[i] = pow(base, (shift + ptr[i] * scale));
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/exp.h b/src/layer/exp.h
new file mode 100644
index 00000000000..685edf308e7
--- /dev/null
+++ b/src/layer/exp.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_EXP_H
+#define LAYER_EXP_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Exp : public Layer
+{
+public:
+    Exp();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    float base;
+    float scale;
+    float shift;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_EXP_H
diff --git a/src/layer/flatten.cpp b/src/layer/flatten.cpp
new file mode 100644
index 00000000000..7418c9b3808
--- /dev/null
+++ b/src/layer/flatten.cpp
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "flatten.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Flatten)
+
+Flatten::Flatten()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+int Flatten::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(size * channels);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.data + size * q;
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = ptr[i];
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/flatten.h b/src/layer/flatten.h
new file mode 100644
index 00000000000..10c5fe26083
--- /dev/null
+++ b/src/layer/flatten.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_FLATTEN_H
+#define LAYER_FLATTEN_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Flatten : public Layer
+{
+public:
+    Flatten();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_FLATTEN_H
diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp
new file mode 100644
index 00000000000..87936a26a8d
--- /dev/null
+++ b/src/layer/innerproduct.cpp
@@ -0,0 +1,273 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "innerproduct.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(InnerProduct)
+
+InnerProduct::InnerProduct()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+InnerProduct::~InnerProduct()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int InnerProduct::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %d",
+                       &num_output, &bias_term, &weight_data_size);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "InnerProduct load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int InnerProduct::load_param_bin(FILE* paramfp)
+{
+    fread(&num_output, sizeof(int), 1, paramfp);
+
+    fread(&bias_term, sizeof(int), 1, paramfp);
+
+    fread(&weight_data_size, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int InnerProduct::load_model(FILE* binfp)
+{
+    int nread;
+
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "InnerProduct read flag_struct failed %d\n", nread);
+        return -1;
+    }
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    weight_data.create(weight_data_size);
+    if (weight_data.empty())
+        return -100;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
+        std::vector<unsigned short> float16_weights;
+        float16_weights.resize(align_weight_data_size);
+        nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "InnerProduct read float16_weights failed %d\n", nread);
+            return -1;
+        }
+
+        weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        float quantization_value[256];
+        nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "InnerProduct read quantization_value failed %d\n", nread);
+            return -1;
+        }
+
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
+        std::vector<unsigned char> index_array;
+        index_array.resize(align_weight_data_size);
+        nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "InnerProduct read index_array failed %d\n", nread);
+            return -1;
+        }
+
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "InnerProduct read weight_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    if (bias_term)
+    {
+        bias_data.create(num_output);
+        if (bias_data.empty())
+            return -100;
+        nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "InnerProduct read bias_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int InnerProduct::load_param(const unsigned char*& mem)
+{
+    num_output = *(int*)(mem);
+    mem += 4;
+
+    bias_term = *(int*)(mem);
+    mem += 4;
+
+    weight_data_size = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int InnerProduct::load_model(const unsigned char*& mem)
+{
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    memcpy(&flag_struct, mem, sizeof(flag_struct));
+    mem += sizeof(flag_struct);
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
+        mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        const float* quantization_value = (const float*)mem;
+        mem += 256 * sizeof(float);
+
+        const unsigned char* index_array = (const unsigned char*)mem;
+        mem += alignSize(weight_data_size * sizeof(unsigned char), 4);
+
+        weight_data.create(weight_data_size);
+        if (weight_data.empty())
+            return -100;
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        weight_data = Mat(weight_data_size, (float*)mem);
+        mem += weight_data_size * sizeof(float);
+    }
+
+    if (bias_term)
+    {
+        bias_data = Mat(num_output, (float*)mem);
+        mem += num_output * sizeof(float);
+    }
+
+    return 0;
+}
+
+int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(1, 1, num_output);
+    if (top_blob.empty())
+        return -100;
+
+    // num_output
+    const float* weight_data_ptr = weight_data;
+    #pragma omp parallel for
+    for (int p=0; p<num_output; p++)
+    {
+        float* outptr = top_blob.channel(p);
+        float sum = 0.f;
+
+        if (bias_term)
+            sum = bias_data.data[p];
+
+        // channels
+        for (int q=0; q<channels; q++)
+        {
+            const float* w = weight_data_ptr + size * channels * p + size * q;
+            const float* m = bottom_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                sum += m[i] * w[i];
+            }
+        }
+
+        outptr[0] = sum;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/innerproduct.h b/src/layer/innerproduct.h
new file mode 100644
index 00000000000..9bcbe714fb8
--- /dev/null
+++ b/src/layer/innerproduct.h
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INNERPRODUCT_H
+#define LAYER_INNERPRODUCT_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class InnerProduct : public Layer
+{
+public:
+    InnerProduct();
+    virtual ~InnerProduct();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+public:
+    // param
+    int num_output;
+    int bias_term;
+
+    int weight_data_size;
+
+    // model
+    Mat weight_data;
+    Mat bias_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_INNERPRODUCT_H
diff --git a/src/layer/input.cpp b/src/layer/input.cpp
new file mode 100644
index 00000000000..e45eb016e0c
--- /dev/null
+++ b/src/layer/input.cpp
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "input.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Input)
+
+Input::Input()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Input::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %d",
+                       &size[0], &size[1], &size[2]);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "Input load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Input::load_param_bin(FILE* paramfp)
+{
+    fread(&size[0], sizeof(int), 1, paramfp);
+
+    fread(&size[1], sizeof(int), 1, paramfp);
+
+    fread(&size[2], sizeof(int), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Input::load_param(const unsigned char*& mem)
+{
+    size[0] = *(int*)(mem);
+    mem += 4;
+
+    size[1] = *(int*)(mem);
+    mem += 4;
+
+    size[2] = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Input::forward(const Mat& /*bottom_blob*/, Mat& /*top_blob*/) const
+{
+    return 0;
+}
+
+int Input::forward_inplace(Mat& /*bottom_top_blob*/) const
+{
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/input.h b/src/layer/input.h
new file mode 100644
index 00000000000..b31db4727d3
--- /dev/null
+++ b/src/layer/input.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INPUT_H
+#define LAYER_INPUT_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Input : public Layer
+{
+public:
+    Input();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    int size[3];
+};
+
+} // namespace ncnn
+
+#endif // LAYER_INPUT_H
diff --git a/src/layer/log.cpp b/src/layer/log.cpp
new file mode 100644
index 00000000000..7ae7a05a579
--- /dev/null
+++ b/src/layer/log.cpp
@@ -0,0 +1,152 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "log.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Log)
+
+Log::Log()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Log::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%f %f %f", &base, &scale, &shift);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "Log load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Log::load_param_bin(FILE* paramfp)
+{
+    fread(&base, sizeof(float), 1, paramfp);
+
+    fread(&scale, sizeof(float), 1, paramfp);
+
+    fread(&shift, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Log::load_param(const unsigned char*& mem)
+{
+    base = *(float*)(mem);
+    mem += 4;
+
+    scale = *(float*)(mem);
+    mem += 4;
+
+    shift = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Log::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    if (base == -1.f)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = log(shift + ptr[i] * scale);
+            }
+        }
+    }
+    else
+    {
+        float log_base_inv = 1.f / log(base);
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = log(shift + ptr[i] * scale) * log_base_inv;
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Log::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    if (base == -1.f)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                ptr[i] = log(shift + ptr[i] * scale);
+            }
+        }
+    }
+    else
+    {
+        float log_base_inv = 1.f / log(base);
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                ptr[i] = log(shift + ptr[i] * scale) * log_base_inv;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/log.h b/src/layer/log.h
new file mode 100644
index 00000000000..eacbd193e27
--- /dev/null
+++ b/src/layer/log.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_LOG_H
+#define LAYER_LOG_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Log : public Layer
+{
+public:
+    Log();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    float base;
+    float scale;
+    float shift;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_LOG_H
diff --git a/src/layer/lrn.cpp b/src/layer/lrn.cpp
new file mode 100644
index 00000000000..8268050c91d
--- /dev/null
+++ b/src/layer/lrn.cpp
@@ -0,0 +1,330 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "lrn.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(LRN)
+
+LRN::LRN()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int LRN::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %f %f",
+                       &region_type, &local_size, &alpha, &beta);
+    if (nscan != 4)
+    {
+        fprintf(stderr, "LRN load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int LRN::load_param_bin(FILE* paramfp)
+{
+    fread(&region_type, sizeof(int), 1, paramfp);
+
+    fread(&local_size, sizeof(int), 1, paramfp);
+
+    fread(&alpha, sizeof(float), 1, paramfp);
+
+    fread(&beta, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int LRN::load_param(const unsigned char*& mem)
+{
+    region_type = *(int*)(mem);
+    mem += 4;
+
+    local_size = *(int*)(mem);
+    mem += 4;
+
+    alpha = *(float*)(mem);
+    mem += 4;
+
+    beta = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int LRN::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    // squared values with local_size padding
+    Mat square_blob;
+    square_blob.create(w, h, channels);
+    if (square_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = square_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = ptr[i] * ptr[i];
+        }
+    }
+
+    float alpha_div_size = alpha / local_size;
+
+    if (region_type == NormRegion_ACROSS_CHANNELS)
+    {
+        top_blob.fill(0.f);
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            // square sum
+            float* outptr = top_blob.channel(q);
+            for (int p=q - local_size / 2; p<q + local_size; p++)
+            {
+                if (p < 0 || p >= channels)
+                    continue;
+
+                const float* sptr = square_blob.channel(p);
+                for (int i=0; i<size; i++)
+                {
+                    outptr[i] += sptr[i];
+                }
+            }
+
+            const float* ptr = bottom_blob.channel(q);
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = ptr[i] * pow(1.f + alpha_div_size * outptr[i], -beta);
+            }
+        }
+    }
+    else if (region_type == NormRegion_WITHIN_CHANNEL)
+    {
+        int outw = w;
+        int outh = h;
+
+        Mat square_blob_bordered = square_blob;
+        int pad = local_size / 2;
+        if (pad > 0)
+        {
+            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
+            if (square_blob_bordered.empty())
+                return -100;
+
+            w = square_blob_bordered.w;
+            h = square_blob_bordered.h;
+        }
+
+        const int maxk = local_size * local_size;
+
+        // norm window offsets
+        std::vector<int> _space_ofs(maxk);
+        int* space_ofs = &_space_ofs[0];
+        {
+            int p1 = 0;
+            int p2 = 0;
+            int gap = w - local_size;
+            for (int i = 0; i < local_size; i++)
+            {
+                for (int j = 0; j < local_size; j++)
+                {
+                    space_ofs[p1] = p2;
+                    p1++;
+                    p2++;
+                }
+                p2 += gap;
+            }
+        }
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            const float* sptr = square_blob_bordered.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    float ss = 0.f;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        float val = sptr[ space_ofs[k] ];
+                        ss += val;
+                    }
+
+                    outptr[j] = ptr[j] * pow(1.f + alpha_div_size * ss, -beta);
+                }
+
+                ptr += outw;
+                sptr += w;
+                outptr += outw;
+            }
+        }
+    }
+
+    return 0;
+}
+
+int LRN::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    // squared values with local_size padding
+    Mat square_blob;
+    square_blob.create(w, h, channels);
+    if (square_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_top_blob.channel(q);
+        float* outptr = square_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = ptr[i] * ptr[i];
+        }
+    }
+
+    float alpha_div_size = alpha / local_size;
+
+    if (region_type == NormRegion_ACROSS_CHANNELS)
+    {
+        Mat square_sum;
+        square_sum.create(w, h, channels);
+        if (square_sum.empty())
+            return -100;
+        square_sum.fill(0.f);
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            // square sum
+            float* ssptr = square_sum.channel(q);
+            for (int p=q - local_size / 2; p<q + local_size; p++)
+            {
+                if (p < 0 || p >= channels)
+                    continue;
+
+                const float* sptr = square_blob.channel(p);
+                for (int i=0; i<size; i++)
+                {
+                    ssptr[i] += sptr[i];
+                }
+            }
+
+            float* ptr = bottom_top_blob.channel(q);
+            for (int i=0; i<size; i++)
+            {
+                ptr[i] = ptr[i] * pow(1.f + alpha_div_size * ssptr[i], -beta);
+            }
+        }
+    }
+    else if (region_type == NormRegion_WITHIN_CHANNEL)
+    {
+        int outw = w;
+        int outh = h;
+
+        Mat square_blob_bordered = square_blob;
+        int pad = local_size / 2;
+        if (pad > 0)
+        {
+            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
+            if (square_blob_bordered.empty())
+                return -100;
+
+            w = square_blob_bordered.w;
+            h = square_blob_bordered.h;
+        }
+
+        const int maxk = local_size * local_size;
+
+        // norm window offsets
+        std::vector<int> _space_ofs(maxk);
+        int* space_ofs = &_space_ofs[0];
+        {
+            int p1 = 0;
+            int p2 = 0;
+            int gap = w - local_size;
+            for (int i = 0; i < local_size; i++)
+            {
+                for (int j = 0; j < local_size; j++)
+                {
+                    space_ofs[p1] = p2;
+                    p1++;
+                    p2++;
+                }
+                p2 += gap;
+            }
+        }
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            const float* sptr = square_blob_bordered.channel(q);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    float ss = 0.f;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        float val = sptr[ space_ofs[k] ];
+                        ss += val;
+                    }
+
+                    ptr[j] = ptr[j] * pow(1.f + alpha_div_size * ss, -beta);
+                }
+
+                ptr += outw;
+                sptr += w;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/lrn.h b/src/layer/lrn.h
new file mode 100644
index 00000000000..8075f0d26f8
--- /dev/null
+++ b/src/layer/lrn.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_LRN_H
+#define LAYER_LRN_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class LRN : public Layer
+{
+public:
+    LRN();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+    enum { NormRegion_ACROSS_CHANNELS = 0, NormRegion_WITHIN_CHANNEL = 1 };
+
+public:
+    // param
+    int region_type;
+    int local_size;
+    float alpha;
+    float beta;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_LRN_H
diff --git a/src/layer/lstm.cpp b/src/layer/lstm.cpp
new file mode 100644
index 00000000000..1c9892cefc2
--- /dev/null
+++ b/src/layer/lstm.cpp
@@ -0,0 +1,241 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "lstm.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(LSTM)
+
+LSTM::LSTM()
+{
+    one_blob_only = false;
+    support_inplace = false;
+}
+
+LSTM::~LSTM()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int LSTM::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d", &num_output, &weight_data_size);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "LSTM load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int LSTM::load_param_bin(FILE* paramfp)
+{
+    fread(&num_output, sizeof(int), 1, paramfp);
+
+    fread(&weight_data_size, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int LSTM::load_model(FILE* binfp)
+{
+    int nread;
+
+    int size = weight_data_size / 2 / num_output / 4;
+
+    // raw weight data
+    weight_hc_data.create(size * 4, num_output);
+    if (weight_hc_data.empty())
+        return -100;
+    nread = fread(weight_hc_data.data, size * 4 * num_output * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "LSTM read weight_hc_data failed %d\n", nread);
+        return -1;
+    }
+
+    weight_xc_data.create(size * 4, num_output);
+    if (weight_xc_data.empty())
+        return -100;
+    nread = fread(weight_xc_data.data, size * 4 * num_output * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "LSTM read weight_xc_data failed %d\n", nread);
+        return -1;
+    }
+
+    bias_c_data.create(4, num_output);
+    if (bias_c_data.empty())
+        return -100;
+    nread = fread(bias_c_data.data, 4 * num_output * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "LSTM read bias_c_data failed %d\n", nread);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int LSTM::load_param(const unsigned char*& mem)
+{
+    num_output = *(int*)(mem);
+    mem += 4;
+
+    weight_data_size = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int LSTM::load_model(const unsigned char*& mem)
+{
+    int size = weight_data_size / 2 / num_output / 4;
+
+    // raw weight data
+    weight_hc_data = Mat(size * 4, num_output, (float*)mem);
+    mem += size * 4 * num_output * sizeof(float);
+
+    weight_xc_data = Mat(size * 4, num_output, (float*)mem);
+    mem += size * 4 * num_output * sizeof(float);
+
+    bias_c_data = Mat(4, num_output, (float*)mem);
+    mem += 4 * num_output * sizeof(float);
+
+    return 0;
+}
+
+int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    // size x 1 x T
+    const Mat& input_blob = bottom_blobs[0];
+
+    // T, 0 or 1 each
+    const Mat& cont_blob = bottom_blobs[1];
+
+    int T = input_blob.c;
+    int size = input_blob.w;
+
+    // initial hidden state
+    Mat hidden(num_output);
+    if (hidden.empty())
+        return -100;
+    hidden.fill(0.f);
+
+    // internal cell state
+    Mat cell(num_output);
+    if (cell.empty())
+        return -100;
+    // 4 x num_output
+    Mat gates(4, num_output);
+    if (gates.empty())
+        return -100;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(num_output, 1, T);
+    if (top_blob.empty())
+        return -100;
+
+    // unroll
+    for (int t=0; t<T; t++)
+    {
+        // clip hidden by continuation indicator
+        // h_cont_{t-1} = cont_t * h_{t-1}
+        // h_cont_{t-1} = h_{t-1} if cont_t == 1
+        //                0       otherwise
+        // calculate hidden
+        // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
+        const float cont = cont_blob.data[t];
+        const Mat x = input_blob.channel(t);
+        float* hidden_data = hidden;
+        for (int q=0; q<num_output; q++)
+        {
+            float h_cont = cont ? hidden_data[q] : 0.f;
+
+            const float* x_data = x;
+            const float* bias_c_data_ptr = bias_c_data.data + 4 * q;
+            float* gates_data = gates.data + 4 * q;
+
+            // gate I F O G
+            const float* weight_hc_data_I = weight_hc_data.data + weight_hc_data.w * q;
+            const float* weight_xc_data_I = weight_xc_data.data + weight_xc_data.w * q;
+            const float* weight_hc_data_F = weight_hc_data.data + weight_hc_data.w * q + size;
+            const float* weight_xc_data_F = weight_xc_data.data + weight_xc_data.w * q + size;
+            const float* weight_hc_data_O = weight_hc_data.data + weight_hc_data.w * q + size*2;
+            const float* weight_xc_data_O = weight_xc_data.data + weight_xc_data.w * q + size*2;
+            const float* weight_hc_data_G = weight_hc_data.data + weight_hc_data.w * q + size*3;
+            const float* weight_xc_data_G = weight_xc_data.data + weight_xc_data.w * q + size*3;
+
+            float I = bias_c_data_ptr[0];
+            float F = bias_c_data_ptr[1];
+            float O = bias_c_data_ptr[2];
+            float G = bias_c_data_ptr[3];
+            for (int i=0; i<size; i++)
+            {
+                I += weight_hc_data_I[i] * h_cont + weight_xc_data_I[i] * x_data[i];
+                F += weight_hc_data_F[i] * h_cont + weight_xc_data_F[i] * x_data[i];
+                O += weight_hc_data_O[i] * h_cont + weight_xc_data_O[i] * x_data[i];
+                G += weight_hc_data_G[i] * h_cont + weight_xc_data_G[i] * x_data[i];
+            }
+
+            gates_data[0] = I;
+            gates_data[1] = F;
+            gates_data[2] = O;
+            gates_data[3] = G;
+        }
+
+        // lstm unit
+        // sigmoid(I)
+        // sigmoid(F)
+        // sigmoid(O)
+        // tanh(G)
+        // c_t := f_t .* c_{t-1} + i_t .* g_t
+        // h_t := o_t .* tanh[c_t]
+        float* cell_data = cell;
+        Mat output = top_blob.channel(t);
+        float* output_data = output;
+        for (int q=0; q<num_output; q++)
+        {
+            float* gates_data = gates.data + 4 * q;
+
+            float I = gates_data[0];
+            float F = gates_data[1];
+            float O = gates_data[2];
+            float G = gates_data[3];
+
+            I = 1.f / (1.f + exp(-I));
+            F = cont ? 0.f : 1.f / (1.f + exp(-F));
+            O = 1.f / (1.f + exp(-O));
+            G = tanh(G);
+
+            float cell = F * cell_data[q] + I * G;
+            float H = O * tanh(cell);
+
+            cell_data[q] = cell;
+            hidden_data[q] = H;
+            output_data[q] = H;
+        }
+
+        // no cell output here
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/lstm.h b/src/layer/lstm.h
new file mode 100644
index 00000000000..e215a4e9a52
--- /dev/null
+++ b/src/layer/lstm.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_LSTM_H
+#define LAYER_LSTM_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class LSTM : public Layer
+{
+public:
+    LSTM();
+    virtual ~LSTM();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+
+public:
+    // param
+    int num_output;
+    int weight_data_size;
+
+    // model
+    Mat weight_hc_data;
+    Mat weight_xc_data;
+    Mat bias_c_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_LSTM_H
diff --git a/src/layer/memorydata.cpp b/src/layer/memorydata.cpp
new file mode 100644
index 00000000000..5c05ef9a8fe
--- /dev/null
+++ b/src/layer/memorydata.cpp
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "memorydata.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(MemoryData)
+
+MemoryData::MemoryData()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int MemoryData::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %d",
+                       &channels, &width, &height);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "MemoryData load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int MemoryData::load_param_bin(FILE* paramfp)
+{
+    fread(&channels, sizeof(int), 1, paramfp);
+
+    fread(&width, sizeof(int), 1, paramfp);
+
+    fread(&height, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int MemoryData::load_param(const unsigned char*& mem)
+{
+    channels = *(int*)(mem);
+    mem += 4;
+
+    width = *(int*)(mem);
+    mem += 4;
+
+    height = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int MemoryData::forward(const Mat& /*bottom_blob*/, Mat& /*top_blob*/) const
+{
+    return 0;
+}
+
+int MemoryData::forward_inplace(Mat& /*bottom_top_blob*/) const
+{
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/memorydata.h b/src/layer/memorydata.h
new file mode 100644
index 00000000000..5643bb5821e
--- /dev/null
+++ b/src/layer/memorydata.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_MEMORYDATA_H
+#define LAYER_MEMORYDATA_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class MemoryData : public Layer
+{
+public:
+    MemoryData();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    int channels;
+    int width;
+    int height;
+
+};
+
+} // namespace ncnn
+
+#endif // LAYER_MEMORYDATA_H
diff --git a/src/layer/mvn.cpp b/src/layer/mvn.cpp
new file mode 100644
index 00000000000..f8f4c59ba11
--- /dev/null
+++ b/src/layer/mvn.cpp
@@ -0,0 +1,213 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "mvn.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(MVN)
+
+MVN::MVN()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int MVN::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %f",
+                       &normalize_variance, &across_channels, &eps);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "MVN load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int MVN::load_param_bin(FILE* paramfp)
+{
+    fread(&normalize_variance, sizeof(int), 1, paramfp);
+
+    fread(&across_channels, sizeof(int), 1, paramfp);
+
+    fread(&eps, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int MVN::load_param(const unsigned char*& mem)
+{
+    normalize_variance = *(int*)(mem);
+    mem += 4;
+
+    across_channels = *(int*)(mem);
+    mem += 4;
+
+    eps = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int MVN::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    // prepare sum per channel
+    Mat sum(channels);
+    if (sum.empty())
+        return -100;
+    float* sum_ptr = sum;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+
+        float sum = 0.f;
+        for (int i=0; i<size; i++)
+        {
+            sum += ptr[i];
+        }
+
+        sum_ptr[q] = sum;
+    }
+
+    if (across_channels)
+    {
+        // compute mean across channels
+        float mean = 0.f;
+        for (int q=0; q<channels; q++)
+        {
+            mean += sum_ptr[q];
+        }
+        mean = mean / (channels * size);
+
+        // substract mean
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = ptr[i] - mean;
+            }
+        }
+    }
+    else
+    {
+        // substract mean
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+            float mean = sum_ptr[q] / size;
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = ptr[i] - mean;
+            }
+        }
+    }
+
+    if (normalize_variance)
+    {
+        // prepare squared sum per channel
+        Mat sqsum(channels);
+        if (sqsum.empty())
+            return -100;
+        float* sqsum_ptr = sqsum;
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = top_blob.channel(q);
+
+            float sum = 0.f;
+            for (int i=0; i<size; i++)
+            {
+                sum += ptr[i] * ptr[i];
+            }
+
+            sqsum_ptr[q] = sum;
+        }
+
+        if (across_channels)
+        {
+            // compute squared mean across channels
+            float sqmean = 0.f;
+            for (int q=0; q<channels; q++)
+            {
+                sqmean += sqsum_ptr[q];
+            }
+            sqmean = sqmean / (channels * size);
+
+            // normalize variance
+            float norm_var = sqrt(sqmean) + eps;
+            float norm_var_inv = 1.f / norm_var;
+
+            // apply normalize_variance
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i=0; i<size; i++)
+                {
+                    outptr[i] = ptr[i] * norm_var_inv;
+                }
+            }
+        }
+        else
+        {
+            // apply normalize_variance
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+                float sqmean = sqsum_ptr[q] / size;
+                float norm_var = sqrt(sqmean) + eps;
+                float norm_var_inv = 1.f / norm_var;
+
+                for (int i=0; i<size; i++)
+                {
+                    outptr[i] = ptr[i] * norm_var_inv;
+                }
+            }
+        }
+
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/mvn.h b/src/layer/mvn.h
new file mode 100644
index 00000000000..61b08e60ef1
--- /dev/null
+++ b/src/layer/mvn.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_MVN_H
+#define LAYER_MVN_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class MVN : public Layer
+{
+public:
+    MVN();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+public:
+    int normalize_variance;
+    int across_channels;
+    float eps;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_MVN_H
diff --git a/src/layer/pooling.cpp b/src/layer/pooling.cpp
new file mode 100644
index 00000000000..6c4686909a0
--- /dev/null
+++ b/src/layer/pooling.cpp
@@ -0,0 +1,262 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pooling.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Pooling)
+
+Pooling::Pooling()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Pooling::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %d %d %d",
+                       &pooling_type, &kernel_size, &stride, &pad, &global_pooling);
+    if (nscan != 5)
+    {
+        fprintf(stderr, "Pooling load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Pooling::load_param_bin(FILE* paramfp)
+{
+    fread(&pooling_type, sizeof(int), 1, paramfp);
+
+    fread(&kernel_size, sizeof(int), 1, paramfp);
+
+    fread(&stride, sizeof(int), 1, paramfp);
+
+    fread(&pad, sizeof(int), 1, paramfp);
+
+    fread(&global_pooling, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Pooling::load_param(const unsigned char*& mem)
+{
+    pooling_type = *(int*)(mem);
+    mem += 4;
+
+    kernel_size = *(int*)(mem);
+    mem += 4;
+
+    stride = *(int*)(mem);
+    mem += 4;
+
+    pad = *(int*)(mem);
+    mem += 4;
+
+    global_pooling = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Pooling::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // max value in NxN window
+    // avg value in NxN window
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+//     fprintf(stderr, "Pooling     input %d x %d  pad = %d  ksize=%d  stride=%d\n", w, h, pad, kernel_size, stride);
+    if (global_pooling)
+    {
+        top_blob.create(1, 1, channels);
+        if (top_blob.empty())
+            return -100;
+
+        int size = w * h;
+
+        if (pooling_type == PoolMethod_MAX)
+        {
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                float max = ptr[0];
+                for (int i=0; i<size; i++)
+                {
+                    max = std::max(max, ptr[i]);
+                }
+
+                outptr[0] = max;
+            }
+        }
+        else if (pooling_type == PoolMethod_AVE)
+        {
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                float sum = 0.f;
+                for (int i=0; i<size; i++)
+                {
+                    sum += ptr[i];
+                }
+
+                outptr[0] = sum / size;
+            }
+        }
+
+        return 0;
+    }
+
+    Mat bottom_blob_bordered = bottom_blob;
+    if (pad > 0)
+    {
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
+        if (bottom_blob_bordered.empty())
+            return -100;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+    }
+
+    int outw = (w - kernel_size) / stride + 1;
+    int outh = (h - kernel_size) / stride + 1;
+
+    int wtail = (w - kernel_size) % stride;
+    int htail = (h - kernel_size) % stride;
+    if (wtail != 0 || htail != 0)
+    {
+        int wtailpad = 0;
+        int htailpad = 0;
+        if (wtail != 0)
+            wtailpad = kernel_size - wtail;
+        if (htail != 0)
+            htailpad = kernel_size - htail;
+
+        Mat bottom_blob_bordered2;
+        copy_make_border(bottom_blob_bordered, bottom_blob_bordered2, 0, htailpad, 0, wtailpad, BORDER_REPLICATE, 0.f);
+        if (bottom_blob_bordered2.empty())
+            return -100;
+
+        bottom_blob_bordered = bottom_blob_bordered2;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+
+        if (wtail != 0)
+            outw += 1;
+        if (htail != 0)
+            outh += 1;
+    }
+
+    top_blob.create(outw, outh, channels);
+    if (top_blob.empty())
+        return -100;
+
+    const int maxk = kernel_size * kernel_size;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w - kernel_size;
+        for (int i = 0; i < kernel_size; i++)
+        {
+            for (int j = 0; j < kernel_size; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2++;
+            }
+            p2 += gap;
+        }
+    }
+
+    if (pooling_type == PoolMethod_MAX)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const Mat m(w, h, bottom_blob_bordered.channel(q));
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    const float* sptr = m.data + m.w * i*stride + j*stride;
+
+                    float max = sptr[0];
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        float val = sptr[ space_ofs[k] ];
+                        max = std::max(max, val);
+                    }
+
+                    outptr[j] = max;
+                }
+
+                outptr += outw;
+            }
+        }
+    }
+    else if (pooling_type == PoolMethod_AVE)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const Mat m(w, h, bottom_blob_bordered.channel(q));
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    const float* sptr = m.data + m.w * i*stride + j*stride;
+
+                    float sum = 0;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        float val = sptr[ space_ofs[k] ];
+                        sum += val;
+                    }
+
+                    outptr[j] = sum / maxk;
+                }
+
+                outptr += outw;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/pooling.h b/src/layer/pooling.h
new file mode 100644
index 00000000000..8c46ad959e6
--- /dev/null
+++ b/src/layer/pooling.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_POOLING_H
+#define LAYER_POOLING_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Pooling : public Layer
+{
+public:
+    Pooling();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    enum { PoolMethod_MAX = 0, PoolMethod_AVE = 1 };
+
+public:
+    // param
+    int pooling_type;
+    int kernel_size;
+    int stride;
+    int pad;
+    int global_pooling;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_POOLING_H
diff --git a/src/layer/power.cpp b/src/layer/power.cpp
new file mode 100644
index 00000000000..c01d039f2d0
--- /dev/null
+++ b/src/layer/power.cpp
@@ -0,0 +1,115 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "power.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Power)
+
+Power::Power()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Power::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%f %f %f", &power, &scale, &shift);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "Power load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Power::load_param_bin(FILE* paramfp)
+{
+    fread(&power, sizeof(float), 1, paramfp);
+
+    fread(&scale, sizeof(float), 1, paramfp);
+
+    fread(&shift, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Power::load_param(const unsigned char*& mem)
+{
+    power = *(float*)(mem);
+    mem += 4;
+
+    scale = *(float*)(mem);
+    mem += 4;
+
+    shift = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Power::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = pow((shift + ptr[i] * scale), power);
+        }
+    }
+
+    return 0;
+}
+
+int Power::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] = pow((shift + ptr[i] * scale), power);
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/power.h b/src/layer/power.h
new file mode 100644
index 00000000000..6c5f96e2066
--- /dev/null
+++ b/src/layer/power.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_POWER_H
+#define LAYER_POWER_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Power : public Layer
+{
+public:
+    Power();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    float power;
+    float scale;
+    float shift;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_POWER_H
diff --git a/src/layer/prelu.cpp b/src/layer/prelu.cpp
new file mode 100644
index 00000000000..38ffe8d9e22
--- /dev/null
+++ b/src/layer/prelu.cpp
@@ -0,0 +1,139 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "prelu.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(PReLU)
+
+PReLU::PReLU()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int PReLU::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d", &num_slope);
+    if (nscan != 1)
+    {
+        fprintf(stderr, "PReLU load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int PReLU::load_param_bin(FILE* paramfp)
+{
+    fread(&num_slope, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int PReLU::load_model(FILE* binfp)
+{
+    int nread;
+
+    slope_data.create(num_slope);
+    if (slope_data.empty())
+        return -100;
+    nread = fread(slope_data, num_slope * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "PReLU read slope_data failed %d\n", nread);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int PReLU::load_param(const unsigned char*& mem)
+{
+    num_slope = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int PReLU::load_model(const unsigned char*& mem)
+{
+    slope_data = Mat(num_slope, (float*)mem);
+    mem += num_slope * sizeof(float);
+
+    return 0;
+}
+
+int PReLU::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    const float* slope_data_ptr = slope_data;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+        float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];
+
+        for (int i=0; i<size; i++)
+        {
+            if (ptr[i] < 0)
+                outptr[i] = ptr[i] * slope;
+            else
+                outptr[i] = ptr[i];
+        }
+    }
+
+    return 0;
+}
+
+int PReLU::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    const float* slope_data_ptr = slope_data;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];
+
+        for (int i=0; i<size; i++)
+        {
+            if (ptr[i] < 0)
+                ptr[i] *= slope;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/prelu.h b/src/layer/prelu.h
new file mode 100644
index 00000000000..19fd4a034d4
--- /dev/null
+++ b/src/layer/prelu.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PRELU_H
+#define LAYER_PRELU_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class PReLU : public Layer
+{
+public:
+    PReLU();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    int num_slope;
+    Mat slope_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PRELU_H
diff --git a/src/layer/proposal.cpp b/src/layer/proposal.cpp
new file mode 100644
index 00000000000..f29da2c01d0
--- /dev/null
+++ b/src/layer/proposal.cpp
@@ -0,0 +1,352 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "proposal.h"
+#include <math.h>
+#include <algorithm>
+#include <vector>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Proposal)
+
+class Rect
+{
+public:
+    float x, y, width, height;
+
+    Rect() : x(0.f), y(0.f), width(0.f), height(0.f) {}
+    Rect(float _x, float _y, float _width, float _height) :x(_x), y(_y), width(_width), height(_height) {}
+    float area() const { return width * height; }
+    float inter_area(const Rect& rhs) const
+    {
+        float x2 = x + width;
+        float y2 = y + height;
+        float rhs_x2 = rhs.x + rhs.width;
+        float rhs_y2 = rhs.y + rhs.height;
+
+        float xL = std::max(x, rhs.x);
+        float xR = std::min(x2, rhs_x2);
+        if (xR <= xL)
+            return 0.f;
+
+        float yT = std::max(y, rhs.y);
+        float yB = std::min(y2, rhs_y2);
+        if (yB <= yT)
+            return 0.f;
+
+        return (xR - xL) * (yB - yT);
+    }
+};
+
+class ProposalBox
+{
+public:
+    Rect box;
+    float score;
+    float area() const { return box.area(); }
+    float inter_area(const ProposalBox& rhs) const { return box.inter_area(rhs.box); }
+    bool operator<(const ProposalBox& rhs) const { return score > rhs.score; }
+};
+
+static std::vector<int> nms(const std::vector<ProposalBox>& boxes, float nms_thresh)
+{
+    // NOTE boxes is already sorted
+    int size = boxes.size();
+
+    std::vector<float> areas;
+    areas.resize(size);
+    for (int i=0; i<size; i++)
+    {
+        areas[i] = boxes[i].area();
+    }
+
+    std::vector<int> suppressed;
+    suppressed.resize(size, 0);
+
+    std::vector<int> picked;
+
+    for (int i=0; i<size; i++)
+    {
+        if (suppressed[i] == 1)
+            continue;
+
+        picked.push_back(i);
+
+        for (int j=i+1; j<size; j++)
+        {
+            if (suppressed[j] == 1)
+                continue;
+
+            float intersize = boxes[i].inter_area(boxes[j]);
+            float ov = intersize / (areas[i] + areas[j] - intersize);
+            if (ov > nms_thresh)
+            {
+                suppressed[j] = 1;
+            }
+        }
+    }
+
+    return picked;
+}
+
+Proposal::Proposal()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Proposal::load_param(FILE* paramfp)
+{
+//     float ratio;
+//     float scale;
+    int nscan = fscanf(paramfp, "%d %d %d %d %f %d",
+                       &feat_stride, &base_size, &pre_nms_topN, &after_nms_topN,
+                       &nms_thresh, &min_size);
+    if (nscan != 6)
+    {
+        fprintf(stderr, "Proposal load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Proposal::load_param_bin(FILE* paramfp)
+{
+    fread(&feat_stride, sizeof(int), 1, paramfp);
+
+    fread(&base_size, sizeof(int), 1, paramfp);
+
+//     float ratio;
+//     float scale;
+
+    fread(&pre_nms_topN, sizeof(int), 1, paramfp);
+
+    fread(&after_nms_topN, sizeof(int), 1, paramfp);
+
+    fread(&nms_thresh, sizeof(float), 1, paramfp);
+
+    fread(&min_size, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Proposal::load_param(const unsigned char*& mem)
+{
+    feat_stride = *(int*)(mem);
+    mem += 4;
+
+    base_size = *(int*)(mem);
+    mem += 4;
+
+//     float ratio;
+//     float scale;
+
+    pre_nms_topN = *(int*)(mem);
+    mem += 4;
+
+    after_nms_topN = *(int*)(mem);
+    mem += 4;
+
+    nms_thresh = *(float*)(mem);
+    mem += 4;
+
+    min_size = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Proposal::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    const Mat& score_blob = bottom_blobs[0];
+    const Mat& bbox_blob = bottom_blobs[1];
+    const Mat& im_info_blob = bottom_blobs[2];
+
+    int w = score_blob.w;
+    int h = score_blob.h;
+
+    // for each (H, W) location i
+    // generate A anchor boxes centered on cell i
+    // apply predicted bbox deltas at cell i to each of the A anchors
+    Rect base_anchor(0, 0, base_size - 1, base_size - 1);
+
+    // generate all ratio anchors
+    float ratios[3] = { 0.5, 1, 2 };
+    Rect ratio_anchors[3];
+    {
+        int size = base_anchor.area();
+        float cx = base_anchor.x + 0.5f * base_anchor.width;
+        float cy = base_anchor.y + 0.5f * base_anchor.height;
+        for (int i=0; i<3; i++)
+        {
+            float aw = (int)(sqrt(size / ratios[i]) + 0.5f);
+            float ah = (int)(aw * ratios[i] + 0.5f);
+            float ax = cx - 0.5f * (aw - 1);
+            float ay = cy - 0.5f * (ah - 1);
+            ratio_anchors[i] = Rect(ax, ay, aw, ah);
+        }
+    }
+
+    // generate all scale anchors
+    float scales[3] = { 8, 16, 32 };
+    Rect anchors[3*3];
+    {
+        for (int i=0; i<3; i++)
+        {
+            const Rect& ra = ratio_anchors[i];
+            float cx = ra.x + 0.5f * ra.width;
+            float cy = ra.y + 0.5f * ra.height;
+            for (int j=0; j<3; j++)
+            {
+                float aw = ra.width * scales[j];
+                float ah = ra.height * scales[j];
+                float ax = cx - 0.5f * (aw - 1);
+                float ay = cy - 0.5f * (ah - 1);
+                anchors[i*3+j] = Rect(ax, ay, aw, ah);
+            }
+        }
+    }
+
+    // generate proposals from bbox deltas and shifted anchors
+    // clip predicted boxes to image
+    std::vector<Rect > proposals;
+    int num_anchors = 3*3;
+    proposals.resize(num_anchors * h * w);
+
+    float im_w = ((const float*)im_info_blob.data)[1];
+    float im_h = ((const float*)im_info_blob.data)[0];
+
+    #pragma omp parallel for
+    for (int k = 0; k < num_anchors; k++)
+    {
+        const float* bbox_xptr = (const float*)(bbox_blob.data + bbox_blob.cstep * (k * 4 + 0));
+        const float* bbox_yptr = (const float*)(bbox_blob.data + bbox_blob.cstep * (k * 4 + 1));
+        const float* bbox_wptr = (const float*)(bbox_blob.data + bbox_blob.cstep * (k * 4 + 2));
+        const float* bbox_hptr = (const float*)(bbox_blob.data + bbox_blob.cstep * (k * 4 + 3));
+
+        // shifted anchor
+        Rect sa = anchors[k];
+        for (int i = 0; i < h; i++)
+        {
+            for (int j = 0; j < w; j++)
+            {
+                // apply bbox deltas
+                float dx = bbox_xptr[j];
+                float dy = bbox_yptr[j];
+                float dw = bbox_wptr[j];
+                float dh = bbox_hptr[j];
+
+                float cx = sa.x + 0.5f * sa.width;
+                float cy = sa.y + 0.5f * sa.height;
+
+                cx += sa.width * dx;
+                cy += sa.height * dy;
+                float aw = sa.width * exp(dw);
+                float ah = sa.height * exp(dh);
+                float ax = cx - 0.5f * aw;
+                float ay = cy - 0.5f * ah;
+
+                // clip box
+                ax = std::max(std::min(ax, im_w - 1), 0.f);
+                ay = std::max(std::min(ay, im_h - 1), 0.f);
+                aw = std::max(std::min(aw, im_w - ax), 0.f);
+                ah = std::max(std::min(ah, im_h - ay), 0.f);
+
+                proposals[k * h * w + i * w + j] = Rect(ax, ay, aw, ah);
+
+                sa.x += feat_stride;
+            }
+
+            bbox_xptr += w;
+            bbox_yptr += w;
+            bbox_wptr += w;
+            bbox_hptr += w;
+
+            sa.x = anchors[k].x;
+            sa.y += feat_stride;
+        }
+    }
+
+    // remove predicted boxes with either height or width < threshold
+    // NOTE convert min_size to input image scale stored in im_info[2]
+    std::vector<ProposalBox> proposal_boxes;
+
+    float im_scale = ((const float*)im_info_blob.data)[2];
+    float min_boxsize = min_size * im_scale;
+
+    const float* scoreptr = (const float*)(score_blob.data);
+    for (size_t i=0; i<proposals.size(); i++)
+    {
+        const Rect& p = proposals[i];
+        if (p.width >= min_boxsize && p.height >= min_boxsize)
+        {
+            ProposalBox pb;
+            pb.box = p;
+            pb.score = scoreptr[i];
+            proposal_boxes.push_back(pb);
+        }
+    }
+    proposals.clear();
+
+    // sort all (proposal, score) pairs by score from highest to lowest
+    std::sort(proposal_boxes.begin(), proposal_boxes.end());
+
+    // take top pre_nms_topN
+    if (pre_nms_topN > 0 && pre_nms_topN < (int)proposal_boxes.size())
+        proposal_boxes.resize(pre_nms_topN);
+
+    // apply nms with nms_thresh
+    std::vector<int> picked = nms(proposal_boxes, nms_thresh);
+
+    // take after_nms_topN
+    int picked_count = std::min((int)picked.size(), after_nms_topN);
+
+    // return the top proposals (-> RoIs top)
+    Mat& roi_blob = top_blobs[0];
+    roi_blob.create(4, picked_count, 1);
+    if (roi_blob.empty())
+        return -100;
+
+    float* outptr = roi_blob;
+    for (int i=0; i<picked_count; i++)
+    {
+        outptr[0] = proposal_boxes[ picked[i] ].box.x;
+        outptr[1] = proposal_boxes[ picked[i] ].box.y;
+        outptr[2] = proposal_boxes[ picked[i] ].box.width;
+        outptr[3] = proposal_boxes[ picked[i] ].box.height;
+
+        outptr += 4;
+    }
+
+    if (top_blobs.size() > 1)
+    {
+        Mat& roi_score_blob = top_blobs[1];
+        roi_score_blob.create(picked_count, 1, 1);
+        if (roi_score_blob.empty())
+            return -100;
+
+        float* outptr = roi_score_blob;
+        for (int i=0; i<picked_count; i++)
+        {
+            outptr[i] = proposal_boxes[ picked[i] ].score;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/proposal.h b/src/layer/proposal.h
new file mode 100644
index 00000000000..893a52e0e5b
--- /dev/null
+++ b/src/layer/proposal.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PROPOSAL_H
+#define LAYER_PROPOSAL_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Proposal : public Layer
+{
+public:
+    Proposal();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+
+public:
+    // param
+    int feat_stride;
+    int base_size;
+//     float ratio;
+//     float scale;
+    int pre_nms_topN;
+    int after_nms_topN;
+    float nms_thresh;
+    int min_size;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PROPOSAL_H
diff --git a/src/layer/reduction.cpp b/src/layer/reduction.cpp
new file mode 100644
index 00000000000..180888ae0c5
--- /dev/null
+++ b/src/layer/reduction.cpp
@@ -0,0 +1,386 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "reduction.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Reduction)
+
+Reduction::Reduction()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+Reduction::~Reduction()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Reduction::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %f", &operation, &dim, &coeff);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "Reduction load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Reduction::load_param_bin(FILE* paramfp)
+{
+    fread(&operation, sizeof(int), 1, paramfp);
+
+    fread(&dim, sizeof(int), 1, paramfp);
+
+    fread(&coeff, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Reduction::load_param(const unsigned char*& mem)
+{
+    operation = *(int*)(mem);
+    mem += 4;
+
+    dim = *(int*)(mem);
+    mem += 4;
+
+    coeff = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Reduction::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    if (dim == 0)
+    {
+        top_blob.create(1);
+    }
+    else if (dim == 1)
+    {
+        top_blob.create(channels);
+    }
+    else if (dim == 2)
+    {
+        top_blob.create(h, channels);
+    }
+    if (top_blob.empty())
+        return -100;
+
+    if (operation == ReductionOp_SUM)
+    {
+        if (dim == 0)
+        {
+            Mat sums(channels);
+            if (sums.empty())
+                return -100;
+            float* sums_ptr = sums;
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+
+                float sum = 0.f;
+                for (int i=0; i<size; i++)
+                {
+                    sum += ptr[i];
+                }
+
+                sums_ptr[q] = sum;
+            }
+
+            float* outptr = top_blob;
+
+            float sum = 0.f;
+            for (int i=0; i<size; i++)
+            {
+                sum += sums_ptr[i];
+            }
+
+            outptr[0] = sum * coeff;
+        }
+        else if (dim == 1)
+        {
+            float* outptr = top_blob;
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+
+                float sum = 0.f;
+                for (int i=0; i<size; i++)
+                {
+                    sum += ptr[i];
+                }
+
+                outptr[q] = sum * coeff;
+            }
+        }
+        else if (dim == 2)
+        {
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i=0; i<h; i++)
+                {
+                    float sum = 0.f;
+                    for (int j=0; j<w; j++)
+                    {
+                        sum += ptr[j];
+                    }
+
+                    outptr[i] = sum * coeff;
+
+                    ptr += w;
+                }
+            }
+        }
+    }
+    else if (operation == ReductionOp_ASUM)
+    {
+        if (dim == 0)
+        {
+            Mat sums(channels);
+            if (sums.empty())
+                return -100;
+            float* sums_ptr = sums;
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+
+                float sum = 0.f;
+                for (int i=0; i<size; i++)
+                {
+                    sum += fabs(ptr[i]);
+                }
+
+                sums_ptr[q] = sum;
+            }
+
+            float* outptr = top_blob;
+
+            float sum = 0.f;
+            for (int i=0; i<size; i++)
+            {
+                sum += sums_ptr[i];
+            }
+
+            outptr[0] = sum * coeff;
+        }
+        else if (dim == 1)
+        {
+            float* outptr = top_blob;
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+
+                float sum = 0.f;
+                for (int i=0; i<size; i++)
+                {
+                    sum += fabs(ptr[i]);
+                }
+
+                outptr[q] = sum * coeff;
+            }
+        }
+        else if (dim == 2)
+        {
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i=0; i<h; i++)
+                {
+                    float sum = 0.f;
+                    for (int j=0; j<w; j++)
+                    {
+                        sum += fabs(ptr[j]);
+                    }
+
+                    outptr[i] = sum * coeff;
+
+                    ptr += w;
+                }
+            }
+        }
+    }
+    else if (operation == ReductionOp_SUMSQ)
+    {
+        if (dim == 0)
+        {
+            Mat sums(channels);
+            if (sums.empty())
+                return -100;
+            float* sums_ptr = sums;
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+
+                float sum = 0.f;
+                for (int i=0; i<size; i++)
+                {
+                    sum += ptr[i] * ptr[i];
+                }
+
+                sums_ptr[q] = sum;
+            }
+
+            float* outptr = top_blob;
+
+            float sum = 0.f;
+            for (int i=0; i<size; i++)
+            {
+                sum += sums_ptr[i];
+            }
+
+            outptr[0] = sum * coeff;
+        }
+        else if (dim == 1)
+        {
+            float* outptr = top_blob;
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+
+                float sum = 0.f;
+                for (int i=0; i<size; i++)
+                {
+                    sum += ptr[i] * ptr[i];
+                }
+
+                outptr[q] = sum * coeff;
+            }
+        }
+        else if (dim == 2)
+        {
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i=0; i<h; i++)
+                {
+                    float sum = 0.f;
+                    for (int j=0; j<w; j++)
+                    {
+                        sum += ptr[i] * ptr[i];
+                    }
+
+                    outptr[i] = sum * coeff;
+
+                    ptr += w;
+                }
+            }
+        }
+    }
+    else if (operation == ReductionOp_MEAN)
+    {
+        if (dim == 0)
+        {
+            Mat sums(channels);
+            if (sums.empty())
+                return -100;
+            float* sums_ptr = sums;
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+
+                float sum = 0.f;
+                for (int i=0; i<size; i++)
+                {
+                    sum += ptr[i];
+                }
+
+                sums_ptr[q] = sum;
+            }
+
+            float* outptr = top_blob;
+
+            float sum = 0.f;
+            for (int i=0; i<size; i++)
+            {
+                sum += sums_ptr[i];
+            }
+
+            outptr[0] = sum / (channels * size) * coeff;
+        }
+        else if (dim == 1)
+        {
+            float* outptr = top_blob;
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+
+                float sum = 0.f;
+                for (int i=0; i<size; i++)
+                {
+                    sum += ptr[i];
+                }
+
+                outptr[q] = sum / size * coeff;
+            }
+        }
+        else if (dim == 2)
+        {
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i=0; i<h; i++)
+                {
+                    float sum = 0.f;
+                    for (int j=0; j<w; j++)
+                    {
+                        sum += ptr[j];
+                    }
+
+                    outptr[i] = sum / w * coeff;
+
+                    ptr += w;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/reduction.h b/src/layer/reduction.h
new file mode 100644
index 00000000000..7be3b110775
--- /dev/null
+++ b/src/layer/reduction.h
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_REDUCTION_H
+#define LAYER_REDUCTION_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Reduction : public Layer
+{
+public:
+    Reduction();
+    virtual ~Reduction();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    enum { ReductionOp_SUM = 0, ReductionOp_ASUM = 1, ReductionOp_SUMSQ = 2, ReductionOp_MEAN = 3 };
+
+public:
+    // param
+    int operation;
+    int dim;
+    float coeff;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REDUCTION_H
diff --git a/src/layer/relu.cpp b/src/layer/relu.cpp
new file mode 100644
index 00000000000..58ab630380a
--- /dev/null
+++ b/src/layer/relu.cpp
@@ -0,0 +1,145 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "relu.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(ReLU)
+
+ReLU::ReLU()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int ReLU::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%f", &slope);
+    if (nscan != 1)
+    {
+        fprintf(stderr, "ReLU load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int ReLU::load_param_bin(FILE* paramfp)
+{
+    fread(&slope, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int ReLU::load_param(const unsigned char*& mem)
+{
+    slope = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int ReLU::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    if (slope == 0.f)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                if (ptr[i] < 0)
+                    outptr[i] = 0;
+                else
+                    outptr[i] = ptr[i];
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                if (ptr[i] < 0)
+                    outptr[i] *= slope;
+                else
+                    outptr[i] = ptr[i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+int ReLU::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    if (slope == 0.f)
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                if (ptr[i] < 0)
+                    ptr[i] = 0;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                if (ptr[i] < 0)
+                    ptr[i] *= slope;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/relu.h b/src/layer/relu.h
new file mode 100644
index 00000000000..3b9f8214ead
--- /dev/null
+++ b/src/layer/relu.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RELU_H
+#define LAYER_RELU_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class ReLU : public Layer
+{
+public:
+    ReLU();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    float slope;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RELU_H
diff --git a/src/layer/reshape.cpp b/src/layer/reshape.cpp
new file mode 100644
index 00000000000..b4cbfe7c0ba
--- /dev/null
+++ b/src/layer/reshape.cpp
@@ -0,0 +1,151 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "reshape.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Reshape)
+
+Reshape::Reshape()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Reshape::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %d",
+                       &w, &h, &c);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "Reshape load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    ndim = 3;
+    if (c == -233)
+        ndim = 2;
+    if (h == -233)
+        ndim = 1;
+    if (w == -233)
+        ndim = 0;
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Reshape::load_param_bin(FILE* paramfp)
+{
+    fread(&w, sizeof(int), 1, paramfp);
+
+    fread(&h, sizeof(int), 1, paramfp);
+
+    fread(&c, sizeof(int), 1, paramfp);
+
+    ndim = 3;
+    if (c == -233)
+        ndim = 2;
+    if (h == -233)
+        ndim = 1;
+    if (w == -233)
+        ndim = 0;
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Reshape::load_param(const unsigned char*& mem)
+{
+    w = *(int*)(mem);
+    mem += 4;
+
+    h = *(int*)(mem);
+    mem += 4;
+
+    c = *(int*)(mem);
+    mem += 4;
+
+    ndim = 3;
+    if (c == -233)
+        ndim = 2;
+    if (h == -233)
+        ndim = 1;
+    if (w == -233)
+        ndim = 0;
+
+    return 0;
+}
+
+int Reshape::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int total = bottom_blob.total();
+
+    if (ndim == 1)
+    {
+        int _w = w;
+
+        if (_w == 0)
+            _w = bottom_blob.w;
+
+        if (_w == -1)
+            _w = total;
+
+        top_blob = bottom_blob.reshape(_w);
+    }
+    else if (ndim == 2)
+    {
+        int _w = w;
+        int _h = h;
+
+        if (_w == 0)
+            _w = bottom_blob.w;
+        if (_h == 0)
+            _h = bottom_blob.h;
+
+        if (_w == -1)
+            _w = total / _h;
+        if (_h == -1)
+            _h = total / _w;
+
+        top_blob = bottom_blob.reshape(_w, _h);
+    }
+    else if (ndim == 3)
+    {
+        int _w = w;
+        int _h = h;
+        int _c = c;
+
+        if (_w == 0)
+            _w = bottom_blob.w;
+        if (_h == 0)
+            _h = bottom_blob.h;
+        if (_c == 0)
+            _c = bottom_blob.c;
+
+        if (_w == -1)
+            _w = total / _c / _h;
+        if (_h == -1)
+            _h = total / _c / _w;
+        if (_c == -1)
+            _c = total / _h / _w;
+
+        top_blob = bottom_blob.reshape(_w, _h, _c);
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/reshape.h b/src/layer/reshape.h
new file mode 100644
index 00000000000..df847a58f02
--- /dev/null
+++ b/src/layer/reshape.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RESHAPE_H
+#define LAYER_RESHAPE_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Reshape : public Layer
+{
+public:
+    Reshape();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+private:
+    int w;
+    int h;
+    int c;
+    int ndim;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RESHAPE_H
diff --git a/src/layer/rnn.cpp b/src/layer/rnn.cpp
new file mode 100644
index 00000000000..36e6f7dcec2
--- /dev/null
+++ b/src/layer/rnn.cpp
@@ -0,0 +1,224 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rnn.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(RNN)
+
+RNN::RNN()
+{
+    one_blob_only = false;
+    support_inplace = false;
+}
+
+RNN::~RNN()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int RNN::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d", &num_output, &weight_data_size);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "RNN load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int RNN::load_param_bin(FILE* paramfp)
+{
+    fread(&num_output, sizeof(int), 1, paramfp);
+
+    fread(&weight_data_size, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int RNN::load_model(FILE* binfp)
+{
+    int nread;
+
+    int size = (weight_data_size - num_output * num_output) / 2 / num_output;
+
+    // raw weight data
+    weight_hh_data.create(size, num_output);
+    if (weight_hh_data.empty())
+        return -100;
+    nread = fread(weight_hh_data.data, size * num_output * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "RNN read weight_hh_data failed %d\n", nread);
+        return -1;
+    }
+
+    weight_xh_data.create(size, num_output);
+    if (weight_xh_data.empty())
+        return -100;
+    nread = fread(weight_xh_data.data, size * num_output * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "RNN read weight_xh_data failed %d\n", nread);
+        return -1;
+    }
+
+    weight_ho_data.create(num_output, num_output);
+    if (weight_ho_data.empty())
+        return -100;
+    nread = fread(weight_ho_data.data, num_output * num_output * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "RNN read weight_ho_data failed %d\n", nread);
+        return -1;
+    }
+
+    bias_h_data.create(num_output);
+    if (bias_h_data.empty())
+        return -100;
+    nread = fread(bias_h_data.data, num_output * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "RNN read bias_h_data failed %d\n", nread);
+        return -1;
+    }
+
+    bias_o_data.create(num_output);
+    if (bias_o_data.empty())
+        return -100;
+    nread = fread(bias_o_data.data, num_output * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "RNN read bias_o_data failed %d\n", nread);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int RNN::load_param(const unsigned char*& mem)
+{
+    num_output = *(int*)(mem);
+    mem += 4;
+
+    weight_data_size = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int RNN::load_model(const unsigned char*& mem)
+{
+    int size = (weight_data_size - num_output * num_output) / 2 / num_output;
+
+    // raw weight data
+    weight_hh_data = Mat(size, num_output, (float*)mem);
+    mem += size * num_output * sizeof(float);
+
+    weight_xh_data = Mat(size, num_output, (float*)mem);
+    mem += size * num_output * sizeof(float);
+
+    weight_ho_data = Mat(num_output, num_output, (float*)mem);
+    mem += num_output * num_output * sizeof(float);
+
+    bias_h_data = Mat(num_output, (float*)mem);
+    mem += num_output * sizeof(float);
+
+    bias_o_data = Mat(num_output, (float*)mem);
+    mem += num_output * sizeof(float);
+
+    return 0;
+}
+
+int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    // size x 1 x T
+    const Mat& input_blob = bottom_blobs[0];
+
+    // T, 0 or 1 each
+    const Mat& cont_blob = bottom_blobs[1];
+
+    int T = input_blob.c;
+    int size = input_blob.w;
+
+    // initial hidden state
+    Mat hidden(num_output);
+    if (hidden.empty())
+        return -100;
+    hidden.fill(0.f);
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(num_output, 1, T);
+    if (top_blob.empty())
+        return -100;
+
+    // unroll
+    for (int t=0; t<T; t++)
+    {
+        // clip hidden by continuation indicator
+        // h_cont_{t-1} = cont_t * h_{t-1}
+        // h_cont_{t-1} = h_{t-1} if cont_t == 1
+        //                0       otherwise
+        // calculate hidden
+        // h_t = tanh( W_hh * h_cont_{t-1} + W_xh * x_t + b_h )
+        const float cont = cont_blob.data[t];
+        const Mat x = input_blob.channel(t);
+        float* hidden_data = hidden;
+        for (int q=0; q<num_output; q++)
+        {
+            float h_cont = cont ? hidden_data[q] : 0.f;
+
+            const float* weight_hh_data_ptr = weight_hh_data.data + weight_hh_data.w * q;
+            const float* weight_xh_data_ptr = weight_xh_data.data + weight_xh_data.w * q;
+            const float* x_data = x;
+
+            float s0 = bias_h_data.data[q];
+            for (int i=0; i<size; i++)
+            {
+                s0 += weight_hh_data_ptr[i] * h_cont + weight_xh_data_ptr[i] * x_data[i];
+            }
+
+            hidden_data[q] = tanh(s0);
+        }
+
+        // calculate output
+        // o_t = tanh( W_ho * h_t + b_o )
+        Mat output = top_blob.channel(t);
+        float* output_data = output;
+        for (int q=0; q<num_output; q++)
+        {
+            const float* weight_ho_data_ptr = weight_ho_data.data + weight_ho_data.w * q;
+
+            float s0 = bias_o_data.data[q];
+            for (int i=0; i<size; i++)
+            {
+                s0 += weight_ho_data_ptr[i] * hidden_data[i];
+            }
+
+            output_data[q] = tanh(s0);
+        }
+
+        // no hidden output here
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/rnn.h b/src/layer/rnn.h
new file mode 100644
index 00000000000..a3afd4f3b16
--- /dev/null
+++ b/src/layer/rnn.h
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RNN_H
+#define LAYER_RNN_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class RNN : public Layer
+{
+public:
+    RNN();
+    virtual ~RNN();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+
+public:
+    // param
+    int num_output;
+    int weight_data_size;
+
+    // model
+    Mat weight_hh_data;
+    Mat weight_xh_data;
+    Mat weight_ho_data;
+    Mat bias_h_data;
+    Mat bias_o_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RNN_H
diff --git a/src/layer/roipooling.cpp b/src/layer/roipooling.cpp
new file mode 100644
index 00000000000..77670a45f7e
--- /dev/null
+++ b/src/layer/roipooling.cpp
@@ -0,0 +1,143 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "roipooling.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(ROIPooling)
+
+ROIPooling::ROIPooling()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int ROIPooling::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d %f",
+                       &pooled_width, &pooled_height, &spatial_scale);
+    if (nscan != 3)
+    {
+        fprintf(stderr, "ROIPooling load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int ROIPooling::load_param_bin(FILE* paramfp)
+{
+    fread(&pooled_width, sizeof(int), 1, paramfp);
+
+    fread(&pooled_height, sizeof(int), 1, paramfp);
+
+    fread(&spatial_scale, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int ROIPooling::load_param(const unsigned char*& mem)
+{
+    pooled_width = *(int*)(mem);
+    mem += 4;
+
+    pooled_height = *(int*)(mem);
+    mem += 4;
+
+    spatial_scale = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int ROIPooling::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    const Mat& roi_blob = bottom_blobs[1];
+    int num_roi = roi_blob.c;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(pooled_width, pooled_height, channels);
+    if (top_blob.empty())
+        return -100;
+
+    // For each ROI R = [x y w h]: max pool over R
+    #pragma omp parallel for
+    for (int n = 0; n < num_roi; n++)
+    {
+        const float* roi_ptr = roi_blob.data + 4 * n;
+
+        int roi_x = round(roi_ptr[0] * spatial_scale);
+        int roi_y = round(roi_ptr[1] * spatial_scale);
+        int roi_w = round(roi_ptr[2] * spatial_scale);
+        int roi_h = round(roi_ptr[3] * spatial_scale);
+
+        float bin_size_w = (float)roi_w / (float)pooled_width;
+        float bin_size_h = (float)roi_h / (float)pooled_height;
+
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int ph = 0; ph < pooled_height; ph++)
+            {
+                for (int pw = 0; pw < pooled_width; pw++)
+                {
+                    // Compute pooling region for this output unit:
+                    //  start (included) = floor(ph * roi_height / pooled_height)
+                    //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height)
+                    int hstart = roi_y + floor((float)(ph) * bin_size_h);
+                    int wstart = roi_x + floor((float)(pw) * bin_size_w);
+                    int hend = roi_y + ceil((float)(ph + 1) * bin_size_h);
+                    int wend = roi_x + ceil((float)(pw + 1) * bin_size_w);
+
+                    hstart = std::min(std::max(hstart, 0), h);
+                    wstart = std::min(std::max(wstart, 0), w);
+                    hend = std::min(std::max(hend, 0), h);
+                    wend = std::min(std::max(wend, 0), w);
+
+                    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+                    float max = is_empty ? 0.f : ptr[hstart * w + wstart];
+
+                    for (int y = hstart; y < hend; y++)
+                    {
+                        for (int x = wstart; x < wend; x++)
+                        {
+                            int index = y * w + x;
+                            max = std::max(max, ptr[index]);
+                        }
+                    }
+
+                    outptr[pw] = max;
+                }
+
+                outptr += pooled_width;
+            }
+        }
+
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/roipooling.h b/src/layer/roipooling.h
new file mode 100644
index 00000000000..560bd941824
--- /dev/null
+++ b/src/layer/roipooling.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ROIPOOLING_H
+#define LAYER_ROIPOOLING_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class ROIPooling : public Layer
+{
+public:
+    ROIPooling();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+
+public:
+    int pooled_width;
+    int pooled_height;
+    float spatial_scale;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ROIPOOLING_H
diff --git a/src/layer/scale.cpp b/src/layer/scale.cpp
new file mode 100644
index 00000000000..61bda8cb2d0
--- /dev/null
+++ b/src/layer/scale.cpp
@@ -0,0 +1,204 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "scale.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Scale)
+
+Scale::Scale()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+Scale::~Scale()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Scale::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d", &scale_data_size, &bias_term);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "Scale load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Scale::load_param_bin(FILE* paramfp)
+{
+    fread(&scale_data_size, sizeof(int), 1, paramfp);
+
+    fread(&bias_term, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+
+int Scale::load_model(FILE* binfp)
+{
+    int nread;
+
+    scale_data.create(1, scale_data_size);
+    nread = fread(scale_data, scale_data_size * sizeof(float), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "Scale read scale_data failed %d\n", nread);
+        return -1;
+    }
+
+    if (bias_term)
+    {
+        bias_data.create(scale_data_size);
+        if (bias_data.empty())
+            return -100;
+        nread = fread(bias_data, scale_data_size * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Scale read bias_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Scale::load_param(const unsigned char*& mem)
+{
+    scale_data_size = *(int*)(mem);
+    mem += 4;
+
+    bias_term = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Scale::load_model(const unsigned char*& mem)
+{
+    scale_data = Mat(1, scale_data_size, (float*)mem);
+    mem += scale_data_size * sizeof(float);
+
+    if (bias_term)
+    {
+        bias_data = Mat(scale_data_size, (float*)mem);
+        mem += scale_data_size * sizeof(float);
+    }
+
+    return 0;
+}
+
+int Scale::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    if (bias_term)
+    {
+        const float* scale_ptr = scale_data;
+        const float* bias_ptr = bias_data;
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            float s = scale_ptr[q];
+            float bias = bias_ptr[q];
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = ptr[i] * s + bias;
+            }
+        }
+    }
+    else
+    {
+        const float* scale_ptr = scale_data;
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            float s = scale_ptr[q];
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = ptr[i] * s;
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Scale::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    if (bias_term)
+    {
+        const float* scale_ptr = scale_data;
+        const float* bias_ptr = bias_data;
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            float s = scale_ptr[q];
+            float bias = bias_ptr[q];
+
+            for (int i=0; i<size; i++)
+            {
+                ptr[i] = ptr[i] * s + bias;
+            }
+        }
+    }
+    else
+    {
+        const float* scale_ptr = scale_data;
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            float s = scale_ptr[q];
+
+            for (int i=0; i<size; i++)
+            {
+                ptr[i] *= s;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/scale.h b/src/layer/scale.h
new file mode 100644
index 00000000000..31625a4798b
--- /dev/null
+++ b/src/layer/scale.h
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SCALE_H
+#define LAYER_SCALE_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Scale : public Layer
+{
+public:
+    Scale();
+    virtual ~Scale();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+    virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+    virtual int load_model(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    // param
+    int scale_data_size;
+    int bias_term;
+
+    // model
+    Mat scale_data;
+    Mat bias_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SCALE_H
diff --git a/src/layer/sigmoid.cpp b/src/layer/sigmoid.cpp
new file mode 100644
index 00000000000..1d398854e18
--- /dev/null
+++ b/src/layer/sigmoid.cpp
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "sigmoid.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Sigmoid)
+
+Sigmoid::Sigmoid()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int Sigmoid::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = 1.f / (1.f + exp(-ptr[i]));
+        }
+    }
+
+    return 0;
+}
+
+int Sigmoid::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] = 1.f / (1.f + exp(-ptr[i]));
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/sigmoid.h b/src/layer/sigmoid.h
new file mode 100644
index 00000000000..8873afb382f
--- /dev/null
+++ b/src/layer/sigmoid.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SIGMOID_H
+#define LAYER_SIGMOID_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Sigmoid : public Layer
+{
+public:
+    Sigmoid();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SIGMOID_H
diff --git a/src/layer/slice.cpp b/src/layer/slice.cpp
new file mode 100644
index 00000000000..b78208d80c4
--- /dev/null
+++ b/src/layer/slice.cpp
@@ -0,0 +1,115 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "slice.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Slice)
+
+Slice::Slice()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Slice::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d", &num_slice);
+    if (nscan != 1)
+    {
+        fprintf(stderr, "Slice load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    slices.create(num_slice);
+    if (slices.empty())
+        return -100;
+    int* slices_ptr = (int*)slices.data;
+    for (int i=0; i<num_slice; i++)
+    {
+        int nscan = fscanf(paramfp, "%d", &slices_ptr[i]);
+        if (nscan != 1)
+        {
+            fprintf(stderr, "Slice load_param failed %d\n", nscan);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Slice::load_param_bin(FILE* paramfp)
+{
+    fread(&num_slice, sizeof(int), 1, paramfp);
+
+    slices.create(num_slice);
+    if (slices.empty())
+        return -100;
+    int* slices_ptr = (int*)slices.data;
+    fread(slices_ptr, sizeof(int), num_slice, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Slice::load_param(const unsigned char*& mem)
+{
+    num_slice = *(int*)(mem);
+    mem += 4;
+
+    slices = Mat(num_slice, (float*)mem);
+    mem += num_slice * sizeof(int);
+
+    return 0;
+}
+
+int Slice::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    int q = 0;
+    const int* slices_ptr = (const int*)slices.data;
+    for (size_t i=0; i<top_blobs.size(); i++)
+    {
+        int slice = slices_ptr[i];
+        if (slice == -233)
+        {
+            slice = (channels - q) / (top_blobs.size() - i);
+        }
+
+        Mat& top_blob = top_blobs[i];
+        top_blob.create(w, h, slice);
+        if (top_blob.empty())
+            return -100;
+
+        int size = bottom_blob.cstep * slice;
+
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.data;
+        for (int j=0; j<size; j++)
+        {
+            outptr[j] = ptr[j];
+        }
+
+        q += slice;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/slice.h b/src/layer/slice.h
new file mode 100644
index 00000000000..b3ba88a43ef
--- /dev/null
+++ b/src/layer/slice.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SLICE_H
+#define LAYER_SLICE_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Slice : public Layer
+{
+public:
+    Slice();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+
+public:
+    int num_slice;
+    Mat slices;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SLICE_H
diff --git a/src/layer/softmax.cpp b/src/layer/softmax.cpp
new file mode 100644
index 00000000000..d695ce96949
--- /dev/null
+++ b/src/layer/softmax.cpp
@@ -0,0 +1,174 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "softmax.h"
+#include <float.h>
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Softmax)
+
+Softmax::Softmax()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int Softmax::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // value = exp( value - global max value )
+    // sum all value
+    // value = value / sum
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    Mat max;
+    max.create(w, h);
+    if (max.empty())
+        return -100;
+    max.fill(-FLT_MAX);
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* maxptr = max;
+
+        for (int i=0; i<size; i++)
+        {
+            maxptr[i] = std::max(maxptr[i], ptr[i]);
+        }
+    }
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+        float* maxptr = max;
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = exp(ptr[i] - maxptr[i]);
+        }
+    }
+
+    Mat sum;
+    sum.create(w, h);
+    if (sum.empty())
+        return -100;
+    sum.fill(0.f);
+    for (int q=0; q<channels; q++)
+    {
+        const float* outptr = top_blob.channel(q);
+        float* sumptr = sum;
+
+        for (int i=0; i<size; i++)
+        {
+            sumptr[i] += outptr[i];
+        }
+    }
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* outptr = top_blob.channel(q);
+        float* sumptr = sum;
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] /= sumptr[i];
+        }
+    }
+
+    return 0;
+}
+
+int Softmax::forward_inplace(Mat& bottom_top_blob) const
+{
+    // value = exp( value - global max value )
+    // sum all value
+    // value = value / sum
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    Mat max;
+    max.create(w, h);
+    if (max.empty())
+        return -100;
+    max.fill(-FLT_MAX);
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* maxptr = max;
+
+        for (int i=0; i<size; i++)
+        {
+            maxptr[i] = std::max(maxptr[i], ptr[i]);
+        }
+    }
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* maxptr = max;
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] = exp(ptr[i] - maxptr[i]);
+        }
+    }
+
+    Mat sum;
+    sum.create(w, h);
+    if (sum.empty())
+        return -100;
+    sum.fill(0.f);
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_top_blob.channel(q);
+        float* sumptr = sum;
+
+        for (int i=0; i<size; i++)
+        {
+            sumptr[i] += ptr[i];
+        }
+    }
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* sumptr = sum;
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] /= sumptr[i];
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/softmax.h b/src/layer/softmax.h
new file mode 100644
index 00000000000..37591d9e214
--- /dev/null
+++ b/src/layer/softmax.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SOFTMAX_H
+#define LAYER_SOFTMAX_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Softmax : public Layer
+{
+public:
+    Softmax();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SOFTMAX_H
diff --git a/src/layer/split.cpp b/src/layer/split.cpp
new file mode 100644
index 00000000000..fb8ee34d5cf
--- /dev/null
+++ b/src/layer/split.cpp
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "split.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Split)
+
+Split::Split()
+{
+}
+
+int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    for (size_t i=0; i<top_blobs.size(); i++)
+    {
+        top_blobs[i] = bottom_blob;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/split.h b/src/layer/split.h
new file mode 100644
index 00000000000..0239d9d56c2
--- /dev/null
+++ b/src/layer/split.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SPLIT_H
+#define LAYER_SPLIT_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Split : public Layer
+{
+public:
+    Split();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SPLIT_H
diff --git a/src/layer/spp.cpp b/src/layer/spp.cpp
new file mode 100644
index 00000000000..4bb78b7d268
--- /dev/null
+++ b/src/layer/spp.cpp
@@ -0,0 +1,196 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "spp.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(SPP)
+
+SPP::SPP()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int SPP::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d",
+                       &pooling_type, &pyramid_height);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "SPP load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int SPP::load_param_bin(FILE* paramfp)
+{
+    fread(&pooling_type, sizeof(int), 1, paramfp);
+
+    fread(&pyramid_height, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int SPP::load_param(const unsigned char*& mem)
+{
+    pooling_type = *(int*)(mem);
+    mem += 4;
+
+    pyramid_height = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int SPP::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // 1 + 4 + 16 + 64 + ... + (2*pyramid_height)^2
+    int pyramid_num_bins = ((1 << (pyramid_height * 2)) - 1) / 3;
+    top_blob.create(pyramid_num_bins, 1, 2);
+    if (top_blob.empty())
+        return -100;
+
+    float* pyramid_ptr = top_blob;
+
+    // all spatial pyramids
+    for (int p = 0; p < pyramid_height; p++)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int num_bins = 1 << p;
+
+        int kernel_h = ceil(h / (float)num_bins);
+        int stride_h = kernel_h;
+        int remainder_h = stride_h * num_bins - h;
+        int pad_h = (remainder_h + 1) / 2;
+
+        int kernel_w = ceil(w / (float)num_bins);
+        int stride_w = kernel_w;
+        int remainder_w = stride_w * num_bins - w;
+        int pad_w = (remainder_w + 1) / 2;
+
+        // max value in NxN window
+        // avg value in NxN window
+
+        int outw = num_bins;
+        int outh = num_bins;
+
+        Mat bottom_blob_bordered = bottom_blob;
+        if (pad_h > 0 || pad_w > 0)
+        {
+            copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+            if (bottom_blob_bordered.empty())
+                return -100;
+
+            w = bottom_blob_bordered.w;
+            h = bottom_blob_bordered.h;
+        }
+
+        const int maxk = kernel_h * kernel_w;
+
+        // kernel offsets
+        std::vector<int> _space_ofs(maxk);
+        int* space_ofs = &_space_ofs[0];
+        {
+            int p1 = 0;
+            int p2 = 0;
+            int gap = w - kernel_w;
+            for (int i = 0; i < kernel_h; i++)
+            {
+                for (int j = 0; j < kernel_w; j++)
+                {
+                    space_ofs[p1] = p2;
+                    p1++;
+                    p2++;
+                }
+                p2 += gap;
+            }
+        }
+
+        if (pooling_type == PoolMethod_MAX)
+        {
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const Mat m(w, h, bottom_blob_bordered.channel(q));
+                float* outptr = pyramid_ptr + outh * outw * q;
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        const float* sptr = m.data + m.w * i*stride_h + j*stride_w;
+
+                        float max = sptr[0];
+
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            float val = sptr[ space_ofs[k] ];
+                            max = std::max(max, val);
+                        }
+
+                        outptr[j] = max;
+                    }
+
+                    outptr += outw;
+                }
+            }
+        }
+        else if (pooling_type == PoolMethod_AVE)
+        {
+            #pragma omp parallel for
+            for (int q=0; q<channels; q++)
+            {
+                const Mat m(w, h, bottom_blob_bordered.channel(q));
+                float* outptr = pyramid_ptr + outh * outw * q;
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        const float* sptr = m.data + m.w * i*stride_h + j*stride_w;
+
+                        float sum = 0;
+
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            float val = sptr[ space_ofs[k] ];
+                            sum += val;
+                        }
+
+                        outptr[j] = sum / maxk;
+                    }
+
+                    outptr += outw;
+                }
+            }
+        }
+
+        pyramid_ptr += channels * outh * outw;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/spp.h b/src/layer/spp.h
new file mode 100644
index 00000000000..7788e975f31
--- /dev/null
+++ b/src/layer/spp.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SPP_H
+#define LAYER_SPP_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class SPP : public Layer
+{
+public:
+    SPP();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    enum { PoolMethod_MAX = 0, PoolMethod_AVE = 1 };
+
+public:
+    // param
+    int pooling_type;
+    int pyramid_height;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SPP_H
diff --git a/src/layer/tanh.cpp b/src/layer/tanh.cpp
new file mode 100644
index 00000000000..a276e1b0b47
--- /dev/null
+++ b/src/layer/tanh.cpp
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tanh.h"
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(TanH)
+
+TanH::TanH()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int TanH::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = tanh(ptr[i]);
+        }
+    }
+
+    return 0;
+}
+
+int TanH::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] = tanh(ptr[i]);
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/tanh.h b/src/layer/tanh.h
new file mode 100644
index 00000000000..e3a36e81b84
--- /dev/null
+++ b/src/layer/tanh.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_TANH_H
+#define LAYER_TANH_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class TanH : public Layer
+{
+public:
+    TanH();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_TANH_H
diff --git a/src/layer/threshold.cpp b/src/layer/threshold.cpp
new file mode 100644
index 00000000000..19a3cbdec56
--- /dev/null
+++ b/src/layer/threshold.cpp
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "threshold.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Threshold)
+
+Threshold::Threshold()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Threshold::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%f", &threshold);
+    if (nscan != 1)
+    {
+        fprintf(stderr, "Threshold load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Threshold::load_param_bin(FILE* paramfp)
+{
+    fread(&threshold, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Threshold::load_param(const unsigned char*& mem)
+{
+    threshold = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Threshold::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    top_blob.create(w, h, channels);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = ptr[i] > threshold ? 1.f : 0.f;
+        }
+    }
+
+    return 0;
+}
+
+int Threshold::forward_inplace(Mat& bottom_top_blob) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    #pragma omp parallel for
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] = ptr[i] > threshold ? 1.f : 0.f;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/threshold.h b/src/layer/threshold.h
new file mode 100644
index 00000000000..2d552258733
--- /dev/null
+++ b/src/layer/threshold.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_THRESHOLD_H
+#define LAYER_THRESHOLD_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Threshold : public Layer
+{
+public:
+    Threshold();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+    float threshold;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_THRESHOLD_H
diff --git a/src/layer/tile.cpp b/src/layer/tile.cpp
new file mode 100644
index 00000000000..87c50a0470e
--- /dev/null
+++ b/src/layer/tile.cpp
@@ -0,0 +1,145 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tile.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Tile)
+
+Tile::Tile()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Tile::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %d", &dim, &tiles);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "Tile load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int Tile::load_param_bin(FILE* paramfp)
+{
+    fread(&dim, sizeof(int), 1, paramfp);
+
+    fread(&tiles, sizeof(int), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Tile::load_param(const unsigned char*& mem)
+{
+    dim = *(int*)(mem);
+    mem += 4;
+
+    tiles = *(int*)(mem);
+    mem += 4;
+
+    return 0;
+}
+
+int Tile::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    if (dim == 0)
+    {
+        top_blob.create(w, h, channels * tiles);
+        if (top_blob.empty())
+            return -100;
+
+        const float* ptr = bottom_blob;
+        int size = bottom_blob.cstep * channels;
+
+        #pragma omp parallel for
+        for (int p=0; p<tiles; p++)
+        {
+            float* outptr = top_blob.channel(p * channels);
+
+            for (int i=0; i<size; i++)
+            {
+                outptr[i] = ptr[i];
+            }
+        }
+    }
+    else if (dim == 1)
+    {
+        top_blob.create(w, h * tiles, channels);
+        if (top_blob.empty())
+            return -100;
+
+        int size = w * h;
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int p=0; p<tiles; p++)
+            {
+                for (int i=0; i<size; i++)
+                {
+                    outptr[i] = ptr[i];
+                }
+
+                outptr += size;
+            }
+        }
+    }
+    else if (dim == 2)
+    {
+        top_blob.create(w * tiles, h, channels);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < h; i++)
+            {
+                for (int p=0; p<tiles; p++)
+                {
+                    for (int j = 0; j < w; j++)
+                    {
+                        outptr[j] = ptr[j];
+                    }
+
+                    outptr += w;
+                }
+
+                ptr += w;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/tile.h b/src/layer/tile.h
new file mode 100644
index 00000000000..6bb2bb3dd5c
--- /dev/null
+++ b/src/layer/tile.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_TILE_H
+#define LAYER_TILE_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Tile : public Layer
+{
+public:
+    Tile();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+    virtual int load_param_bin(FILE* paramfp);
+#endif // NCNN_STDIO
+    virtual int load_param(const unsigned char*& mem);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+public:
+    int dim;
+    int tiles;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_TILE_H
diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h
new file mode 100644
index 00000000000..2a9c6b1a4d6
--- /dev/null
+++ b/src/layer/x86/avx_mathfun.h
@@ -0,0 +1,712 @@
+/* 
+   AVX implementation of sin, cos, sincos, exp and log
+
+   Based on "sse_mathfun.h", by Julien Pommier
+   http://gruntthepeon.free.fr/ssemath/
+
+   Copyright (C) 2012 Giovanni Garberoglio
+   Interdisciplinary Laboratory for Computational Science (LISC)
+   Fondazione Bruno Kessler and University of Trento
+   via Sommarive, 18
+   I-38123 Trento (Italy)
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <immintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+# define ALIGN32_BEG
+# define ALIGN32_END __attribute__((aligned(32)))
+
+/* __m128 is ugly to write */
+typedef __m256  v8sf; // vector of 8 float (avx)
+typedef __m256i v8si; // vector of 8 int   (avx)
+typedef __m128i v4si; // vector of 8 int   (avx)
+
+#define _PI32AVX_CONST(Name, Val)                                            \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+
+_PI32AVX_CONST(1, 1);
+_PI32AVX_CONST(inv1, ~1);
+_PI32AVX_CONST(2, 2);
+_PI32AVX_CONST(4, 4);
+
+
+/* declare some AVX constants -- why can't I figure a better way to do that? */
+#define _PS256_CONST(Name, Val)                                            \
+  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PI32_CONST256(Name, Val)                                            \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PS256_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
+
+_PS256_CONST(1  , 1.0f);
+_PS256_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS256_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST256(0, 0);
+_PI32_CONST256(1, 1);
+_PI32_CONST256(inv1, ~1);
+_PI32_CONST256(2, 2);
+_PI32_CONST256(4, 4);
+_PI32_CONST256(0x7f, 0x7f);
+
+_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_q1, -2.12194440e-4);
+_PS256_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef __AVX2__
+
+typedef union imm_xmm_union {
+  v8si imm;
+  v4si xmm[2];
+} imm_xmm_union;
+
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) {    \
+    imm_xmm_union u __attribute__((aligned(32)));  \
+    u.imm = imm_;				   \
+    xmm0_ = u.xmm[0];                            \
+    xmm1_ = u.xmm[1];                            \
+}
+
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) {                       \
+    imm_xmm_union u __attribute__((aligned(32))); \
+    u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+  }
+
+
+#define AVX2_BITOP_USING_SSE2(fn) \
+static inline v8si _mm256_##fn(v8si x, int a) \
+{ \
+  /* use SSE2 instruction to perform the bitop AVX2 */ \
+  v4si x1, x2; \
+  v8si ret; \
+  COPY_IMM_TO_XMM(x, x1, x2); \
+  x1 = _mm_##fn(x1,a); \
+  x2 = _mm_##fn(x2,a); \
+  COPY_XMM_TO_IMM(x1, x2, ret); \
+  return(ret); \
+}
+
+#warning "Using SSE2 to perform AVX2 bitshift ops"
+AVX2_BITOP_USING_SSE2(slli_epi32)
+AVX2_BITOP_USING_SSE2(srli_epi32)
+
+#define AVX2_INTOP_USING_SSE2(fn) \
+static inline v8si _mm256_##fn(v8si x, v8si y) \
+{ \
+  /* use SSE2 instructions to perform the AVX2 integer operation */ \
+  v4si x1, x2; \
+  v4si y1, y2; \
+  v8si ret; \
+  COPY_IMM_TO_XMM(x, x1, x2); \
+  COPY_IMM_TO_XMM(y, y1, y2); \
+  x1 = _mm_##fn(x1,y1); \
+  x2 = _mm_##fn(x2,y2); \
+  COPY_XMM_TO_IMM(x1, x2, ret); \
+  return(ret); \
+}
+
+#warning "Using SSE2 to perform AVX2 integer ops"
+AVX2_INTOP_USING_SSE2(and_si128)
+AVX2_INTOP_USING_SSE2(andnot_si128)
+AVX2_INTOP_USING_SSE2(cmpeq_epi32)
+AVX2_INTOP_USING_SSE2(sub_epi32)
+AVX2_INTOP_USING_SSE2(add_epi32)
+
+#endif /* __AVX2__ */
+
+
+/* natural logarithm computed for 8 simultaneous float 
+   return NaN for x <= 0
+*/
+v8sf log256_ps(v8sf x) {
+  v8si imm0;
+  v8sf one = *(v8sf*)_ps256_1;
+
+  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
+
+  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
+
+  // can be done with AVX2
+  imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+
+  // this is again another AVX2 instruction
+  imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  v8sf e = _mm256_cvtepi32_ps(imm0);
+
+  e = _mm256_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf tmp = _mm256_and_ps(x, mask);
+  x = _mm256_sub_ps(x, one);
+  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
+  x = _mm256_add_ps(x, tmp);
+
+  v8sf z = _mm256_mul_ps(x,x);
+
+  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+  y = _mm256_mul_ps(y, x);
+
+  y = _mm256_mul_ps(y, z);
+  
+  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
+
+
+  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+
+  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+  x = _mm256_add_ps(x, y);
+  x = _mm256_add_ps(x, tmp);
+  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS256_CONST(exp_hi,	88.3762626647949f);
+_PS256_CONST(exp_lo,	-88.3762626647949f);
+
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS256_CONST(cephes_exp_C1, 0.693359375);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v8sf exp256_ps(v8sf x) {
+  v8sf tmp = _mm256_setzero_ps(), fx;
+  v8si imm0;
+  v8sf one = *(v8sf*)_ps256_1;
+
+  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+  //imm0 = _mm256_cvttps_epi32(fx);
+  //tmp  = _mm256_cvtepi32_ps(imm0);
+  
+  tmp = _mm256_floor_ps(fx);
+
+  /* if greater, substract 1 */
+  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);    
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);    
+  mask = _mm256_and_ps(mask, one);
+  fx = _mm256_sub_ps(tmp, mask);
+
+  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+  x = _mm256_sub_ps(x, tmp);
+  x = _mm256_sub_ps(x, z);
+
+  z = _mm256_mul_ps(x,x);
+  
+  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, x);
+  y = _mm256_add_ps(y, one);
+
+  /* build 2^n */
+  imm0 = _mm256_cvttps_epi32(fx);
+  // another two AVX2 instructions
+  imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = _mm256_slli_epi32(imm0, 23);
+  v8sf pow2n = _mm256_castsi256_ps(imm0);
+  y = _mm256_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS256_CONST(minus_cephes_DP1, -0.78515625);
+_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS256_CONST(sincof_p0, -1.9515295891E-4);
+_PS256_CONST(sincof_p1,  8.3321608736E-3);
+_PS256_CONST(sincof_p2, -1.6666654611E-1);
+_PS256_CONST(coscof_p0,  2.443315711809948E-005);
+_PS256_CONST(coscof_p1, -1.388731625493765E-003);
+_PS256_CONST(coscof_p2,  4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 8 sines at onces using AVX intrisics
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+*/
+v8sf sin256_ps(v8sf x) { // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+  /*
+    Here we start a series of integer operations, which are in the
+    realm of AVX2.
+    If we don't have AVX, let's perform them using SSE2 directives
+  */
+
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  // another two AVX2 instruction
+  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  /* get the swap sign flag */
+  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
+  imm0 = _mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
+  imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+ 
+  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x,x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y,y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+v8sf cos256_ps(v8sf x) { // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+  
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+  imm2 = _mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
+  
+  /* get the swap sign flag */
+  imm0 = _mm256_andnot_si128(imm2, *(v8si*)_pi32_256_4);
+  imm0 = _mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask */
+  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
+  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+#else
+
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+  v8sf sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x,x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y,y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
+
+  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
+  v8si imm0, imm2, imm4;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+  v4si imm4_1, imm4_2;
+#endif
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__    
+  /* store the integer part of y in imm2 */
+  imm2 = _mm256_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);
+
+  y = _mm256_cvtepi32_ps(imm2);
+  imm4 = imm2;
+
+  /* get the swap sign flag for the sine */
+  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
+  imm0 = _mm256_slli_epi32(imm0, 29);
+  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+
+  /* get the polynom selection mask for the sine*/
+  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
+  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+  
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm4_1 = imm2_1;
+  imm4_2 = imm2_2;
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
+  
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+#ifdef __AVX2__
+  imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
+  imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4);
+  imm4 = _mm256_slli_epi32(imm4, 29);
+#else
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
+
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
+  
+  imm4_1 = _mm_slli_epi32(imm4_1, 29);
+  imm4_2 = _mm_slli_epi32(imm4_2, 29);
+
+  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+#endif
+
+  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf*)_ps256_coscof_p0;
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
+  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
+  y2 = _mm256_sub_ps(y2,ysin2);
+  y = _mm256_sub_ps(y, ysin1);
+
+  xmm1 = _mm256_add_ps(ysin1,ysin2);
+  xmm2 = _mm256_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
+}
+
diff --git a/src/layer/x86/convolution_3x3.h b/src/layer/x86/convolution_3x3.h
new file mode 100644
index 00000000000..0c240a6c719
--- /dev/null
+++ b/src/layer/x86/convolution_3x3.h
@@ -0,0 +1,141 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        for (int q=0; q<inch; q++)
+        {
+            float* outptr = out;
+            float* outptr2 = outptr + outw;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch*9  + q*9;
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+            const float* r2 = img0 + w*2;
+            const float* r3 = img0 + w*3;
+
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            int i = 0;
+
+            for (; i+1 < outh; i+=2)
+            {
+
+                int remain = outw;
+
+                for (; remain>0; remain--)
+                {
+                    float sum = 0;
+                    float sum2 = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+
+                    sum2 += r1[0] * k0[0];
+                    sum2 += r1[1] * k0[1];
+                    sum2 += r1[2] * k0[2];
+                    sum2 += r2[0] * k1[0];
+                    sum2 += r2[1] * k1[1];
+                    sum2 += r2[2] * k1[2];
+                    sum2 += r3[0] * k2[0];
+                    sum2 += r3[1] * k2[1];
+                    sum2 += r3[2] * k2[2];
+
+                    *outptr += sum;
+                    *outptr2 += sum2;
+
+                    r0++;
+                    r1++;
+                    r2++;
+                    r3++;
+                    outptr++;
+                    outptr2++;
+                }
+
+                r0 += 2 + w;
+                r1 += 2 + w;
+                r2 += 2 + w;
+                r3 += 2 + w;
+
+                outptr += outw;
+                outptr2 += outw;
+            }
+
+            for (; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain>0; remain--)
+                {
+                    float sum = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+
+                    *outptr += sum;
+
+                    r0++;
+                    r1++;
+                    r2++;
+                    outptr++;
+                }
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+            }
+
+        }
+    }
+
+}
diff --git a/src/layer/x86/convolution_5x5.h b/src/layer/x86/convolution_5x5.h
new file mode 100644
index 00000000000..19edd85a45e
--- /dev/null
+++ b/src/layer/x86/convolution_5x5.h
@@ -0,0 +1,215 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv5x5s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        for (int q=0; q<inch; q++)
+        {
+            float* outptr = out;
+            float* outptr2 = outptr + outw;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch*25  + q*25;
+
+            const float* r0 = img0;
+            const float* r1 = img0 + w;
+            const float* r2 = img0 + w*2;
+            const float* r3 = img0 + w*3;
+            const float* r4 = img0 + w*4;
+            const float* r5 = img0 + w*5;
+
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 5;
+            const float* k2 = kernel0 + 10;
+            const float* k3 = kernel0 + 15;
+            const float* k4 = kernel0 + 20;
+
+            int i = 0;
+
+            for (; i+1 < outh; i+=2)
+            {
+
+                int remain = outw;
+
+                for (; remain>0; remain--)
+                {
+                    float sum = 0;
+                    float sum2 = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r0[3] * k0[3];
+                    sum += r0[4] * k0[4];
+
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r1[3] * k1[3];
+                    sum += r1[4] * k1[4];
+
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+                    sum += r2[3] * k2[3];
+                    sum += r2[4] * k2[4];
+
+                    sum += r3[0] * k3[0];
+                    sum += r3[1] * k3[1];
+                    sum += r3[2] * k3[2];
+                    sum += r3[3] * k3[3];
+                    sum += r3[4] * k3[4];
+
+                    sum += r4[0] * k4[0];
+                    sum += r4[1] * k4[1];
+                    sum += r4[2] * k4[2];
+                    sum += r4[3] * k4[3];
+                    sum += r4[4] * k4[4];
+
+                    sum2 += r1[0] * k0[0];
+                    sum2 += r1[1] * k0[1];
+                    sum2 += r1[2] * k0[2];
+                    sum2 += r1[3] * k0[3];
+                    sum2 += r1[4] * k0[4];
+
+                    sum2 += r2[0] * k1[0];
+                    sum2 += r2[1] * k1[1];
+                    sum2 += r2[2] * k1[2];
+                    sum2 += r2[3] * k1[3];
+                    sum2 += r2[4] * k1[4];
+
+                    sum2 += r3[0] * k2[0];
+                    sum2 += r3[1] * k2[1];
+                    sum2 += r3[2] * k2[2];
+                    sum2 += r3[3] * k2[3];
+                    sum2 += r3[4] * k2[4];
+
+                    sum2 += r4[0] * k3[0];
+                    sum2 += r4[1] * k3[1];
+                    sum2 += r4[2] * k3[2];
+                    sum2 += r4[3] * k3[3];
+                    sum2 += r4[4] * k3[4];
+
+                    sum2 += r5[0] * k4[0];
+                    sum2 += r5[1] * k4[1];
+                    sum2 += r5[2] * k4[2];
+                    sum2 += r5[3] * k4[3];
+                    sum2 += r5[4] * k4[4];
+
+                    *outptr += sum;
+                    *outptr2 += sum2;
+
+                    r0++;
+                    r1++;
+                    r2++;
+                    r3++;
+                    r4++;
+                    r5++;
+                    outptr++;
+                    outptr2++;
+                }
+
+                r0 += 4 + w;
+                r1 += 4 + w;
+                r2 += 4 + w;
+                r3 += 4 + w;
+                r4 += 4 + w;
+                r5 += 4 + w;
+
+                outptr += outw;
+                outptr2 += outw;
+            }
+
+            for (; i < outh; i++)
+            {
+
+                int remain = outw;
+
+                for (; remain>0; remain--)
+                {
+                    float sum = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r0[3] * k0[3];
+                    sum += r0[4] * k0[4];
+
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r1[3] * k1[3];
+                    sum += r1[4] * k1[4];
+
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+                    sum += r2[3] * k2[3];
+                    sum += r2[4] * k2[4];
+
+                    sum += r3[0] * k3[0];
+                    sum += r3[1] * k3[1];
+                    sum += r3[2] * k3[2];
+                    sum += r3[3] * k3[3];
+                    sum += r3[4] * k3[4];
+
+                    sum += r4[0] * k4[0];
+                    sum += r4[1] * k4[1];
+                    sum += r4[2] * k4[2];
+                    sum += r4[3] * k4[3];
+                    sum += r4[4] * k4[4];
+
+                    *outptr += sum;
+
+                    r0++;
+                    r1++;
+                    r2++;
+                    r3++;
+                    r4++;
+                    outptr++;
+                }
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+                r3 += 4;
+                r4 += 4;
+
+            }
+
+        }
+    }
+
+}
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
new file mode 100644
index 00000000000..75e0e4121c6
--- /dev/null
+++ b/src/layer/x86/convolution_x86.cpp
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_x86.h"
+
+namespace ncnn {
+
+#include "convolution_3x3.h"
+#include "convolution_5x5.h"
+
+DEFINE_LAYER_CREATOR(Convolution_x86)
+
+int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // convolv with NxN kernel
+    // value = value + bias
+
+    if (kernel_size > 5 || dilation != 1)
+    {
+        return Convolution::forward(bottom_blob, top_blob);
+    }
+
+    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
+
+    // kernel_size x stride
+    conv_func conv_func_table[5][5] =
+    {
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 1
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 2
+        {
+            conv3x3s1_sse,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 3
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 4
+        {
+            conv5x5s1_sse,
+            0,
+            0,
+            0,
+            0
+        }  // kernel_size = 5
+    };
+
+    conv_func conv = conv_func_table[kernel_size-1][stride-1];
+    if (!conv)
+    {
+        return Convolution::forward(bottom_blob, top_blob);
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    Mat bottom_blob_bordered = bottom_blob;
+    if (pad > 0)
+    {
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
+        if (bottom_blob_bordered.empty())
+            return -100;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+    }
+
+    int outw = (w - kernel_size) / stride + 1;
+    int outh = (h - kernel_size) / stride + 1;
+
+    top_blob.create(outw, outh, num_output);
+    if (top_blob.empty())
+        return -100;
+
+    conv(bottom_blob_bordered, top_blob, weight_data, bias_data);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/x86/convolution_x86.h b/src/layer/x86/convolution_x86.h
new file mode 100644
index 00000000000..019ba9ea5bd
--- /dev/null
+++ b/src/layer/x86/convolution_x86.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION_X86_H
+#define LAYER_CONVOLUTION_X86_H
+
+#include "convolution.h"
+
+namespace ncnn {
+
+class Convolution_x86 : public Convolution
+{
+public:
+    virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION_X86_H
diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h
new file mode 100644
index 00000000000..2a7862ac963
--- /dev/null
+++ b/src/layer/x86/sse_mathfun.h
@@ -0,0 +1,711 @@
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+
+#endif // USE_SSE2
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+
+  e = _mm_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  v4sf tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+
+
+  v4sf z = _mm_mul_ps(x,x);
+
+  v4sf y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+
+  y = _mm_mul_ps(y, z);
+  
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+
+
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+
+  z = _mm_mul_ps(x,x);
+  
+  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23); 
+  mm1 = _mm_slli_pi32(mm1, 23);
+  
+  v4sf pow2n; 
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+  return y;
+}
+
+/* almost the same as sin_ps */
+v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+  /* get the swap sign flag in mm0:mm1 and the 
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+
diff --git a/src/mat.cpp b/src/mat.cpp
new file mode 100644
index 00000000000..0ed909eb6d7
--- /dev/null
+++ b/src/mat.cpp
@@ -0,0 +1,520 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "mat.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+#include "cpu.h"
+
+namespace ncnn {
+
+void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_vals)
+{
+    int size = w * h;
+
+    if (mean_vals && !norm_vals)
+    {
+        // substract mean only
+        #pragma omp parallel for
+        for (int q=0; q<c; q++)
+        {
+            float* ptr = data + cstep * q;
+            const float mean = mean_vals[q];
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            float32x4_t _mean = vdupq_n_f32(mean);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _ptr = vld1q_f32(ptr);
+                _ptr = vsubq_f32(_ptr, _mean);
+                vst1q_f32(ptr, _ptr);
+                ptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "vdup.f32   q1, %4              \n"
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.f32   {d0-d1}, [%1 :128]  \n"
+                "vsub.f32   q0, q0, q1          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d1}, [%1 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr)     // %1
+                : "0"(nn),
+                  "1"(ptr),
+                  "r"(mean)     // %4
+                : "cc", "memory", "q0", "q1"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                *ptr -= mean;
+                ptr++;
+            }
+        }
+    }
+    else if (!mean_vals && norm_vals)
+    {
+        // normalize only
+        #pragma omp parallel for
+        for (int q=0; q<c; q++)
+        {
+            float* ptr = data + cstep * q;
+            const float norm = norm_vals[q];
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            float32x4_t _norm = vdupq_n_f32(norm);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _ptr = vld1q_f32(ptr);
+                _ptr = vmulq_f32(_ptr, _norm);
+                vst1q_f32(ptr, _ptr);
+                ptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "vdup.f32   q1, %4              \n"
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.f32   {d0-d1}, [%1 :128]  \n"
+                "vmul.f32   q0, q0, q1          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d1}, [%1 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr)     // %1
+                : "0"(nn),
+                  "1"(ptr),
+                  "r"(norm)     // %4
+                : "cc", "memory", "q0", "q1"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                *ptr *= norm;
+                ptr++;
+            }
+        }
+    }
+    else if (mean_vals && norm_vals)
+    {
+        // substract mean and normalize
+        #pragma omp parallel for
+        for (int q=0; q<c; q++)
+        {
+            float* ptr = data + cstep * q;
+            const float mean = mean_vals[q];
+            const float norm = norm_vals[q];
+
+#if __ARM_NEON
+            int nn = size >> 2;
+            int remain = size - (nn << 2);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            float32x4_t _mean = vdupq_n_f32(mean);
+            float32x4_t _norm = vdupq_n_f32(norm);
+            for (; nn>0; nn--)
+            {
+                float32x4_t _ptr = vld1q_f32(ptr);
+                _ptr = vsubq_f32(_ptr, _mean);
+                _ptr = vmulq_f32(_ptr, _norm);
+                vst1q_f32(ptr, _ptr);
+                ptr += 4;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "vdup.f32   q1, %4              \n"
+                "vdup.f32   q2, %5              \n"
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.f32   {d0-d1}, [%1 :128]  \n"
+                "vsub.f32   q0, q0, q1          \n"
+                "vmul.f32   q0, q0, q2          \n"
+                "subs       %0, #1              \n"
+                "vst1.f32   {d0-d1}, [%1 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr)     // %1
+                : "0"(nn),
+                  "1"(ptr),
+                  "r"(mean),    // %4
+                  "r"(norm)     // %5
+                : "cc", "memory", "q0", "q1", "q2"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                *ptr = (*ptr - mean) * norm;
+                ptr++;
+            }
+        }
+    }
+}
+
+// convert half precision floating point to float
+static float half2float(unsigned short value)
+{
+    // 1 : 5 : 10
+    unsigned short sign = (value & 0x8000) >> 15;
+    unsigned short exponent = (value & 0x7c00) >> 10;
+    unsigned short significand = value & 0x03FF;
+
+//     fprintf(stderr, "%d %d %d\n", sign, exponent, significand);
+
+    // 1 : 8 : 23
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    if (exponent == 0)
+    {
+        if (significand == 0)
+        {
+            // zero
+            tmp.u = (sign << 31);
+        }
+        else
+        {
+            // denormal
+            exponent = 0;
+            // find non-zero bit
+            while ((significand & 0x200) == 0)
+            {
+                significand <<= 1;
+                exponent++;
+            }
+            significand <<= 1;
+            significand &= 0x3FF;
+            tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13);
+        }
+    }
+    else if (exponent == 0x1F)
+    {
+        // infinity or NaN
+        tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13);
+    }
+    else
+    {
+        // normalized
+        tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13);
+    }
+
+    return tmp.f;
+}
+
+Mat Mat::from_float16(const unsigned short* data, int size)
+{
+    Mat m(size);
+    if (m.empty())
+        return m;
+
+    float* ptr = m.data;
+
+#if __ARM_NEON && (__ARM_FP & 2)
+    int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0;
+    int remain = size - (nn << 2);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON && (__ARM_FP & 2)
+#if __aarch64__
+    if (nn > 0)
+    {
+    asm volatile(
+        "0:                             \n"
+        "ldr    d0, [%1], #8            \n"
+        "fcvtl  v1.4s, v0.4h            \n"
+        "subs   %w0, %w0, #1            \n"
+        "str    q1, [%2], #16           \n"
+        "bne    0b                      \n"
+        : "=r"(nn),     // %0
+          "=r"(data),   // %1
+          "=r"(ptr)     // %2
+        : "0"(nn),
+          "1"(data),
+          "2"(ptr)
+        : "cc", "memory", "v0", "v1"
+    );
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "0:                             \n"
+        "pld        [%1, #64]           \n"
+        "vld1.s16   {d0}, [%1 :64]!     \n"
+        "vcvt.f32.f16 q1, d0            \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d2-d3}, [%2 :128]! \n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(data),   // %1
+          "=r"(ptr)     // %2
+        : "0"(nn),
+          "1"(data),
+          "2"(ptr)
+        : "cc", "memory", "q0", "q1"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr = half2float(*data);
+
+        data++;
+        ptr++;
+    }
+
+    return m;
+}
+
+static void copy_make_border_image(const Mat& src, Mat& dst, int top, int left, int type, float v)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    const float* ptr = src.data;
+    float* outptr = dst.data;
+
+    if (type == BORDER_CONSTANT)
+    {
+        int y = 0;
+        // fill top
+        for (; y < top; y++)
+        {
+            int x = 0;
+            for (; x < w; x++)
+            {
+                outptr[x] = v;
+            }
+            outptr += w;
+        }
+        // fill center
+        for (; y < (top + src.h); y++)
+        {
+            int x = 0;
+            for (; x < left; x++)
+            {
+                outptr[x] = v;
+            }
+            for (; x < (left + src.w); x++)
+            {
+                outptr[x] = ptr[x - left];
+            }
+            for (; x < w; x++)
+            {
+                outptr[x] = v;
+            }
+            ptr += src.w;
+            outptr += w;
+        }
+        // fill bottom
+        for (; y < h; y++)
+        {
+            int x = 0;
+            for (; x < w; x++)
+            {
+                outptr[x] = v;
+            }
+            outptr += w;
+        }
+    }
+    else if (type == BORDER_REPLICATE)
+    {
+        int y = 0;
+        // fill top
+        for (; y < top; y++)
+        {
+            int x = 0;
+            for (; x < left; x++)
+            {
+                outptr[x] = ptr[0];
+            }
+            for (; x < (left + src.w); x++)
+            {
+                outptr[x] = ptr[x - left];
+            }
+            for (; x < w; x++)
+            {
+                outptr[x] = ptr[src.w - 1];
+            }
+            outptr += w;
+        }
+        // fill center
+        for (; y < (top + src.h); y++)
+        {
+            int x = 0;
+            for (; x < left; x++)
+            {
+                outptr[x] = ptr[0];
+            }
+            for (; x < (left + src.w); x++)
+            {
+                outptr[x] = ptr[x - left];
+            }
+            for (; x < w; x++)
+            {
+                outptr[x] = ptr[src.w - 1];
+            }
+            ptr += src.w;
+            outptr += w;
+        }
+        // fill bottom
+        ptr -= src.w;
+        for (; y < h; y++)
+        {
+            int x = 0;
+            for (; x < left; x++)
+            {
+                outptr[x] = ptr[0];
+            }
+            for (; x < (left + src.w); x++)
+            {
+                outptr[x] = ptr[x - left];
+            }
+            for (; x < w; x++)
+            {
+                outptr[x] = ptr[src.w - 1];
+            }
+            outptr += w;
+        }
+    }
+}
+
+void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v)
+{
+    int w = src.w + left + right;
+    int h = src.h + top + bottom;
+
+    if (src.dims == 2)
+    {
+        dst.create(w, h);
+        if (dst.empty())
+            return;
+
+        copy_make_border_image(src, dst, top, left, type, v);
+    }
+    else if (src.dims == 3)
+    {
+        int channels = src.c;
+
+        dst.create(w, h, channels);
+        if (dst.empty())
+            return;
+
+        // unroll image channel
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const Mat m = src.channel(q);
+            Mat borderm = dst.channel(q);
+
+            copy_make_border_image(m, borderm, top, left, type, v);
+        }
+    }
+}
+
+static void copy_cut_border_image(const Mat& src, Mat& dst, int top, int left)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    const float* ptr = src.data + src.w * top + left;
+    float* outptr = dst.data;
+
+    for (int y = 0; y < h; y++)
+    {
+        for (int x = 0; x < w; x++)
+        {
+            outptr[x] = ptr[x];
+        }
+        outptr += w;
+        ptr += src.w;
+    }
+}
+
+void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+{
+    int w = src.w - left - right;
+    int h = src.h - top - bottom;
+
+    if (src.dims == 2)
+    {
+        dst.create(w, h);
+        if (dst.empty())
+            return;
+
+        copy_cut_border_image(src, dst, top, left);
+    }
+    else if (src.dims == 3)
+    {
+        int channels = src.c;
+
+        dst.create(w, h, channels);
+        if (dst.empty())
+            return;
+
+        // unroll image channel
+        #pragma omp parallel for
+        for (int q=0; q<channels; q++)
+        {
+            const Mat m = src.channel(q);
+            Mat cutm = dst.channel(q);
+
+            copy_cut_border_image(m, cutm, top, left);
+        }
+    }
+}
+
+} // namespace ncnn
diff --git a/src/mat.h b/src/mat.h
new file mode 100644
index 00000000000..e07723adc55
--- /dev/null
+++ b/src/mat.h
@@ -0,0 +1,512 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+namespace ncnn {
+
+// the three dimension matrix
+class Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w);
+    // image
+    Mat(int w, int h);
+    // dim
+    Mat(int w, int h, int c);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, float* data);
+    // external image
+    Mat(int w, int h, float* data);
+    // external dim
+    Mat(int w, int h, int c, float* data);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    // deep copy
+    Mat clone() const;
+    // reshape vec
+    Mat reshape(int w) const;
+    // reshape image
+    Mat reshape(int w, int h) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c) const;
+    // allocate vec
+    void create(int w);
+    // allocate image
+    void create(int w, int h);
+    // allocate dim
+    void create(int w, int h, int c);
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    float* row(int y);
+    const float* row(int y) const;
+    operator float*();
+    operator const float*() const;
+
+    enum
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB       = 1,
+        PIXEL_BGR       = (1 << 1),
+        PIXEL_GRAY      = (1 << 2),
+        PIXEL_RGBA      = (1 << 3),
+
+        PIXEL_RGB2BGR   = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY  = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB   = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY  = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB  = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR  = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB  = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR  = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type);
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height);
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precisoin floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // the dimensionality
+    int dims;
+    // pointer to the data
+    float* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int w;
+    int h;
+    int c;
+
+    size_t cstep;
+};
+
+// misc function
+// image pixel bilinear resize
+void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+
+// mat process
+enum
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+};
+void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v);
+void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right);
+
+// the alignment of all the allocated buffers
+#define MALLOC_ALIGN    16
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n-1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static inline size_t alignSize(size_t sz, int n)
+{
+    return (sz + n-1) & -n;
+}
+
+static inline void* fastMalloc(size_t size)
+{
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+}
+
+static inline void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+    }
+}
+
+// exchange-add operation for atomic operations on reference counters
+#if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#  define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#  if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#    ifdef __ATOMIC_ACQ_REL
+#      define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#    else
+#      define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#    endif
+#  else
+#    if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#      define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#    else
+#      define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#    endif
+#  endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#  include <intrin.h>
+#  define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+static inline void NCNN_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
+#endif
+
+inline Mat::Mat()
+    : dims(0), data(0), refcount(0), w(0), h(0), c(0), cstep(0)
+{
+}
+
+inline Mat::Mat(int _w)
+    : dims(0), data(0), refcount(0)
+{
+    create(_w);
+}
+
+inline Mat::Mat(int _w, int _h)
+    : dims(0), data(0), refcount(0)
+{
+    create(_w, _h);
+}
+
+inline Mat::Mat(int _w, int _h, int _c)
+    : dims(0), data(0), refcount(0)
+{
+    create(_w, _h, _c);
+}
+
+inline Mat::Mat(const Mat& m)
+    : dims(m.dims), data(m.data), refcount(m.refcount)
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+
+    w = m.w;
+    h = m.h;
+    c = m.c;
+
+    cstep = m.cstep;
+}
+
+inline Mat::Mat(int _w, float* _data)
+    : dims(1), data(_data), refcount(0)
+{
+    w = _w;
+    h = 1;
+    c = 1;
+
+    cstep = w;
+}
+
+inline Mat::Mat(int _w, int _h, float* _data)
+    : dims(2), data(_data), refcount(0)
+{
+    w = _w;
+    h = _h;
+    c = 1;
+
+    cstep = w * h;
+}
+
+inline Mat::Mat(int _w, int _h, int _c, float* _data)
+    : dims(3), data(_data), refcount(0)
+{
+    w = _w;
+    h = _h;
+    c = _c;
+
+    cstep = alignSize(w * h * sizeof(float), 16) >> 2;
+}
+
+inline Mat::~Mat()
+{
+    release();
+}
+
+inline Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    dims = m.dims;
+    data = m.data;
+    refcount = m.refcount;
+
+    w = m.w;
+    h = m.h;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+inline void Mat::fill(float _v)
+{
+    size_t _total = total();
+    for (size_t i = 0; i < _total; i++)
+    {
+        data[i] = _v;
+    }
+}
+
+inline Mat Mat::clone() const
+{
+    if (empty())
+        return Mat();
+
+    Mat m;
+    if (dims == 1)
+        m.create(w);
+    else if (dims == 2)
+        m.create(w, h);
+    else if (dims == 3)
+        m.create(w, h, c);
+
+    if (total() > 0)
+    {
+        memcpy(m.data, data, total() * sizeof(float));
+    }
+
+    return m;
+}
+
+inline Mat Mat::reshape(int _w) const
+{
+    Mat m = *this;
+
+    m.dims = 1;
+
+    m.w = _w;
+    m.h = 1;
+    m.c = 1;
+
+    m.cstep = _w;
+
+    return m;
+}
+
+inline Mat Mat::reshape(int _w, int _h) const
+{
+    Mat m = *this;
+
+    m.dims = 2;
+
+    m.w = _w;
+    m.h = _h;
+    m.c = 1;
+
+    m.cstep = _w * _h;
+
+    return m;
+}
+
+inline Mat Mat::reshape(int _w, int _h, int _c) const
+{
+    Mat m = *this;
+
+    m.dims = 3;
+
+    m.w = _w;
+    m.h = _h;
+    m.c = _c;
+
+    m.cstep = alignSize(_w * _h * sizeof(float), 16) >> 2;
+
+    return m;
+}
+
+inline void Mat::create(int _w)
+{
+    release();
+
+    dims = 1;
+
+    w = _w;
+    h = 1;
+    c = 1;
+
+    cstep = w;
+
+    if (total() > 0)
+    {
+        size_t totalsize = total() * sizeof(float);
+        data = (float*)fastMalloc(totalsize + (int)sizeof(*refcount));
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+inline void Mat::create(int _w, int _h)
+{
+    release();
+
+    dims = 2;
+
+    w = _w;
+    h = _h;
+    c = 1;
+
+    cstep = w * h;
+
+    if (total() > 0)
+    {
+        size_t totalsize = total() * sizeof(float);
+        data = (float*)fastMalloc(totalsize + (int)sizeof(*refcount));
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+inline void Mat::create(int _w, int _h, int _c)
+{
+    release();
+
+    dims = 3;
+
+    w = _w;
+    h = _h;
+    c = _c;
+
+    cstep = alignSize(w * h * sizeof(float), 16) >> 2;
+
+    if (total() > 0)
+    {
+        size_t totalsize = total() * sizeof(float);
+        data = (float*)fastMalloc(totalsize + (int)sizeof(*refcount));
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+inline void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+inline void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+        fastFree(data);
+
+    dims = 0;
+    data = 0;
+
+    w = 0;
+    h = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+inline bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+inline size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+inline Mat Mat::channel(int c)
+{
+    return Mat(w, h, data + cstep * c);
+}
+
+inline const Mat Mat::channel(int c) const
+{
+    return Mat(w, h, data + cstep * c);
+}
+
+inline float* Mat::row(int y)
+{
+    return data + w * y;
+}
+
+inline const float* Mat::row(int y) const
+{
+    return data + w * y;
+}
+
+inline Mat::operator float*()
+{
+    return data;
+}
+
+inline Mat::operator const float*() const
+{
+    return data;
+}
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/src/mat_pixel.cpp b/src/mat_pixel.cpp
new file mode 100644
index 00000000000..3a649b89809
--- /dev/null
+++ b/src/mat_pixel.cpp
@@ -0,0 +1,2084 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "mat.h"
+#include <limits.h>
+#include <algorithm>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+static Mat from_rgb(const unsigned char* rgb, int w, int h)
+{
+    Mat m(w, h, 3);
+    if (m.empty())
+        return m;
+
+    float* ptr0 = m.channel(0);
+    float* ptr1 = m.channel(1);
+    float* ptr2 = m.channel(2);
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 3;
+    int remain = size - (nn << 3);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    for (; nn>0; nn--)
+    {
+        uint8x8x3_t _rgb = vld3_u8(rgb);
+        uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
+        uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
+        uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
+
+        float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
+        float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
+        float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
+        float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
+        float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
+        float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
+
+        vst1q_f32(ptr0, _rlow);
+        vst1q_f32(ptr0+4, _rhigh);
+        vst1q_f32(ptr1, _glow);
+        vst1q_f32(ptr1+4, _ghigh);
+        vst1q_f32(ptr2, _blow);
+        vst1q_f32(ptr2+4, _bhigh);
+
+        rgb += 3*8;
+        ptr0 += 8;
+        ptr1 += 8;
+        ptr2 += 8;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "0:                             \n"
+        "pld        [%1, #256]          \n"
+        "vld3.u8    {d0-d2}, [%1]!      \n"
+        "vmovl.u8   q8, d0              \n"
+        "vmovl.u8   q9, d1              \n"
+        "vmovl.u8   q10, d2             \n"
+        "vmovl.u16  q0, d16             \n"
+        "vmovl.u16  q1, d17             \n"
+        "vmovl.u16  q2, d18             \n"
+        "vmovl.u16  q3, d19             \n"
+        "vmovl.u16  q8, d20             \n"
+        "vmovl.u16  q9, d21             \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "vcvt.f32.u32   q2, q2          \n"
+        "vcvt.f32.u32   q3, q3          \n"
+        "vcvt.f32.u32   q8, q8          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%2 :128]! \n"
+        "vcvt.f32.u32   q9, q9          \n"
+        "vst1.f32   {d4-d7}, [%3 :128]! \n"
+        "vst1.f32   {d16-d19}, [%4 :128]!\n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(rgb),    // %1
+          "=r"(ptr0),   // %2
+          "=r"(ptr1),   // %3
+          "=r"(ptr2)    // %4
+        : "0"(nn),
+          "1"(rgb),
+          "2"(ptr0),
+          "3"(ptr1),
+          "4"(ptr2)
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr0 = rgb[0];
+        *ptr1 = rgb[1];
+        *ptr2 = rgb[2];
+
+        rgb += 3;
+        ptr0++;
+        ptr1++;
+        ptr2++;
+    }
+
+    return m;
+}
+
+static void to_rgb(const Mat& m, unsigned char* rgb)
+{
+    const float* ptr0 = m.channel(0);
+    const float* ptr1 = m.channel(1);
+    const float* ptr2 = m.channel(2);
+
+    int size = m.w * m.h;
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)std::min(std::max((int)(X), 0), 255);
+
+    int remain = size;
+
+    for (; remain>0; remain--)
+    {
+        rgb[0] = SATURATE_CAST_UCHAR(*ptr0);
+        rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
+        rgb[2] = SATURATE_CAST_UCHAR(*ptr2);
+
+        rgb += 3;
+        ptr0++;
+        ptr1++;
+        ptr2++;
+    }
+
+#undef SATURATE_CAST_UCHAR
+}
+
+static Mat from_gray(const unsigned char* gray, int w, int h)
+{
+    Mat m(w, h, 1);
+    if (m.empty())
+        return m;
+
+    float* ptr = m;
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 4;
+    int remain = size - (nn << 4);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    for (; nn>0; nn--)
+    {
+        uint8x16_t _gray = vld1q_u8(gray);
+        uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
+        uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
+
+        float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
+        float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
+        float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
+        float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
+
+        vst1q_f32(ptr, _graylow_0);
+        vst1q_f32(ptr+4, _grayhigh_0);
+        vst1q_f32(ptr+8, _graylow_1);
+        vst1q_f32(ptr+12, _grayhigh_1);
+
+        gray += 16;
+        ptr += 16;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "0:                             \n"
+        "pld        [%1, #128]          \n"
+        "vld1.u8    {d0,d1}, [%1]!      \n"
+        "vmovl.u8   q8, d0              \n"
+        "vmovl.u8   q9, d1              \n"
+        "vmovl.u16  q0, d16             \n"
+        "vmovl.u16  q1, d17             \n"
+        "vmovl.u16  q2, d18             \n"
+        "vmovl.u16  q3, d19             \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "vcvt.f32.u32   q2, q2          \n"
+        "vcvt.f32.u32   q3, q3          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%2 :128]! \n"
+        "vst1.f32   {d4-d7}, [%2 :128]! \n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(gray),   // %1
+          "=r"(ptr)     // %2
+        : "0"(nn),
+          "1"(gray),
+          "2"(ptr)
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr = *gray;
+
+        gray++;
+        ptr++;
+    }
+
+    return m;
+}
+
+static void to_gray(const Mat& m, unsigned char* gray)
+{
+    const float* ptr = m;
+
+    int size = m.w * m.h;
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)std::min(std::max((int)(X), 0), 255);
+
+    int remain = size;
+
+    for (; remain>0; remain--)
+    {
+        *gray = SATURATE_CAST_UCHAR(*ptr);
+
+        gray++;
+        ptr++;
+    }
+
+#undef SATURATE_CAST_UCHAR
+}
+
+static Mat from_rgba(const unsigned char* rgba, int w, int h)
+{
+    Mat m(w, h, 4);
+    if (m.empty())
+        return m;
+
+    float* ptr0 = m.channel(0);
+    float* ptr1 = m.channel(1);
+    float* ptr2 = m.channel(2);
+    float* ptr3 = m.channel(3);
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 3;
+    int remain = size - (nn << 3);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    for (; nn>0; nn--)
+    {
+        uint8x8x4_t _rgba = vld4_u8(rgba);
+        int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
+        int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
+        int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
+        int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
+
+        float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
+        float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
+        float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
+        float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
+        float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
+        float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
+        float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
+        float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
+
+        vst1q_f32(ptr0, _rlow);
+        vst1q_f32(ptr0+4, _rhigh);
+        vst1q_f32(ptr1, _glow);
+        vst1q_f32(ptr1+4, _ghigh);
+        vst1q_f32(ptr2, _blow);
+        vst1q_f32(ptr2+4, _bhigh);
+        vst1q_f32(ptr3, _alow);
+        vst1q_f32(ptr3+4, _ahigh);
+
+        rgba += 4*8;
+        ptr0 += 8;
+        ptr1 += 8;
+        ptr2 += 8;
+        ptr3 += 8;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "0:                             \n"
+        "pld        [%1, #256]          \n"
+        "vld4.u8    {d0-d3}, [%1]!      \n"
+        "vmovl.u8   q8, d0              \n"
+        "vmovl.u8   q9, d1              \n"
+        "vmovl.u8   q10, d2             \n"
+        "vmovl.u8   q11, d3             \n"
+        "vmovl.u16  q0, d16             \n"
+        "vmovl.u16  q1, d17             \n"
+        "vmovl.u16  q2, d18             \n"
+        "vmovl.u16  q3, d19             \n"
+        "vmovl.u16  q8, d20             \n"
+        "vmovl.u16  q9, d21             \n"
+        "vmovl.u16  q10, d22            \n"
+        "vmovl.u16  q11, d23            \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "vcvt.f32.u32   q2, q2          \n"
+        "vcvt.f32.u32   q3, q3          \n"
+        "vcvt.f32.u32   q8, q8          \n"
+        "vcvt.f32.u32   q9, q9          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%2 :128]! \n"
+        "vcvt.f32.u32   q10, q10        \n"
+        "vcvt.f32.u32   q11, q11        \n"
+        "vst1.f32   {d4-d7}, [%3 :128]! \n"
+        "vst1.f32   {d16-d19}, [%4 :128]!\n"
+        "vst1.f32   {d20-d23}, [%5 :128]!\n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(rgba),   // %1
+          "=r"(ptr0),   // %2
+          "=r"(ptr1),   // %3
+          "=r"(ptr2),   // %4
+          "=r"(ptr3)    // %5
+        : "0"(nn),
+          "1"(rgba),
+          "2"(ptr0),
+          "3"(ptr1),
+          "4"(ptr2),
+          "5"(ptr3)
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr0 = rgba[0];
+        *ptr1 = rgba[1];
+        *ptr2 = rgba[2];
+        *ptr3 = rgba[3];
+
+        rgba += 4;
+        ptr0++;
+        ptr1++;
+        ptr2++;
+        ptr3++;
+    }
+
+    return m;
+}
+
+static void to_rgba(const Mat& m, unsigned char* rgba)
+{
+    const float* ptr0 = m.channel(0);
+    const float* ptr1 = m.channel(1);
+    const float* ptr2 = m.channel(2);
+    const float* ptr3 = m.channel(3);
+
+    int size = m.w * m.h;
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)std::min(std::max((int)(X), 0), 255);
+
+    int remain = size;
+
+    for (; remain>0; remain--)
+    {
+        rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
+        rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
+        rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
+        rgba[3] = SATURATE_CAST_UCHAR(*ptr3);
+
+        rgba += 4;
+        ptr0++;
+        ptr1++;
+        ptr2++;
+        ptr3++;
+    }
+
+#undef SATURATE_CAST_UCHAR
+}
+
+static Mat from_rgb2bgr(const unsigned char* rgb, int w, int h)
+{
+    Mat m(w, h, 3);
+    if (m.empty())
+        return m;
+
+    float* ptr0 = m.channel(0);
+    float* ptr1 = m.channel(1);
+    float* ptr2 = m.channel(2);
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 3;
+    int remain = size - (nn << 3);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    for (; nn>0; nn--)
+    {
+        uint8x8x3_t _rgb = vld3_u8(rgb);
+        uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
+        uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
+        uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
+
+        float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
+        float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
+        float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
+        float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
+        float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
+        float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
+
+        vst1q_f32(ptr2, _rlow);
+        vst1q_f32(ptr2+4, _rhigh);
+        vst1q_f32(ptr1, _glow);
+        vst1q_f32(ptr1+4, _ghigh);
+        vst1q_f32(ptr0, _blow);
+        vst1q_f32(ptr0+4, _bhigh);
+
+        rgb += 3*8;
+        ptr0 += 8;
+        ptr1 += 8;
+        ptr2 += 8;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "0:                             \n"
+        "pld        [%1, #256]          \n"
+        "vld3.u8    {d0-d2}, [%1]!      \n"
+        "vmovl.u8   q8, d0              \n"
+        "vmovl.u8   q9, d1              \n"
+        "vmovl.u8   q10, d2             \n"
+        "vmovl.u16  q0, d16             \n"
+        "vmovl.u16  q1, d17             \n"
+        "vmovl.u16  q2, d18             \n"
+        "vmovl.u16  q3, d19             \n"
+        "vmovl.u16  q8, d20             \n"
+        "vmovl.u16  q9, d21             \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "vcvt.f32.u32   q2, q2          \n"
+        "vcvt.f32.u32   q3, q3          \n"
+        "vcvt.f32.u32   q8, q8          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%4 :128]! \n"
+        "vcvt.f32.u32   q9, q9          \n"
+        "vst1.f32   {d4-d7}, [%3 :128]! \n"
+        "vst1.f32   {d16-d19}, [%2 :128]!\n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(rgb),    // %1
+          "=r"(ptr0),   // %2
+          "=r"(ptr1),   // %3
+          "=r"(ptr2)    // %4
+        : "0"(nn),
+          "1"(rgb),
+          "2"(ptr0),
+          "3"(ptr1),
+          "4"(ptr2)
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr0 = rgb[2];
+        *ptr1 = rgb[1];
+        *ptr2 = rgb[0];
+
+        rgb += 3;
+        ptr0++;
+        ptr1++;
+        ptr2++;
+    }
+
+    return m;
+}
+
+static void to_bgr2rgb(const Mat& m, unsigned char* rgb)
+{
+    const float* ptr0 = m.channel(0);
+    const float* ptr1 = m.channel(1);
+    const float* ptr2 = m.channel(2);
+
+    int size = m.w * m.h;
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)std::min(std::max((int)(X), 0), 255);
+
+    int remain = size;
+
+    for (; remain>0; remain--)
+    {
+        rgb[2] = SATURATE_CAST_UCHAR(*ptr0);
+        rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
+        rgb[0] = SATURATE_CAST_UCHAR(*ptr2);
+
+        rgb += 3;
+        ptr0++;
+        ptr1++;
+        ptr2++;
+    }
+
+#undef SATURATE_CAST_UCHAR
+}
+
+static Mat from_rgb2gray(const unsigned char* rgb, int w, int h)
+{
+    // coeffs for r g b = 0.299f, 0.587f, 0.114f
+    const unsigned char Y_shift = 8;//14
+    const unsigned char R2Y = 77;
+    const unsigned char G2Y = 150;
+    const unsigned char B2Y = 29;
+
+    Mat m(w, h, 1);
+    if (m.empty())
+        return m;
+
+    float* ptr = m;
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 3;
+    int remain = size - (nn << 3);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    uint8x8_t _R2Y = vdup_n_u8(R2Y);
+    uint8x8_t _G2Y = vdup_n_u8(G2Y);
+    uint8x8_t _B2Y = vdup_n_u8(B2Y);
+    for (; nn>0; nn--)
+    {
+        uint8x8x3_t _rgb = vld3_u8(rgb);
+
+        uint16x8_t _y16 = vmull_u8(_rgb.val[0], _R2Y);
+        _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
+        _y16 = vmlal_u8(_y16, _rgb.val[2], _B2Y);
+        _y16 = vshrq_n_u16(_y16, Y_shift);
+
+        float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
+        float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
+
+        vst1q_f32(ptr, _ylow);
+        vst1q_f32(ptr+4, _yhigh);
+
+        rgb += 3*8;
+        ptr += 8;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "vdup.u8    d16, %6             \n"
+        "vdup.u8    d17, %7             \n"
+        "vdup.u8    d18, %8             \n"
+        "0:                             \n"
+        "pld        [%1, #256]          \n"
+        "vld3.u8    {d0-d2}, [%1]!      \n"
+        "vmull.u8   q2, d0, d16         \n"
+        "vmlal.u8   q2, d1, d17         \n"
+        "vmlal.u8   q2, d2, d18         \n"
+        "vshr.u16   q2, q2, #8          \n" // Y_shift
+        "vmovl.u16  q0, d4              \n"
+        "vmovl.u16  q1, d5              \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%2 :128]! \n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(rgb),    // %1
+          "=r"(ptr)     // %2
+        : "0"(nn),
+          "1"(rgb),
+          "2"(ptr),
+          "r"(R2Y),     // %6
+          "r"(G2Y),     // %7
+          "r"(B2Y)      // %8
+        : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr = (rgb[0] * R2Y + rgb[1] * G2Y + rgb[2] * B2Y) >> Y_shift;
+
+        rgb += 3;
+        ptr++;
+    }
+
+    return m;
+}
+
+static Mat from_bgr2gray(const unsigned char* bgr, int w, int h)
+{
+    // coeffs for r g b = 0.299f, 0.587f, 0.114f
+    const unsigned char Y_shift = 8;//14
+    const unsigned char R2Y = 77;
+    const unsigned char G2Y = 150;
+    const unsigned char B2Y = 29;
+
+    Mat m(w, h, 1);
+    if (m.empty())
+        return m;
+
+    float* ptr = m;
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 3;
+    int remain = size - (nn << 3);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    uint8x8_t _R2Y = vdup_n_u8(R2Y);
+    uint8x8_t _G2Y = vdup_n_u8(G2Y);
+    uint8x8_t _B2Y = vdup_n_u8(B2Y);
+    for (; nn>0; nn--)
+    {
+        uint8x8x3_t _rgb = vld3_u8(bgr);
+
+        uint16x8_t _y16 = vmull_u8(_rgb.val[2], _R2Y);
+        _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
+        _y16 = vmlal_u8(_y16, _rgb.val[0], _B2Y);
+        _y16 = vshrq_n_u16(_y16, Y_shift);
+
+        float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
+        float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
+
+        vst1q_f32(ptr, _ylow);
+        vst1q_f32(ptr+4, _yhigh);
+
+        bgr += 3*8;
+        ptr += 8;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "vdup.u8    d16, %6             \n"
+        "vdup.u8    d17, %7             \n"
+        "vdup.u8    d18, %8             \n"
+        "0:                             \n"
+        "pld        [%1, #256]          \n"
+        "vld3.u8    {d0-d2}, [%1]!      \n"
+        "vmull.u8   q2, d2, d16         \n"
+        "vmlal.u8   q2, d1, d17         \n"
+        "vmlal.u8   q2, d0, d18         \n"
+        "vshr.u16   q2, q2, #8          \n" // Y_shift
+        "vmovl.u16  q0, d4              \n"
+        "vmovl.u16  q1, d5              \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%2 :128]! \n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(bgr),    // %1
+          "=r"(ptr)     // %2
+        : "0"(nn),
+          "1"(bgr),
+          "2"(ptr),
+          "r"(R2Y),     // %6
+          "r"(G2Y),     // %7
+          "r"(B2Y)      // %8
+        : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr = (bgr[2] * R2Y + bgr[1] * G2Y + bgr[0] * B2Y) >> Y_shift;
+
+        bgr += 3;
+        ptr++;
+    }
+
+    return m;
+}
+
+static Mat from_gray2rgb(const unsigned char* gray, int w, int h)
+{
+    Mat m(w, h, 3);
+    if (m.empty())
+        return m;
+
+    float* ptr0 = m.channel(0);
+    float* ptr1 = m.channel(1);
+    float* ptr2 = m.channel(2);
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 4;
+    int remain = size - (nn << 4);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    for (; nn>0; nn--)
+    {
+        uint8x16_t _gray = vld1q_u8(gray);
+        uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
+        uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
+
+        float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
+        float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
+        float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
+        float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
+
+        vst1q_f32(ptr0, _graylow_0);
+        vst1q_f32(ptr0+4, _grayhigh_0);
+        vst1q_f32(ptr0+8, _graylow_1);
+        vst1q_f32(ptr0+12, _grayhigh_1);
+
+        vst1q_f32(ptr1, _graylow_0);
+        vst1q_f32(ptr1+4, _grayhigh_0);
+        vst1q_f32(ptr1+8, _graylow_1);
+        vst1q_f32(ptr1+12, _grayhigh_1);
+
+        vst1q_f32(ptr2, _graylow_0);
+        vst1q_f32(ptr2+4, _grayhigh_0);
+        vst1q_f32(ptr2+8, _graylow_1);
+        vst1q_f32(ptr2+12, _grayhigh_1);
+
+        gray += 16;
+        ptr0 += 16;
+        ptr1 += 16;
+        ptr2 += 16;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "0:                             \n"
+        "pld        [%1, #128]          \n"
+        "vld1.u8    {d0,d1}, [%1]!      \n"
+        "vmovl.u8   q8, d0              \n"
+        "vmovl.u8   q9, d1              \n"
+        "vmovl.u16  q0, d16             \n"
+        "vmovl.u16  q1, d17             \n"
+        "vmovl.u16  q2, d18             \n"
+        "vmovl.u16  q3, d19             \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "vcvt.f32.u32   q2, q2          \n"
+        "vcvt.f32.u32   q3, q3          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%2 :128]! \n"
+        "vst1.f32   {d4-d7}, [%2 :128]! \n"
+        "vst1.f32   {d0-d3}, [%3 :128]! \n"
+        "vst1.f32   {d4-d7}, [%3 :128]! \n"
+        "vst1.f32   {d0-d3}, [%4 :128]! \n"
+        "vst1.f32   {d4-d7}, [%4 :128]! \n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(gray),   // %1
+          "=r"(ptr0),   // %2
+          "=r"(ptr1),   // %3
+          "=r"(ptr2)    // %4
+        : "0"(nn),
+          "1"(gray),
+          "2"(ptr0),
+          "3"(ptr1),
+          "4"(ptr2)
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr0 = *gray;
+        *ptr1 = *gray;
+        *ptr2 = *gray;
+
+        gray++;
+        ptr0++;
+        ptr1++;
+        ptr2++;
+    }
+
+    return m;
+}
+
+static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h)
+{
+    Mat m(w, h, 3);
+    if (m.empty())
+        return m;
+
+    float* ptr0 = m.channel(0);
+    float* ptr1 = m.channel(1);
+    float* ptr2 = m.channel(2);
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 3;
+    int remain = size - (nn << 3);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    for (; nn>0; nn--)
+    {
+        uint8x8x4_t _rgba = vld4_u8(rgba);
+        int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
+        int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
+        int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
+
+        float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
+        float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
+        float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
+        float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
+        float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
+        float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
+
+        vst1q_f32(ptr0, _rlow);
+        vst1q_f32(ptr0+4, _rhigh);
+        vst1q_f32(ptr1, _glow);
+        vst1q_f32(ptr1+4, _ghigh);
+        vst1q_f32(ptr2, _blow);
+        vst1q_f32(ptr2+4, _bhigh);
+
+        rgba += 4*8;
+        ptr0 += 8;
+        ptr1 += 8;
+        ptr2 += 8;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "0:                             \n"
+        "pld        [%1, #256]          \n"
+        "vld4.u8    {d0-d3}, [%1]!      \n"
+        "vmovl.u8   q8, d0              \n"
+        "vmovl.u8   q9, d1              \n"
+        "vmovl.u8   q10, d2             \n"
+        "vmovl.u16  q0, d16             \n"
+        "vmovl.u16  q1, d17             \n"
+        "vmovl.u16  q2, d18             \n"
+        "vmovl.u16  q3, d19             \n"
+        "vmovl.u16  q8, d20             \n"
+        "vmovl.u16  q9, d21             \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "vcvt.f32.u32   q2, q2          \n"
+        "vcvt.f32.u32   q3, q3          \n"
+        "vcvt.f32.u32   q8, q8          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%2 :128]! \n"
+        "vcvt.f32.u32   q9, q9          \n"
+        "vst1.f32   {d4-d7}, [%3 :128]! \n"
+        "vst1.f32   {d16-d19}, [%4 :128]!\n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(rgba),   // %1
+          "=r"(ptr0),   // %2
+          "=r"(ptr1),   // %3
+          "=r"(ptr2)    // %4
+        : "0"(nn),
+          "1"(rgba),
+          "2"(ptr0),
+          "3"(ptr1),
+          "4"(ptr2)
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr0 = rgba[0];
+        *ptr1 = rgba[1];
+        *ptr2 = rgba[2];
+
+        rgba += 4;
+        ptr0++;
+        ptr1++;
+        ptr2++;
+    }
+
+    return m;
+}
+
+static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h)
+{
+    Mat m(w, h, 3);
+    if (m.empty())
+        return m;
+
+    float* ptr0 = m.channel(0);
+    float* ptr1 = m.channel(1);
+    float* ptr2 = m.channel(2);
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 3;
+    int remain = size - (nn << 3);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    for (; nn>0; nn--)
+    {
+        uint8x8x4_t _rgba = vld4_u8(rgba);
+        int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
+        int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
+        int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
+
+        float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
+        float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
+        float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
+        float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
+        float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
+        float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
+
+        vst1q_f32(ptr2, _rlow);
+        vst1q_f32(ptr2+4, _rhigh);
+        vst1q_f32(ptr1, _glow);
+        vst1q_f32(ptr1+4, _ghigh);
+        vst1q_f32(ptr0, _blow);
+        vst1q_f32(ptr0+4, _bhigh);
+
+        rgba += 4*8;
+        ptr0 += 8;
+        ptr1 += 8;
+        ptr2 += 8;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "0:                             \n"
+        "pld        [%1, #256]          \n"
+        "vld4.u8    {d0-d3}, [%1]!      \n"
+        "vmovl.u8   q8, d0              \n"
+        "vmovl.u8   q9, d1              \n"
+        "vmovl.u8   q10, d2             \n"
+        "vmovl.u16  q0, d16             \n"
+        "vmovl.u16  q1, d17             \n"
+        "vmovl.u16  q2, d18             \n"
+        "vmovl.u16  q3, d19             \n"
+        "vmovl.u16  q8, d20             \n"
+        "vmovl.u16  q9, d21             \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "vcvt.f32.u32   q2, q2          \n"
+        "vcvt.f32.u32   q3, q3          \n"
+        "vcvt.f32.u32   q8, q8          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%4 :128]! \n"
+        "vcvt.f32.u32   q9, q9          \n"
+        "vst1.f32   {d4-d7}, [%3 :128]! \n"
+        "vst1.f32   {d16-d19}, [%2 :128]!\n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(rgba),   // %1
+          "=r"(ptr0),   // %2
+          "=r"(ptr1),   // %3
+          "=r"(ptr2)    // %4
+        : "0"(nn),
+          "1"(rgba),
+          "2"(ptr0),
+          "3"(ptr1),
+          "4"(ptr2)
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr0 = rgba[2];
+        *ptr1 = rgba[1];
+        *ptr2 = rgba[0];
+
+        rgba += 4;
+        ptr0++;
+        ptr1++;
+        ptr2++;
+    }
+
+    return m;
+}
+
+static Mat from_rgba2gray(const unsigned char* rgba, int w, int h)
+{
+    // coeffs for r g b = 0.299f, 0.587f, 0.114f
+    const unsigned char Y_shift = 8;//14
+    const unsigned char R2Y = 77;
+    const unsigned char G2Y = 150;
+    const unsigned char B2Y = 29;
+
+    Mat m(w, h, 1);
+    if (m.empty())
+        return m;
+
+    float* ptr = m;
+
+    int size = w * h;
+
+#if __ARM_NEON
+    int nn = size >> 3;
+    int remain = size - (nn << 3);
+#else
+    int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+    uint8x8_t _R2Y = vdup_n_u8(R2Y);
+    uint8x8_t _G2Y = vdup_n_u8(G2Y);
+    uint8x8_t _B2Y = vdup_n_u8(B2Y);
+    for (; nn>0; nn--)
+    {
+        uint8x8x4_t _rgba = vld4_u8(rgba);
+
+        uint16x8_t _y16 = vmull_u8(_rgba.val[0], _R2Y);
+        _y16 = vmlal_u8(_y16, _rgba.val[1], _G2Y);
+        _y16 = vmlal_u8(_y16, _rgba.val[2], _B2Y);
+        _y16 = vshrq_n_u16(_y16, Y_shift);
+
+        float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
+        float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
+
+        vst1q_f32(ptr, _ylow);
+        vst1q_f32(ptr+4, _yhigh);
+
+        rgba += 4*8;
+        ptr += 8;
+    }
+#else
+    if (nn > 0)
+    {
+    asm volatile(
+        "vdup.u8    d16, %6             \n"
+        "vdup.u8    d17, %7             \n"
+        "vdup.u8    d18, %8             \n"
+        "0:                             \n"
+        "pld        [%1, #256]          \n"
+        "vld4.u8    {d0-d3}, [%1]!      \n"
+        "vmull.u8   q2, d0, d16         \n"
+        "vmlal.u8   q2, d1, d17         \n"
+        "vmlal.u8   q2, d2, d18         \n"
+        "vshr.u16   q2, q2, #8          \n" // Y_shift
+        "vmovl.u16  q0, d4              \n"
+        "vmovl.u16  q1, d5              \n"
+        "vcvt.f32.u32   q0, q0          \n"
+        "vcvt.f32.u32   q1, q1          \n"
+        "subs       %0, #1              \n"
+        "vst1.f32   {d0-d3}, [%2 :128]! \n"
+        "bne        0b                  \n"
+        : "=r"(nn),     // %0
+          "=r"(rgba),   // %1
+          "=r"(ptr)     // %2
+        : "0"(nn),
+          "1"(rgba),
+          "2"(ptr),
+          "r"(R2Y),     // %6
+          "r"(G2Y),     // %7
+          "r"(B2Y)      // %8
+        : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
+    );
+    }
+#endif // __aarch64__
+#endif // __ARM_NEON
+    for (; remain>0; remain--)
+    {
+        *ptr = (rgba[0] * R2Y + rgba[1] * G2Y + rgba[2] * B2Y) >> Y_shift;
+
+        rgba += 4;
+        ptr++;
+    }
+
+    return m;
+}
+
+void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
+{
+    const int INTER_RESIZE_COEF_BITS=11;
+    const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
+//     const int ONE=INTER_RESIZE_COEF_SCALE;
+
+    double scale_x = (double)srcw / w;
+    double scale_y = (double)srch / h;
+
+    int* buf = new int[w + h + w + h];
+
+    int* xofs = buf;//new int[w];
+    int* yofs = buf + w;//new int[h];
+
+    short* ialpha = (short*)(buf + w + h);//new short[w * 2];
+    short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
+
+    float fx;
+    float fy;
+    int sx;
+    int sy;
+
+#define SATURATE_CAST_SHORT(X) (short)std::min(std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
+
+    for (int dx = 0; dx < w; dx++)
+    {
+        fx = (float)((dx + 0.5) * scale_x - 0.5);
+        sx = fx;//cvFloor(fx);
+        fx -= sx;
+
+        if (sx >= srcw - 1)
+        {
+            sx = srcw - 2;
+            fx = 1.f;
+        }
+
+        xofs[dx] = sx*3;
+
+        float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
+        float a1 =        fx  * INTER_RESIZE_COEF_SCALE;
+
+        ialpha[dx*2    ] = SATURATE_CAST_SHORT(a0);
+        ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
+    }
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        fy = (float)((dy + 0.5) * scale_y - 0.5);
+        sy = fy;//cvFloor(fy);
+        fy -= sy;
+
+        if (sy >= srch - 1)
+        {
+            sy = srch - 2;
+            fy = 1.f;
+        }
+
+        yofs[dy] = sy*3;
+
+        float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
+        float b1 =        fy  * INTER_RESIZE_COEF_SCALE;
+
+        ibeta[dy*2    ] = SATURATE_CAST_SHORT(b0);
+        ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
+    }
+
+#undef SATURATE_CAST_SHORT
+
+    // loop body
+    Mat rowsbuf0((w*3 >> 1) + 3);
+    Mat rowsbuf1((w*3 >> 1) + 3);
+    short* rows0 = (short*)rowsbuf0.data;
+    short* rows1 = (short*)rowsbuf1.data;
+
+    int prev_sy1 = -1;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // hresize one row
+            short* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const unsigned char *S1 = src + srcw * (sy+3);
+
+            const short* ialphap = ialpha;
+            short* rows1p = rows1;
+            for ( int dx = 0; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                short a0 = ialphap[0];
+                short a1 = ialphap[1];
+
+                const unsigned char* S1p = S1 + sx;
+#if __ARM_NEON
+                int16x4_t _a0 = vdup_n_s16(a0);
+                int16x4_t _a1 = vdup_n_s16(a1);
+                uint8x8_t _S1 = vld1_u8(S1p);
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S1low = vget_low_s16(_S116);
+                int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
+                int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+                _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+                int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+                vst1_s16(rows1p, _rows1_sr4);
+#else
+                rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
+                rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
+                rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
+#endif // __ARM_NEON
+
+                ialphap += 2;
+                rows1p += 3;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const unsigned char *S0 = src + srcw * (sy);
+            const unsigned char *S1 = src + srcw * (sy+3);
+
+            const short* ialphap = ialpha;
+            short* rows0p = rows0;
+            short* rows1p = rows1;
+            for ( int dx = 0; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                short a0 = ialphap[0];
+                short a1 = ialphap[1];
+
+                const unsigned char* S0p = S0 + sx;
+                const unsigned char* S1p = S1 + sx;
+#if __ARM_NEON
+                int16x4_t _a0 = vdup_n_s16(a0);
+                int16x4_t _a1 = vdup_n_s16(a1);
+                uint8x8_t _S0 = vld1_u8(S0p);
+                uint8x8_t _S1 = vld1_u8(S1p);
+                int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S0low = vget_low_s16(_S016);
+                int16x4_t _S1low = vget_low_s16(_S116);
+                int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
+                int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
+                int32x4_t _rows0 = vmull_s16(_S0low, _a0);
+                int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+                _rows0 = vmlal_s16(_rows0, _S0high, _a1);
+                _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+                int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
+                int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+                vst1_s16(rows0p, _rows0_sr4);
+                vst1_s16(rows1p, _rows1_sr4);
+#else
+                rows0p[0] = (S0p[0]*a0 + S0p[3]*a1) >> 4;
+                rows0p[1] = (S0p[1]*a0 + S0p[4]*a1) >> 4;
+                rows0p[2] = (S0p[2]*a0 + S0p[5]*a1) >> 4;
+                rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
+                rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
+                rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
+#endif // __ARM_NEON
+
+                ialphap += 2;
+                rows0p += 3;
+                rows1p += 3;
+            }
+        }
+
+        prev_sy1 = sy + 1;
+
+        // vresize
+        short b0 = ibeta[0];
+        short b1 = ibeta[1];
+
+        short* rows0p = rows0;
+        short* rows1p = rows1;
+        unsigned char* Dp = dst + w * 3 * (dy);
+
+#if __ARM_NEON
+        int nn = (w * 3) >> 3;
+#else
+        int nn = 0;
+#endif
+        int remain = (w * 3) - (nn << 3);
+
+#if __ARM_NEON
+#if __aarch64__
+        int16x4_t _b0 = vdup_n_s16(b0);
+        int16x4_t _b1 = vdup_n_s16(b1);
+        int32x4_t _v2 = vdupq_n_s32(2);
+        for (; nn>0; nn--)
+        {
+            int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+            int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+            int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
+            int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
+
+            int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+            int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+            int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+            int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+            int32x4_t _acc = _v2;
+            _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+            _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+            int32x4_t _acc_1 = _v2;
+            _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+            _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+            int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+            int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+            uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+            vst1_u8(Dp, _D);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "vdup.s16   d16, %8         \n"
+            "mov        r4, #2          \n"
+            "vdup.s16   d17, %9         \n"
+            "vdup.s32   q12, r4         \n"
+            "pld        [%0, #128]      \n"
+            "vld1.s16   {d2-d3}, [%0 :128]!\n"
+            "pld        [%1, #128]      \n"
+            "vld1.s16   {d6-d7}, [%1 :128]!\n"
+            "0:                         \n"
+            "vmull.s16  q0, d2, d16     \n"
+            "vmull.s16  q1, d3, d16     \n"
+            "vorr.s32   q10, q12, q12   \n"
+            "vorr.s32   q11, q12, q12   \n"
+            "vmull.s16  q2, d6, d17     \n"
+            "vmull.s16  q3, d7, d17     \n"
+            "vsra.s32   q10, q0, #16    \n"
+            "vsra.s32   q11, q1, #16    \n"
+            "pld        [%0, #128]      \n"
+            "vld1.s16   {d2-d3}, [%0 :128]!\n"
+            "vsra.s32   q10, q2, #16    \n"
+            "vsra.s32   q11, q3, #16    \n"
+            "pld        [%1, #128]      \n"
+            "vld1.s16   {d6-d7}, [%1 :128]!\n"
+            "vshrn.s32  d20, q10, #2    \n"
+            "vshrn.s32  d21, q11, #2    \n"
+            "vqmovun.s16 d20, q10        \n"
+            "vst1.8     {d20}, [%2]!    \n"
+            "subs       %3, #1          \n"
+            "bne        0b              \n"
+            "sub        %0, #16         \n"
+            "sub        %1, #16         \n"
+            : "=r"(rows0p), // %0
+              "=r"(rows1p), // %1
+              "=r"(Dp),     // %2
+              "=r"(nn)      // %3
+            : "0"(rows0p),
+              "1"(rows1p),
+              "2"(Dp),
+              "3"(nn),
+              "r"(b0),      // %8
+              "r"(b1)       // %9
+            : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for ( ; remain; --remain )
+        {
+//             D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+            *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
+        }
+
+        ibeta += 2;
+    }
+
+    delete[] buf;
+}
+
+void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
+{
+    const int INTER_RESIZE_COEF_BITS=11;
+    const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
+//     const int ONE=INTER_RESIZE_COEF_SCALE;
+
+    double scale_x = (double)srcw / w;
+    double scale_y = (double)srch / h;
+
+    int* buf = new int[w + h + w + h];
+
+    int* xofs = buf;//new int[w];
+    int* yofs = buf + w;//new int[h];
+
+    short* ialpha = (short*)(buf + w + h);//new short[w * 2];
+    short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
+
+    float fx;
+    float fy;
+    int sx;
+    int sy;
+
+#define SATURATE_CAST_SHORT(X) (short)std::min(std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
+
+    for (int dx = 0; dx < w; dx++)
+    {
+        fx = (float)((dx + 0.5) * scale_x - 0.5);
+        sx = fx;//cvFloor(fx);
+        fx -= sx;
+
+        if (sx >= srcw - 1)
+        {
+            sx = srcw - 2;
+            fx = 1.f;
+        }
+
+        xofs[dx] = sx;
+
+        float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
+        float a1 =        fx  * INTER_RESIZE_COEF_SCALE;
+
+        ialpha[dx*2    ] = SATURATE_CAST_SHORT(a0);
+        ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
+    }
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        fy = (float)((dy + 0.5) * scale_y - 0.5);
+        sy = fy;//cvFloor(fy);
+        fy -= sy;
+
+        if (sy >= srch - 1)
+        {
+            sy = srch - 2;
+            fy = 1.f;
+        }
+
+        yofs[dy] = sy;
+
+        float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
+        float b1 =        fy  * INTER_RESIZE_COEF_SCALE;
+
+        ibeta[dy*2    ] = SATURATE_CAST_SHORT(b0);
+        ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
+    }
+
+#undef SATURATE_CAST_SHORT
+
+    // loop body
+    Mat rowsbuf0((w >> 1) + 1);
+    Mat rowsbuf1((w >> 1) + 1);
+    short* rows0 = (short*)rowsbuf0.data;
+    short* rows1 = (short*)rowsbuf1.data;
+
+    int prev_sy1 = -1;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // hresize one row
+            short* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const unsigned char *S1 = src + srcw * (sy+1);
+
+            const short* ialphap = ialpha;
+            short* rows1p = rows1;
+            for ( int dx = 0; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                short a0 = ialphap[0];
+                short a1 = ialphap[1];
+
+                const unsigned char* S1p = S1 + sx;
+                rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
+
+                ialphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const unsigned char *S0 = src + srcw * (sy);
+            const unsigned char *S1 = src + srcw * (sy+1);
+
+            const short* ialphap = ialpha;
+            short* rows0p = rows0;
+            short* rows1p = rows1;
+            for ( int dx = 0; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                short a0 = ialphap[0];
+                short a1 = ialphap[1];
+
+                const unsigned char* S0p = S0 + sx;
+                const unsigned char* S1p = S1 + sx;
+                rows0p[dx] = (S0p[0]*a0 + S0p[1]*a1) >> 4;
+                rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
+
+                ialphap += 2;
+            }
+        }
+
+        prev_sy1 = sy + 1;
+
+        // vresize
+        short b0 = ibeta[0];
+        short b1 = ibeta[1];
+
+        short* rows0p = rows0;
+        short* rows1p = rows1;
+        unsigned char* Dp = dst + w * (dy);
+
+#if __ARM_NEON
+        int nn = w >> 3;
+#else
+        int nn = 0;
+#endif
+        int remain = w - (nn << 3);
+
+#if __ARM_NEON
+#if __aarch64__
+        int16x4_t _b0 = vdup_n_s16(b0);
+        int16x4_t _b1 = vdup_n_s16(b1);
+        int32x4_t _v2 = vdupq_n_s32(2);
+        for (; nn>0; nn--)
+        {
+            int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+            int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+            int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
+            int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
+
+            int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+            int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+            int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+            int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+            int32x4_t _acc = _v2;
+            _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+            _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+            int32x4_t _acc_1 = _v2;
+            _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+            _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+            int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+            int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+            uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+            vst1_u8(Dp, _D);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "vdup.s16   d16, %8         \n"
+            "mov        r4, #2          \n"
+            "vdup.s16   d17, %9         \n"
+            "vdup.s32   q12, r4         \n"
+            "pld        [%0, #128]      \n"
+            "vld1.s16   {d2-d3}, [%0 :128]!\n"
+            "pld        [%1, #128]      \n"
+            "vld1.s16   {d6-d7}, [%1 :128]!\n"
+            "0:                         \n"
+            "vmull.s16  q0, d2, d16     \n"
+            "vmull.s16  q1, d3, d16     \n"
+            "vorr.s32   q10, q12, q12   \n"
+            "vorr.s32   q11, q12, q12   \n"
+            "vmull.s16  q2, d6, d17     \n"
+            "vmull.s16  q3, d7, d17     \n"
+            "vsra.s32   q10, q0, #16    \n"
+            "vsra.s32   q11, q1, #16    \n"
+            "pld        [%0, #128]      \n"
+            "vld1.s32   {d2-d3}, [%0 :128]!\n"
+            "vsra.s32   q10, q2, #16    \n"
+            "vsra.s32   q11, q3, #16    \n"
+            "pld        [%1, #128]      \n"
+            "vld1.s32   {d6-d7}, [%1 :128]!\n"
+            "vshrn.s32  d20, q10, #2    \n"
+            "vshrn.s32  d21, q11, #2    \n"
+            "vqmovun.s16 d20, q10        \n"
+            "vst1.8     {d20}, [%2]!    \n"
+            "subs       %3, #1          \n"
+            "bne        0b              \n"
+            "sub        %0, #16         \n"
+            "sub        %1, #16         \n"
+            : "=r"(rows0p), // %0
+              "=r"(rows1p), // %1
+              "=r"(Dp),     // %2
+              "=r"(nn)      // %3
+            : "0"(rows0p),
+              "1"(rows1p),
+              "2"(Dp),
+              "3"(nn),
+              "r"(b0),      // %8
+              "r"(b1)       // %9
+            : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for ( ; remain; --remain )
+        {
+//             D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+            *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
+        }
+
+        ibeta += 2;
+    }
+
+    delete[] buf;
+}
+
+void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
+{
+    const int INTER_RESIZE_COEF_BITS=11;
+    const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
+//     const int ONE=INTER_RESIZE_COEF_SCALE;
+
+    double scale_x = (double)srcw / w;
+    double scale_y = (double)srch / h;
+
+    int* buf = new int[w + h + w + h];
+
+    int* xofs = buf;//new int[w];
+    int* yofs = buf + w;//new int[h];
+
+    short* ialpha = (short*)(buf + w + h);//new short[w * 2];
+    short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
+
+    float fx;
+    float fy;
+    int sx;
+    int sy;
+
+#define SATURATE_CAST_SHORT(X) (short)std::min(std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
+
+    for (int dx = 0; dx < w; dx++)
+    {
+        fx = (float)((dx + 0.5) * scale_x - 0.5);
+        sx = fx;//cvFloor(fx);
+        fx -= sx;
+
+        if (sx >= srcw - 1)
+        {
+            sx = srcw - 2;
+            fx = 1.f;
+        }
+
+        xofs[dx] = sx*4;
+
+        float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
+        float a1 =        fx  * INTER_RESIZE_COEF_SCALE;
+
+        ialpha[dx*2    ] = SATURATE_CAST_SHORT(a0);
+        ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
+    }
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        fy = (float)((dy + 0.5) * scale_y - 0.5);
+        sy = fy;//cvFloor(fy);
+        fy -= sy;
+
+        if (sy >= srch - 1)
+        {
+            sy = srch - 2;
+            fy = 1.f;
+        }
+
+        yofs[dy] = sy*4;
+
+        float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
+        float b1 =        fy  * INTER_RESIZE_COEF_SCALE;
+
+        ibeta[dy*2    ] = SATURATE_CAST_SHORT(b0);
+        ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
+    }
+
+#undef SATURATE_CAST_SHORT
+
+    // loop body
+    Mat rowsbuf0((w*4 >> 1) + 4);
+    Mat rowsbuf1((w*4 >> 1) + 4);
+    short* rows0 = (short*)rowsbuf0.data;
+    short* rows1 = (short*)rowsbuf1.data;
+
+    int prev_sy1 = -1;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // hresize one row
+            short* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const unsigned char *S1 = src + srcw * (sy+4);
+
+            const short* ialphap = ialpha;
+            short* rows1p = rows1;
+            for ( int dx = 0; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                short a0 = ialphap[0];
+                short a1 = ialphap[1];
+
+                const unsigned char* S1p = S1 + sx;
+#if __ARM_NEON
+                int16x4_t _a0 = vdup_n_s16(a0);
+                int16x4_t _a1 = vdup_n_s16(a1);
+                uint8x8_t _S1 = vld1_u8(S1p);
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S1low = vget_low_s16(_S116);
+                int16x4_t _S1high = vget_high_s16(_S116);
+                int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+                _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+                int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+                vst1_s16(rows1p, _rows1_sr4);
+#else
+                rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
+                rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
+                rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
+                rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
+#endif // __ARM_NEON
+
+                ialphap += 2;
+                rows1p += 4;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const unsigned char *S0 = src + srcw * (sy);
+            const unsigned char *S1 = src + srcw * (sy+4);
+
+            const short* ialphap = ialpha;
+            short* rows0p = rows0;
+            short* rows1p = rows1;
+            for ( int dx = 0; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                short a0 = ialphap[0];
+                short a1 = ialphap[1];
+
+                const unsigned char* S0p = S0 + sx;
+                const unsigned char* S1p = S1 + sx;
+#if __ARM_NEON
+                int16x4_t _a0 = vdup_n_s16(a0);
+                int16x4_t _a1 = vdup_n_s16(a1);
+                uint8x8_t _S0 = vld1_u8(S0p);
+                uint8x8_t _S1 = vld1_u8(S1p);
+                int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
+                int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
+                int16x4_t _S0low = vget_low_s16(_S016);
+                int16x4_t _S1low = vget_low_s16(_S116);
+                int16x4_t _S0high = vget_high_s16(_S016);
+                int16x4_t _S1high = vget_high_s16(_S116);
+                int32x4_t _rows0 = vmull_s16(_S0low, _a0);
+                int32x4_t _rows1 = vmull_s16(_S1low, _a0);
+                _rows0 = vmlal_s16(_rows0, _S0high, _a1);
+                _rows1 = vmlal_s16(_rows1, _S1high, _a1);
+                int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
+                int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
+                vst1_s16(rows0p, _rows0_sr4);
+                vst1_s16(rows1p, _rows1_sr4);
+#else
+                rows0p[0] = (S0p[0]*a0 + S0p[4]*a1) >> 4;
+                rows0p[1] = (S0p[1]*a0 + S0p[5]*a1) >> 4;
+                rows0p[2] = (S0p[2]*a0 + S0p[6]*a1) >> 4;
+                rows0p[3] = (S0p[3]*a0 + S0p[7]*a1) >> 4;
+                rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
+                rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
+                rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
+                rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
+#endif // __ARM_NEON
+
+                ialphap += 2;
+                rows0p += 4;
+                rows1p += 4;
+            }
+        }
+
+        prev_sy1 = sy + 1;
+
+        // vresize
+        short b0 = ibeta[0];
+        short b1 = ibeta[1];
+
+        short* rows0p = rows0;
+        short* rows1p = rows1;
+        unsigned char* Dp = dst + w * 4 * (dy);
+
+#if __ARM_NEON
+        int nn = (w * 4) >> 3;
+#else
+        int nn = 0;
+#endif
+        int remain = (w * 4) - (nn << 3);
+
+#if __ARM_NEON
+#if __aarch64__
+        int16x4_t _b0 = vdup_n_s16(b0);
+        int16x4_t _b1 = vdup_n_s16(b1);
+        int32x4_t _v2 = vdupq_n_s32(2);
+        for (; nn>0; nn--)
+        {
+            int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+            int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+            int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
+            int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
+
+            int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+            int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+            int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+            int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+            int32x4_t _acc = _v2;
+            _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+            _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+            int32x4_t _acc_1 = _v2;
+            _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+            _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+            int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+            int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+            uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+            vst1_u8(Dp, _D);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+        }
+#else
+        if (nn > 0)
+        {
+        asm volatile(
+            "vdup.s16   d16, %8         \n"
+            "mov        r4, #2          \n"
+            "vdup.s16   d17, %9         \n"
+            "vdup.s32   q12, r4         \n"
+            "pld        [%0, #128]      \n"
+            "vld1.s16   {d2-d3}, [%0 :128]!\n"
+            "pld        [%1, #128]      \n"
+            "vld1.s16   {d6-d7}, [%1 :128]!\n"
+            "0:                         \n"
+            "vmull.s16  q0, d2, d16     \n"
+            "vmull.s16  q1, d3, d16     \n"
+            "vorr.s32   q10, q12, q12   \n"
+            "vorr.s32   q11, q12, q12   \n"
+            "vmull.s16  q2, d6, d17     \n"
+            "vmull.s16  q3, d7, d17     \n"
+            "vsra.s32   q10, q0, #16    \n"
+            "vsra.s32   q11, q1, #16    \n"
+            "pld        [%0, #128]      \n"
+            "vld1.s32   {d2-d3}, [%0 :128]!\n"
+            "vsra.s32   q10, q2, #16    \n"
+            "vsra.s32   q11, q3, #16    \n"
+            "pld        [%1, #128]      \n"
+            "vld1.s32   {d6-d7}, [%1 :128]!\n"
+            "vshrn.s32  d20, q10, #2    \n"
+            "vshrn.s32  d21, q11, #2    \n"
+            "vqmovun.s16 d20, q10        \n"
+            "vst1.8     {d20}, [%2]!    \n"
+            "subs       %3, #1          \n"
+            "bne        0b              \n"
+            "sub        %0, #16         \n"
+            "sub        %1, #16         \n"
+            : "=r"(rows0p), // %0
+              "=r"(rows1p), // %1
+              "=r"(Dp),     // %2
+              "=r"(nn)      // %3
+            : "0"(rows0p),
+              "1"(rows1p),
+              "2"(Dp),
+              "3"(nn),
+              "r"(b0),      // %8
+              "r"(b1)       // %9
+            : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
+        );
+        }
+#endif // __aarch64__
+#endif // __ARM_NEON
+        for ( ; remain; --remain )
+        {
+//             D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+            *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
+        }
+
+        ibeta += 2;
+    }
+
+    delete[] buf;
+}
+
+Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h)
+{
+    if (type & PIXEL_CONVERT_MASK)
+    {
+        if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB)
+            return from_rgb2bgr(pixels, w, h);
+
+        if (type == PIXEL_RGB2GRAY)
+            return from_rgb2gray(pixels, w, h);
+
+        if (type == PIXEL_BGR2GRAY)
+            return from_bgr2gray(pixels, w, h);
+
+        if (type == PIXEL_GRAY2RGB || type == PIXEL_GRAY2BGR)
+            return from_gray2rgb(pixels, w, h);
+
+        if (type == PIXEL_RGBA2RGB)
+            return from_rgba2rgb(pixels, w, h);
+
+        if (type == PIXEL_RGBA2BGR)
+            return from_rgba2bgr(pixels, w, h);
+
+        if (type == PIXEL_RGBA2GRAY)
+            return from_rgba2gray(pixels, w, h);
+    }
+    else
+    {
+        if (type == PIXEL_RGB || type == PIXEL_BGR)
+            return from_rgb(pixels, w, h);
+
+        if (type == PIXEL_GRAY)
+            return from_gray(pixels, w, h);
+
+        if (type == PIXEL_RGBA)
+            return from_rgba(pixels, w, h);
+    }
+
+    return Mat();
+}
+
+Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height)
+{
+    if (w == target_width && h == target_height)
+        return Mat::from_pixels(pixels, type, w, h);
+
+    Mat m;
+
+    int type_from = type & PIXEL_FORMAT_MASK;
+
+    if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
+    {
+        unsigned char* dst = new unsigned char[target_width * target_height * 3];
+
+        resize_bilinear_c3(pixels, w, h, dst, target_width, target_height);
+
+        m = Mat::from_pixels(dst, type, target_width, target_height);
+
+        delete[] dst;
+    }
+    else if (type_from == PIXEL_GRAY)
+    {
+        unsigned char* dst = new unsigned char[target_width * target_height];
+
+        resize_bilinear_c1(pixels, w, h, dst, target_width, target_height);
+
+        m = Mat::from_pixels(dst, type, target_width, target_height);
+
+        delete[] dst;
+    }
+    else if (type_from == PIXEL_RGBA)
+    {
+        unsigned char* dst = new unsigned char[target_width * target_height * 4];
+
+        resize_bilinear_c4(pixels, w, h, dst, target_width, target_height);
+
+        m = Mat::from_pixels(dst, type, target_width, target_height);
+
+        delete[] dst;
+    }
+
+    return m;
+}
+
+void Mat::to_pixels(unsigned char* pixels, int type)
+{
+    if (type & PIXEL_CONVERT_MASK)
+    {
+        if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB)
+            return to_bgr2rgb(*this, pixels);
+    }
+    else
+    {
+        if (type == PIXEL_RGB || type == PIXEL_BGR)
+            return to_rgb(*this, pixels);
+
+        if (type == PIXEL_GRAY)
+            return to_gray(*this, pixels);
+
+        if (type == PIXEL_RGBA)
+            return to_rgba(*this, pixels);
+    }
+}
+
+void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height)
+{
+    if (w == target_width && h == target_height)
+        return to_pixels(pixels, type);
+
+    int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
+
+    if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
+    {
+        unsigned char* src = new unsigned char[w * h * 3];
+
+        to_pixels(src, type);
+
+        resize_bilinear_c3(src, w, h, pixels, target_width, target_height);
+
+        delete[] src;
+    }
+    else if (type_to == PIXEL_GRAY)
+    {
+        unsigned char* src = new unsigned char[w * h];
+
+        to_pixels(src, type);
+
+        resize_bilinear_c1(src, w, h, pixels, target_width, target_height);
+
+        delete[] src;
+    }
+    else if (type_to == PIXEL_RGBA)
+    {
+        unsigned char* src = new unsigned char[w * h * 4];
+
+        to_pixels(src, type);
+
+        resize_bilinear_c4(src, w, h, pixels, target_width, target_height);
+
+        delete[] src;
+    }
+}
+
+} // namespace ncnn
diff --git a/src/net.cpp b/src/net.cpp
new file mode 100644
index 00000000000..2eef581715d
--- /dev/null
+++ b/src/net.cpp
@@ -0,0 +1,774 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif // _OPENMP
+
+namespace ncnn {
+
+Net::Net()
+{
+}
+
+Net::~Net()
+{
+    clear();
+}
+
+#if NCNN_STRING
+int Net::register_custom_layer(const char* type, layer_creator_func creator)
+{
+    int typeindex = layer_to_index(type);
+    if (typeindex != 0)
+    {
+        fprintf(stderr, "can not register build-in layer type %s\n", type);
+        return -1;
+    }
+
+    int custom_index = custom_layer_to_index(type);
+    if (custom_index == -1)
+    {
+        struct layer_registry_entry entry = { type, creator };
+        custom_layer_registry.push_back(entry);
+    }
+    else
+    {
+        fprintf(stderr, "overwrite existing custom layer type %s\n", type);
+        custom_layer_registry[custom_index].name = type;
+        custom_layer_registry[custom_index].creator = creator;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+
+int Net::register_custom_layer(int index, layer_creator_func creator)
+{
+    int custom_index = index & ~LayerType::CustomBit;
+    if (index == custom_index)
+    {
+        fprintf(stderr, "can not register build-in layer index %d\n", custom_index);
+        return -1;
+    }
+
+    if ((int)custom_layer_registry.size() <= custom_index)
+    {
+#if NCNN_STRING
+        struct layer_registry_entry dummy = { "", 0 };
+#else
+        struct layer_registry_entry dummy = { 0 };
+#endif // NCNN_STRING
+        custom_layer_registry.resize(custom_index + 1, dummy);
+    }
+
+    if (custom_layer_registry[custom_index].creator)
+    {
+        fprintf(stderr, "overwrite existing custom layer index %d\n", custom_index);
+    }
+
+    custom_layer_registry[custom_index].creator = creator;
+    return 0;
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Net::load_param(FILE* fp)
+{
+    // parse
+    int layer_count = 0;
+    int blob_count = 0;
+    fscanf(fp, "%d %d", &layer_count, &blob_count);
+
+    layers.resize(layer_count);
+    blobs.resize(blob_count);
+
+    int layer_index = 0;
+    int blob_index = 0;
+    while (!feof(fp))
+    {
+        int nscan = 0;
+
+        char layer_type[256];
+        char layer_name[256];
+        int bottom_count = 0;
+        int top_count = 0;
+        nscan = fscanf(fp, "%256s %256s %d %d", layer_type, layer_name, &bottom_count, &top_count);
+        if (nscan != 4)
+        {
+            continue;
+        }
+
+        int typeindex = layer_to_index(layer_type);
+        Layer* layer = create_layer(typeindex);
+        if (!layer)
+        {
+            typeindex = custom_layer_to_index(layer_type);
+            layer = create_custom_layer(typeindex);
+        }
+
+        layer->type = std::string(layer_type);
+        layer->name = std::string(layer_name);
+//         fprintf(stderr, "new layer %d %s\n", layer_index, layer_name);
+
+        layer->bottoms.resize(bottom_count);
+        for (int i=0; i<bottom_count; i++)
+        {
+            char bottom_name[256];
+            nscan = fscanf(fp, "%256s", bottom_name);
+            if (nscan != 1)
+            {
+                continue;
+            }
+
+            int bottom_blob_index = find_blob_index_by_name(bottom_name);
+            if (bottom_blob_index == -1)
+            {
+                Blob& blob = blobs[blob_index];
+
+                bottom_blob_index = blob_index;
+
+                blob.name = std::string(bottom_name);
+//                 fprintf(stderr, "new blob %s\n", bottom_name);
+
+                blob_index++;
+            }
+
+            Blob& blob = blobs[bottom_blob_index];
+
+            blob.consumers.push_back(layer_index);
+
+            layer->bottoms[i] = bottom_blob_index;
+        }
+
+        layer->tops.resize(top_count);
+        for (int i=0; i<top_count; i++)
+        {
+            Blob& blob = blobs[blob_index];
+
+            char blob_name[256];
+            nscan = fscanf(fp, "%256s", blob_name);
+            if (nscan != 1)
+            {
+                continue;
+            }
+
+            blob.name = std::string(blob_name);
+//             fprintf(stderr, "new blob %s\n", blob_name);
+
+            blob.producer = layer_index;
+
+            layer->tops[i] = blob_index;
+
+            blob_index++;
+        }
+
+        // layer specific params
+        int lr = layer->load_param(fp);
+        if (lr != 0)
+        {
+            fprintf(stderr, "layer load_param failed\n");
+            continue;
+        }
+
+        layers[layer_index] = layer;
+
+        layer_index++;
+    }
+
+    return 0;
+}
+
+int Net::load_param(const char* protopath)
+{
+    FILE* fp = fopen(protopath, "rb");
+    if (!fp)
+    {
+        fprintf(stderr, "fopen %s failed\n", protopath);
+        return -1;
+    }
+
+    int ret = load_param(fp);
+
+    fclose(fp);
+
+    return ret;
+}
+#endif // NCNN_STRING
+
+int Net::load_param_bin(FILE* fp)
+{
+    int layer_count = 0;
+    fread(&layer_count, sizeof(int), 1, fp);
+
+    int blob_count = 0;
+    fread(&blob_count, sizeof(int), 1, fp);
+
+    layers.resize(layer_count);
+    blobs.resize(blob_count);
+
+    for (int i=0; i<layer_count; i++)
+    {
+        int typeindex;
+        fread(&typeindex, sizeof(int), 1, fp);
+
+        int bottom_count;
+        fread(&bottom_count, sizeof(int), 1, fp);
+
+        int top_count;
+        fread(&top_count, sizeof(int), 1, fp);
+
+        Layer* layer = create_layer(typeindex);
+        if (!layer)
+        {
+            int custom_index = typeindex & ~LayerType::CustomBit;
+            layer = create_custom_layer(custom_index);
+        }
+
+//         layer->type = std::string(layer_type);
+//         layer->name = std::string(layer_name);
+//         fprintf(stderr, "new layer %d\n", typeindex);
+
+        layer->bottoms.resize(bottom_count);
+        for (int j=0; j<bottom_count; j++)
+        {
+            int bottom_blob_index;
+            fread(&bottom_blob_index, sizeof(int), 1, fp);
+
+            Blob& blob = blobs[bottom_blob_index];
+
+            blob.consumers.push_back(i);
+
+            layer->bottoms[j] = bottom_blob_index;
+        }
+
+        layer->tops.resize(top_count);
+        for (int j=0; j<top_count; j++)
+        {
+            int top_blob_index;
+            fread(&top_blob_index, sizeof(int), 1, fp);
+
+            Blob& blob = blobs[top_blob_index];
+
+//             blob.name = std::string(blob_name);
+//             fprintf(stderr, "new blob %s\n", blob_name);
+
+            blob.producer = i;
+
+            layer->tops[j] = top_blob_index;
+        }
+
+        // layer specific params
+        int lr = layer->load_param_bin(fp);
+        if (lr != 0)
+        {
+            fprintf(stderr, "layer load_param failed\n");
+            continue;
+        }
+
+        layers[i] = layer;
+    }
+
+    return 0;
+}
+
+int Net::load_param_bin(const char* protopath)
+{
+    FILE* fp = fopen(protopath, "rb");
+    if (!fp)
+    {
+        fprintf(stderr, "fopen %s failed\n", protopath);
+        return -1;
+    }
+
+    int ret = load_param_bin(fp);
+
+    fclose(fp);
+
+    return ret;
+}
+
+int Net::load_model(FILE* fp)
+{
+    // load file
+    int ret = 0;
+
+    for (size_t i=0; i<layers.size(); i++)
+    {
+        Layer* layer = layers[i];
+
+        int lret = layer->load_model(fp);
+        if (lret != 0)
+        {
+            fprintf(stderr, "layer load_model %d failed\n", (int)i);
+            ret = -1;
+            break;
+        }
+    }
+
+    return ret;
+}
+
+int Net::load_model(const char* modelpath)
+{
+    FILE* fp = fopen(modelpath, "rb");
+    if (!fp)
+    {
+        fprintf(stderr, "fopen %s failed\n", modelpath);
+        return -1;
+    }
+
+    int ret = load_model(fp);
+
+    fclose(fp);
+
+    return ret;
+}
+#endif // NCNN_STDIO
+
+int Net::load_param(const unsigned char* _mem)
+{
+    if ((unsigned long)_mem & 0x3)
+    {
+        // reject unaligned memory
+        fprintf(stderr, "memory not 32-bit aligned at %p\n", _mem);
+        return 0;
+    }
+
+    const unsigned char* mem = _mem;
+    int layer_count = *(int*)(mem);
+    mem += 4;
+
+    int blob_count = *(int*)(mem);
+    mem += 4;
+
+    layers.resize(layer_count);
+    blobs.resize(blob_count);
+
+    for (int i=0; i<layer_count; i++)
+    {
+        int typeindex = *(int*)mem;
+        mem += 4;
+
+        int bottom_count = *(int*)mem;
+        mem += 4;
+
+        int top_count = *(int*)mem;
+        mem += 4;
+
+        Layer* layer = create_layer(typeindex);
+        if (!layer)
+        {
+            int custom_index = typeindex & ~LayerType::CustomBit;
+            layer = create_custom_layer(custom_index);
+        }
+
+//         layer->type = std::string(layer_type);
+//         layer->name = std::string(layer_name);
+//         fprintf(stderr, "new layer %d\n", typeindex);
+
+        layer->bottoms.resize(bottom_count);
+        for (int j=0; j<bottom_count; j++)
+        {
+            int bottom_blob_index = *(int*)mem;
+            mem += 4;
+
+            Blob& blob = blobs[bottom_blob_index];
+
+            blob.consumers.push_back(i);
+
+            layer->bottoms[j] = bottom_blob_index;
+        }
+
+        layer->tops.resize(top_count);
+        for (int j=0; j<top_count; j++)
+        {
+            int top_blob_index = *(int*)mem;
+            mem += 4;
+
+            Blob& blob = blobs[top_blob_index];
+
+//             blob.name = std::string(blob_name);
+//             fprintf(stderr, "new blob %s\n", blob_name);
+
+            blob.producer = i;
+
+            layer->tops[j] = top_blob_index;
+        }
+
+        // layer specific params
+        int lr = layer->load_param(mem);
+        if (lr != 0)
+        {
+            fprintf(stderr, "layer load_param failed\n");
+            continue;
+        }
+
+        layers[i] = layer;
+    }
+
+    return mem - _mem;
+}
+
+int Net::load_model(const unsigned char* _mem)
+{
+    if ((unsigned long)_mem & 0x3)
+    {
+        // reject unaligned memory
+        fprintf(stderr, "memory not 32-bit aligned at %p\n", _mem);
+        return 0;
+    }
+
+    const unsigned char* mem = _mem;
+    for (size_t i=0; i<layers.size(); i++)
+    {
+        Layer* layer = layers[i];
+
+        int lret = layer->load_model(mem);
+        if (lret != 0)
+        {
+            fprintf(stderr, "layer load_model failed\n");
+            return -1;
+        }
+    }
+
+    return mem - _mem;
+}
+
+void Net::clear()
+{
+    blobs.clear();
+    for (size_t i=0; i<layers.size(); i++)
+    {
+        delete layers[i];
+    }
+    layers.clear();
+}
+
+Extractor Net::create_extractor() const
+{
+    return Extractor(this, blobs.size());
+}
+
+#if NCNN_STRING
+int Net::find_blob_index_by_name(const char* name) const
+{
+    for (size_t i=0; i<blobs.size(); i++)
+    {
+        const Blob& blob = blobs[i];
+        if (blob.name == name)
+        {
+            return i;
+        }
+    }
+
+    fprintf(stderr, "find_blob_index_by_name %s failed\n", name);
+    return -1;
+}
+
+int Net::find_layer_index_by_name(const char* name) const
+{
+    for (size_t i=0; i<layers.size(); i++)
+    {
+        const Layer* layer = layers[i];
+        if (layer->name == name)
+        {
+            return i;
+        }
+    }
+
+    fprintf(stderr, "find_layer_index_by_name %s failed\n", name);
+    return -1;
+}
+
+int Net::custom_layer_to_index(const char* type)
+{
+    const int custom_layer_registry_entry_count = custom_layer_registry.size();
+    for (int i=0; i<custom_layer_registry_entry_count; i++)
+    {
+        if (strcmp(type, custom_layer_registry[i].name) == 0)
+        {
+            return i;
+        }
+    }
+
+    fprintf(stderr, "custom layer %s not exists\n", type);
+    return -1;
+}
+#endif // NCNN_STRING
+
+Layer* Net::create_custom_layer(int index)
+{
+    const int custom_layer_registry_entry_count = custom_layer_registry.size();
+    if (index < 0 || index >= custom_layer_registry_entry_count)
+    {
+        fprintf(stderr, "custom layer index %d not exists\n", index);
+        return 0;
+    }
+
+    layer_creator_func layer_creator = custom_layer_registry[index].creator;
+    return layer_creator();
+}
+
+int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightmode) const
+{
+    const Layer* layer = layers[layer_index];
+
+//     fprintf(stderr, "forward_layer %d %s\n", layer_index, layer->name.c_str());
+
+    if (layer->one_blob_only)
+    {
+        // load bottom blob
+        int bottom_blob_index = layer->bottoms[0];
+        int top_blob_index = layer->tops[0];
+
+        if (blob_mats[bottom_blob_index].dims == 0)
+        {
+            int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, lightmode);
+            if (ret != 0)
+                return ret;
+        }
+
+        Mat bottom_blob = blob_mats[bottom_blob_index];
+
+        if (lightmode)
+        {
+            // delete after taken in light mode
+            blob_mats[bottom_blob_index].release();
+            // deep copy for inplace forward if data is shared
+            if (layer->support_inplace && *bottom_blob.refcount != 1)
+            {
+                bottom_blob = bottom_blob.clone();
+            }
+        }
+
+        // forward
+        if (lightmode && layer->support_inplace)
+        {
+            Mat& bottom_top_blob = bottom_blob;
+            int ret = layer->forward_inplace(bottom_top_blob);
+            if (ret != 0)
+                return ret;
+
+            // store top blob
+            blob_mats[top_blob_index] = bottom_top_blob;
+        }
+        else
+        {
+            Mat top_blob;
+            int ret = layer->forward(bottom_blob, top_blob);
+            if (ret != 0)
+                return ret;
+
+            // store top blob
+            blob_mats[top_blob_index] = top_blob;
+        }
+
+    }
+    else
+    {
+        // load bottom blobs
+        std::vector<Mat> bottom_blobs;
+        bottom_blobs.resize(layer->bottoms.size());
+        for (size_t i=0; i<layer->bottoms.size(); i++)
+        {
+            int bottom_blob_index = layer->bottoms[i];
+
+            if (blob_mats[bottom_blob_index].dims == 0)
+            {
+                int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, lightmode);
+                if (ret != 0)
+                    return ret;
+            }
+
+            bottom_blobs[i] = blob_mats[bottom_blob_index];
+
+            if (lightmode)
+            {
+                // delete after taken in light mode
+                blob_mats[bottom_blob_index].release();
+                // deep copy for inplace forward if data is shared
+                if (layer->support_inplace && *bottom_blobs[i].refcount != 1)
+                {
+                    bottom_blobs[i] = bottom_blobs[i].clone();
+                }
+            }
+        }
+
+        // forward
+        if (lightmode && layer->support_inplace)
+        {
+            std::vector<Mat>& bottom_top_blobs = bottom_blobs;
+            int ret = layer->forward_inplace(bottom_top_blobs);
+            if (ret != 0)
+                return ret;
+
+            // store top blobs
+            for (size_t i=0; i<layer->tops.size(); i++)
+            {
+                int top_blob_index = layer->tops[i];
+
+                blob_mats[top_blob_index] = bottom_top_blobs[i];
+            }
+        }
+        else
+        {
+            std::vector<Mat> top_blobs;
+            top_blobs.resize(layer->tops.size());
+            int ret = layer->forward(bottom_blobs, top_blobs);
+            if (ret != 0)
+                return ret;
+
+            // store top blobs
+            for (size_t i=0; i<layer->tops.size(); i++)
+            {
+                int top_blob_index = layer->tops[i];
+
+                blob_mats[top_blob_index] = top_blobs[i];
+            }
+        }
+    }
+
+//     fprintf(stderr, "forward_layer %d %s done\n", layer_index, layer->name.c_str());
+//     const Mat& blob = blob_mats[layer->tops[0]];
+//     fprintf(stderr, "[%-2d %-16s %-16s]  %d    blobs count = %-3d   size = %-3d x %-3d\n", layer_index, layer->type.c_str(), layer->name.c_str(), layer->tops[0], blob.c, blob.h, blob.w);
+
+    return 0;
+}
+
+Extractor::Extractor(const Net* _net, int blob_count) : net(_net)
+{
+    blob_mats.resize(blob_count);
+    lightmode = false;
+    num_threads = 0;
+}
+
+void Extractor::set_light_mode(bool enable)
+{
+    lightmode = enable;
+}
+
+void Extractor::set_num_threads(int _num_threads)
+{
+    num_threads = _num_threads;
+}
+
+int Extractor::input(int blob_index, const Mat& in)
+{
+    if (blob_index < 0 || blob_index >= (int)blob_mats.size())
+        return -1;
+
+    blob_mats[blob_index] = in;
+
+    return 0;
+}
+
+int Extractor::extract(int blob_index, Mat& feat)
+{
+    if (blob_index < 0 || blob_index >= (int)blob_mats.size())
+        return -1;
+
+    int ret = 0;
+
+    if (blob_mats[blob_index].dims == 0)
+    {
+        int layer_index = net->blobs[blob_index].producer;
+
+#ifdef _OPENMP
+        int dynamic_current = 0;
+        int num_threads_current = 1;
+        if (num_threads)
+        {
+            dynamic_current = omp_get_dynamic();
+            num_threads_current = omp_get_num_threads();
+            omp_set_dynamic(0);
+            omp_set_num_threads(num_threads);
+        }
+#endif
+
+        ret = net->forward_layer(layer_index, blob_mats, lightmode);
+
+#ifdef _OPENMP
+        if (num_threads)
+        {
+            omp_set_dynamic(dynamic_current);
+            omp_set_num_threads(num_threads_current);
+        }
+#endif
+    }
+
+    feat = blob_mats[blob_index];
+
+    return ret;
+}
+
+#if NCNN_STRING
+int Extractor::input(const char* blob_name, const Mat& in)
+{
+    int blob_index = net->find_blob_index_by_name(blob_name);
+    if (blob_index == -1)
+        return -1;
+
+    blob_mats[blob_index] = in;
+
+    return 0;
+}
+
+int Extractor::extract(const char* blob_name, Mat& feat)
+{
+    int blob_index = net->find_blob_index_by_name(blob_name);
+    if (blob_index == -1)
+        return -1;
+
+    int ret = 0;
+
+    if (blob_mats[blob_index].dims == 0)
+    {
+        int layer_index = net->blobs[blob_index].producer;
+
+#ifdef _OPENMP
+        int dynamic_current = 0;
+        int num_threads_current = 1;
+        if (num_threads)
+        {
+            dynamic_current = omp_get_dynamic();
+            num_threads_current = omp_get_num_threads();
+            omp_set_dynamic(0);
+            omp_set_num_threads(num_threads);
+        }
+#endif
+
+        ret = net->forward_layer(layer_index, blob_mats, lightmode);
+
+#ifdef _OPENMP
+        if (num_threads)
+        {
+            omp_set_dynamic(dynamic_current);
+            omp_set_num_threads(num_threads_current);
+        }
+#endif
+    }
+
+    feat = blob_mats[blob_index];
+
+    return ret;
+}
+#endif // NCNN_STRING
+
+} // namespace ncnn
diff --git a/src/net.h b/src/net.h
new file mode 100644
index 00000000000..bd8bac319ac
--- /dev/null
+++ b/src/net.h
@@ -0,0 +1,142 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include <stdio.h>
+#include <vector>
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class Extractor;
+class Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    ~Net();
+
+#if NCNN_STRING
+    // register custom layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator);
+#endif // NCNN_STRING
+    // register custom layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    Layer* create_custom_layer(int index);
+    int forward_layer(int layer_index, std::vector<Mat>& blob_mats, bool lightmode) const;
+
+protected:
+    std::vector<Blob> blobs;
+    std::vector<Layer*> layers;
+
+    std::vector<layer_registry_entry> custom_layer_registry;
+};
+
+class Extractor
+{
+public:
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // disabled by default, but recommend to enable
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, Mat& feat);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, Mat& feat);
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, int blob_count);
+
+private:
+    const Net* net;
+    std::vector<Mat> blob_mats;
+    bool lightmode;
+    int num_threads;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/src/opencv.cpp b/src/opencv.cpp
new file mode 100644
index 00000000000..d95ede6e5f2
--- /dev/null
+++ b/src/opencv.cpp
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "opencv.h"
+
+#if NCNN_OPENCV
+
+#include <stdio.h>
+
+namespace cv {
+
+Mat imread(const std::string& path, int flags)
+{
+    (void)flags;
+
+    // read pgm/ppm
+    FILE* fp = fopen(path.c_str(), "rb");
+    if (!fp)
+        return Mat();
+
+    Mat m;
+
+    char magic[3];
+    int w, h;
+    int nscan = fscanf(fp, "%2s\n%d %d\n255\n", magic, &w, &h);
+    if (nscan == 3 && magic[0] == 'P' && (magic[1] == '5' || magic[1] == '6'))
+    {
+        if (magic[1] == '5')
+        {
+            m.create(h, w, CV_8UC1);
+        }
+        else if (magic[1] == '6')
+        {
+            m.create(h, w, CV_8UC3);
+        }
+        if (m.empty())
+        {
+            fclose(fp);
+            return Mat();
+        }
+
+        fread(m.data, 1, m.total(), fp);
+    }
+
+    fclose(fp);
+
+    return m;
+}
+
+void imwrite(const std::string& path, const Mat& m)
+{
+    // write pgm/ppm
+    FILE* fp = fopen(path.c_str(), "wb");
+    if (!fp)
+        return;
+
+    if (m.channels() == 1)
+    {
+        fprintf(fp, "P5\n%d %d\n255\n", m.cols, m.rows);
+    }
+    else if (m.channels() == 3)
+    {
+        fprintf(fp, "P6\n%d %d\n255\n", m.cols, m.rows);
+    }
+
+    fwrite(m.data, 1, m.total(), fp);
+
+    fclose(fp);
+}
+
+void resize(const Mat& src, Mat& dst, const Size& size, float sw, float sh, int flags)
+{
+    int srcw = src.cols;
+    int srch = src.rows;
+
+    int w = size.width;
+    int h = size.height;
+
+    if (w == 0 || h == 0)
+    {
+        w = srcw * sw;
+        h = srch * sh;
+    }
+
+    if (w == 0 || h == 0)
+        return;
+
+    if (w == srcw && h == srch)
+    {
+        dst = src.clone();
+        return;
+    }
+
+    cv::Mat tmp(h, w, src.c);
+    if (tmp.empty())
+        return;
+
+    if (src.c == 1)
+        ncnn::resize_bilinear_c1(src.data, srcw, srch, tmp.data, w, h);
+    else if (src.c == 3)
+        ncnn::resize_bilinear_c3(src.data, srcw, srch, tmp.data, w, h);
+    else if (src.c == 4)
+        ncnn::resize_bilinear_c4(src.data, srcw, srch, tmp.data, w, h);
+
+    dst = tmp;
+}
+
+} // namespace cv
+
+#endif // NCNN_OPENCV
diff --git a/src/opencv.h b/src/opencv.h
new file mode 100644
index 00000000000..2b6f91b9ca6
--- /dev/null
+++ b/src/opencv.h
@@ -0,0 +1,264 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPENCV_H
+#define NCNN_OPENCV_H
+
+#include "platform.h"
+
+#if NCNN_OPENCV
+
+#include <algorithm>
+#include <string>
+#include "mat.h"
+
+// minimal opencv style data structure implementation
+namespace cv
+{
+
+struct Size
+{
+    Size() : width(0), height(0) {}
+    Size(int _w, int _h) : width(_w), height(_h) {}
+
+    int width;
+    int height;
+};
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_() : x(0), y(0), width(0), height(0) {}
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h) : x(_x), y(_y), width(_w), height(_h) {}
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp> static inline Rect_<_Tp>& operator &= ( Rect_<_Tp>& a, const Rect_<_Tp>& b )
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1; a.y = y1;
+    if( a.width <= 0 || a.height <= 0 )
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp> static inline Rect_<_Tp>& operator |= ( Rect_<_Tp>& a, const Rect_<_Tp>& b )
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1; a.y = y1;
+    return a;
+}
+
+template<typename _Tp> static inline Rect_<_Tp> operator & (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp> static inline Rect_<_Tp> operator | (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_() : x(0), y(0) {}
+    Point_(_Tp _x, _Tp _y) : x(_x), y(_y) {}
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+#define CV_8UC1 1
+#define CV_8UC3 3
+#define CV_8UC4 4
+#define CV_32FC1 4
+
+struct Mat
+{
+    Mat() : data(0), refcount(0), rows(0), cols(0), c(0) {}
+
+    Mat(int _rows, int _cols, int flags) : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m) : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data) : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (unsigned char*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((unsigned char*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const { return data == 0 || total() == 0; }
+
+    int channels() const { return c; }
+
+    size_t total() const { return cols * rows * c; }
+
+    const unsigned char* ptr(int y) const { return data + y * cols * c; }
+
+    unsigned char* ptr(int y) { return data + y * cols * c; }
+
+    // roi
+    Mat operator()( const Rect& roi ) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const unsigned char* sptr = ptr(sy) + roi.x * c;
+            unsigned char* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    unsigned char* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+
+};
+
+#define CV_LOAD_IMAGE_GRAYSCALE 1
+#define CV_LOAD_IMAGE_COLOR 3
+Mat imread(const std::string& path, int flags);
+void imwrite(const std::string& path, const Mat& m);
+
+void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+
+} // namespace cv
+
+#endif // NCNN_OPENCV
+
+#endif // NCNN_OPENCV_H
diff --git a/src/platform.h.in b/src/platform.h.in
new file mode 100644
index 00000000000..8a6ac5a8347
--- /dev/null
+++ b/src/platform.h.in
@@ -0,0 +1,22 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#cmakedefine01 NCNN_STDIO
+#cmakedefine01 NCNN_STRING
+#cmakedefine01 NCNN_OPENCV
+
+#endif // NCNN_PLATFORM_H
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
new file mode 100644
index 00000000000..0b710050b60
--- /dev/null
+++ b/tools/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src)
+
+find_package(Protobuf REQUIRED)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+protobuf_generate_cpp(CAFFE_PROTO_SRCS CAFFE_PROTO_HDRS caffe.proto)
+
+add_executable(caffe2ncnn caffe2ncnn.cpp ${CAFFE_PROTO_SRCS} ${CAFFE_PROTO_HDRS})
+
+target_link_libraries(caffe2ncnn ${PROTOBUF_LIBRARIES})
+
+include_directories(${CMAKE_SOURCE_DIR}/src)
+
+add_executable(ncnn2mem ncnn2mem.cpp)
+
+target_link_libraries(ncnn2mem ncnn)
diff --git a/tools/caffe.proto b/tools/caffe.proto
new file mode 100644
index 00000000000..336ffc2941d
--- /dev/null
+++ b/tools/caffe.proto
@@ -0,0 +1,1395 @@
+syntax = "proto2";
+
+package caffe;
+
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
+message BlobProto {
+  optional BlobShape shape = 7;
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+  repeated double double_data = 8 [packed = true];
+  repeated double double_diff = 9 [packed = true];
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
+}
+
+// The BlobProtoVector is simply a way to pass multiple blobproto instances
+// around.
+message BlobProtoVector {
+  repeated BlobProto blobs = 1;
+}
+
+message Datum {
+  optional int32 channels = 1;
+  optional int32 height = 2;
+  optional int32 width = 3;
+  // the actual image data, in bytes
+  optional bytes data = 4;
+  optional int32 label = 5;
+  // Optionally, the datum could also hold float data.
+  repeated float float_data = 6;
+  // If true data contains an encoded image that need to be decoded
+  optional bool encoded = 7 [default = false];
+}
+
+message FillerParameter {
+  // The filler type.
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0]; // the value in constant filler
+  optional float min = 3 [default = 0]; // the min value in uniform filler
+  optional float max = 4 [default = 1]; // the max value in uniform filler
+  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
+  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  // The expected number of non-zero output weights for a given input in
+  // Gaussian filler -- the default -1 means don't perform sparsification.
+  optional int32 sparse = 7 [default = -1];
+  // Normalize the filler variance by fan_in, fan_out, or their average.
+  // Applies to 'xavier' and 'msra' fillers.
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    AVERAGE = 2;
+  }
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+}
+
+message NetParameter {
+  optional string name = 1; // consider giving the network a name
+  // DEPRECATED. See InputParameter. The input blobs to the network.
+  repeated string input = 3;
+  // DEPRECATED. See InputParameter. The shape of the input blobs.
+  repeated BlobShape input_shape = 8;
+
+  // 4D input dimensions -- deprecated.  Use "input_shape" instead.
+  // If specified, for each input blob there should be four
+  // values specifying the num, channels, height and width of the input blob.
+  // Thus, there should be a total of (4 * #input) numbers.
+  repeated int32 input_dim = 4;
+
+  // Whether the network will force every layer to carry out backward operation.
+  // If set False, then whether to carry out backward is determined
+  // automatically according to the net structure and learning rates.
+  optional bool force_backward = 5 [default = false];
+  // The current "state" of the network, including the phase, level, and stage.
+  // Some layers may be included/excluded depending on this state and the states
+  // specified in the layers' include and exclude fields.
+  optional NetState state = 6;
+
+  // Print debugging information about results while running Net::Forward,
+  // Net::Backward, and Net::Update.
+  optional bool debug_info = 7 [default = false];
+
+  // The layers that make up the net.  Each of their configurations, including
+  // connectivity and behavior, is specified as a LayerParameter.
+  repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
+
+  // DEPRECATED: use 'layer' instead.
+  repeated V1LayerParameter layers = 2;
+}
+
+// NOTE
+// Update the next available ID when you add a new SolverParameter field.
+//
+// SolverParameter next available ID: 41 (last added: type)
+message SolverParameter {
+  //////////////////////////////////////////////////////////////////////////////
+  // Specifying the train and test networks
+  //
+  // Exactly one train net must be specified using one of the following fields:
+  //     train_net_param, train_net, net_param, net
+  // One or more test nets may be specified using any of the following fields:
+  //     test_net_param, test_net, net_param, net
+  // If more than one test net field is specified (e.g., both net and
+  // test_net are specified), they will be evaluated in the field order given
+  // above: (1) test_net_param, (2) test_net, (3) net_param/net.
+  // A test_iter must be specified for each test_net.
+  // A test_level and/or a test_stage may also be specified for each test_net.
+  //////////////////////////////////////////////////////////////////////////////
+
+  // Proto filename for the train net, possibly combined with one or more
+  // test nets.
+  optional string net = 24;
+  // Inline train net param, possibly combined with one or more test nets.
+  optional NetParameter net_param = 25;
+
+  optional string train_net = 1; // Proto filename for the train net.
+  repeated string test_net = 2; // Proto filenames for the test nets.
+  optional NetParameter train_net_param = 21; // Inline train net params.
+  repeated NetParameter test_net_param = 22; // Inline test net params.
+
+  // The states for the train/test nets. Must be unspecified or
+  // specified once per net.
+  //
+  // By default, all states will have solver = true;
+  // train_state will have phase = TRAIN,
+  // and all test_state's will have phase = TEST.
+  // Other defaults are set according to the NetState defaults.
+  optional NetState train_state = 26;
+  repeated NetState test_state = 27;
+
+  // The number of iterations for each test net.
+  repeated int32 test_iter = 3;
+
+  // The number of iterations between two testing phases.
+  optional int32 test_interval = 4 [default = 0];
+  optional bool test_compute_loss = 19 [default = false];
+  // If true, run an initial test pass before the first iteration,
+  // ensuring memory availability and printing the starting value of the loss.
+  optional bool test_initialization = 32 [default = true];
+  optional float base_lr = 5; // The base learning rate
+  // the number of iterations between displaying info. If display = 0, no info
+  // will be displayed.
+  optional int32 display = 6;
+  // Display the loss averaged over the last average_loss iterations
+  optional int32 average_loss = 33 [default = 1];
+  optional int32 max_iter = 7; // the maximum number of iterations
+  // accumulate gradients over `iter_size` x `batch_size` instances
+  optional int32 iter_size = 36 [default = 1];
+
+  // The learning rate decay policy. The currently implemented learning rate
+  // policies are as follows:
+  //    - fixed: always return base_lr.
+  //    - step: return base_lr * gamma ^ (floor(iter / step))
+  //    - exp: return base_lr * gamma ^ iter
+  //    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
+  //    - multistep: similar to step but it allows non uniform steps defined by
+  //      stepvalue
+  //    - poly: the effective learning rate follows a polynomial decay, to be
+  //      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
+  //    - sigmoid: the effective learning rate follows a sigmod decay
+  //      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+  //
+  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
+  // in the solver parameter protocol buffer, and iter is the current iteration.
+  optional string lr_policy = 8;
+  optional float gamma = 9; // The parameter to compute the learning rate.
+  optional float power = 10; // The parameter to compute the learning rate.
+  optional float momentum = 11; // The momentum value.
+  optional float weight_decay = 12; // The weight decay.
+  // regularization types supported: L1 and L2
+  // controlled by weight_decay
+  optional string regularization_type = 29 [default = "L2"];
+  // the stepsize for learning rate policy "step"
+  optional int32 stepsize = 13;
+  // the stepsize for learning rate policy "multistep"
+  repeated int32 stepvalue = 34;
+
+  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
+  // whenever their actual L2 norm is larger.
+  optional float clip_gradients = 35 [default = -1];
+
+  optional int32 snapshot = 14 [default = 0]; // The snapshot interval
+  optional string snapshot_prefix = 15; // The prefix for the snapshot.
+  // whether to snapshot diff in the results or not. Snapshotting diff will help
+  // debugging but the final protocol buffer size will be much larger.
+  optional bool snapshot_diff = 16 [default = false];
+  enum SnapshotFormat {
+    HDF5 = 0;
+    BINARYPROTO = 1;
+  }
+  optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO];
+  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
+  enum SolverMode {
+    CPU = 0;
+    GPU = 1;
+  }
+  optional SolverMode solver_mode = 17 [default = GPU];
+  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
+  optional int32 device_id = 18 [default = 0];
+  // If non-negative, the seed with which the Solver will initialize the Caffe
+  // random number generator -- useful for reproducible results. Otherwise,
+  // (and by default) initialize using a seed derived from the system clock.
+  optional int64 random_seed = 20 [default = -1];
+
+  // type of the solver
+  optional string type = 40 [default = "SGD"];
+
+  // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
+  optional float delta = 31 [default = 1e-8];
+  // parameters for the Adam solver
+  optional float momentum2 = 39 [default = 0.999];
+
+  // RMSProp decay value
+  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
+  optional float rms_decay = 38;
+
+  // If true, print information about the state of the net that may help with
+  // debugging learning problems.
+  optional bool debug_info = 23 [default = false];
+
+  // If false, don't save a snapshot after training finishes.
+  optional bool snapshot_after_train = 28 [default = true];
+
+  // DEPRECATED: old solver enum types, use string instead
+  enum SolverType {
+    SGD = 0;
+    NESTEROV = 1;
+    ADAGRAD = 2;
+    RMSPROP = 3;
+    ADADELTA = 4;
+    ADAM = 5;
+  }
+  // DEPRECATED: use type instead of solver_type
+  optional SolverType solver_type = 30 [default = SGD];
+}
+
+// A message that stores the solver snapshots
+message SolverState {
+  optional int32 iter = 1; // The current iteration
+  optional string learned_net = 2; // The file that stores the learned net.
+  repeated BlobProto history = 3; // The history for sgd solvers
+  optional int32 current_step = 4 [default = 0]; // The current step for learning rate
+}
+
+enum Phase {
+   TRAIN = 0;
+   TEST = 1;
+}
+
+message NetState {
+  optional Phase phase = 1 [default = TEST];
+  optional int32 level = 2 [default = 0];
+  repeated string stage = 3;
+}
+
+message NetStateRule {
+  // Set phase to require the NetState have a particular phase (TRAIN or TEST)
+  // to meet this rule.
+  optional Phase phase = 1;
+
+  // Set the minimum and/or maximum levels in which the layer should be used.
+  // Leave undefined to meet the rule regardless of level.
+  optional int32 min_level = 2;
+  optional int32 max_level = 3;
+
+  // Customizable sets of stages to include or exclude.
+  // The net must have ALL of the specified stages and NONE of the specified
+  // "not_stage"s to meet the rule.
+  // (Use multiple NetStateRules to specify conjunctions of stages.)
+  repeated string stage = 4;
+  repeated string not_stage = 5;
+}
+
+// Specifies training parameters (multipliers on global learning constants,
+// and the name and other settings used for weight sharing).
+message ParamSpec {
+  // The names of the parameter blobs -- useful for sharing parameters among
+  // layers, but never required otherwise.  To share a parameter between two
+  // layers, give it a (non-empty) name.
+  optional string name = 1;
+
+  // Whether to require shared weights to have the same shape, or just the same
+  // count -- defaults to STRICT if unspecified.
+  optional DimCheckMode share_mode = 2;
+  enum DimCheckMode {
+    // STRICT (default) requires that num, channels, height, width each match.
+    STRICT = 0;
+    // PERMISSIVE requires only the count (num*channels*height*width) to match.
+    PERMISSIVE = 1;
+  }
+
+  // The multiplier on the global learning rate for this parameter.
+  optional float lr_mult = 3 [default = 1.0];
+
+  // The multiplier on the global weight decay for this parameter.
+  optional float decay_mult = 4 [default = 1.0];
+}
+
+// NOTE
+// Update the next available ID when you add a new LayerParameter field.
+//
+// LayerParameter next available layer-specific ID: 145 (last added: crop_param)
+message LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the layer type
+  repeated string bottom = 3; // the name of each bottom blob
+  repeated string top = 4; // the name of each top blob
+
+  // The train / test phase for computation.
+  optional Phase phase = 10;
+
+  // The amount of weight to assign each top blob in the objective.
+  // Each layer assigns a default value, usually of either 0 or 1,
+  // to each top blob.
+  repeated float loss_weight = 5;
+
+  // Specifies training parameters (multipliers on global learning constants,
+  // and the name and other settings used for weight sharing).
+  repeated ParamSpec param = 6;
+
+  // The blobs containing the numeric parameters of the layer.
+  repeated BlobProto blobs = 7;
+
+  // Specifies whether to backpropagate to each bottom. If unspecified,
+  // Caffe will automatically infer whether each input needs backpropagation
+  // to compute parameter gradients. If set to true for some inputs,
+  // backpropagation to those inputs is forced; if set false for some inputs,
+  // backpropagation to those inputs is skipped.
+  //
+  // The size must be either 0 or equal to the number of bottoms.
+  repeated bool propagate_down = 11;
+
+  // Rules controlling whether and when a layer is included in the network,
+  // based on the current NetState.  You may specify a non-zero number of rules
+  // to include OR exclude, but not both.  If no include or exclude rules are
+  // specified, the layer is always included.  If the current NetState meets
+  // ANY (i.e., one or more) of the specified rules, the layer is
+  // included/excluded.
+  repeated NetStateRule include = 8;
+  repeated NetStateRule exclude = 9;
+
+  // Parameters for data pre-processing.
+  optional TransformationParameter transform_param = 100;
+
+  // Parameters shared by loss layers.
+  optional LossParameter loss_param = 101;
+
+  // Layer type-specific parameters.
+  //
+  // Note: certain layers may have more than one computational engine
+  // for their implementation. These layers include an Engine type and
+  // engine parameter for selecting the implementation.
+  // The default for the engine is set by the ENGINE switch at compile-time.
+  optional AccuracyParameter accuracy_param = 102;
+  optional ArgMaxParameter argmax_param = 103;
+  optional BatchNormParameter batch_norm_param = 139;
+  optional BiasParameter bias_param = 141;
+  optional ConcatParameter concat_param = 104;
+  optional ContrastiveLossParameter contrastive_loss_param = 105;
+  optional ConvolutionParameter convolution_param = 106;
+  optional CropParameter crop_param = 144;
+  optional DataParameter data_param = 107;
+  optional DropoutParameter dropout_param = 108;
+  optional DummyDataParameter dummy_data_param = 109;
+  optional EltwiseParameter eltwise_param = 110;
+  optional ELUParameter elu_param = 140;
+  optional EmbedParameter embed_param = 137;
+  optional ExpParameter exp_param = 111;
+  optional FlattenParameter flatten_param = 135;
+  optional HDF5DataParameter hdf5_data_param = 112;
+  optional HDF5OutputParameter hdf5_output_param = 113;
+  optional HingeLossParameter hinge_loss_param = 114;
+  optional ImageDataParameter image_data_param = 115;
+  optional InfogainLossParameter infogain_loss_param = 116;
+  optional InnerProductParameter inner_product_param = 117;
+  optional InputParameter input_param = 143;
+  optional LogParameter log_param = 134;
+  optional LRNParameter lrn_param = 118;
+  optional MemoryDataParameter memory_data_param = 119;
+  optional MVNParameter mvn_param = 120;
+  optional PoolingParameter pooling_param = 121;
+  optional PowerParameter power_param = 122;
+  optional PReLUParameter prelu_param = 131;
+  optional PythonParameter python_param = 130;
+  optional ReductionParameter reduction_param = 136;
+  optional ReLUParameter relu_param = 123;
+  optional ReshapeParameter reshape_param = 133;
+  optional ROIPoolingParameter roi_pooling_param = 8266711;
+  optional ScaleParameter scale_param = 142;
+  optional SigmoidParameter sigmoid_param = 124;
+  optional SmoothL1LossParameter smooth_l1_loss_param = 8266712;
+  optional SoftmaxParameter softmax_param = 125;
+  optional SPPParameter spp_param = 132;
+  optional SliceParameter slice_param = 126;
+  optional TanHParameter tanh_param = 127;
+  optional ThresholdParameter threshold_param = 128;
+  optional TileParameter tile_param = 138;
+  optional WindowDataParameter window_data_param = 129;
+}
+
+// Message that stores parameters used to apply transformation
+// to the data layer's data
+message TransformationParameter {
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 1 [default = 1];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 2 [default = false];
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 3 [default = 0];
+  // mean_file and mean_value cannot be specified at the same time
+  optional string mean_file = 4;
+  // if specified can be repeated once (would substract it from all the channels)
+  // or can be repeated the same number of times as channels
+  // (would subtract them from the corresponding channel)
+  repeated float mean_value = 5;
+  // Force the decoded image to have 3 color channels.
+  optional bool force_color = 6 [default = false];
+  // Force the decoded image to have 1 color channels.
+  optional bool force_gray = 7 [default = false];
+}
+
+// Message that stores parameters shared by loss layers
+message LossParameter {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // How to normalize the loss for loss layers that aggregate across batches,
+  // spatial dimensions, or other dimensions.  Currently only implemented in
+  // SoftmaxWithLoss layer.
+  enum NormalizationMode {
+    // Divide by the number of examples in the batch times spatial dimensions.
+    // Outputs that receive the ignore label will NOT be ignored in computing
+    // the normalization factor.
+    FULL = 0;
+    // Divide by the total number of output locations that do not take the
+    // ignore_label.  If ignore_label is not set, this behaves like FULL.
+    VALID = 1;
+    // Divide by the batch size.
+    BATCH_SIZE = 2;
+    // Do not normalize the loss.
+    NONE = 3;
+  }
+  optional NormalizationMode normalization = 3 [default = VALID];
+  // Deprecated.  Ignored if normalization is specified.  If normalization
+  // is not specified, then setting this to false will be equivalent to
+  // normalization = BATCH_SIZE to be consistent with previous behavior.
+  optional bool normalize = 2;
+}
+
+// Messages that store parameters used by individual layer types follow, in
+// alphabetical order.
+
+message AccuracyParameter {
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.  By default, only compare to the top scoring
+  // class (i.e. argmax).
+  optional uint32 top_k = 1 [default = 1];
+
+  // The "label" axis of the prediction blob, whose argmax corresponds to the
+  // predicted label -- may be negative to index from the end (e.g., -1 for the
+  // last axis).  For example, if axis == 1 and the predictions are
+  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
+  // labels with integer values in {0, 1, ..., C-1}.
+  optional int32 axis = 2 [default = 1];
+
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 3;
+}
+
+message ArgMaxParameter {
+  // If true produce pairs (argmax, maxval)
+  optional bool out_max_val = 1 [default = false];
+  optional uint32 top_k = 2 [default = 1];
+  // The axis along which to maximise -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
+  // for each index of the first / num dimension.
+  optional int32 axis = 3;
+}
+
+message ConcatParameter {
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [default = 1];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 concat_dim = 1 [default = 1];
+}
+
+message BatchNormParameter {
+  // If false, accumulate global mean/variance values via a moving average. If
+  // true, use those accumulated values instead of computing mean/variance
+  // across the batch.
+  optional bool use_global_stats = 1;
+  // How much does the moving average decay each iteration?
+  optional float moving_average_fraction = 2 [default = .999];
+  // Small value to add to the variance estimate so that we don't divide by
+  // zero.
+  optional float eps = 3 [default = 1e-5];
+}
+
+message BiasParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar bias.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the bias
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.)
+  // The initialization for the learned bias parameter.
+  // Default is the zero (0) initialization, resulting in the BiasLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+}
+
+message ContrastiveLossParameter {
+  // margin for dissimilar pair
+  optional float margin = 1 [default = 1.0];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [default = false];
+}
+
+message ConvolutionParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 stride = 6; // The stride; defaults to 1
+  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
+  // holes. (Kernel dilation is sometimes referred to by its use in the
+  // algorithme à trous from Holschneider et al. 1987.)
+  repeated uint32 dilation = 18; // The dilation; defaults to 1
+
+  // For 2D convolution only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11; // The kernel height (2D only)
+  optional uint32 kernel_w = 12; // The kernel width (2D only)
+  optional uint32 stride_h = 13; // The stride height (2D only)
+  optional uint32 stride_w = 14; // The stride width (2D only)
+
+  optional uint32 group = 5 [default = 1]; // The group size for group conv
+
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 15 [default = DEFAULT];
+
+  // The axis to interpret as "channels" when performing convolution.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
+  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
+  // groups g>1) filters across the spatial axes (H, W) of the input.
+  // With (N, C, D, H, W) inputs, and axis == 1, we perform
+  // N independent 3D convolutions, sliding (C/g)-channels
+  // filters across the spatial axes (D, H, W) of the input.
+  optional int32 axis = 16 [default = 1];
+
+  // Whether to force use of the general ND convolution, even if a specific
+  // implementation for blobs of the appropriate number of spatial dimensions
+  // is available. (Currently, there is only a 2D-specific convolution
+  // implementation; for input blobs with num_axes != 2, this option is
+  // ignored and the ND implementation will be used.)
+  optional bool force_nd_im2col = 17 [default = false];
+}
+
+message CropParameter {
+  // To crop, elements of the first bottom are selected to fit the dimensions
+  // of the second, reference bottom. The crop is configured by
+  // - the crop `axis` to pick the dimensions for cropping
+  // - the crop `offset` to set the shift for all/each dimension
+  // to align the cropped bottom with the reference bottom.
+  // All dimensions up to but excluding `axis` are preserved, while
+  // the dimensions including and trailing `axis` are cropped.
+  // If only one `offset` is set, then all dimensions are offset by this amount.
+  // Otherwise, the number of offsets must equal the number of cropped axes to
+  // shift the crop in each dimension accordingly.
+  // Note: standard dimensions are N,C,H,W so the default is a spatial crop,
+  // and `axis` may be negative to index from the end (e.g., -1 for the last
+  // axis).
+  optional int32 axis = 1 [default = 2];
+  repeated uint32 offset = 2;
+}
+
+message DataParameter {
+  enum DB {
+    LEVELDB = 0;
+    LMDB = 1;
+  }
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  // DEPRECATED. Each solver accesses a different subset of the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  optional DB backend = 8 [default = LEVELDB];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  // Force the encoded image to have 3 color channels
+  optional bool force_encoded_color = 9 [default = false];
+  // Prefetch queue (Number of batches to prefetch to host memory, increase if
+  // data access bandwidth varies).
+  optional uint32 prefetch = 10 [default = 4];
+}
+
+message DropoutParameter {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+  optional bool scale_train = 2 [default = true];  // scale train or test phase
+}
+
+// DummyDataLayer fills any number of arbitrarily shaped blobs with random
+// (or constant) data generated by "Fillers" (see "message FillerParameter").
+message DummyDataParameter {
+  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or N
+  // shape fields, and 0, 1 or N data_fillers.
+  //
+  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
+  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
+  // specified, the ith is applied to the ith top blob.
+  repeated FillerParameter data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  repeated uint32 num = 2;
+  repeated uint32 channels = 3;
+  repeated uint32 height = 4;
+  repeated uint32 width = 5;
+}
+
+message EltwiseParameter {
+  enum EltwiseOp {
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [default = true];
+}
+
+// Message that stores parameters used by ELULayer
+message ELUParameter {
+  // Described in:
+  // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate
+  // Deep Network Learning by Exponential Linear Units (ELUs). arXiv
+  optional float alpha = 1 [default = 1];
+}
+
+// Message that stores parameters used by EmbedLayer
+message EmbedParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4; // The filler for the weight
+  optional FillerParameter bias_filler = 5; // The filler for the bias
+
+}
+
+// Message that stores parameters used by ExpLayer
+message ExpParameter {
+  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = exp(shift + scale * x).
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+/// Message that stores parameters used by FlattenLayer
+message FlattenParameter {
+  // The first axis to flatten: all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 1 [default = 1];
+
+  // The last axis to flatten: all following axes are retained in the output.
+  // May be negative to index from the end (e.g., the default -1 for the last
+  // axis).
+  optional int32 end_axis = 2 [default = -1];
+}
+
+// Message that stores parameters used by HDF5DataLayer
+message HDF5DataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+
+  // Specify whether to shuffle the data.
+  // If shuffle == true, the ordering of the HDF5 files is shuffled,
+  // and the ordering of data within any given HDF5 file is shuffled,
+  // but data between different files are not interleaved; all of a file's
+  // data are output (in a random order) before moving onto another file.
+  optional bool shuffle = 3 [default = false];
+}
+
+message HDF5OutputParameter {
+  optional string file_name = 1;
+}
+
+message HingeLossParameter {
+  enum Norm {
+    L1 = 1;
+    L2 = 2;
+  }
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [default = L1];
+}
+
+message ImageDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4 [default = 1];
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 8 [default = false];
+  // It will also resize images if new_height or new_width are not zero.
+  optional uint32 new_height = 9 [default = 0];
+  optional uint32 new_width = 10 [default = 0];
+  // Specify if the images are color or gray
+  optional bool is_color = 11 [default = true];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  optional string root_folder = 12 [default = ""];
+}
+
+message InfogainLossParameter {
+  // Specify the infogain matrix source.
+  optional string source = 1;
+}
+
+message InnerProductParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 3; // The filler for the weight
+  optional FillerParameter bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+  // Specify whether to transpose the weight matrix or not.
+  // If transpose == true, any operations will be performed on the transpose
+  // of the weight matrix. The weight matrix itself is not going to be transposed
+  // but rather the transfer flag of operations will be toggled accordingly.
+  optional bool transpose = 6 [default = false];
+}
+
+message InputParameter {
+  // This layer produces N >= 1 top blob(s) to be assigned manually.
+  // Define N shapes to set a shape for each top.
+  // Define 1 shape to set the same shape for every top.
+  // Define no shape to defer to reshaping manually.
+  repeated BlobShape shape = 1;
+}
+
+// Message that stores parameters used by LogLayer
+message LogParameter {
+  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that stores parameters used by LRNLayer
+message LRNParameter {
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float k = 5 [default = 1.];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+message MemoryDataParameter {
+  optional uint32 batch_size = 1;
+  optional uint32 channels = 2;
+  optional uint32 height = 3;
+  optional uint32 width = 4;
+}
+
+message MVNParameter {
+  // This parameter can be set to false to normalize mean only
+  optional bool normalize_variance = 1 [default = true];
+
+  // This parameter can be set to true to perform DNN-like MVN
+  optional bool across_channels = 2 [default = false];
+
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 3 [default = 1e-9];
+}
+
+message PoolingParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 2; // The kernel size (square)
+  optional uint32 kernel_h = 5; // The kernel height
+  optional uint32 kernel_w = 6; // The kernel width
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7; // The stride height
+  optional uint32 stride_w = 8; // The stride width
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 11 [default = DEFAULT];
+  // If global_pooling then it will pool over the size of the bottom by doing
+  // kernel_h = bottom->height and kernel_w = bottom->width
+  optional bool global_pooling = 12 [default = false];
+}
+
+message PowerParameter {
+  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
+  optional float power = 1 [default = 1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+message PythonParameter {
+  optional string module = 1;
+  optional string layer = 2;
+  // This value is set to the attribute `param_str` of the `PythonLayer` object
+  // in Python before calling the `setup()` method. This could be a number,
+  // string, dictionary in Python dict format, JSON, etc. You may parse this
+  // string in `setup` method and use it in `forward` and `backward`.
+  optional string param_str = 3 [default = ''];
+  // Whether this PythonLayer is shared among worker solvers during data parallelism.
+  // If true, each worker solver sequentially run forward from this layer.
+  // This value should be set true if you are using it as a data layer.
+  optional bool share_in_parallel = 4 [default = false];
+}
+
+// Message that stores parameters used by ReductionLayer
+message ReductionParameter {
+  enum ReductionOp {
+    SUM = 1;
+    ASUM = 2;
+    SUMSQ = 3;
+    MEAN = 4;
+  }
+
+  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
+
+  // The first axis to reduce to a scalar -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // (Currently, only reduction along ALL "tail" axes is supported; reduction
+  // of axis M through N, where N < num_axes - 1, is unsupported.)
+  // Suppose we have an n-axis bottom Blob with shape:
+  //     (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)).
+  // If axis == m, the output Blob will have shape
+  //     (d0, d1, d2, ..., d(m-1)),
+  // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1))
+  // times, each including (dm * d(m+1) * ... * d(n-1)) individual data.
+  // If axis == 0 (the default), the output Blob always has the empty shape
+  // (count 1), performing reduction across the entire input --
+  // often useful for creating new loss functions.
+  optional int32 axis = 2 [default = 0];
+
+  optional float coeff = 3 [default = 1.0]; // coefficient for output
+}
+
+// Message that stores parameters used by ReLULayer
+message ReLUParameter {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 2 [default = DEFAULT];
+}
+
+message ReshapeParameter {
+  // Specify the output dimensions. If some of the dimensions are set to 0,
+  // the corresponding dimension from the bottom layer is used (unchanged).
+  // Exactly one dimension may be set to -1, in which case its value is
+  // inferred from the count of the bottom blob and the remaining dimensions.
+  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
+  //
+  //   layer {
+  //     type: "Reshape" bottom: "input" top: "output"
+  //     reshape_param { ... }
+  //   }
+  //
+  // If "input" is 2D with shape 2 x 8, then the following reshape_param
+  // specifications are all equivalent, producing a 3D blob "output" with shape
+  // 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
+  //   reshape_param { shape { dim: -1  dim: 0  dim:  2 } }
+  //
+  optional BlobShape shape = 1;
+
+  // axis and num_axes control the portion of the bottom blob's shape that are
+  // replaced by (included in) the reshape. By default (axis == 0 and
+  // num_axes == -1), the entire bottom blob shape is included in the reshape,
+  // and hence the shape field must specify the entire output shape.
+  //
+  // axis may be non-zero to retain some portion of the beginning of the input
+  // shape (and may be negative to index from the end; e.g., -1 to begin the
+  // reshape after the last axis, including nothing in the reshape,
+  // -2 to include only the last axis, etc.).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are all equivalent,
+  // producing a blob "output" with shape 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
+  //
+  // num_axes specifies the extent of the reshape.
+  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
+  // input axes in the range [axis, axis+num_axes].
+  // num_axes may also be -1, the default, to include all remaining axes
+  // (starting from axis).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are equivalent,
+  // producing a blob "output" with shape 1 x 2 x 8.
+  //
+  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
+  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
+  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
+  //
+  // On the other hand, these would produce output blob shape 2 x 1 x 8:
+  //
+  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
+  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
+  //
+  optional int32 axis = 2 [default = 0];
+  optional int32 num_axes = 3 [default = -1];
+}
+
+// Message that stores parameters used by ROIPoolingLayer
+message ROIPoolingParameter {
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
+  optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
+  // Multiplicative spatial scale factor to translate ROI coords from their
+  // input scale to the scale used when pooling
+  optional float spatial_scale = 3 [default = 1];
+}
+
+message ScaleParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar multiplier.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the scale
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.)
+  // The initialization for the learned scale parameter.
+  // Default is the unit (1) initialization, resulting in the ScaleLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+
+  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
+  // may be more efficient).  Initialized with bias_filler (defaults to 0).
+  optional bool bias_term = 4 [default = false];
+  optional FillerParameter bias_filler = 5;
+}
+
+message SigmoidParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+message SmoothL1LossParameter {
+  // SmoothL1Loss(x) =
+  //   0.5 * (sigma * x) ** 2    -- if x < 1.0 / sigma / sigma
+  //   |x| - 0.5 / sigma / sigma -- otherwise
+  optional float sigma = 1 [default = 1];
+}
+
+message SliceParameter {
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [default = 1];
+  repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [default = 1];
+}
+
+// Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+message SoftmaxParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  optional int32 axis = 2 [default = 1];
+}
+
+message TanHParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+// Message that stores parameters used by TileLayer
+message TileParameter {
+  // The index of the axis to tile.
+  optional int32 axis = 1 [default = 1];
+
+  // The number of copies (tiles) of the blob to output.
+  optional int32 tiles = 2;
+}
+
+// Message that stores parameters used by ThresholdLayer
+message ThresholdParameter {
+  optional float threshold = 1 [default = 0]; // Strictly positive values
+}
+
+message WindowDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 6 [default = false];
+  // Foreground (object) overlap threshold
+  optional float fg_threshold = 7 [default = 0.5];
+  // Background (non-object) overlap threshold
+  optional float bg_threshold = 8 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float fg_fraction = 9 [default = 0.25];
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 10 [default = 0];
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string crop_mode = 11 [default = "warp"];
+  // cache_images: will load all images in memory for faster access
+  optional bool cache_images = 12 [default = false];
+  // append root_folder to locate images
+  optional string root_folder = 13 [default = ""];
+}
+
+message SPPParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+// DEPRECATED: use LayerParameter.
+message V1LayerParameter {
+  repeated string bottom = 2;
+  repeated string top = 3;
+  optional string name = 4;
+  repeated NetStateRule include = 32;
+  repeated NetStateRule exclude = 33;
+  enum LayerType {
+    NONE = 0;
+    ABSVAL = 35;
+    ACCURACY = 1;
+    ARGMAX = 30;
+    BNLL = 2;
+    CONCAT = 3;
+    CONTRASTIVE_LOSS = 37;
+    CONVOLUTION = 4;
+    DATA = 5;
+    DECONVOLUTION = 39;
+    DROPOUT = 6;
+    DUMMY_DATA = 32;
+    EUCLIDEAN_LOSS = 7;
+    ELTWISE = 25;
+    EXP = 38;
+    FLATTEN = 8;
+    HDF5_DATA = 9;
+    HDF5_OUTPUT = 10;
+    HINGE_LOSS = 28;
+    IM2COL = 11;
+    IMAGE_DATA = 12;
+    INFOGAIN_LOSS = 13;
+    INNER_PRODUCT = 14;
+    LRN = 15;
+    MEMORY_DATA = 29;
+    MULTINOMIAL_LOGISTIC_LOSS = 16;
+    MVN = 34;
+    POOLING = 17;
+    POWER = 26;
+    RELU = 18;
+    SIGMOID = 19;
+    SIGMOID_CROSS_ENTROPY_LOSS = 27;
+    SILENCE = 36;
+    SOFTMAX = 20;
+    SOFTMAX_LOSS = 21;
+    SPLIT = 22;
+    SLICE = 33;
+    TANH = 23;
+    WINDOW_DATA = 24;
+    THRESHOLD = 31;
+  }
+  optional LayerType type = 5;
+  repeated BlobProto blobs = 6;
+  repeated string param = 1001;
+  repeated DimCheckMode blob_share_mode = 1002;
+  enum DimCheckMode {
+    STRICT = 0;
+    PERMISSIVE = 1;
+  }
+  repeated float blobs_lr = 7;
+  repeated float weight_decay = 8;
+  repeated float loss_weight = 35;
+  optional AccuracyParameter accuracy_param = 27;
+  optional ArgMaxParameter argmax_param = 23;
+  optional ConcatParameter concat_param = 9;
+  optional ContrastiveLossParameter contrastive_loss_param = 40;
+  optional ConvolutionParameter convolution_param = 10;
+  optional DataParameter data_param = 11;
+  optional DropoutParameter dropout_param = 12;
+  optional DummyDataParameter dummy_data_param = 26;
+  optional EltwiseParameter eltwise_param = 24;
+  optional ExpParameter exp_param = 41;
+  optional HDF5DataParameter hdf5_data_param = 13;
+  optional HDF5OutputParameter hdf5_output_param = 14;
+  optional HingeLossParameter hinge_loss_param = 29;
+  optional ImageDataParameter image_data_param = 15;
+  optional InfogainLossParameter infogain_loss_param = 16;
+  optional InnerProductParameter inner_product_param = 17;
+  optional LRNParameter lrn_param = 18;
+  optional MemoryDataParameter memory_data_param = 22;
+  optional MVNParameter mvn_param = 34;
+  optional PoolingParameter pooling_param = 19;
+  optional PowerParameter power_param = 21;
+  optional ReLUParameter relu_param = 30;
+  optional SigmoidParameter sigmoid_param = 38;
+  optional SoftmaxParameter softmax_param = 39;
+  optional SliceParameter slice_param = 31;
+  optional TanHParameter tanh_param = 37;
+  optional ThresholdParameter threshold_param = 25;
+  optional WindowDataParameter window_data_param = 20;
+  optional TransformationParameter transform_param = 36;
+  optional LossParameter loss_param = 42;
+  optional V0LayerParameter layer = 1;
+}
+
+// DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
+// in Caffe.  We keep this message type around for legacy support.
+message V0LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the string to specify the layer type
+
+  // Parameters to specify layers with inner products.
+  optional uint32 num_output = 3; // The number of outputs for the layer
+  optional bool biasterm = 4 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 5; // The filler for the weight
+  optional FillerParameter bias_filler = 6; // The filler for the bias
+
+  optional uint32 pad = 7 [default = 0]; // The padding size
+  optional uint32 kernelsize = 8; // The kernel size
+  optional uint32 group = 9 [default = 1]; // The group size for group conv
+  optional uint32 stride = 10 [default = 1]; // The stride
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 11 [default = MAX]; // The pooling method
+  optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio
+
+  optional uint32 local_size = 13 [default = 5]; // for local response norm
+  optional float alpha = 14 [default = 1.]; // for local response norm
+  optional float beta = 15 [default = 0.75]; // for local response norm
+  optional float k = 22 [default = 1.];
+
+  // For data layers, specify the data source
+  optional string source = 16;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 17 [default = 1];
+  optional string meanfile = 18;
+  // For data layers, specify the batch size.
+  optional uint32 batchsize = 19;
+  // For data layers, specify if we would like to randomly crop an image.
+  optional uint32 cropsize = 20 [default = 0];
+  // For data layers, specify if we want to randomly mirror data.
+  optional bool mirror = 21 [default = false];
+
+  // The blobs containing the numeric parameters of the layer
+  repeated BlobProto blobs = 50;
+  // The ratio that is multiplied on the global learning rate. If you want to
+  // set the learning ratio for one blob, you need to set it for all blobs.
+  repeated float blobs_lr = 51;
+  // The weight decay that is multiplied on the global weight decay.
+  repeated float weight_decay = 52;
+
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 53 [default = 0];
+
+  // Fields related to detection (det_*)
+  // foreground (object) overlap threshold
+  optional float det_fg_threshold = 54 [default = 0.5];
+  // background (non-object) overlap threshold
+  optional float det_bg_threshold = 55 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float det_fg_fraction = 56 [default = 0.25];
+
+  // optional bool OBSOLETE_can_clobber = 57 [default = true];
+
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 det_context_pad = 58 [default = 0];
+
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string det_crop_mode = 59 [default = "warp"];
+
+  // For ReshapeLayer, one needs to specify the new dimensions.
+  optional int32 new_num = 60 [default = 0];
+  optional int32 new_channels = 61 [default = 0];
+  optional int32 new_height = 62 [default = 0];
+  optional int32 new_width = 63 [default = 0];
+
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  // It will also resize images if new_height or new_width are not zero.
+  optional bool shuffle_images = 64 [default = false];
+
+  // For ConcatLayer, one needs to specify the dimension for concatenation, and
+  // the other dimensions must be the same for all the bottom blobs.
+  // By default it will concatenate blobs along the channels dimension.
+  optional uint32 concat_dim = 65 [default = 1];
+
+  optional HDF5OutputParameter hdf5_output_param = 1001;
+}
+
+message PReLUParameter {
+  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
+  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
+
+  // Initial value of a_i. Default is a_i=0.25 for all i.
+  optional FillerParameter filler = 1;
+  // Whether or not slope paramters are shared across channels.
+  optional bool channel_shared = 2 [default = false];
+}
diff --git a/tools/caffe2ncnn.cpp b/tools/caffe2ncnn.cpp
new file mode 100644
index 00000000000..3b92dc45edb
--- /dev/null
+++ b/tools/caffe2ncnn.cpp
@@ -0,0 +1,782 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdio.h>
+#include <limits.h>
+
+#include <fstream>
+#include <set>
+#include <limits>
+#include <algorithm>
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/message.h>
+
+#include "caffe.pb.h"
+
+static inline size_t alignSize(size_t sz, int n)
+{
+    return (sz + n-1) & -n;
+}
+
+// convert float to half precision floating point
+static unsigned short float2half(float value)
+{
+    // 1 : 8 : 23
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+
+    tmp.f = value;
+
+    // 1 : 8 : 23
+    unsigned short sign = (tmp.u & 0x80000000) >> 31;
+    unsigned short exponent = (tmp.u & 0x7F800000) >> 23;
+    unsigned int significand = tmp.u & 0x7FFFFF;
+
+//     fprintf(stderr, "%d %d %d\n", sign, exponent, significand);
+
+    // 1 : 5 : 10
+    unsigned short fp16;
+    if (exponent == 0)
+    {
+        // zero or denormal, always underflow
+        fp16 = (sign << 15) | (0x00 << 10) | 0x00;
+    }
+    else if (exponent == 0xFF)
+    {
+        // infinity or NaN
+        fp16 = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00);
+    }
+    else
+    {
+        // normalized
+        short newexp = exponent + (- 127 + 15);
+        if (newexp >= 31)
+        {
+            // overflow, return infinity
+            fp16 = (sign << 15) | (0x1F << 10) | 0x00;
+        }
+        else if (newexp <= 0)
+        {
+            // underflow
+            if (newexp >= -10)
+            {
+                // denormal half-precision
+                unsigned short sig = (significand | 0x800000) >> (14 - newexp);
+                fp16 = (sign << 15) | (0x00 << 10) | sig;
+            }
+            else
+            {
+                // underflow
+                fp16 = (sign << 15) | (0x00 << 10) | 0x00;
+            }
+        }
+        else
+        {
+            fp16 = (sign << 15) | (newexp << 10) | (significand >> 13);
+        }
+    }
+
+    return fp16;
+}
+
+static int quantize_weight(float *data, size_t data_length, std::vector<unsigned short>& float16_weights)
+{
+    float16_weights.resize(data_length);
+
+    for (size_t i = 0; i < data_length; i++)
+    {
+        float f = data[i];
+
+        unsigned short fp16 = float2half(f);
+
+        float16_weights[i] = fp16;
+    }
+
+    // magic tag for half-precision floating point
+    return 0x01306B47;
+}
+
+static bool quantize_weight(float *data, size_t data_length, int quantize_level, std::vector<float> &quantize_table, std::vector<unsigned char> &quantize_index) {
+    
+    assert(quantize_level != 0);
+    assert(data != NULL);
+    assert(data_length > 0);
+
+    if (data_length < quantize_level) {
+        fprintf(stderr, "No need quantize,because: data_length < quantize_level");
+        return false;
+    }
+
+    quantize_table.reserve(quantize_level);
+    quantize_index.reserve(data_length);
+
+    // 1. Find min and max value
+    float max_value = std::numeric_limits<float>::min();
+    float min_value = std::numeric_limits<float>::max();
+
+    for (int i = 0; i < data_length; ++i) 
+    {
+        if (max_value < data[i]) max_value = data[i];
+        if (min_value > data[i]) min_value = data[i];
+    }
+    float strides = (max_value - min_value) / quantize_level;
+
+    // 2. Generate quantize table
+    for (int i = 0; i < quantize_level; ++i)
+    {
+        quantize_table.push_back(min_value + i * strides);
+    }
+
+    // 3. Align data to the quantized value
+    for (int i = 0; i < data_length; ++i) 
+    {
+        size_t table_index = int((data[i] - min_value) / strides);
+        table_index = std::min<float>(table_index, quantize_level - 1);
+
+        float low_value  = quantize_table[table_index];
+        float high_value = low_value + strides;
+
+        // find a nearest value between low and high value.
+        float targetValue = data[i] - low_value < high_value - data[i] ? low_value : high_value;
+
+        table_index = int((targetValue - min_value) / strides);
+        table_index = std::min<float>(table_index, quantize_level - 1);
+        quantize_index.push_back(table_index);
+    }
+
+    return true;
+}
+
+static bool read_proto_from_text(const char* filepath, google::protobuf::Message* message)
+{
+    std::ifstream fs(filepath, std::ifstream::in);
+    if (!fs.is_open())
+    {
+        fprintf(stderr, "open failed %s\n", filepath);
+        return false;
+    }
+
+    google::protobuf::io::IstreamInputStream input(&fs);
+    bool success = google::protobuf::TextFormat::Parse(&input, message);
+
+    fs.close();
+
+    return success;
+}
+
+static bool read_proto_from_binary(const char* filepath, google::protobuf::Message* message)
+{
+    std::ifstream fs(filepath, std::ifstream::in | std::ifstream::binary);
+    if (!fs.is_open())
+    {
+        fprintf(stderr, "open failed %s\n", filepath);
+        return false;
+    }
+
+    google::protobuf::io::IstreamInputStream input(&fs);
+    google::protobuf::io::CodedInputStream codedstr(&input);
+
+    codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
+
+    bool success = message->ParseFromCodedStream(&codedstr);
+
+    fs.close();
+
+    return success;
+}
+
+int main(int argc, char** argv)
+{
+    if (!(argc == 3 || argc == 5 || argc == 6))
+    {
+        fprintf(stderr, "Usage: %s [caffeproto] [caffemodel] [ncnnproto] [ncnnbin] [quantizelevel]\n", argv[0]);
+        return -1;
+    }
+
+    const char* caffeproto = argv[1];
+    const char* caffemodel = argv[2];
+    const char* ncnn_prototxt = argc >= 5 ? argv[3] : "ncnn.proto";
+    const char* ncnn_modelbin = argc >= 5 ? argv[4] : "ncnn.bin";
+    const char* quantize_param = argc == 6 ? argv[5] : "0";
+    int quantize_level = atoi(quantize_param);
+
+    if (quantize_level != 0 && quantize_level != 256 && quantize_level != 65536) {
+        fprintf(stderr, "only support quantize level = 0 or level = 256", argv[0]);
+        return -1;
+    }
+
+    caffe::NetParameter proto;
+    caffe::NetParameter net;
+
+    // load
+    bool s0 = read_proto_from_text(caffeproto, &proto);
+    if (!s0)
+    {
+        fprintf(stderr, "read_proto_from_text failed\n");
+        return -1;
+    }
+
+    bool s1 = read_proto_from_binary(caffemodel, &net);
+    if (!s1)
+    {
+        fprintf(stderr, "read_proto_from_binary failed\n");
+        return -1;
+    }
+
+    FILE* pp = fopen(ncnn_prototxt, "wb");
+    FILE* bp = fopen(ncnn_modelbin, "wb");
+
+    // rename mapping for identical bottom top style
+    std::map<std::string, std::string> blob_name_decorated;
+
+    // bottom blob reference
+    std::map<std::string, int> bottom_reference;
+
+    // global definition line
+    // [layer count] [blob count]
+    int layer_count = proto.layer_size();
+    std::set<std::string> blob_names;
+    for (int i=0; i<layer_count; i++)
+    {
+        const caffe::LayerParameter& layer = proto.layer(i);
+
+        for (int j=0; j<layer.bottom_size(); j++)
+        {
+            std::string blob_name = layer.bottom(j);
+            if (blob_name_decorated.find(blob_name) != blob_name_decorated.end())
+            {
+                blob_name = blob_name_decorated[blob_name];
+            }
+
+            blob_names.insert(blob_name);
+
+            if (bottom_reference.find(blob_name) == bottom_reference.end())
+            {
+                bottom_reference[blob_name] = 1;
+            }
+            else
+            {
+                bottom_reference[blob_name] = bottom_reference[blob_name] + 1;
+            }
+        }
+
+        if (layer.bottom_size() == 1 && layer.top_size() == 1 && layer.bottom(0) == layer.top(0))
+        {
+            std::string blob_name = layer.top(0) + "_" + layer.name();
+            blob_name_decorated[layer.top(0)] = blob_name;
+            blob_names.insert(blob_name);
+        }
+        else
+        {
+            for (int j=0; j<layer.top_size(); j++)
+            {
+                std::string blob_name = layer.top(j);
+                blob_names.insert(blob_name);
+            }
+        }
+    }
+    // remove bottom_reference entry with reference equals to one
+    int splitncnn_blob_count = 0;
+    std::map<std::string, int>::iterator it = bottom_reference.begin();
+    while (it != bottom_reference.end())
+    {
+        if (it->second == 1)
+        {
+            bottom_reference.erase(it++);
+        }
+        else
+        {
+            splitncnn_blob_count += it->second;
+//             fprintf(stderr, "%s %d\n", it->first.c_str(), it->second);
+            ++it;
+        }
+    }
+    fprintf(pp, "%d %d\n", layer_count + bottom_reference.size(), blob_names.size() + splitncnn_blob_count);
+
+    // populate
+    blob_name_decorated.clear();
+    int internal_split = 0;
+    for (int i=0; i<layer_count; i++)
+    {
+        const caffe::LayerParameter& layer = proto.layer(i);
+
+        // layer definition line, repeated
+        // [type] [name] [bottom blob count] [top blob count] [bottom blobs] [top blobs] [layer specific params]
+        fprintf(pp, "%-16s %-16s %d %d", layer.type().c_str(), layer.name().c_str(), layer.bottom_size(), layer.top_size());
+
+        for (int j=0; j<layer.bottom_size(); j++)
+        {
+            std::string blob_name = layer.bottom(j);
+            if (blob_name_decorated.find(layer.bottom(j)) != blob_name_decorated.end())
+            {
+                blob_name = blob_name_decorated[layer.bottom(j)];
+            }
+
+            if (bottom_reference.find(blob_name) != bottom_reference.end())
+            {
+                int refidx = bottom_reference[blob_name] - 1;
+                bottom_reference[blob_name] = refidx;
+
+                char splitsuffix[256];
+                sprintf(splitsuffix, "_splitncnn_%d", refidx);
+                blob_name = blob_name + splitsuffix;
+            }
+
+            fprintf(pp, " %s", blob_name.c_str());
+        }
+
+        // decorated
+        if (layer.bottom_size() == 1 && layer.top_size() == 1 && layer.bottom(0) == layer.top(0))
+        {
+            std::string blob_name = layer.top(0) + "_" + layer.name();
+            blob_name_decorated[layer.top(0)] = blob_name;
+
+            fprintf(pp, " %s", blob_name.c_str());
+        }
+        else
+        {
+            for (int j=0; j<layer.top_size(); j++)
+            {
+                std::string blob_name = layer.top(j);
+                fprintf(pp, " %s", blob_name.c_str());
+            }
+        }
+
+        // find blob binary by layer name
+        int netidx;
+        for (netidx=0; netidx<net.layer_size(); netidx++)
+        {
+            if (net.layer(netidx).name() == layer.name())
+            {
+                break;
+            }
+        }
+
+        // layer specific params
+        if (layer.type() == "BatchNorm")
+        {
+            const caffe::LayerParameter& binlayer = net.layer(netidx);
+
+            const caffe::BlobProto& mean_blob = binlayer.blobs(0);
+            const caffe::BlobProto& var_blob = binlayer.blobs(1);
+            fprintf(pp, " %d", (int)mean_blob.data_size());
+
+            const caffe::BatchNormParameter& batch_norm_param = layer.batch_norm_param();
+            float eps = batch_norm_param.eps();
+
+            std::vector<float> ones(mean_blob.data_size(), 1.f);
+            fwrite(ones.data(), sizeof(float), ones.size(), bp);// slope
+
+            if (binlayer.blobs_size() < 3)
+            {
+                fwrite(mean_blob.data().data(), sizeof(float), mean_blob.data_size(), bp);
+                float tmp;
+                for (int j=0; j<var_blob.data_size(); j++)
+                {
+                    tmp = var_blob.data().data()[j] + eps;
+                    fwrite(&tmp, sizeof(float), 1, bp);
+                }
+            }
+            else
+            {
+                float scale_factor = 1 / binlayer.blobs(2).data().data()[0];
+                // premultiply scale_factor to mean and variance
+                float tmp;
+                for (int j=0; j<mean_blob.data_size(); j++)
+                {
+                    tmp = mean_blob.data().data()[j] * scale_factor;
+                    fwrite(&tmp, sizeof(float), 1, bp);
+                }
+                for (int j=0; j<var_blob.data_size(); j++)
+                {
+                    tmp = var_blob.data().data()[j] * scale_factor + eps;
+                    fwrite(&tmp, sizeof(float), 1, bp);
+                }
+            }
+
+            std::vector<float> zeros(mean_blob.data_size(), 0.f);
+            fwrite(zeros.data(), sizeof(float), zeros.size(), bp);// bias
+        }
+        else if (layer.type() == "Convolution")
+        {
+            const caffe::LayerParameter& binlayer = net.layer(netidx);
+
+            const caffe::BlobProto& weight_blob = binlayer.blobs(0);
+            const caffe::ConvolutionParameter& convolution_param = layer.convolution_param();
+            fprintf(pp, " %d %d %d %d %d %d %d", convolution_param.num_output(), convolution_param.kernel_size(0),
+                    convolution_param.dilation_size() != 0 ? convolution_param.dilation(0) : 1,
+                    convolution_param.stride_size() != 0 ? convolution_param.stride(0) : 1,
+                    convolution_param.pad_size() != 0 ? convolution_param.pad(0) : 0,
+                    convolution_param.bias_term(),
+                    weight_blob.data_size());
+
+            for (int j = 0; j < binlayer.blobs_size(); j++)
+            {
+                int quantize_tag = 0;
+                const caffe::BlobProto& blob = binlayer.blobs(j);
+
+                std::vector<float> quantize_table;
+                std::vector<unsigned char> quantize_index;
+
+                std::vector<unsigned short> float16_weights;
+
+                // we will not quantize the bias values
+                if (j == 0 && quantize_level != 0)
+                {
+                    if (quantize_level == 256)
+                    {
+                    quantize_tag = quantize_weight((float *)blob.data().data(), blob.data_size(), quantize_level, quantize_table, quantize_index);
+                    }
+                    else if (quantize_level == 65536)
+                    {
+                    quantize_tag = quantize_weight((float *)blob.data().data(), blob.data_size(), float16_weights);
+                    }
+                }
+                
+                // write quantize tag first
+                if (j == 0) 
+                    fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                if (quantize_tag) 
+                {
+                    int p0 = ftell(bp);
+                    if (quantize_level == 256)
+                    {
+                    // write quantize table and index
+                    fwrite(quantize_table.data(), sizeof(float), quantize_table.size(), bp);
+                    fwrite(quantize_index.data(), sizeof(unsigned char), quantize_index.size(), bp);
+                    }
+                    else if (quantize_level == 65536)
+                    {
+                    fwrite(float16_weights.data(), sizeof(unsigned short), float16_weights.size(), bp);
+                    }
+                    // padding to 32bit align
+                    int nwrite = ftell(bp) - p0;
+                    int nalign = alignSize(nwrite, 4);
+                    unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00};
+                    fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp);
+                } 
+                else 
+                {
+                    // write original data
+                    fwrite(blob.data().data(), sizeof(float), blob.data_size(), bp);
+                }
+            }
+            
+        }
+        else if (layer.type() == "Crop")
+        {
+            const caffe::CropParameter& crop_param = layer.crop_param();
+            int num_offset = crop_param.offset_size();
+            int woffset = (num_offset == 2) ? crop_param.offset(0) : 0;
+            int hoffset = (num_offset == 2) ? crop_param.offset(1) : 0;
+            fprintf(pp, " %d %d", woffset, hoffset);
+        }
+        else if (layer.type() == "Deconvolution")
+        {
+            const caffe::LayerParameter& binlayer = net.layer(netidx);
+
+            const caffe::BlobProto& weight_blob = binlayer.blobs(0);
+            const caffe::ConvolutionParameter& convolution_param = layer.convolution_param();
+            fprintf(pp, " %d %d %d %d %d %d %d", convolution_param.num_output(), convolution_param.kernel_size(0),
+                    convolution_param.dilation_size() != 0 ? convolution_param.dilation(0) : 1,
+                    convolution_param.stride_size() != 0 ? convolution_param.stride(0) : 1,
+                    convolution_param.pad_size() != 0 ? convolution_param.pad(0) : 0,
+                    convolution_param.bias_term(),
+                    weight_blob.data_size());
+
+            int quantized_weight = 0;
+            fwrite(&quantized_weight, sizeof(int), 1, bp);
+
+            // reorder weight from inch-outch to outch-inch
+            int ksize = convolution_param.kernel_size(0);
+            int num_output = convolution_param.num_output();
+            int num_input = weight_blob.data_size() / (ksize * ksize) / num_output;
+            const float* weight_data_ptr = weight_blob.data().data();
+            for (int k=0; k<num_output; k++)
+            {
+                for (int j=0; j<num_input; j++)
+                {
+                    fwrite(weight_data_ptr + (j*num_output + k) * ksize * ksize, sizeof(float), ksize * ksize, bp);
+                }
+            }
+
+            for (int j=1; j<binlayer.blobs_size(); j++)
+            {
+                const caffe::BlobProto& blob = binlayer.blobs(j);
+                fwrite(blob.data().data(), sizeof(float), blob.data_size(), bp);
+            }
+        }
+        else if (layer.type() == "Eltwise")
+        {
+            const caffe::EltwiseParameter& eltwise_param = layer.eltwise_param();
+            int coeff_size = eltwise_param.coeff_size();
+            fprintf(pp, " %d %d", (int)eltwise_param.operation(), coeff_size);
+            for (int j=0; j<coeff_size; j++)
+            {
+                fprintf(pp, " %f", eltwise_param.coeff(j));
+            }
+        }
+        else if (layer.type() == "InnerProduct")
+        {
+            const caffe::LayerParameter& binlayer = net.layer(netidx);
+
+            const caffe::BlobProto& weight_blob = binlayer.blobs(0);
+            const caffe::InnerProductParameter& inner_product_param = layer.inner_product_param();
+            fprintf(pp, " %d %d %d", inner_product_param.num_output(), inner_product_param.bias_term(),
+                    weight_blob.data_size());
+
+            for (int j=0; j<binlayer.blobs_size(); j++)
+            {
+                int quantize_tag = 0;
+                const caffe::BlobProto& blob = binlayer.blobs(j);
+
+                std::vector<float> quantize_table;
+                std::vector<unsigned char> quantize_index;
+
+                std::vector<unsigned short> float16_weights;
+
+                // we will not quantize the bias values
+                if (j == 0 && quantize_level != 0)
+                {
+                    if (quantize_level == 256)
+                    {
+                    quantize_tag = quantize_weight((float *)blob.data().data(), blob.data_size(), quantize_level, quantize_table, quantize_index);
+                    }
+                    else if (quantize_level == 65536)
+                    {
+                    quantize_tag = quantize_weight((float *)blob.data().data(), blob.data_size(), float16_weights);
+                    }
+                }
+
+                // write quantize tag first
+                if (j == 0)
+                    fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                if (quantize_tag) 
+				{
+                    int p0 = ftell(bp);
+                    if (quantize_level == 256)
+                    {
+                    // write quantize table and index
+                    fwrite(quantize_table.data(), sizeof(float), quantize_table.size(), bp);
+                    fwrite(quantize_index.data(), sizeof(unsigned char), quantize_index.size(), bp);
+                    }
+                    else if (quantize_level == 65536)
+                    {
+                    fwrite(float16_weights.data(), sizeof(unsigned short), float16_weights.size(), bp);
+                    }
+                    // padding to 32bit align
+                    int nwrite = ftell(bp) - p0;
+                    int nalign = alignSize(nwrite, 4);
+                    unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00};
+                    fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp);
+                }
+                else 
+				{
+                    // write original data
+                    fwrite(blob.data().data(), sizeof(float), blob.data_size(), bp);
+                }
+            }
+        }
+        else if (layer.type() == "Input")
+        {
+            const caffe::InputParameter& input_param = layer.input_param();
+            const caffe::BlobShape& bs = input_param.shape(0);
+            for (int j=1; j<std::min((int)bs.dim_size(), 4); j++)
+            {
+                fprintf(pp, " %d", bs.dim(j));
+            }
+            for (int j=bs.dim_size(); j<4; j++)
+            {
+                fprintf(pp, " -233");
+            }
+        }
+        else if (layer.type() == "LRN")
+        {
+            const caffe::LRNParameter& lrn_param = layer.lrn_param();
+            fprintf(pp, " %d %d %.8f %.8f", lrn_param.norm_region(), lrn_param.local_size(), lrn_param.alpha(), lrn_param.beta());
+        }
+        else if (layer.type() == "MemoryData")
+        {
+            const caffe::MemoryDataParameter& memory_data_param = layer.memory_data_param();
+            fprintf(pp, " %d %d %d", memory_data_param.channels(), memory_data_param.width(), memory_data_param.height());
+        }
+        else if (layer.type() == "Pooling")
+        {
+            const caffe::PoolingParameter& pooling_param = layer.pooling_param();
+            fprintf(pp, " %d %d %d %d %d", pooling_param.pool(), pooling_param.kernel_size(), pooling_param.stride(), pooling_param.pad(),
+                    pooling_param.has_global_pooling() ? pooling_param.global_pooling() : 0);
+        }
+        else if (layer.type() == "Power")
+        {
+            const caffe::PowerParameter& power_param = layer.power_param();
+            fprintf(pp, " %f %f %f", power_param.power(), power_param.scale(), power_param.shift());
+        }
+        else if (layer.type() == "PReLU")
+        {
+            const caffe::LayerParameter& binlayer = net.layer(netidx);
+            const caffe::BlobProto& slope_blob = binlayer.blobs(0);
+            fprintf(pp, " %d", slope_blob.data_size());
+            fwrite(slope_blob.data().data(), sizeof(float), slope_blob.data_size(), bp);
+        }
+        else if (layer.type() == "Proposal")
+        {
+            const caffe::PythonParameter& python_param = layer.python_param();
+            int feat_stride = 16;
+            sscanf(python_param.param_str().c_str(), "'feat_stride': %d", &feat_stride);
+            int base_size = 16;
+//             float ratio;
+//             float scale;
+            int pre_nms_topN = 6000;
+            int after_nms_topN = 5;
+            float nms_thresh = 0.7;
+            int min_size = 16;
+            fprintf(pp, " %d %d %d %d %f %d", feat_stride, base_size, pre_nms_topN, after_nms_topN, nms_thresh, min_size);
+        }
+        else if (layer.type() == "ReLU")
+        {
+            const caffe::ReLUParameter& relu_param = layer.relu_param();
+            fprintf(pp, " %f", relu_param.negative_slope());
+        }
+        else if (layer.type() == "Reshape")
+        {
+            const caffe::ReshapeParameter& reshape_param = layer.reshape_param();
+            const caffe::BlobShape& bs = reshape_param.shape();
+            for (int j=1; j<std::min((int)bs.dim_size(), 4); j++)
+            {
+                fprintf(pp, " %d", bs.dim(j));
+            }
+            for (int j=bs.dim_size(); j<4; j++)
+            {
+                fprintf(pp, " -233");
+            }
+        }
+        else if (layer.type() == "ROIPooling")
+        {
+            const caffe::ROIPoolingParameter& roi_pooling_param = layer.roi_pooling_param();
+            fprintf(pp, " %d %d %.8f", roi_pooling_param.pooled_w(), roi_pooling_param.pooled_h(), roi_pooling_param.spatial_scale());
+        }
+        else if (layer.type() == "Scale")
+        {
+            const caffe::LayerParameter& binlayer = net.layer(netidx);
+
+            const caffe::BlobProto& weight_blob = binlayer.blobs(0);
+            const caffe::ScaleParameter& scale_param = layer.scale_param();
+            fprintf(pp, " %d %d", (int)weight_blob.data_size(), scale_param.bias_term());
+
+            for (int j=0; j<binlayer.blobs_size(); j++)
+            {
+                const caffe::BlobProto& blob = binlayer.blobs(j);
+                fwrite(blob.data().data(), sizeof(float), blob.data_size(), bp);
+            }
+        }
+        else if (layer.type() == "Slice")
+        {
+            const caffe::SliceParameter& slice_param = layer.slice_param();
+            if (slice_param.has_slice_dim())
+            {
+                int num_slice = layer.top_size();
+                fprintf(pp, " %d", num_slice);
+                for (int j=0; j<num_slice; j++)
+                {
+                    fprintf(pp, " -233");
+                }
+            }
+            else
+            {
+                int num_slice = slice_param.slice_point_size() + 1;
+                fprintf(pp, " %d", num_slice);
+                int prev_offset = 0;
+                for (int j=0; j<num_slice; j++)
+                {
+                    int offset = slice_param.slice_point(j);
+                    fprintf(pp, " %d", offset - prev_offset);
+                    prev_offset = offset;
+                }
+                fprintf(pp, " -233");
+            }
+        }
+        else if (layer.type() == "Threshold")
+        {
+            const caffe::ThresholdParameter& threshold_param = layer.threshold_param();
+            fprintf(pp, " %f", threshold_param.threshold());
+        }
+
+        fprintf(pp, "\n");
+
+        // add split layer if top reference larger than one
+        if (layer.bottom_size() == 1 && layer.top_size() == 1 && layer.bottom(0) == layer.top(0))
+        {
+            std::string blob_name = blob_name_decorated[layer.top(0)];
+            if (bottom_reference.find(blob_name) != bottom_reference.end())
+            {
+                int refcount = bottom_reference[blob_name];
+                if (refcount > 1)
+                {
+                    char splitname[256];
+                    sprintf(splitname, "splitncnn_%d", internal_split);
+                    fprintf(pp, "%-16s %-16s %d %d", "Split", splitname, 1, refcount);
+                    fprintf(pp, " %s", blob_name.c_str());
+
+                    for (int j=0; j<refcount; j++)
+                    {
+                        fprintf(pp, " %s_splitncnn_%d", blob_name.c_str(), j);
+                    }
+                    fprintf(pp, "\n");
+
+                    internal_split++;
+                }
+            }
+        }
+        else
+        {
+            for (int j=0; j<layer.top_size(); j++)
+            {
+                std::string blob_name = layer.top(j);
+                if (bottom_reference.find(blob_name) != bottom_reference.end())
+                {
+                    int refcount = bottom_reference[blob_name];
+                    if (refcount > 1)
+                    {
+                        char splitname[256];
+                        sprintf(splitname, "splitncnn_%d", internal_split);
+                        fprintf(pp, "%-16s %-16s %d %d", "Split", splitname, 1, refcount);
+                        fprintf(pp, " %s", blob_name.c_str());
+
+                        for (int j=0; j<refcount; j++)
+                        {
+                            fprintf(pp, " %s_splitncnn_%d", blob_name.c_str(), j);
+                        }
+                        fprintf(pp, "\n");
+
+                        internal_split++;
+                    }
+                }
+            }
+        }
+
+    }
+
+    fclose(pp);
+    fclose(bp);
+
+    return 0;
+}
diff --git a/tools/ncnn2mem.cpp b/tools/ncnn2mem.cpp
new file mode 100644
index 00000000000..fbab34e76cb
--- /dev/null
+++ b/tools/ncnn2mem.cpp
@@ -0,0 +1,318 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <string>
+#include <vector>
+#include "layer.h"
+
+static std::vector<std::string> layer_names;
+static std::vector<std::string> blob_names;
+
+static int find_blob_index_by_name(const char* name)
+{
+    for (int i=0; i<blob_names.size(); i++)
+    {
+        if (blob_names[i] == name)
+        {
+            return i;
+        }
+    }
+
+    fprintf(stderr, "find_blob_index_by_name %s failed\n", name);
+    return -1;
+}
+
+static void sanitize_name(char* name)
+{
+    for (int i=0; i<strlen(name); i++)
+    {
+        if (!isalnum(name[i]))
+        {
+            name[i] = '_';
+        }
+    }
+}
+
+static std::string path_to_varname(const char* path)
+{
+    const char* lastslash = strrchr(path, '/');
+    const char* name = lastslash == NULL ? path : lastslash + 1;
+
+    std::string varname = name;
+    sanitize_name((char*)varname.c_str());
+
+    return varname;
+}
+
+static int dump_param(const char* parampath, const char* parambinpath, const char* idcpppath)
+{
+    FILE* fp = fopen(parampath, "rb");
+
+    FILE* mp = fopen(parambinpath, "wb");
+    FILE* ip = fopen(idcpppath, "wb");
+
+    std::string param_var = path_to_varname(parampath);
+
+    std::string include_guard_var = path_to_varname(idcpppath);
+
+    fprintf(ip, "#ifndef NCNN_INCLUDE_GUARD_%s\n", include_guard_var.c_str());
+    fprintf(ip, "#define NCNN_INCLUDE_GUARD_%s\n", include_guard_var.c_str());
+    fprintf(ip, "namespace %s_id {\n", param_var.c_str());
+
+    int layer_count = 0;
+    int blob_count = 0;
+    fscanf(fp, "%d %d", &layer_count, &blob_count);
+    fwrite(&layer_count, sizeof(int), 1, mp);
+    fwrite(&blob_count, sizeof(int), 1, mp);
+
+    layer_names.resize(layer_count);
+    blob_names.resize(blob_count);
+
+    int layer_index = 0;
+    int blob_index = 0;
+    while (!feof(fp))
+    {
+        int nscan = 0;
+
+        char layer_type[32];
+        char layer_name[256];
+        int bottom_count = 0;
+        int top_count = 0;
+        nscan = fscanf(fp, "%32s %256s %d %d", layer_type, layer_name, &bottom_count, &top_count);
+        if (nscan != 4)
+        {
+            continue;
+        }
+
+        sanitize_name(layer_name);
+
+        int typeindex = ncnn::layer_to_index(layer_type);
+        fwrite(&typeindex, sizeof(int), 1, mp);
+
+        fwrite(&bottom_count, sizeof(int), 1, mp);
+        fwrite(&top_count, sizeof(int), 1, mp);
+
+        fprintf(ip, "const int LAYER_%s = %d;\n", layer_name, layer_index);
+
+//         layer->bottoms.resize(bottom_count);
+        for (int i=0; i<bottom_count; i++)
+        {
+            char bottom_name[256];
+            nscan = fscanf(fp, "%256s", bottom_name);
+            if (nscan != 1)
+            {
+                continue;
+            }
+
+            sanitize_name(bottom_name);
+
+            int bottom_blob_index = find_blob_index_by_name(bottom_name);
+
+            fwrite(&bottom_blob_index, sizeof(int), 1, mp);
+        }
+
+//         layer->tops.resize(top_count);
+        for (int i=0; i<top_count; i++)
+        {
+            char blob_name[256];
+            nscan = fscanf(fp, "%256s", blob_name);
+            if (nscan != 1)
+            {
+                continue;
+            }
+
+            sanitize_name(blob_name);
+
+            blob_names[blob_index] = std::string(blob_name);
+
+            fprintf(ip, "const int BLOB_%s = %d;\n", blob_name, blob_index);
+
+            fwrite(&blob_index, sizeof(int), 1, mp);
+
+            blob_index++;
+        }
+
+        // dump layer specific params
+        char buffer[1024];
+        fgets(buffer, 1024, fp);
+
+        int pos = 0;
+        int nconsumed = 0;
+        while (1)
+        {
+            // skip whitespace
+            nconsumed = 0;
+            sscanf(buffer + pos, "%*[ \t]%n", &nconsumed);
+            pos += nconsumed;
+
+            bool isfloat = false;
+            // look ahead for determine isfloat
+            const char* bp = buffer + pos;
+            for (int j=0; j<20; j++)
+            {
+                if (bp[j] == ' ' || bp[j] == '\t')
+                {
+                    break;
+                }
+                if (bp[j] == '.')
+                {
+                    isfloat = true;
+                    break;
+                }
+            }
+
+            if (isfloat)
+            {
+                float vf;
+                nconsumed = 0;
+                nscan = sscanf(buffer + pos, "%f%n", &vf, &nconsumed);
+
+                pos += nconsumed;
+
+                if (nscan != 1)
+                {
+                    break;
+                }
+
+                fwrite(&vf, sizeof(float), 1, mp);
+            }
+            else
+            {
+                int v;
+                nconsumed = 0;
+                nscan = sscanf(buffer + pos, "%d%n", &v, &nconsumed);
+
+                pos += nconsumed;
+
+                if (nscan != 1)
+                {
+                    break;
+                }
+
+                fwrite(&v, sizeof(int), 1, mp);
+            }
+
+        }
+
+        layer_names[layer_index] = std::string(layer_name);
+
+        layer_index++;
+    }
+
+    fprintf(ip, "} // namespace %s_id\n", param_var.c_str());
+    fprintf(ip, "#endif // NCNN_INCLUDE_GUARD_%s\n", include_guard_var.c_str());
+
+    fclose(fp);
+
+    fclose(mp);
+    fclose(ip);
+
+    return 0;
+}
+
+static int write_memcpp(const char* parambinpath, const char* modelpath, const char* memcpppath)
+{
+    FILE* cppfp = fopen(memcpppath, "wb");
+
+    // dump param
+    std::string param_var = path_to_varname(parambinpath);
+
+    std::string include_guard_var = path_to_varname(memcpppath);
+
+    FILE* mp = fopen(parambinpath, "rb");
+
+    fprintf(cppfp, "#ifndef NCNN_INCLUDE_GUARD_%s\n", include_guard_var.c_str());
+    fprintf(cppfp, "#define NCNN_INCLUDE_GUARD_%s\n", include_guard_var.c_str());
+
+    fprintf(cppfp, "static const unsigned char %s[] = {\n", param_var.c_str());
+
+    int i = 0;
+    while (!feof(mp))
+    {
+        int c = fgetc(mp);
+        if (c == EOF)
+            break;
+        fprintf(cppfp, "0x%02x,", c);
+
+        i++;
+        if (i % 16 == 0)
+        {
+            fprintf(cppfp, "\n");
+        }
+    }
+
+    fprintf(cppfp, "};\n");
+
+    fclose(mp);
+
+    // dump model
+    std::string model_var = path_to_varname(modelpath);
+
+    FILE* bp = fopen(modelpath, "rb");
+
+    fprintf(cppfp, "static const unsigned char %s[] = {\n", model_var.c_str());
+
+    i = 0;
+    while (!feof(bp))
+    {
+        int c = fgetc(bp);
+        if (c == EOF)
+            break;
+        fprintf(cppfp, "0x%02x,", c);
+
+        i++;
+        if (i % 16 == 0)
+        {
+            fprintf(cppfp, "\n");
+        }
+    }
+
+    fprintf(cppfp, "};\n");
+
+    fprintf(cppfp, "#endif // NCNN_INCLUDE_GUARD_%s\n", include_guard_var.c_str());
+
+    fclose(bp);
+
+    fclose(cppfp);
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 5)
+    {
+        fprintf(stderr, "Usage: %s [ncnnproto] [ncnnbin] [idcpppath] [memcpppath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* parampath = argv[1];
+    const char* modelpath = argv[2];
+    const char* idcpppath = argv[3];
+    const char* memcpppath = argv[4];
+
+    const char* lastslash = strrchr(parampath, '/');
+    const char* name = lastslash == NULL ? parampath : lastslash + 1;
+
+    std::string parambinpath = std::string(name) + ".bin";
+
+    dump_param(parampath, parambinpath.c_str(), idcpppath);
+
+    write_memcpp(parambinpath.c_str(), modelpath, memcpppath);
+
+    return 0;
+}