diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000000..0968e75f87c
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,62 @@
+
+if(CMAKE_TOOLCHAIN_FILE)
+set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
+# get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
+get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
+find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
+message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}")
+endif()
+
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory")
+endif()
+message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")
+
+project(ncnn)
+
+cmake_minimum_required(VERSION 2.8.10)
+
+# set(CMAKE_BUILD_TYPE debug)
+# set(CMAKE_BUILD_TYPE relwithdebinfo)
+set(CMAKE_BUILD_TYPE release)
+
+option(NCNN_OPENMP "openmp support" ON)
+option(NCNN_STDIO "load model from external file" ON)
+option(NCNN_STRING "plain and verbose string" ON)
+option(NCNN_OPENCV "minimal opencv structure emulation" OFF)
+
+if(NCNN_OPENMP)
+ find_package(OpenMP)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+add_definitions(-Wall -Wextra)
+
+add_definitions(-fPIC)
+add_definitions(-Ofast)
+
+add_definitions(-ffast-math)
+# add_definitions(-march=native)
+
+# add_definitions(-flto)
+
+add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+
+if(ANDROID)
+ # disable shared library on android
+ set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions")
+elseif(IOS)
+ # disable shared library on xcode ios
+ set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions")
+endif()
+
+##############################################
+
+# add_subdirectory(examples)
+add_subdirectory(src)
+if(NOT ANDROID AND NOT IOS)
+add_subdirectory(tools)
+endif()
diff --git a/Info.plist b/Info.plist
new file mode 100644
index 00000000000..f90da17ba35
--- /dev/null
+++ b/Info.plist
@@ -0,0 +1,18 @@
+
+
+
+
+ CFBundleName
+ ncnn
+ CFBundleIdentifier
+ com.tencent.ncnn
+ CFBundleVersion
+ 1.0
+ CFBundleShortVersionString
+ 1.0
+ CFBundleSignature
+ ????
+ CFBundlePackageType
+ FMWK
+
+
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 00000000000..2eb0363c72e
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,86 @@
+Tencent is pleased to support the open source community by making ncnn available.
+Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+If you have downloaded a copy of the ncnn binary from Tencent, please note that the ncnn binary is licensed under the BSD 3-Clause License.
+If you have downloaded a copy of the ncnn source code from Tencent, please note that ncnn source code is licensed under the BSD 3-Clause License, except for the third-party components listed below which are subject to different license terms. Your integration of ncnn into your own projects may require compliance with the BSD 3-Clause License, as well as the other licenses applicable to the third-party components included within ncnn.
+A copy of the BSD 3-Clause License is included in this file.
+
+Other dependencies and licenses:
+
+Open Source Software Licensed Under the zlib License:
+The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
+----------------------------------------------------------------------------------------
+1. neon_mathfun.h
+Copyright (C) 2011 Julien Pommier
+
+2. sse_mathfun.h
+Copyright (C) 2007 Julien Pommier
+
+3. avx_mathfun.h
+Copyright (C) 2012 Giovanni Garberoglio
+Interdisciplinary Laboratory for Computational Science (LISC)
+Fondazione Bruno Kessler and University of Trento
+via Sommarive, 18
+I-38123 Trento (Italy)
+
+
+Terms of the zlib License:
+---------------------------------------------------
+Copyright (c)
+
+This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+
+
+Open Source Software Licensed Under the BSD 2-Clause License:
+The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
+----------------------------------------------------------------------------------------
+1. squeezenet 1.1
+Copyright (c) 2016 Forrest N. Iandola and Matthew W. Moskewicz and Khalid Ashraf and Song Han and William J. Dally and Kurt Keutzer
+All rights reserved.
+
+2. caffe.proto master
+All contributions by the University of California:
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014-2017, the respective contributors
+All rights reserved.
+
+
+Terms of the BSD 2-Clause License:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+Open Source Software Licensed Under the BSD 3-Clause License:
+The below software in this distribution may have been modified by THL A29 Limited (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 THL A29 Limited.
+----------------------------------------------------------------------------------------
+1. android.toolchain.cmake master
+Copyright (c) 2010-2011, Ethan Rublee
+Copyright (c) 2011-2014, Andrey Kamaev
+All rights reserved.
+
+
+Terms of the BSD 3-Clause License:
+--------------------------------------------------------------------
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+Neither the name of [copyright holder] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000000..935f35c35db
--- /dev/null
+++ b/README.md
@@ -0,0 +1,44 @@
+# ncnn
+
+---
+
+ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架。ncnn 从设计之初深刻考虑手机端的部属和使用。无第三方依赖,跨平台,手机端 cpu 的速度快于目前所有已知的开源框架。基于 ncnn,开发者能够将深度学习算法轻松移植到手机端高效执行,开发出人工智能 APP,将 AI 带到你的指尖。ncnn 目前已在腾讯多款应用中使用,如 QQ,Qzone,微信,天天P图等。
+
+ncnn is a high-performance neural network inference computing framework optimized for the mobile platform. ncnn is deeply considered of the deployment and uses on mobile phones from the beginning of the design. ncnn does not have third party dependent, it is cross-platform, and runs faster than all known open source framework on mobile phone cpu. Developers can easily deploy deep learning algorithm models to the mobile platform by using the efficient ncnn implementation, create intelligent APP, and bring the artificial intelligence to your fingertips. ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu and so on.
+
+---
+
+### 功能概述
+
+* 支持卷积神经网络,支持多输入和多分支结构,可计算部分分支
+* 无任何第三方库依赖,不依赖 BLAS/NNPACK 等计算框架
+* 纯 C++ 实现,跨平台,支持 android ios 等
+* ARM NEON 汇编级良心优化,计算速度极快
+* 精细的内存管理和数据结构设计,内存占用极低
+* 支持多核并行计算加速,ARM big.LITTLE cpu 调度优化
+* 整体库体积小于 500K,并可轻松精简到小于 300K
+* 可扩展的模型设计,支持 8bit 量化和半精度浮点存储,可导入 caffe 模型
+* 支持直接内存零拷贝引用加载网络模型
+* 可注册自定义层实现并扩展
+* 恩,很强就是了,不怕被塞卷 QvQ
+
+### Features
+
+* Support convolution neural network, support multiple input and multi-branch structure, can calculate part of the branch
+* No third-party library dependent, do not rely on BLAS / NNPACK or other computing framework
+* Pure C ++ implementation, cross-platform, support android ios and so on
+* ARM NEON assembly level of careful optimization, the calculation speed is extremely fast
+* Sophisticated memory management and data structure design, very low memory footprint
+* Support multi-core parallel computing acceleration, ARM big.LITTLE cpu scheduling optimization
+* The overall library size is less than 500K, and can be easily reduced to less than 300K
+* Extensible model design, support 8bit quantization and half-precision floating point storage, can import caffe model
+* Support direct memory zero copy reference load network model
+* Can be registered with custom layer implementation and extented
+* Well, it is strong, not afraid of being stuffed with 卷 QvQ
+
+---
+
+### License
+
+BSD 3 Clause
+
diff --git a/android.toolchain.cmake b/android.toolchain.cmake
new file mode 100644
index 00000000000..900ca8c91c3
--- /dev/null
+++ b/android.toolchain.cmake
@@ -0,0 +1,1735 @@
+# Copyright (c) 2010-2011, Ethan Rublee
+# Copyright (c) 2011-2014, Andrey Kamaev
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+# ------------------------------------------------------------------------------
+# Android CMake toolchain file, for use with the Android NDK r5-r10d
+# Requires cmake 2.6.3 or newer (2.8.9 or newer is recommended).
+# See home page: https://github.com/taka-no-me/android-cmake
+#
+# Usage Linux:
+# $ export ANDROID_NDK=/absolute/path/to/the/android-ndk
+# $ mkdir build && cd build
+# $ cmake -DCMAKE_TOOLCHAIN_FILE=path/to/the/android.toolchain.cmake ..
+# $ make -j8
+#
+# Usage Windows:
+# You need native port of make to build your project.
+# Android NDK r7 (and newer) already has make.exe on board.
+# For older NDK you have to install it separately.
+# For example, this one: http://gnuwin32.sourceforge.net/packages/make.htm
+#
+# $ SET ANDROID_NDK=C:\absolute\path\to\the\android-ndk
+# $ mkdir build && cd build
+# $ cmake.exe -G"MinGW Makefiles"
+# -DCMAKE_TOOLCHAIN_FILE=path\to\the\android.toolchain.cmake
+# -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%\prebuilt\windows\bin\make.exe" ..
+# $ cmake.exe --build .
+#
+#
+# Options (can be set as cmake parameters: -D=):
+# ANDROID_NDK=/opt/android-ndk - path to the NDK root.
+# Can be set as environment variable. Can be set only at first cmake run.
+#
+# ANDROID_ABI=armeabi-v7a - specifies the target Application Binary
+# Interface (ABI). This option nearly matches to the APP_ABI variable
+# used by ndk-build tool from Android NDK.
+#
+# Possible targets are:
+# "armeabi" - ARMv5TE based CPU with software floating point operations
+# "armeabi-v7a" - ARMv7 based devices with hardware FPU instructions
+# this ABI target is used by default
+# "armeabi-v7a-hard with NEON" - ARMv7 based devices with hardware FPU instructions and hardfp
+# "armeabi-v7a with NEON" - same as armeabi-v7a, but
+# sets NEON as floating-point unit
+# "armeabi-v7a with VFPV3" - same as armeabi-v7a, but
+# sets VFPV3 as floating-point unit (has 32 registers instead of 16)
+# "armeabi-v6 with VFP" - tuned for ARMv6 processors having VFP
+# "x86" - IA-32 instruction set
+# "mips" - MIPS32 instruction set
+#
+# 64-bit ABIs for NDK r10 and newer:
+# "arm64-v8a" - ARMv8 AArch64 instruction set
+# "x86_64" - Intel64 instruction set (r1)
+# "mips64" - MIPS64 instruction set (r6)
+#
+# ANDROID_NATIVE_API_LEVEL=android-8 - level of Android API compile for.
+# Option is read-only when standalone toolchain is used.
+# Note: building for "android-L" requires explicit configuration.
+#
+# ANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.9 - the name of compiler
+# toolchain to be used. The list of possible values depends on the NDK
+# version. For NDK r10c the possible values are:
+#
+# * aarch64-linux-android-4.9
+# * aarch64-linux-android-clang3.4
+# * aarch64-linux-android-clang3.5
+# * arm-linux-androideabi-4.6
+# * arm-linux-androideabi-4.8
+# * arm-linux-androideabi-4.9 (default)
+# * arm-linux-androideabi-clang3.4
+# * arm-linux-androideabi-clang3.5
+# * mips64el-linux-android-4.9
+# * mips64el-linux-android-clang3.4
+# * mips64el-linux-android-clang3.5
+# * mipsel-linux-android-4.6
+# * mipsel-linux-android-4.8
+# * mipsel-linux-android-4.9
+# * mipsel-linux-android-clang3.4
+# * mipsel-linux-android-clang3.5
+# * x86-4.6
+# * x86-4.8
+# * x86-4.9
+# * x86-clang3.4
+# * x86-clang3.5
+# * x86_64-4.9
+# * x86_64-clang3.4
+# * x86_64-clang3.5
+#
+# ANDROID_FORCE_ARM_BUILD=OFF - set ON to generate 32-bit ARM instructions
+# instead of Thumb. Is not available for "armeabi-v6 with VFP"
+# (is forced to be ON) ABI.
+#
+# ANDROID_NO_UNDEFINED=ON - set ON to show all undefined symbols as linker
+# errors even if they are not used.
+#
+# ANDROID_SO_UNDEFINED=OFF - set ON to allow undefined symbols in shared
+# libraries. Automatically turned for NDK r5x and r6x due to GLESv2
+# problems.
+#
+# ANDROID_STL=gnustl_static - specify the runtime to use.
+#
+# Possible values are:
+# none -> Do not configure the runtime.
+# system -> Use the default minimal system C++ runtime library.
+# Implies -fno-rtti -fno-exceptions.
+# Is not available for standalone toolchain.
+# system_re -> Use the default minimal system C++ runtime library.
+# Implies -frtti -fexceptions.
+# Is not available for standalone toolchain.
+# gabi++_static -> Use the GAbi++ runtime as a static library.
+# Implies -frtti -fno-exceptions.
+# Available for NDK r7 and newer.
+# Is not available for standalone toolchain.
+# gabi++_shared -> Use the GAbi++ runtime as a shared library.
+# Implies -frtti -fno-exceptions.
+# Available for NDK r7 and newer.
+# Is not available for standalone toolchain.
+# stlport_static -> Use the STLport runtime as a static library.
+# Implies -fno-rtti -fno-exceptions for NDK before r7.
+# Implies -frtti -fno-exceptions for NDK r7 and newer.
+# Is not available for standalone toolchain.
+# stlport_shared -> Use the STLport runtime as a shared library.
+# Implies -fno-rtti -fno-exceptions for NDK before r7.
+# Implies -frtti -fno-exceptions for NDK r7 and newer.
+# Is not available for standalone toolchain.
+# gnustl_static -> Use the GNU STL as a static library.
+# Implies -frtti -fexceptions.
+# gnustl_shared -> Use the GNU STL as a shared library.
+# Implies -frtti -fno-exceptions.
+# Available for NDK r7b and newer.
+# Silently degrades to gnustl_static if not available.
+# c++_static -> Use the LLVM libc++ runtime as a static library.
+# Implies -frtti -fexceptions.
+# c++_shared -> Use the LLVM libc++ runtime as a static library.
+# Implies -frtti -fno-exceptions.
+#
+# ANDROID_STL_FORCE_FEATURES=ON - turn rtti and exceptions support based on
+# chosen runtime. If disabled, then the user is responsible for settings
+# these options.
+#
+# What?:
+# android-cmake toolchain searches for NDK/toolchain in the following order:
+# ANDROID_NDK - cmake parameter
+# ANDROID_NDK - environment variable
+# ANDROID_STANDALONE_TOOLCHAIN - cmake parameter
+# ANDROID_STANDALONE_TOOLCHAIN - environment variable
+# ANDROID_NDK - default locations
+# ANDROID_STANDALONE_TOOLCHAIN - default locations
+#
+# Make sure to do the following in your scripts:
+# SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${my_cxx_flags}" )
+# SET( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${my_cxx_flags}" )
+# The flags will be prepopulated with critical flags, so don't loose them.
+# Also be aware that toolchain also sets configuration-specific compiler
+# flags and linker flags.
+#
+# ANDROID and BUILD_ANDROID will be set to true, you may test any of these
+# variables to make necessary Android-specific configuration changes.
+#
+# Also ARMEABI or ARMEABI_V7A or ARMEABI_V7A_HARD or X86 or MIPS or ARM64_V8A or X86_64 or MIPS64
+# will be set true, mutually exclusive. NEON option will be set true
+# if VFP is set to NEON.
+#
+# ------------------------------------------------------------------------------
+
+cmake_minimum_required( VERSION 2.6.3 )
+
+if( DEFINED CMAKE_CROSSCOMPILING )
+ # subsequent toolchain loading is not really needed
+ return()
+endif()
+
+if( CMAKE_TOOLCHAIN_FILE )
+ # touch toolchain variable to suppress "unused variable" warning
+endif()
+
+# inherit settings in recursive loads
+get_property( _CMAKE_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE )
+if( _CMAKE_IN_TRY_COMPILE )
+ include( "${CMAKE_CURRENT_SOURCE_DIR}/../android.toolchain.config.cmake" OPTIONAL )
+endif()
+
+# this one is important
+if( CMAKE_VERSION VERSION_GREATER "3.0.99" )
+ set( CMAKE_SYSTEM_NAME Android )
+else()
+ set( CMAKE_SYSTEM_NAME Linux )
+endif()
+
+# this one not so much
+set( CMAKE_SYSTEM_VERSION 1 )
+
+# rpath makes low sense for Android
+set( CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG "" )
+set( CMAKE_SKIP_RPATH TRUE CACHE BOOL "If set, runtime paths are not added when using shared libraries." )
+
+# NDK search paths
+set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r10d -r10c -r10b -r10 -r9d -r9c -r9b -r9 -r8e -r8d -r8c -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
+if( NOT DEFINED ANDROID_NDK_SEARCH_PATHS )
+ if( CMAKE_HOST_WIN32 )
+ file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS )
+ set( ANDROID_NDK_SEARCH_PATHS "${ANDROID_NDK_SEARCH_PATHS}" "$ENV{SystemDrive}/NVPACK" )
+ else()
+ file( TO_CMAKE_PATH "$ENV{HOME}" ANDROID_NDK_SEARCH_PATHS )
+ set( ANDROID_NDK_SEARCH_PATHS /opt "${ANDROID_NDK_SEARCH_PATHS}/NVPACK" )
+ endif()
+endif()
+if( NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH )
+ set( ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH /opt/android-toolchain )
+endif()
+
+# known ABIs
+set( ANDROID_SUPPORTED_ABIS_arm "armeabi-v7a;armeabi;armeabi-v7a with NEON;armeabi-v7a-hard with NEON;armeabi-v7a with VFPV3;armeabi-v6 with VFP" )
+set( ANDROID_SUPPORTED_ABIS_arm64 "arm64-v8a" )
+set( ANDROID_SUPPORTED_ABIS_x86 "x86" )
+set( ANDROID_SUPPORTED_ABIS_x86_64 "x86_64" )
+set( ANDROID_SUPPORTED_ABIS_mips "mips" )
+set( ANDROID_SUPPORTED_ABIS_mips64 "mips64" )
+
+# API level defaults
+set( ANDROID_DEFAULT_NDK_API_LEVEL 8 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_arm64 21 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_x86 9 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_x86_64 21 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_mips 9 )
+set( ANDROID_DEFAULT_NDK_API_LEVEL_mips64 21 )
+
+
+macro( __LIST_FILTER listvar regex )
+ if( ${listvar} )
+ foreach( __val ${${listvar}} )
+ if( __val MATCHES "${regex}" )
+ list( REMOVE_ITEM ${listvar} "${__val}" )
+ endif()
+ endforeach()
+ endif()
+endmacro()
+
+macro( __INIT_VARIABLE var_name )
+ set( __test_path 0 )
+ foreach( __var ${ARGN} )
+ if( __var STREQUAL "PATH" )
+ set( __test_path 1 )
+ break()
+ endif()
+ endforeach()
+
+ if( __test_path AND NOT EXISTS "${${var_name}}" )
+ unset( ${var_name} CACHE )
+ endif()
+
+ if( " ${${var_name}}" STREQUAL " " )
+ set( __values 0 )
+ foreach( __var ${ARGN} )
+ if( __var STREQUAL "VALUES" )
+ set( __values 1 )
+ elseif( NOT __var STREQUAL "PATH" )
+ if( __var MATCHES "^ENV_.*$" )
+ string( REPLACE "ENV_" "" __var "${__var}" )
+ set( __value "$ENV{${__var}}" )
+ elseif( DEFINED ${__var} )
+ set( __value "${${__var}}" )
+ elseif( __values )
+ set( __value "${__var}" )
+ else()
+ set( __value "" )
+ endif()
+
+ if( NOT " ${__value}" STREQUAL " " AND (NOT __test_path OR EXISTS "${__value}") )
+ set( ${var_name} "${__value}" )
+ break()
+ endif()
+ endif()
+ endforeach()
+ unset( __value )
+ unset( __values )
+ endif()
+
+ if( __test_path )
+ file( TO_CMAKE_PATH "${${var_name}}" ${var_name} )
+ endif()
+ unset( __test_path )
+endmacro()
+
+macro( __DETECT_NATIVE_API_LEVEL _var _path )
+ set( __ndkApiLevelRegex "^[\t ]*#define[\t ]+__ANDROID_API__[\t ]+([0-9]+)[\t ]*.*$" )
+ file( STRINGS ${_path} __apiFileContent REGEX "${__ndkApiLevelRegex}" )
+ if( NOT __apiFileContent )
+ message( SEND_ERROR "Could not get Android native API level. Probably you have specified invalid level value, or your copy of NDK/toolchain is broken." )
+ endif()
+ string( REGEX REPLACE "${__ndkApiLevelRegex}" "\\1" ${_var} "${__apiFileContent}" )
+ unset( __apiFileContent )
+ unset( __ndkApiLevelRegex )
+endmacro()
+
+macro( __DETECT_TOOLCHAIN_MACHINE_NAME _var _root )
+ if( EXISTS "${_root}" )
+ file( GLOB __gccExePath RELATIVE "${_root}/bin/" "${_root}/bin/*-gcc${TOOL_OS_SUFFIX}" )
+ __LIST_FILTER( __gccExePath "^[.].*" )
+ list( LENGTH __gccExePath __gccExePathsCount )
+ if( NOT __gccExePathsCount EQUAL 1 AND NOT _CMAKE_IN_TRY_COMPILE )
+ message( WARNING "Could not determine machine name for compiler from ${_root}" )
+ set( ${_var} "" )
+ else()
+ get_filename_component( __gccExeName "${__gccExePath}" NAME_WE )
+ string( REPLACE "-gcc" "" ${_var} "${__gccExeName}" )
+ endif()
+ unset( __gccExePath )
+ unset( __gccExePathsCount )
+ unset( __gccExeName )
+ else()
+ set( ${_var} "" )
+ endif()
+endmacro()
+
+
+# fight against cygwin
+set( ANDROID_FORBID_SYGWIN TRUE CACHE BOOL "Prevent cmake from working under cygwin and using cygwin tools")
+mark_as_advanced( ANDROID_FORBID_SYGWIN )
+if( ANDROID_FORBID_SYGWIN )
+ if( CYGWIN )
+ message( FATAL_ERROR "Android NDK and android-cmake toolchain are not welcome Cygwin. It is unlikely that this cmake toolchain will work under cygwin. But if you want to try then you can set cmake variable ANDROID_FORBID_SYGWIN to FALSE and rerun cmake." )
+ endif()
+
+ if( CMAKE_HOST_WIN32 )
+ # remove cygwin from PATH
+ set( __new_path "$ENV{PATH}")
+ __LIST_FILTER( __new_path "cygwin" )
+ set(ENV{PATH} "${__new_path}")
+ unset(__new_path)
+ endif()
+endif()
+
+
+# detect current host platform
+if( NOT DEFINED ANDROID_NDK_HOST_X64 AND (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64" OR CMAKE_HOST_APPLE) )
+ set( ANDROID_NDK_HOST_X64 1 CACHE BOOL "Try to use 64-bit compiler toolchain" )
+ mark_as_advanced( ANDROID_NDK_HOST_X64 )
+endif()
+
+set( TOOL_OS_SUFFIX "" )
+if( CMAKE_HOST_APPLE )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME "darwin-x86_64" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME2 "darwin-x86" )
+elseif( CMAKE_HOST_WIN32 )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME "windows-x86_64" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME2 "windows" )
+ set( TOOL_OS_SUFFIX ".exe" )
+elseif( CMAKE_HOST_UNIX )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME "linux-x86_64" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME2 "linux-x86" )
+else()
+ message( FATAL_ERROR "Cross-compilation on your platform is not supported by this cmake toolchain" )
+endif()
+
+if( NOT ANDROID_NDK_HOST_X64 )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+endif()
+
+# see if we have path to Android NDK
+if( NOT ANDROID_NDK AND NOT ANDROID_STANDALONE_TOOLCHAIN )
+ __INIT_VARIABLE( ANDROID_NDK PATH ENV_ANDROID_NDK )
+endif()
+if( NOT ANDROID_NDK )
+ # see if we have path to Android standalone toolchain
+ __INIT_VARIABLE( ANDROID_STANDALONE_TOOLCHAIN PATH ENV_ANDROID_STANDALONE_TOOLCHAIN )
+
+ if( NOT ANDROID_STANDALONE_TOOLCHAIN )
+ #try to find Android NDK in one of the the default locations
+ set( __ndkSearchPaths )
+ foreach( __ndkSearchPath ${ANDROID_NDK_SEARCH_PATHS} )
+ foreach( suffix ${ANDROID_SUPPORTED_NDK_VERSIONS} )
+ list( APPEND __ndkSearchPaths "${__ndkSearchPath}/android-ndk${suffix}" )
+ endforeach()
+ endforeach()
+ __INIT_VARIABLE( ANDROID_NDK PATH VALUES ${__ndkSearchPaths} )
+ unset( __ndkSearchPaths )
+
+ if( ANDROID_NDK )
+ message( STATUS "Using default path for Android NDK: ${ANDROID_NDK}" )
+ message( STATUS " If you prefer to use a different location, please define a cmake or environment variable: ANDROID_NDK" )
+ else()
+ #try to find Android standalone toolchain in one of the the default locations
+ __INIT_VARIABLE( ANDROID_STANDALONE_TOOLCHAIN PATH ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH )
+
+ if( ANDROID_STANDALONE_TOOLCHAIN )
+ message( STATUS "Using default path for standalone toolchain ${ANDROID_STANDALONE_TOOLCHAIN}" )
+ message( STATUS " If you prefer to use a different location, please define the variable: ANDROID_STANDALONE_TOOLCHAIN" )
+ endif( ANDROID_STANDALONE_TOOLCHAIN )
+ endif( ANDROID_NDK )
+ endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
+endif( NOT ANDROID_NDK )
+
+# remember found paths
+if( ANDROID_NDK )
+ get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
+ set( ANDROID_NDK "${ANDROID_NDK}" CACHE INTERNAL "Path of the Android NDK" FORCE )
+ set( BUILD_WITH_ANDROID_NDK True )
+ if( EXISTS "${ANDROID_NDK}/RELEASE.TXT" )
+ file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX "r[0-9]+[a-z]?" )
+ string( REGEX MATCH "r([0-9]+)([a-z]?)" ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ else()
+ set( ANDROID_NDK_RELEASE "r1x" )
+ set( ANDROID_NDK_RELEASE_FULL "unreleased" )
+ endif()
+ string( REGEX REPLACE "r([0-9]+)([a-z]?)" "\\1*1000" ANDROID_NDK_RELEASE_NUM "${ANDROID_NDK_RELEASE}" )
+ string( FIND " abcdefghijklmnopqastuvwxyz" "${CMAKE_MATCH_2}" __ndkReleaseLetterNum )
+ math( EXPR ANDROID_NDK_RELEASE_NUM "${ANDROID_NDK_RELEASE_NUM}+${__ndkReleaseLetterNum}" )
+elseif( ANDROID_STANDALONE_TOOLCHAIN )
+ get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
+ # try to detect change
+ if( CMAKE_AR )
+ string( LENGTH "${ANDROID_STANDALONE_TOOLCHAIN}" __length )
+ string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidStandaloneToolchainPreviousPath )
+ if( NOT __androidStandaloneToolchainPreviousPath STREQUAL ANDROID_STANDALONE_TOOLCHAIN )
+ message( FATAL_ERROR "It is not possible to change path to the Android standalone toolchain on subsequent run." )
+ endif()
+ unset( __androidStandaloneToolchainPreviousPath )
+ unset( __length )
+ endif()
+ set( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" CACHE INTERNAL "Path of the Android standalone toolchain" FORCE )
+ set( BUILD_WITH_STANDALONE_TOOLCHAIN True )
+else()
+ list(GET ANDROID_NDK_SEARCH_PATHS 0 ANDROID_NDK_SEARCH_PATH)
+ message( FATAL_ERROR "Could not find neither Android NDK nor Android standalone toolchain.
+ You should either set an environment variable:
+ export ANDROID_NDK=~/my-android-ndk
+ or
+ export ANDROID_STANDALONE_TOOLCHAIN=~/my-android-toolchain
+ or put the toolchain or NDK in the default path:
+ sudo ln -s ~/my-android-ndk ${ANDROID_NDK_SEARCH_PATH}/android-ndk
+ sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
+endif()
+
+# android NDK layout
+if( BUILD_WITH_ANDROID_NDK )
+ if( NOT DEFINED ANDROID_NDK_LAYOUT )
+ # try to automatically detect the layout
+ if( EXISTS "${ANDROID_NDK}/RELEASE.TXT")
+ set( ANDROID_NDK_LAYOUT "RELEASE" )
+ elseif( EXISTS "${ANDROID_NDK}/../../linux-x86/toolchain/" )
+ set( ANDROID_NDK_LAYOUT "LINARO" )
+ elseif( EXISTS "${ANDROID_NDK}/../../gcc/" )
+ set( ANDROID_NDK_LAYOUT "ANDROID" )
+ endif()
+ endif()
+ set( ANDROID_NDK_LAYOUT "${ANDROID_NDK_LAYOUT}" CACHE STRING "The inner layout of NDK" )
+ mark_as_advanced( ANDROID_NDK_LAYOUT )
+ if( ANDROID_NDK_LAYOUT STREQUAL "LINARO" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+ set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../${ANDROID_NDK_HOST_SYSTEM_NAME}/toolchain" )
+ set( ANDROID_NDK_TOOLCHAINS_SUBPATH "" )
+ set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ elseif( ANDROID_NDK_LAYOUT STREQUAL "ANDROID" )
+ set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+ set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../gcc/${ANDROID_NDK_HOST_SYSTEM_NAME}/arm" )
+ set( ANDROID_NDK_TOOLCHAINS_SUBPATH "" )
+ set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ else() # ANDROID_NDK_LAYOUT STREQUAL "RELEASE"
+ set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/toolchains" )
+ set( ANDROID_NDK_TOOLCHAINS_SUBPATH "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+ set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME2}" )
+ endif()
+ get_filename_component( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK_TOOLCHAINS_PATH}" ABSOLUTE )
+
+ # try to detect change of NDK
+ if( CMAKE_AR )
+ string( LENGTH "${ANDROID_NDK_TOOLCHAINS_PATH}" __length )
+ string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
+ if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK_TOOLCHAINS_PATH )
+ message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
+ " )
+ endif()
+ unset( __androidNdkPreviousPath )
+ unset( __length )
+ endif()
+endif()
+
+
+# get all the details about standalone toolchain
+if( BUILD_WITH_STANDALONE_TOOLCHAIN )
+ __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
+ set( ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
+ set( __availableToolchains "standalone" )
+ __DETECT_TOOLCHAIN_MACHINE_NAME( __availableToolchainMachines "${ANDROID_STANDALONE_TOOLCHAIN}" )
+ if( NOT __availableToolchainMachines )
+ message( FATAL_ERROR "Could not determine machine name of your toolchain. Probably your Android standalone toolchain is broken." )
+ endif()
+ if( __availableToolchainMachines MATCHES x86_64 )
+ set( __availableToolchainArchs "x86_64" )
+ elseif( __availableToolchainMachines MATCHES i686 )
+ set( __availableToolchainArchs "x86" )
+ elseif( __availableToolchainMachines MATCHES aarch64 )
+ set( __availableToolchainArchs "arm64" )
+ elseif( __availableToolchainMachines MATCHES arm )
+ set( __availableToolchainArchs "arm" )
+ elseif( __availableToolchainMachines MATCHES mips64el )
+ set( __availableToolchainArchs "mips64" )
+ elseif( __availableToolchainMachines MATCHES mipsel )
+ set( __availableToolchainArchs "mips" )
+ endif()
+ execute_process( COMMAND "${ANDROID_STANDALONE_TOOLCHAIN}/bin/${__availableToolchainMachines}-gcc${TOOL_OS_SUFFIX}" -dumpversion
+ OUTPUT_VARIABLE __availableToolchainCompilerVersions OUTPUT_STRIP_TRAILING_WHITESPACE )
+ string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9]+)?" __availableToolchainCompilerVersions "${__availableToolchainCompilerVersions}" )
+ if( EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/bin/clang${TOOL_OS_SUFFIX}" )
+ list( APPEND __availableToolchains "standalone-clang" )
+ list( APPEND __availableToolchainMachines ${__availableToolchainMachines} )
+ list( APPEND __availableToolchainArchs ${__availableToolchainArchs} )
+ list( APPEND __availableToolchainCompilerVersions ${__availableToolchainCompilerVersions} )
+ endif()
+endif()
+
+macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __toolchain_subpath )
+ foreach( __toolchain ${${__availableToolchainsLst}} )
+ if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}${__toolchain_subpath}" )
+ SET( __toolchainVersionRegex "^TOOLCHAIN_VERSION[\t ]+:=[\t ]+(.*)$" )
+ FILE( STRINGS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}/setup.mk" __toolchainVersionStr REGEX "${__toolchainVersionRegex}" )
+ if( __toolchainVersionStr )
+ string( REGEX REPLACE "${__toolchainVersionRegex}" "\\1" __toolchainVersionStr "${__toolchainVersionStr}" )
+ string( REGEX REPLACE "-clang3[.][0-9]$" "-${__toolchainVersionStr}" __gcc_toolchain "${__toolchain}" )
+ else()
+ string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
+ endif()
+ unset( __toolchainVersionStr )
+ unset( __toolchainVersionRegex )
+ else()
+ set( __gcc_toolchain "${__toolchain}" )
+ endif()
+ __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK_TOOLCHAINS_PATH}/${__gcc_toolchain}${__toolchain_subpath}" )
+ if( __machine )
+ string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9x]+)?$" __version "${__gcc_toolchain}" )
+ if( __machine MATCHES x86_64 )
+ set( __arch "x86_64" )
+ elseif( __machine MATCHES i686 )
+ set( __arch "x86" )
+ elseif( __machine MATCHES aarch64 )
+ set( __arch "arm64" )
+ elseif( __machine MATCHES arm )
+ set( __arch "arm" )
+ elseif( __machine MATCHES mips64el )
+ set( __arch "mips64" )
+ elseif( __machine MATCHES mipsel )
+ set( __arch "mips" )
+ else()
+ set( __arch "" )
+ endif()
+ #message("machine: !${__machine}!\narch: !${__arch}!\nversion: !${__version}!\ntoolchain: !${__toolchain}!\n")
+ if (__arch)
+ list( APPEND __availableToolchainMachines "${__machine}" )
+ list( APPEND __availableToolchainArchs "${__arch}" )
+ list( APPEND __availableToolchainCompilerVersions "${__version}" )
+ list( APPEND ${__availableToolchainsVar} "${__toolchain}" )
+ endif()
+ endif()
+ unset( __gcc_toolchain )
+ endforeach()
+endmacro()
+
+# get all the details about NDK
+if( BUILD_WITH_ANDROID_NDK )
+ file( GLOB ANDROID_SUPPORTED_NATIVE_API_LEVELS RELATIVE "${ANDROID_NDK}/platforms" "${ANDROID_NDK}/platforms/android-*" )
+ string( REPLACE "android-" "" ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_SUPPORTED_NATIVE_API_LEVELS}" )
+ set( __availableToolchains "" )
+ set( __availableToolchainMachines "" )
+ set( __availableToolchainArchs "" )
+ set( __availableToolchainCompilerVersions "" )
+ if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_TOOLCHAIN_NAME}/" )
+ # do not go through all toolchains if we know the name
+ set( __availableToolchainsLst "${ANDROID_TOOLCHAIN_NAME}" )
+ __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+ if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+ __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
+ if( __availableToolchains )
+ set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
+ endif()
+ endif()
+ endif()
+ if( NOT __availableToolchains )
+ file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK_TOOLCHAINS_PATH}" "${ANDROID_NDK_TOOLCHAINS_PATH}/*" )
+ if( __availableToolchainsLst )
+ list(SORT __availableToolchainsLst) # we need clang to go after gcc
+ endif()
+ __LIST_FILTER( __availableToolchainsLst "^[.]" )
+ __LIST_FILTER( __availableToolchainsLst "llvm" )
+ __LIST_FILTER( __availableToolchainsLst "renderscript" )
+ __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+ if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+ __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
+ if( __availableToolchains )
+ set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
+ endif()
+ endif()
+ endif()
+ if( NOT __availableToolchains )
+ message( FATAL_ERROR "Could not find any working toolchain in the NDK. Probably your Android NDK is broken." )
+ endif()
+endif()
+
+# build list of available ABIs
+set( ANDROID_SUPPORTED_ABIS "" )
+set( __uniqToolchainArchNames ${__availableToolchainArchs} )
+list( REMOVE_DUPLICATES __uniqToolchainArchNames )
+list( SORT __uniqToolchainArchNames )
+foreach( __arch ${__uniqToolchainArchNames} )
+ list( APPEND ANDROID_SUPPORTED_ABIS ${ANDROID_SUPPORTED_ABIS_${__arch}} )
+endforeach()
+unset( __uniqToolchainArchNames )
+if( NOT ANDROID_SUPPORTED_ABIS )
+ message( FATAL_ERROR "No one of known Android ABIs is supported by this cmake toolchain." )
+endif()
+
+# choose target ABI
+__INIT_VARIABLE( ANDROID_ABI VALUES ${ANDROID_SUPPORTED_ABIS} )
+# verify that target ABI is supported
+list( FIND ANDROID_SUPPORTED_ABIS "${ANDROID_ABI}" __androidAbiIdx )
+if( __androidAbiIdx EQUAL -1 )
+ string( REPLACE ";" "\", \"" PRINTABLE_ANDROID_SUPPORTED_ABIS "${ANDROID_SUPPORTED_ABIS}" )
+ message( FATAL_ERROR "Specified ANDROID_ABI = \"${ANDROID_ABI}\" is not supported by this cmake toolchain or your NDK/toolchain.
+ Supported values are: \"${PRINTABLE_ANDROID_SUPPORTED_ABIS}\"
+ " )
+endif()
+unset( __androidAbiIdx )
+
+# set target ABI options
+if( ANDROID_ABI STREQUAL "x86" )
+ set( X86 true )
+ set( ANDROID_NDK_ABI_NAME "x86" )
+ set( ANDROID_ARCH_NAME "x86" )
+ set( ANDROID_LLVM_TRIPLE "i686-none-linux-android" )
+ set( CMAKE_SYSTEM_PROCESSOR "i686" )
+elseif( ANDROID_ABI STREQUAL "x86_64" )
+ set( X86 true )
+ set( X86_64 true )
+ set( ANDROID_NDK_ABI_NAME "x86_64" )
+ set( ANDROID_ARCH_NAME "x86_64" )
+ set( CMAKE_SYSTEM_PROCESSOR "x86_64" )
+ set( ANDROID_LLVM_TRIPLE "x86_64-none-linux-android" )
+elseif( ANDROID_ABI STREQUAL "mips64" )
+ set( MIPS64 true )
+ set( ANDROID_NDK_ABI_NAME "mips64" )
+ set( ANDROID_ARCH_NAME "mips64" )
+ set( ANDROID_LLVM_TRIPLE "mips64el-none-linux-android" )
+ set( CMAKE_SYSTEM_PROCESSOR "mips64" )
+elseif( ANDROID_ABI STREQUAL "mips" )
+ set( MIPS true )
+ set( ANDROID_NDK_ABI_NAME "mips" )
+ set( ANDROID_ARCH_NAME "mips" )
+ set( ANDROID_LLVM_TRIPLE "mipsel-none-linux-android" )
+ set( CMAKE_SYSTEM_PROCESSOR "mips" )
+elseif( ANDROID_ABI STREQUAL "arm64-v8a" )
+ set( ARM64_V8A true )
+ set( ANDROID_NDK_ABI_NAME "arm64-v8a" )
+ set( ANDROID_ARCH_NAME "arm64" )
+ set( ANDROID_LLVM_TRIPLE "aarch64-none-linux-android" )
+ set( CMAKE_SYSTEM_PROCESSOR "aarch64" )
+ set( VFPV3 true )
+ set( NEON true )
+elseif( ANDROID_ABI STREQUAL "armeabi" )
+ set( ARMEABI true )
+ set( ANDROID_NDK_ABI_NAME "armeabi" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv5te-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv5te" )
+elseif( ANDROID_ABI STREQUAL "armeabi-v6 with VFP" )
+ set( ARMEABI_V6 true )
+ set( ANDROID_NDK_ABI_NAME "armeabi" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv5te-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv6" )
+ # need always fallback to older platform
+ set( ARMEABI true )
+elseif( ANDROID_ABI STREQUAL "armeabi-v7a")
+ set( ARMEABI_V7A true )
+ set( ANDROID_NDK_ABI_NAME "armeabi-v7a" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
+elseif( ANDROID_ABI STREQUAL "armeabi-v7a with VFPV3" )
+ set( ARMEABI_V7A true )
+ set( ANDROID_NDK_ABI_NAME "armeabi-v7a" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
+ set( VFPV3 true )
+elseif( ANDROID_ABI STREQUAL "armeabi-v7a with NEON" )
+ set( ARMEABI_V7A true )
+ set( ANDROID_NDK_ABI_NAME "armeabi-v7a" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
+ set( VFPV3 true )
+ set( NEON true )
+elseif( ANDROID_ABI STREQUAL "armeabi-v7a-hard with NEON" )
+ set( ARMEABI_V7A_HARD true )
+ set( ANDROID_NDK_ABI_NAME "armeabi-v7a-hard" )
+ set( ANDROID_ARCH_NAME "arm" )
+ set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
+ set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
+ set( VFPV3 true )
+ set( NEON true )
+else()
+ message( SEND_ERROR "Unknown ANDROID_ABI=\"${ANDROID_ABI}\" is specified." )
+endif()
+
+if( CMAKE_BINARY_DIR AND EXISTS "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" )
+ # really dirty hack
+ # it is not possible to change CMAKE_SYSTEM_PROCESSOR after the first run...
+ file( APPEND "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" "SET(CMAKE_SYSTEM_PROCESSOR \"${CMAKE_SYSTEM_PROCESSOR}\")\n" )
+endif()
+
+if( ANDROID_ARCH_NAME STREQUAL "arm" AND NOT ARMEABI_V6 )
+ __INIT_VARIABLE( ANDROID_FORCE_ARM_BUILD VALUES OFF )
+ set( ANDROID_FORCE_ARM_BUILD ${ANDROID_FORCE_ARM_BUILD} CACHE BOOL "Use 32-bit ARM instructions instead of Thumb-1" FORCE )
+ mark_as_advanced( ANDROID_FORCE_ARM_BUILD )
+else()
+ unset( ANDROID_FORCE_ARM_BUILD CACHE )
+endif()
+
+# choose toolchain
+if( ANDROID_TOOLCHAIN_NAME )
+ list( FIND __availableToolchains "${ANDROID_TOOLCHAIN_NAME}" __toolchainIdx )
+ if( __toolchainIdx EQUAL -1 )
+ list( SORT __availableToolchains )
+ string( REPLACE ";" "\n * " toolchains_list "${__availableToolchains}" )
+ set( toolchains_list " * ${toolchains_list}")
+ message( FATAL_ERROR "Specified toolchain \"${ANDROID_TOOLCHAIN_NAME}\" is missing in your NDK or broken. Please verify that your NDK is working or select another compiler toolchain.
+To configure the toolchain set CMake variable ANDROID_TOOLCHAIN_NAME to one of the following values:\n${toolchains_list}\n" )
+ endif()
+ list( GET __availableToolchainArchs ${__toolchainIdx} __toolchainArch )
+ if( NOT __toolchainArch STREQUAL ANDROID_ARCH_NAME )
+ message( SEND_ERROR "Selected toolchain \"${ANDROID_TOOLCHAIN_NAME}\" is not able to compile binaries for the \"${ANDROID_ARCH_NAME}\" platform." )
+ endif()
+else()
+ set( __toolchainIdx -1 )
+ set( __applicableToolchains "" )
+ set( __toolchainMaxVersion "0.0.0" )
+ list( LENGTH __availableToolchains __availableToolchainsCount )
+ math( EXPR __availableToolchainsCount "${__availableToolchainsCount}-1" )
+ foreach( __idx RANGE ${__availableToolchainsCount} )
+ list( GET __availableToolchainArchs ${__idx} __toolchainArch )
+ if( __toolchainArch STREQUAL ANDROID_ARCH_NAME )
+ list( GET __availableToolchainCompilerVersions ${__idx} __toolchainVersion )
+ string( REPLACE "x" "99" __toolchainVersion "${__toolchainVersion}")
+ if( __toolchainVersion VERSION_GREATER __toolchainMaxVersion )
+ set( __toolchainMaxVersion "${__toolchainVersion}" )
+ set( __toolchainIdx ${__idx} )
+ endif()
+ endif()
+ endforeach()
+ unset( __availableToolchainsCount )
+ unset( __toolchainMaxVersion )
+ unset( __toolchainVersion )
+endif()
+unset( __toolchainArch )
+if( __toolchainIdx EQUAL -1 )
+ message( FATAL_ERROR "No one of available compiler toolchains is able to compile for ${ANDROID_ARCH_NAME} platform." )
+endif()
+list( GET __availableToolchains ${__toolchainIdx} ANDROID_TOOLCHAIN_NAME )
+list( GET __availableToolchainMachines ${__toolchainIdx} ANDROID_TOOLCHAIN_MACHINE_NAME )
+list( GET __availableToolchainCompilerVersions ${__toolchainIdx} ANDROID_COMPILER_VERSION )
+
+unset( __toolchainIdx )
+unset( __availableToolchains )
+unset( __availableToolchainMachines )
+unset( __availableToolchainArchs )
+unset( __availableToolchainCompilerVersions )
+
+# choose native API level
+__INIT_VARIABLE( ANDROID_NATIVE_API_LEVEL ENV_ANDROID_NATIVE_API_LEVEL ANDROID_API_LEVEL ENV_ANDROID_API_LEVEL ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME} ANDROID_DEFAULT_NDK_API_LEVEL )
+string( REPLACE "android-" "" ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" )
+string( STRIP "${ANDROID_NATIVE_API_LEVEL}" ANDROID_NATIVE_API_LEVEL )
+# adjust API level
+set( __real_api_level ${ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME}} )
+foreach( __level ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
+ if( (__level LESS ANDROID_NATIVE_API_LEVEL OR __level STREQUAL ANDROID_NATIVE_API_LEVEL) AND NOT __level LESS __real_api_level )
+ set( __real_api_level ${__level} )
+ endif()
+endforeach()
+if( __real_api_level AND NOT ANDROID_NATIVE_API_LEVEL STREQUAL __real_api_level )
+ message( STATUS "Adjusting Android API level 'android-${ANDROID_NATIVE_API_LEVEL}' to 'android-${__real_api_level}'")
+ set( ANDROID_NATIVE_API_LEVEL ${__real_api_level} )
+endif()
+unset(__real_api_level)
+# validate
+list( FIND ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_NATIVE_API_LEVEL}" __levelIdx )
+if( __levelIdx EQUAL -1 )
+ message( SEND_ERROR "Specified Android native API level 'android-${ANDROID_NATIVE_API_LEVEL}' is not supported by your NDK/toolchain." )
+else()
+ if( BUILD_WITH_ANDROID_NDK )
+ __DETECT_NATIVE_API_LEVEL( __realApiLevel "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}/usr/include/android/api-level.h" )
+ if( NOT __realApiLevel EQUAL ANDROID_NATIVE_API_LEVEL AND NOT __realApiLevel GREATER 9000 )
+ message( SEND_ERROR "Specified Android API level (${ANDROID_NATIVE_API_LEVEL}) does not match to the level found (${__realApiLevel}). Probably your copy of NDK is broken." )
+ endif()
+ unset( __realApiLevel )
+ endif()
+ set( ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" CACHE STRING "Android API level for native code" FORCE )
+ set( CMAKE_ANDROID_API ${ANDROID_NATIVE_API_LEVEL} )
+ if( CMAKE_VERSION VERSION_GREATER "2.8" )
+ list( SORT ANDROID_SUPPORTED_NATIVE_API_LEVELS )
+ set_property( CACHE ANDROID_NATIVE_API_LEVEL PROPERTY STRINGS ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
+ endif()
+endif()
+unset( __levelIdx )
+
+
+# remember target ABI
+set( ANDROID_ABI "${ANDROID_ABI}" CACHE STRING "The target ABI for Android. If arm, then armeabi-v7a is recommended for hardware floating point." FORCE )
+if( CMAKE_VERSION VERSION_GREATER "2.8" )
+ list( SORT ANDROID_SUPPORTED_ABIS_${ANDROID_ARCH_NAME} )
+ set_property( CACHE ANDROID_ABI PROPERTY STRINGS ${ANDROID_SUPPORTED_ABIS_${ANDROID_ARCH_NAME}} )
+endif()
+
+
+# runtime choice (STL, rtti, exceptions)
+if( NOT ANDROID_STL )
+ set( ANDROID_STL gnustl_static )
+endif()
+set( ANDROID_STL "${ANDROID_STL}" CACHE STRING "C++ runtime" )
+set( ANDROID_STL_FORCE_FEATURES ON CACHE BOOL "automatically configure rtti and exceptions support based on C++ runtime" )
+mark_as_advanced( ANDROID_STL ANDROID_STL_FORCE_FEATURES )
+
+if( BUILD_WITH_ANDROID_NDK )
+ if( NOT "${ANDROID_STL}" MATCHES "^(none|system|system_re|gabi\\+\\+_static|gabi\\+\\+_shared|stlport_static|stlport_shared|gnustl_static|gnustl_shared|c\\+\\+_static|c\\+\\+_shared)$")
+ message( FATAL_ERROR "ANDROID_STL is set to invalid value \"${ANDROID_STL}\".
+The possible values are:
+ none -> Do not configure the runtime.
+ system -> Use the default minimal system C++ runtime library.
+ system_re -> Same as system but with rtti and exceptions.
+ gabi++_static -> Use the GAbi++ runtime as a static library.
+ gabi++_shared -> Use the GAbi++ runtime as a shared library.
+ stlport_static -> Use the STLport runtime as a static library.
+ stlport_shared -> Use the STLport runtime as a shared library.
+ gnustl_static -> (default) Use the GNU STL as a static library.
+ gnustl_shared -> Use the GNU STL as a shared library.
+ c++_shared -> Use the LLVM libc++ runtime as a shared library.
+ c++_static -> Use the LLVM libc++ runtime as a static library.
+" )
+ endif()
+elseif( BUILD_WITH_STANDALONE_TOOLCHAIN )
+ if( NOT "${ANDROID_STL}" MATCHES "^(none|gnustl_static|gnustl_shared|c\\+\\+_static|c\\+\\+_shared)$")
+ message( FATAL_ERROR "ANDROID_STL is set to invalid value \"${ANDROID_STL}\".
+The possible values are:
+ none -> Do not configure the runtime.
+ gnustl_static -> (default) Use the GNU STL as a static library.
+ gnustl_shared -> Use the GNU STL as a shared library.
+ c++_shared -> Use the LLVM libc++ runtime as a shared library.
+ c++_static -> Use the LLVM libc++ runtime as a static library.
+" )
+ endif()
+endif()
+
+unset( ANDROID_RTTI )
+unset( ANDROID_EXCEPTIONS )
+unset( ANDROID_STL_INCLUDE_DIRS )
+unset( __libstl )
+unset( __libsupcxx )
+
+if( NOT _CMAKE_IN_TRY_COMPILE AND ANDROID_NDK_RELEASE STREQUAL "r7b" AND ARMEABI_V7A AND NOT VFPV3 AND ANDROID_STL MATCHES "gnustl" )
+ message( WARNING "The GNU STL armeabi-v7a binaries from NDK r7b can crash non-NEON devices. The files provided with NDK r7b were not configured properly, resulting in crashes on Tegra2-based devices and others when trying to use certain floating-point functions (e.g., cosf, sinf, expf).
+You are strongly recommended to switch to another NDK release.
+" )
+endif()
+
+if( NOT _CMAKE_IN_TRY_COMPILE AND X86 AND ANDROID_STL MATCHES "gnustl" AND ANDROID_NDK_RELEASE STREQUAL "r6" )
+ message( WARNING "The x86 system header file from NDK r6 has incorrect definition for ptrdiff_t. You are recommended to upgrade to a newer NDK release or manually patch the header:
+See https://android.googlesource.com/platform/development.git f907f4f9d4e56ccc8093df6fee54454b8bcab6c2
+ diff --git a/ndk/platforms/android-9/arch-x86/include/machine/_types.h b/ndk/platforms/android-9/arch-x86/include/machine/_types.h
+ index 5e28c64..65892a1 100644
+ --- a/ndk/platforms/android-9/arch-x86/include/machine/_types.h
+ +++ b/ndk/platforms/android-9/arch-x86/include/machine/_types.h
+ @@ -51,7 +51,11 @@ typedef long int ssize_t;
+ #endif
+ #ifndef _PTRDIFF_T
+ #define _PTRDIFF_T
+ -typedef long ptrdiff_t;
+ +# ifdef __ANDROID__
+ + typedef int ptrdiff_t;
+ +# else
+ + typedef long ptrdiff_t;
+ +# endif
+ #endif
+" )
+endif()
+
+
+# setup paths and STL for standalone toolchain
+if( BUILD_WITH_STANDALONE_TOOLCHAIN )
+ set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_STANDALONE_TOOLCHAIN}" )
+ set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_STANDALONE_TOOLCHAIN}" )
+ set( ANDROID_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot" )
+
+ if( NOT ANDROID_STL STREQUAL "none" )
+ set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_STANDALONE_TOOLCHAIN}/include/c++/${ANDROID_COMPILER_VERSION}" )
+ if( NOT EXISTS "${ANDROID_STL_INCLUDE_DIRS}" )
+ # old location ( pre r8c )
+ set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/include/c++/${ANDROID_COMPILER_VERSION}" )
+ endif()
+ if( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND EXISTS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/${CMAKE_SYSTEM_PROCESSOR}/bits" )
+ list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/${CMAKE_SYSTEM_PROCESSOR}" )
+ elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/thumb/bits" )
+ list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/thumb" )
+ else()
+ list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}" )
+ endif()
+ # always search static GNU STL to get the location of libsupc++.a
+ if( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libstdc++.a" )
+ set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb" )
+ elseif( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libstdc++.a" )
+ set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}" )
+ elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libstdc++.a" )
+ set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb" )
+ elseif( EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libstdc++.a" )
+ set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib" )
+ endif()
+ if( __libstl )
+ set( __libsupcxx "${__libstl}/libsupc++.a" )
+ set( __libstl "${__libstl}/libstdc++.a" )
+ endif()
+ if( NOT EXISTS "${__libsupcxx}" )
+ message( FATAL_ERROR "The required libstdsupc++.a is missing in your standalone toolchain.
+ Usually it happens because of bug in make-standalone-toolchain.sh script from NDK r7, r7b and r7c.
+ You need to either upgrade to newer NDK or manually copy
+ $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a
+ to
+ ${__libsupcxx}
+ " )
+ endif()
+ if( ANDROID_STL STREQUAL "gnustl_shared" )
+ if( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libgnustl_shared.so" )
+ set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libgnustl_shared.so" )
+ elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libgnustl_shared.so" )
+ set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libgnustl_shared.so" )
+ elseif( EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libgnustl_shared.so" )
+ set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libgnustl_shared.so" )
+ endif()
+ endif()
+ endif()
+endif()
+
+# clang
+if( "${ANDROID_TOOLCHAIN_NAME}" STREQUAL "standalone-clang" )
+ set( ANDROID_COMPILER_IS_CLANG 1 )
+ execute_process( COMMAND "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/clang${TOOL_OS_SUFFIX}" --version OUTPUT_VARIABLE ANDROID_CLANG_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE )
+ string( REGEX MATCH "[0-9]+[.][0-9]+" ANDROID_CLANG_VERSION "${ANDROID_CLANG_VERSION}")
+elseif( "${ANDROID_TOOLCHAIN_NAME}" MATCHES "-clang3[.][0-9]?$" )
+ string( REGEX MATCH "3[.][0-9]$" ANDROID_CLANG_VERSION "${ANDROID_TOOLCHAIN_NAME}")
+ string( REGEX REPLACE "-clang${ANDROID_CLANG_VERSION}$" "-${ANDROID_COMPILER_VERSION}" ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
+ if( NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}/bin/clang${TOOL_OS_SUFFIX}" )
+ message( FATAL_ERROR "Could not find the Clang compiler driver" )
+ endif()
+ set( ANDROID_COMPILER_IS_CLANG 1 )
+ set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+else()
+ set( ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
+ unset( ANDROID_COMPILER_IS_CLANG CACHE )
+endif()
+
+string( REPLACE "." "" _clang_name "clang${ANDROID_CLANG_VERSION}" )
+if( NOT EXISTS "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}${TOOL_OS_SUFFIX}" )
+ set( _clang_name "clang" )
+endif()
+
+
+# setup paths and STL for NDK
+if( BUILD_WITH_ANDROID_NDK )
+ set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+ set( ANDROID_SYSROOT "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}" )
+
+ if( ANDROID_STL STREQUAL "none" )
+ # do nothing
+ elseif( ANDROID_STL STREQUAL "system" )
+ set( ANDROID_RTTI OFF )
+ set( ANDROID_EXCEPTIONS OFF )
+ set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/system/include" )
+ elseif( ANDROID_STL STREQUAL "system_re" )
+ set( ANDROID_RTTI ON )
+ set( ANDROID_EXCEPTIONS ON )
+ set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/system/include" )
+ elseif( ANDROID_STL MATCHES "gabi" )
+ if( ANDROID_NDK_RELEASE_NUM LESS 7000 ) # before r7
+ message( FATAL_ERROR "gabi++ is not available in your NDK. You have to upgrade to NDK r7 or newer to use gabi++.")
+ endif()
+ set( ANDROID_RTTI ON )
+ set( ANDROID_EXCEPTIONS OFF )
+ set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/gabi++/include" )
+ set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gabi++/libs/${ANDROID_NDK_ABI_NAME}/libgabi++_static.a" )
+ elseif( ANDROID_STL MATCHES "stlport" )
+ if( NOT ANDROID_NDK_RELEASE_NUM LESS 8004 ) # before r8d
+ set( ANDROID_EXCEPTIONS ON )
+ else()
+ set( ANDROID_EXCEPTIONS OFF )
+ endif()
+ if( ANDROID_NDK_RELEASE_NUM LESS 7000 ) # before r7
+ set( ANDROID_RTTI OFF )
+ else()
+ set( ANDROID_RTTI ON )
+ endif()
+ set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/stlport/stlport" )
+ set( __libstl "${ANDROID_NDK}/sources/cxx-stl/stlport/libs/${ANDROID_NDK_ABI_NAME}/libstlport_static.a" )
+ elseif( ANDROID_STL MATCHES "gnustl" )
+ set( ANDROID_EXCEPTIONS ON )
+ set( ANDROID_RTTI ON )
+ if( EXISTS "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
+ if( ARMEABI_V7A AND ANDROID_COMPILER_VERSION VERSION_EQUAL "4.7" AND ANDROID_NDK_RELEASE STREQUAL "r8d" )
+ # gnustl binary for 4.7 compiler is buggy :(
+ # TODO: look for right fix
+ set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/4.6" )
+ else()
+ set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
+ endif()
+ else()
+ set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++" )
+ endif()
+ set( ANDROID_STL_INCLUDE_DIRS "${__libstl}/include" "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/include" "${__libstl}/include/backward" )
+ if( EXISTS "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libgnustl_static.a" )
+ set( __libstl "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libgnustl_static.a" )
+ else()
+ set( __libstl "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" )
+ endif()
+ elseif( ANDROID_STL MATCHES "c\\+\\+" )
+ set( ANDROID_EXCEPTIONS ON )
+ set( ANDROID_RTTI ON )
+ set( __libstl "${ANDROID_NDK}/sources/cxx-stl/llvm-libc++" )
+ set( __libstl "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libc++_static.a" )
+ set( __libgnustl "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
+ set( ANDROID_STL_INCLUDE_DIRS "${__libgnustl}/include" "${__libgnustl}/libs/${ANDROID_NDK_ABI_NAME}/include" "${__libgnustl}/include/backward" )
+ else()
+ message( FATAL_ERROR "Unknown runtime: ${ANDROID_STL}" )
+ endif()
+
+ # find libsupc++.a - rtti & exceptions
+ if( ANDROID_STL STREQUAL "system_re" OR ANDROID_STL MATCHES "gnustl" )
+ set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r8b or newer
+ if( NOT EXISTS "${__libsupcxx}" )
+ set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r7-r8
+ endif()
+ if( NOT EXISTS "${__libsupcxx}" ) # before r7
+ if( ARMEABI_V7A )
+ if( ANDROID_FORCE_ARM_BUILD )
+ set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" )
+ else()
+ set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" )
+ endif()
+ elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD )
+ set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libsupc++.a" )
+ else()
+ set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" )
+ endif()
+ endif()
+ if( NOT EXISTS "${__libsupcxx}")
+ message( ERROR "Could not find libsupc++.a for a chosen platform. Either your NDK is not supported or is broken.")
+ endif()
+ endif()
+endif()
+
+
+# case of shared STL linkage
+if( ANDROID_STL MATCHES "shared" AND DEFINED __libstl )
+ string( REPLACE "_static.a" "_shared.so" __libstl "${__libstl}" )
+ if( NOT EXISTS "${__libstl}" )
+ message( FATAL_ERROR "Unable to find shared library ${__libstl}" )
+ endif()
+endif()
+
+
+# ccache support
+__INIT_VARIABLE( _ndk_ccache NDK_CCACHE ENV_NDK_CCACHE )
+if( _ndk_ccache )
+ if( DEFINED NDK_CCACHE AND NOT EXISTS NDK_CCACHE )
+ unset( NDK_CCACHE CACHE )
+ endif()
+ find_program( NDK_CCACHE "${_ndk_ccache}" DOC "The path to ccache binary")
+else()
+ unset( NDK_CCACHE CACHE )
+endif()
+unset( _ndk_ccache )
+
+
+# setup the cross-compiler
+if( NOT CMAKE_C_COMPILER )
+ if( NDK_CCACHE AND NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
+ set( CMAKE_C_COMPILER "${NDK_CCACHE}" CACHE PATH "ccache as C compiler" )
+ set( CMAKE_CXX_COMPILER "${NDK_CCACHE}" CACHE PATH "ccache as C++ compiler" )
+ if( ANDROID_COMPILER_IS_CLANG )
+ set( CMAKE_C_COMPILER_ARG1 "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}${TOOL_OS_SUFFIX}" CACHE PATH "C compiler")
+ set( CMAKE_CXX_COMPILER_ARG1 "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler")
+ else()
+ set( CMAKE_C_COMPILER_ARG1 "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc${TOOL_OS_SUFFIX}" CACHE PATH "C compiler")
+ set( CMAKE_CXX_COMPILER_ARG1 "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-g++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler")
+ endif()
+ else()
+ if( ANDROID_COMPILER_IS_CLANG )
+ set( CMAKE_C_COMPILER "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}${TOOL_OS_SUFFIX}" CACHE PATH "C compiler")
+ set( CMAKE_CXX_COMPILER "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler")
+ else()
+ set( CMAKE_C_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc${TOOL_OS_SUFFIX}" CACHE PATH "C compiler" )
+ set( CMAKE_CXX_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-g++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler" )
+ endif()
+ endif()
+ set( CMAKE_ASM_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc${TOOL_OS_SUFFIX}" CACHE PATH "assembler" )
+ set( CMAKE_STRIP "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-strip${TOOL_OS_SUFFIX}" CACHE PATH "strip" )
+ if( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc-ar${TOOL_OS_SUFFIX}" )
+ # Use gcc-ar if we have it for better LTO support.
+ set( CMAKE_AR "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc-ar${TOOL_OS_SUFFIX}" CACHE PATH "archive" )
+ else()
+ set( CMAKE_AR "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ar${TOOL_OS_SUFFIX}" CACHE PATH "archive" )
+ endif()
+ set( CMAKE_LINKER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ld${TOOL_OS_SUFFIX}" CACHE PATH "linker" )
+ set( CMAKE_NM "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-nm${TOOL_OS_SUFFIX}" CACHE PATH "nm" )
+ set( CMAKE_OBJCOPY "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-objcopy${TOOL_OS_SUFFIX}" CACHE PATH "objcopy" )
+ set( CMAKE_OBJDUMP "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-objdump${TOOL_OS_SUFFIX}" CACHE PATH "objdump" )
+ set( CMAKE_RANLIB "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ranlib${TOOL_OS_SUFFIX}" CACHE PATH "ranlib" )
+endif()
+
+set( _CMAKE_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_MACHINE_NAME}-" )
+if( CMAKE_VERSION VERSION_LESS 2.8.5 )
+ set( CMAKE_ASM_COMPILER_ARG1 "-c" )
+endif()
+if( APPLE )
+ find_program( CMAKE_INSTALL_NAME_TOOL NAMES install_name_tool )
+ if( NOT CMAKE_INSTALL_NAME_TOOL )
+ message( FATAL_ERROR "Could not find install_name_tool, please check your installation." )
+ endif()
+ mark_as_advanced( CMAKE_INSTALL_NAME_TOOL )
+endif()
+
+# Force set compilers because standard identification works badly for us
+include( CMakeForceCompiler )
+CMAKE_FORCE_C_COMPILER( "${CMAKE_C_COMPILER}" GNU )
+if( ANDROID_COMPILER_IS_CLANG )
+ set( CMAKE_C_COMPILER_ID Clang )
+endif()
+set( CMAKE_C_PLATFORM_ID Linux )
+if( X86_64 OR MIPS64 OR ARM64_V8A )
+ set( CMAKE_C_SIZEOF_DATA_PTR 8 )
+else()
+ set( CMAKE_C_SIZEOF_DATA_PTR 4 )
+endif()
+set( CMAKE_C_HAS_ISYSROOT 1 )
+set( CMAKE_C_COMPILER_ABI ELF )
+CMAKE_FORCE_CXX_COMPILER( "${CMAKE_CXX_COMPILER}" GNU )
+if( ANDROID_COMPILER_IS_CLANG )
+ set( CMAKE_CXX_COMPILER_ID Clang)
+endif()
+set( CMAKE_CXX_PLATFORM_ID Linux )
+set( CMAKE_CXX_SIZEOF_DATA_PTR ${CMAKE_C_SIZEOF_DATA_PTR} )
+set( CMAKE_CXX_HAS_ISYSROOT 1 )
+set( CMAKE_CXX_COMPILER_ABI ELF )
+set( CMAKE_CXX_SOURCE_FILE_EXTENSIONS cc cp cxx cpp CPP c++ C )
+# force ASM compiler (required for CMake < 2.8.5)
+set( CMAKE_ASM_COMPILER_ID_RUN TRUE )
+set( CMAKE_ASM_COMPILER_ID GNU )
+set( CMAKE_ASM_COMPILER_WORKS TRUE )
+set( CMAKE_ASM_COMPILER_FORCED TRUE )
+set( CMAKE_COMPILER_IS_GNUASM 1)
+set( CMAKE_ASM_SOURCE_FILE_EXTENSIONS s S asm )
+
+foreach( lang C CXX ASM )
+ if( ANDROID_COMPILER_IS_CLANG )
+ set( CMAKE_${lang}_COMPILER_VERSION ${ANDROID_CLANG_VERSION} )
+ else()
+ set( CMAKE_${lang}_COMPILER_VERSION ${ANDROID_COMPILER_VERSION} )
+ endif()
+endforeach()
+
+# flags and definitions
+remove_definitions( -DANDROID )
+add_definitions( -DANDROID )
+
+if( ANDROID_SYSROOT MATCHES "[ ;\"]" )
+ if( CMAKE_HOST_WIN32 )
+ # try to convert path to 8.3 form
+ file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "@echo %~s1" )
+ execute_process( COMMAND "$ENV{ComSpec}" /c "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "${ANDROID_SYSROOT}"
+ OUTPUT_VARIABLE __path OUTPUT_STRIP_TRAILING_WHITESPACE
+ RESULT_VARIABLE __result ERROR_QUIET )
+ if( __result EQUAL 0 )
+ file( TO_CMAKE_PATH "${__path}" ANDROID_SYSROOT )
+ set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
+ else()
+ set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+ endif()
+ else()
+ set( ANDROID_CXX_FLAGS "'--sysroot=${ANDROID_SYSROOT}'" )
+ endif()
+ if( NOT _CMAKE_IN_TRY_COMPILE )
+ # quotes can break try_compile and compiler identification
+ message(WARNING "Path to your Android NDK (or toolchain) has non-alphanumeric symbols.\nThe build might be broken.\n")
+ endif()
+else()
+ set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
+endif()
+
+# NDK flags
+if (ARM64_V8A )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funwind-tables" )
+ set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer -fstrict-aliasing" )
+ set( ANDROID_CXX_FLAGS_DEBUG "-fno-omit-frame-pointer -fno-strict-aliasing" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} -funswitch-loops -finline-limit=300" )
+ endif()
+elseif( ARMEABI OR ARMEABI_V7A OR ARMEABI_V7A_HARD)
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funwind-tables" )
+ if( NOT ANDROID_FORCE_ARM_BUILD AND NOT ARMEABI_V6 )
+ set( ANDROID_CXX_FLAGS_RELEASE "-mthumb -fomit-frame-pointer -fno-strict-aliasing" )
+ set( ANDROID_CXX_FLAGS_DEBUG "-marm -fno-omit-frame-pointer -fno-strict-aliasing" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -finline-limit=64" )
+ endif()
+ else()
+ # always compile ARMEABI_V6 in arm mode; otherwise there is no difference from ARMEABI
+ set( ANDROID_CXX_FLAGS_RELEASE "-marm -fomit-frame-pointer -fstrict-aliasing" )
+ set( ANDROID_CXX_FLAGS_DEBUG "-marm -fno-omit-frame-pointer -fno-strict-aliasing" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funswitch-loops -finline-limit=300" )
+ endif()
+ endif()
+elseif( X86 OR X86_64 )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funwind-tables" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funswitch-loops -finline-limit=300" )
+ endif()
+ set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer -fstrict-aliasing" )
+ set( ANDROID_CXX_FLAGS_DEBUG "-fno-omit-frame-pointer -fno-strict-aliasing" )
+elseif( MIPS OR MIPS64 )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fno-strict-aliasing -finline-functions -funwind-tables -fmessage-length=0" )
+ set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer" )
+ set( ANDROID_CXX_FLAGS_DEBUG "-fno-omit-frame-pointer" )
+ if( NOT ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fno-inline-functions-called-once -fgcse-after-reload -frerun-cse-after-loop -frename-registers" )
+ set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} -funswitch-loops -finline-limit=300" )
+ endif()
+elseif()
+ set( ANDROID_CXX_FLAGS_RELEASE "" )
+ set( ANDROID_CXX_FLAGS_DEBUG "" )
+endif()
+
+set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fsigned-char" ) # good/necessary when porting desktop libraries
+
+if( NOT X86 AND NOT ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "-Wno-psabi ${ANDROID_CXX_FLAGS}" )
+endif()
+
+if( NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.6" )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -no-canonical-prefixes" ) # see https://android-review.googlesource.com/#/c/47564/
+endif()
+
+# ABI-specific flags
+if( ARMEABI_V7A_HARD )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv7-a -mfloat-abi=hard -mhard-float -D_NDK_MATH_NO_SOFTFP=1" )
+ if( NEON )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=neon" )
+ elseif( VFPV3 )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3" )
+ else()
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3-d16" )
+ endif()
+elseif( ARMEABI_V7A )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv7-a -mfloat-abi=softfp" )
+ if( NEON )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=neon" )
+ elseif( VFPV3 )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3" )
+ else()
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3-d16" )
+ endif()
+
+elseif( ARMEABI_V6 )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv6 -mfloat-abi=softfp -mfpu=vfp" ) # vfp == vfpv2
+elseif( ARMEABI )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv5te -mtune=xscale -msoft-float" )
+endif()
+
+if( ANDROID_STL MATCHES "gnustl" AND (EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}") )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY " -o " )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE " -o " )
+ set( CMAKE_CXX_LINK_EXECUTABLE " -o " )
+else()
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY " -o " )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE " -o " )
+ set( CMAKE_CXX_LINK_EXECUTABLE " -o " )
+endif()
+
+# STL
+if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
+ if( EXISTS "${__libstl}" )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libstl}\"" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libstl}\"" )
+ set( CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} \"${__libstl}\"" )
+ endif()
+ if( EXISTS "${__libsupcxx}" )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libsupcxx}\"" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libsupcxx}\"" )
+ set( CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
+ # C objects:
+ set( CMAKE_C_CREATE_SHARED_LIBRARY " -o " )
+ set( CMAKE_C_CREATE_SHARED_MODULE " -o " )
+ set( CMAKE_C_LINK_EXECUTABLE " -o " )
+ set( CMAKE_C_CREATE_SHARED_LIBRARY "${CMAKE_C_CREATE_SHARED_LIBRARY} \"${__libsupcxx}\"" )
+ set( CMAKE_C_CREATE_SHARED_MODULE "${CMAKE_C_CREATE_SHARED_MODULE} \"${__libsupcxx}\"" )
+ set( CMAKE_C_LINK_EXECUTABLE "${CMAKE_C_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
+ endif()
+ if( ANDROID_STL MATCHES "gnustl" )
+ if( NOT EXISTS "${ANDROID_LIBM_PATH}" )
+ set( ANDROID_LIBM_PATH -lm )
+ endif()
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} ${ANDROID_LIBM_PATH}" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE "${CMAKE_CXX_CREATE_SHARED_MODULE} ${ANDROID_LIBM_PATH}" )
+ set( CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} ${ANDROID_LIBM_PATH}" )
+ endif()
+endif()
+
+# variables controlling optional build flags
+if( ANDROID_NDK_RELEASE_NUM LESS 7000 ) # before r7
+ # libGLESv2.so in NDK's prior to r7 refers to missing external symbols.
+ # So this flag option is required for all projects using OpenGL from native.
+ __INIT_VARIABLE( ANDROID_SO_UNDEFINED VALUES ON )
+else()
+ __INIT_VARIABLE( ANDROID_SO_UNDEFINED VALUES OFF )
+endif()
+__INIT_VARIABLE( ANDROID_NO_UNDEFINED VALUES ON )
+__INIT_VARIABLE( ANDROID_FUNCTION_LEVEL_LINKING VALUES ON )
+__INIT_VARIABLE( ANDROID_GOLD_LINKER VALUES ON )
+__INIT_VARIABLE( ANDROID_NOEXECSTACK VALUES ON )
+__INIT_VARIABLE( ANDROID_RELRO VALUES ON )
+
+set( ANDROID_NO_UNDEFINED ${ANDROID_NO_UNDEFINED} CACHE BOOL "Show all undefined symbols as linker errors" )
+set( ANDROID_SO_UNDEFINED ${ANDROID_SO_UNDEFINED} CACHE BOOL "Allows or disallows undefined symbols in shared libraries" )
+set( ANDROID_FUNCTION_LEVEL_LINKING ${ANDROID_FUNCTION_LEVEL_LINKING} CACHE BOOL "Put each function in separate section and enable garbage collection of unused input sections at link time" )
+set( ANDROID_GOLD_LINKER ${ANDROID_GOLD_LINKER} CACHE BOOL "Enables gold linker" )
+set( ANDROID_NOEXECSTACK ${ANDROID_NOEXECSTACK} CACHE BOOL "Allows or disallows undefined symbols in shared libraries" )
+set( ANDROID_RELRO ${ANDROID_RELRO} CACHE BOOL "Enables RELRO - a memory corruption mitigation technique" )
+mark_as_advanced( ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_FUNCTION_LEVEL_LINKING ANDROID_GOLD_LINKER ANDROID_NOEXECSTACK ANDROID_RELRO )
+
+# linker flags
+set( ANDROID_LINKER_FLAGS "" )
+
+if( ARMEABI_V7A )
+ # this is *required* to use the following linker flags that routes around
+ # a CPU bug in some Cortex-A8 implementations:
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--fix-cortex-a8" )
+endif()
+
+if( ARMEABI_V7A_HARD )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-warn-mismatch -lm_hard" )
+endif()
+
+if( ANDROID_NO_UNDEFINED )
+ if( MIPS )
+ # there is some sysroot-related problem in mips linker...
+ if( NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined -Wl,-rpath-link,${ANDROID_SYSROOT}/usr/lib" )
+ endif()
+ else()
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ endif()
+endif()
+
+if( ANDROID_SO_UNDEFINED )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,-allow-shlib-undefined" )
+endif()
+
+if( ANDROID_FUNCTION_LEVEL_LINKING )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fdata-sections -ffunction-sections" )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--gc-sections" )
+endif()
+
+if( ANDROID_COMPILER_VERSION VERSION_EQUAL "4.6" )
+ if( ANDROID_GOLD_LINKER AND (CMAKE_HOST_UNIX OR ANDROID_NDK_RELEASE_NUM GREATER 8002) AND (ARMEABI OR ARMEABI_V7A OR ARMEABI_V7A_HARD OR X86) )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=gold" )
+ elseif( ANDROID_NDK_RELEASE_NUM GREATER 8002 ) # after r8b
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=bfd" )
+ elseif( ANDROID_NDK_RELEASE STREQUAL "r8b" AND ARMEABI AND NOT _CMAKE_IN_TRY_COMPILE )
+ message( WARNING "The default bfd linker from arm GCC 4.6 toolchain can fail with 'unresolvable R_ARM_THM_CALL relocation' error message. See https://code.google.com/p/android/issues/detail?id=35342
+ On Linux and OS X host platform you can workaround this problem using gold linker (default).
+ Rerun cmake with -DANDROID_GOLD_LINKER=ON option in case of problems.
+" )
+ endif()
+endif() # version 4.6
+
+if( ANDROID_NOEXECSTACK )
+ if( ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -Xclang -mnoexecstack" )
+ else()
+ set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -Wa,--noexecstack" )
+ endif()
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,-z,noexecstack" )
+endif()
+
+if( ANDROID_RELRO )
+ set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,-z,relro -Wl,-z,now" )
+endif()
+
+if( ANDROID_COMPILER_IS_CLANG )
+ set( ANDROID_CXX_FLAGS "-target ${ANDROID_LLVM_TRIPLE} -Qunused-arguments ${ANDROID_CXX_FLAGS}" )
+ if( BUILD_WITH_ANDROID_NDK )
+ set( ANDROID_CXX_FLAGS "-gcc-toolchain ${ANDROID_TOOLCHAIN_ROOT} ${ANDROID_CXX_FLAGS}" )
+ endif()
+endif()
+
+# cache flags
+set( CMAKE_CXX_FLAGS "" CACHE STRING "c++ flags" )
+set( CMAKE_C_FLAGS "" CACHE STRING "c flags" )
+set( CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "c++ Release flags" )
+set( CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "c Release flags" )
+set( CMAKE_CXX_FLAGS_DEBUG "-O0 -g -DDEBUG -D_DEBUG" CACHE STRING "c++ Debug flags" )
+set( CMAKE_C_FLAGS_DEBUG "-O0 -g -DDEBUG -D_DEBUG" CACHE STRING "c Debug flags" )
+set( CMAKE_SHARED_LINKER_FLAGS "" CACHE STRING "shared linker flags" )
+set( CMAKE_MODULE_LINKER_FLAGS "" CACHE STRING "module linker flags" )
+set( CMAKE_EXE_LINKER_FLAGS "-Wl,-z,nocopyreloc" CACHE STRING "executable linker flags" )
+
+# put flags to cache (for debug purpose only)
+set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS}" CACHE INTERNAL "Android specific c/c++ flags" )
+set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE}" CACHE INTERNAL "Android specific c/c++ Release flags" )
+set( ANDROID_CXX_FLAGS_DEBUG "${ANDROID_CXX_FLAGS_DEBUG}" CACHE INTERNAL "Android specific c/c++ Debug flags" )
+set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}" CACHE INTERNAL "Android specific c/c++ linker flags" )
+
+# finish flags
+set( CMAKE_CXX_FLAGS "${ANDROID_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" )
+set( CMAKE_C_FLAGS "${ANDROID_CXX_FLAGS} ${CMAKE_C_FLAGS}" )
+set( CMAKE_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}" )
+set( CMAKE_C_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}" )
+set( CMAKE_CXX_FLAGS_DEBUG "${ANDROID_CXX_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}" )
+set( CMAKE_C_FLAGS_DEBUG "${ANDROID_CXX_FLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}" )
+set( CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}" )
+set( CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}" )
+set( CMAKE_EXE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )
+
+if( MIPS AND BUILD_WITH_ANDROID_NDK AND ANDROID_NDK_RELEASE STREQUAL "r8" )
+ set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
+ set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
+ set( CMAKE_EXE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
+endif()
+
+# pie/pic
+if( NOT (ANDROID_NATIVE_API_LEVEL LESS 16) AND (NOT DEFINED ANDROID_APP_PIE OR ANDROID_APP_PIE) AND (CMAKE_VERSION VERSION_GREATER 2.8.8) )
+ set( CMAKE_POSITION_INDEPENDENT_CODE TRUE )
+ set( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fPIE -pie")
+else()
+ set( CMAKE_POSITION_INDEPENDENT_CODE FALSE )
+ set( CMAKE_CXX_FLAGS "-fpic ${CMAKE_CXX_FLAGS}" )
+ set( CMAKE_C_FLAGS "-fpic ${CMAKE_C_FLAGS}" )
+endif()
+
+# configure rtti
+if( DEFINED ANDROID_RTTI AND ANDROID_STL_FORCE_FEATURES )
+ if( ANDROID_RTTI )
+ set( CMAKE_CXX_FLAGS "-frtti ${CMAKE_CXX_FLAGS}" )
+ else()
+ set( CMAKE_CXX_FLAGS "-fno-rtti ${CMAKE_CXX_FLAGS}" )
+ endif()
+endif()
+
+# configure exceptios
+if( DEFINED ANDROID_EXCEPTIONS AND ANDROID_STL_FORCE_FEATURES )
+ if( ANDROID_EXCEPTIONS )
+ set( CMAKE_CXX_FLAGS "-fexceptions ${CMAKE_CXX_FLAGS}" )
+ set( CMAKE_C_FLAGS "-fexceptions ${CMAKE_C_FLAGS}" )
+ else()
+ set( CMAKE_CXX_FLAGS "-fno-exceptions ${CMAKE_CXX_FLAGS}" )
+ set( CMAKE_C_FLAGS "-fno-exceptions ${CMAKE_C_FLAGS}" )
+ endif()
+endif()
+
+# global includes and link directories
+include_directories( SYSTEM "${ANDROID_SYSROOT}/usr/include" ${ANDROID_STL_INCLUDE_DIRS} )
+get_filename_component(__android_install_path "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" ABSOLUTE) # avoid CMP0015 policy warning
+link_directories( "${__android_install_path}" )
+
+# detect if need link crtbegin_so.o explicitly
+if( NOT DEFINED ANDROID_EXPLICIT_CRT_LINK )
+ set( __cmd "${CMAKE_CXX_CREATE_SHARED_LIBRARY}" )
+ string( REPLACE "" "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}" __cmd "${__cmd}" )
+ string( REPLACE "" "${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}" __cmd "${__cmd}" )
+ string( REPLACE "" "${CMAKE_CXX_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "" "" __cmd "${__cmd}" )
+ string( REPLACE "" "${CMAKE_SHARED_LINKER_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "" "-shared" __cmd "${__cmd}" )
+ string( REPLACE "" "" __cmd "${__cmd}" )
+ string( REPLACE "" "" __cmd "${__cmd}" )
+ string( REPLACE "" "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain_crtlink_test.so" __cmd "${__cmd}" )
+ string( REPLACE "" "\"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" __cmd "${__cmd}" )
+ string( REPLACE "" "" __cmd "${__cmd}" )
+ separate_arguments( __cmd )
+ foreach( __var ANDROID_NDK ANDROID_NDK_TOOLCHAINS_PATH ANDROID_STANDALONE_TOOLCHAIN )
+ if( ${__var} )
+ set( __tmp "${${__var}}" )
+ separate_arguments( __tmp )
+ string( REPLACE "${__tmp}" "${${__var}}" __cmd "${__cmd}")
+ endif()
+ endforeach()
+ string( REPLACE "'" "" __cmd "${__cmd}" )
+ string( REPLACE "\"" "" __cmd "${__cmd}" )
+ execute_process( COMMAND ${__cmd} RESULT_VARIABLE __cmd_result OUTPUT_QUIET ERROR_QUIET )
+ if( __cmd_result EQUAL 0 )
+ set( ANDROID_EXPLICIT_CRT_LINK ON )
+ else()
+ set( ANDROID_EXPLICIT_CRT_LINK OFF )
+ endif()
+endif()
+
+if( ANDROID_EXPLICIT_CRT_LINK )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+endif()
+
+# setup output directories
+set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )
+
+if( DEFINED LIBRARY_OUTPUT_PATH_ROOT
+ OR EXISTS "${CMAKE_SOURCE_DIR}/AndroidManifest.xml"
+ OR (EXISTS "${CMAKE_SOURCE_DIR}/../AndroidManifest.xml" AND EXISTS "${CMAKE_SOURCE_DIR}/../jni/") )
+ set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "Root for binaries output, set this to change where Android libs are installed to" )
+ if( NOT _CMAKE_IN_TRY_COMPILE )
+ if( EXISTS "${CMAKE_SOURCE_DIR}/jni/CMakeLists.txt" )
+ set( EXECUTABLE_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/bin/${ANDROID_NDK_ABI_NAME}" CACHE PATH "Output directory for applications" )
+ else()
+ set( EXECUTABLE_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/bin" CACHE PATH "Output directory for applications" )
+ endif()
+ set( LIBRARY_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/libs/${ANDROID_NDK_ABI_NAME}" CACHE PATH "Output directory for Android libs" )
+ endif()
+endif()
+
+# copy shaed stl library to build directory
+if( NOT _CMAKE_IN_TRY_COMPILE AND __libstl MATCHES "[.]so$" AND DEFINED LIBRARY_OUTPUT_PATH )
+ get_filename_component( __libstlname "${__libstl}" NAME )
+ execute_process( COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${__libstl}" "${LIBRARY_OUTPUT_PATH}/${__libstlname}" RESULT_VARIABLE __fileCopyProcess )
+ if( NOT __fileCopyProcess EQUAL 0 OR NOT EXISTS "${LIBRARY_OUTPUT_PATH}/${__libstlname}")
+ message( SEND_ERROR "Failed copying of ${__libstl} to the ${LIBRARY_OUTPUT_PATH}/${__libstlname}" )
+ endif()
+ unset( __fileCopyProcess )
+ unset( __libstlname )
+endif()
+
+
+# set these global flags for cmake client scripts to change behavior
+set( ANDROID True )
+set( BUILD_ANDROID True )
+
+# where is the target environment
+set( CMAKE_FIND_ROOT_PATH "${ANDROID_TOOLCHAIN_ROOT}/bin" "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}" "${ANDROID_SYSROOT}" "${CMAKE_INSTALL_PREFIX}" "${CMAKE_INSTALL_PREFIX}/share" )
+
+# only search for libraries and includes in the ndk toolchain
+set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY )
+set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
+set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
+
+
+# macro to find packages on the host OS
+macro( find_host_package )
+ set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
+ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
+ set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER )
+ if( CMAKE_HOST_WIN32 )
+ SET( WIN32 1 )
+ SET( UNIX )
+ elseif( CMAKE_HOST_APPLE )
+ SET( APPLE 1 )
+ SET( UNIX )
+ endif()
+ find_package( ${ARGN} )
+ SET( WIN32 )
+ SET( APPLE )
+ SET( UNIX 1 )
+ set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY )
+ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
+ set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
+endmacro()
+
+
+# macro to find programs on the host OS
+macro( find_host_program )
+ set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
+ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
+ set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER )
+ if( CMAKE_HOST_WIN32 )
+ SET( WIN32 1 )
+ SET( UNIX )
+ elseif( CMAKE_HOST_APPLE )
+ SET( APPLE 1 )
+ SET( UNIX )
+ endif()
+ find_program( ${ARGN} )
+ SET( WIN32 )
+ SET( APPLE )
+ SET( UNIX 1 )
+ set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY )
+ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
+ set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
+endmacro()
+
+
+# export toolchain settings for the try_compile() command
+if( NOT _CMAKE_IN_TRY_COMPILE )
+ set( __toolchain_config "")
+ foreach( __var NDK_CCACHE LIBRARY_OUTPUT_PATH_ROOT ANDROID_FORBID_SYGWIN
+ ANDROID_NDK_HOST_X64
+ ANDROID_NDK
+ ANDROID_NDK_LAYOUT
+ ANDROID_STANDALONE_TOOLCHAIN
+ ANDROID_TOOLCHAIN_NAME
+ ANDROID_ABI
+ ANDROID_NATIVE_API_LEVEL
+ ANDROID_STL
+ ANDROID_STL_FORCE_FEATURES
+ ANDROID_FORCE_ARM_BUILD
+ ANDROID_NO_UNDEFINED
+ ANDROID_SO_UNDEFINED
+ ANDROID_FUNCTION_LEVEL_LINKING
+ ANDROID_GOLD_LINKER
+ ANDROID_NOEXECSTACK
+ ANDROID_RELRO
+ ANDROID_LIBM_PATH
+ ANDROID_EXPLICIT_CRT_LINK
+ ANDROID_APP_PIE
+ )
+ if( DEFINED ${__var} )
+ if( ${__var} MATCHES " ")
+ set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" CACHE INTERNAL \"\" )\n" )
+ else()
+ set( __toolchain_config "${__toolchain_config}set( ${__var} ${${__var}} CACHE INTERNAL \"\" )\n" )
+ endif()
+ endif()
+ endforeach()
+ file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/android.toolchain.config.cmake" "${__toolchain_config}" )
+ unset( __toolchain_config )
+endif()
+
+
+# force cmake to produce / instead of \ in build commands for Ninja generator
+if( CMAKE_GENERATOR MATCHES "Ninja" AND CMAKE_HOST_WIN32 )
+ # it is a bad hack after all
+ # CMake generates Ninja makefiles with UNIX paths only if it thinks that we are going to build with MinGW
+ set( CMAKE_COMPILER_IS_MINGW TRUE ) # tell CMake that we are MinGW
+ set( CMAKE_CROSSCOMPILING TRUE ) # stop recursion
+ enable_language( C )
+ enable_language( CXX )
+ # unset( CMAKE_COMPILER_IS_MINGW ) # can't unset because CMake does not convert back-slashes in response files without it
+ unset( MINGW )
+endif()
+
+
+# Variables controlling behavior or set by cmake toolchain:
+# ANDROID_ABI : "armeabi-v7a" (default), "armeabi", "armeabi-v7a with NEON", "armeabi-v7a-hard with NEON", "armeabi-v7a with VFPV3", "armeabi-v6 with VFP", "x86", "mips", "arm64-v8a", "x86_64", "mips64"
+# ANDROID_NATIVE_API_LEVEL : 3,4,5,8,9,14,15,16,17,18,19,21 (depends on NDK version)
+# ANDROID_STL : gnustl_static/gnustl_shared/stlport_static/stlport_shared/gabi++_static/gabi++_shared/system_re/system/none
+# ANDROID_FORBID_SYGWIN : ON/OFF
+# ANDROID_NO_UNDEFINED : ON/OFF
+# ANDROID_SO_UNDEFINED : OFF/ON (default depends on NDK version)
+# ANDROID_FUNCTION_LEVEL_LINKING : ON/OFF
+# ANDROID_GOLD_LINKER : ON/OFF
+# ANDROID_NOEXECSTACK : ON/OFF
+# ANDROID_RELRO : ON/OFF
+# ANDROID_FORCE_ARM_BUILD : ON/OFF
+# ANDROID_STL_FORCE_FEATURES : ON/OFF
+# ANDROID_LIBM_PATH : path to libm.so (set to something like $(TOP)/out/target/product//obj/lib/libm.so) to workaround unresolved `sincos`
+# Can be set only at the first run:
+# ANDROID_NDK : path to your NDK install
+# NDK_CCACHE : path to your ccache executable
+# ANDROID_TOOLCHAIN_NAME : the NDK name of compiler toolchain
+# ANDROID_NDK_HOST_X64 : try to use x86_64 toolchain (default for x64 host systems)
+# ANDROID_NDK_LAYOUT : the inner NDK structure (RELEASE, LINARO, ANDROID)
+# LIBRARY_OUTPUT_PATH_ROOT :
+# ANDROID_STANDALONE_TOOLCHAIN
+#
+# Primary read-only variables:
+# ANDROID : always TRUE
+# ARMEABI : TRUE for arm v6 and older devices
+# ARMEABI_V6 : TRUE for arm v6
+# ARMEABI_V7A : TRUE for arm v7a
+# ARMEABI_V7A_HARD : TRUE for arm v7a with hardfp
+# ARM64_V8A : TRUE for arm64-v8a
+# NEON : TRUE if NEON unit is enabled
+# VFPV3 : TRUE if VFP version 3 is enabled
+# X86 : TRUE if configured for x86
+# X86_64 : TRUE if configured for x86_64
+# MIPS : TRUE if configured for mips
+# MIPS64 : TRUE if configured for mips64
+# BUILD_WITH_ANDROID_NDK : TRUE if NDK is used
+# BUILD_WITH_STANDALONE_TOOLCHAIN : TRUE if standalone toolchain is used
+# ANDROID_NDK_HOST_SYSTEM_NAME : "windows", "linux-x86" or "darwin-x86" depending on host platform
+# ANDROID_NDK_ABI_NAME : "armeabi", "armeabi-v7a", "armeabi-v7a-hard", "x86", "mips", "arm64-v8a", "x86_64", "mips64" depending on ANDROID_ABI
+# ANDROID_NDK_RELEASE : from r5 to r10d; set only for NDK
+# ANDROID_NDK_RELEASE_NUM : numeric ANDROID_NDK_RELEASE version (1000*major+minor)
+# ANDROID_ARCH_NAME : "arm", "x86", "mips", "arm64", "x86_64", "mips64" depending on ANDROID_ABI
+# ANDROID_SYSROOT : path to the compiler sysroot
+# TOOL_OS_SUFFIX : "" or ".exe" depending on host platform
+# ANDROID_COMPILER_IS_CLANG : TRUE if clang compiler is used
+#
+# Secondary (less stable) read-only variables:
+# ANDROID_COMPILER_VERSION : GCC version used (not Clang version)
+# ANDROID_CLANG_VERSION : version of clang compiler if clang is used
+# ANDROID_CXX_FLAGS : C/C++ compiler flags required by Android platform
+# ANDROID_SUPPORTED_ABIS : list of currently allowed values for ANDROID_ABI
+# ANDROID_TOOLCHAIN_MACHINE_NAME : "arm-linux-androideabi", "arm-eabi" or "i686-android-linux"
+# ANDROID_TOOLCHAIN_ROOT : path to the top level of toolchain (standalone or placed inside NDK)
+# ANDROID_CLANG_TOOLCHAIN_ROOT : path to clang tools
+# ANDROID_SUPPORTED_NATIVE_API_LEVELS : list of native API levels found inside NDK
+# ANDROID_STL_INCLUDE_DIRS : stl include paths
+# ANDROID_RTTI : if rtti is enabled by the runtime
+# ANDROID_EXCEPTIONS : if exceptions are enabled by the runtime
+# ANDROID_GCC_TOOLCHAIN_NAME : read-only, differs from ANDROID_TOOLCHAIN_NAME only if clang is used
+#
+# Defaults:
+# ANDROID_DEFAULT_NDK_API_LEVEL
+# ANDROID_DEFAULT_NDK_API_LEVEL_${ARCH}
+# ANDROID_NDK_SEARCH_PATHS
+# ANDROID_SUPPORTED_ABIS_${ARCH}
+# ANDROID_SUPPORTED_NDK_VERSIONS
diff --git a/build.sh b/build.sh
new file mode 100644
index 00000000000..ac4bfa7a81b
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/bash
+
+##### android armv7
+mkdir -p build-android-armv7
+pushd build-android-armv7
+cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="armeabi-v7a with NEON" -DANDROID_NATIVE_API_LEVEL=android-9 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF ..
+make
+make install
+popd
+
+##### android aarch64
+mkdir -p build-android-aarch64
+pushd build-android-aarch64
+cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_NATIVE_API_LEVEL=android-21 -DANDROID_FORCE_ARM_BUILD=OFF -DANDROID_STL_FORCE_FEATURES=OFF ..
+make
+make install
+popd
+
+##### ios armv7 arm64
+mkdir -p build-ios
+pushd build-ios
+cmake -DCMAKE_TOOLCHAIN_FILE=../iosxc.toolchain.cmake ..
+make
+make install
+popd
+
+##### ios simulator i386 x86_64
+mkdir -p build-ios-sim
+pushd build-ios-sim
+cmake -DCMAKE_TOOLCHAIN_FILE=../iossimxc.toolchain.cmake ..
+make
+make install
+popd
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 00000000000..2373832bd09
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,9 @@
+
+find_package(OpenCV REQUIRED core highgui imgproc)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src)
+
+add_executable(squeezenet squeezenet.cpp)
+
+target_link_libraries(squeezenet ncnn ${OpenCV_LIBS})
diff --git a/examples/squeezencnn/AndroidManifest.xml b/examples/squeezencnn/AndroidManifest.xml
new file mode 100644
index 00000000000..5624e012f60
--- /dev/null
+++ b/examples/squeezencnn/AndroidManifest.xml
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
diff --git a/examples/squeezencnn/ant.properties b/examples/squeezencnn/ant.properties
new file mode 100644
index 00000000000..9281e74f1c2
--- /dev/null
+++ b/examples/squeezencnn/ant.properties
@@ -0,0 +1,21 @@
+# This file is used to override default values used by the Ant build system.
+#
+# This file must be checked into Version Control Systems, as it is
+# integral to the build system of your project.
+
+# This file is only used by the Ant script.
+
+# You can use this to override default values such as
+# 'source.dir' for the location of your java source folder and
+# 'out.dir' for the location of your output folder.
+
+# You can also use it define how the release builds are signed by declaring
+# the following properties:
+# 'key.store' for the location of your keystore and
+# 'key.alias' for the name of the key to use.
+# The password will be asked during the build when you use the 'release' target.
+
+key.store=/home/nihui/osd/nihuini-release-key.keystore
+key.alias=nihuini
+key.store.password=nihuini
+key.alias.password=nihuini
diff --git a/examples/squeezencnn/assets/squeezenet_v1.1.bin b/examples/squeezencnn/assets/squeezenet_v1.1.bin
new file mode 120000
index 00000000000..655c56c35be
--- /dev/null
+++ b/examples/squeezencnn/assets/squeezenet_v1.1.bin
@@ -0,0 +1 @@
+../../squeezenet_v1.1.bin
\ No newline at end of file
diff --git a/examples/squeezencnn/assets/squeezenet_v1.1.param.bin b/examples/squeezencnn/assets/squeezenet_v1.1.param.bin
new file mode 100644
index 00000000000..c419dc9e1bd
Binary files /dev/null and b/examples/squeezencnn/assets/squeezenet_v1.1.param.bin differ
diff --git a/examples/squeezencnn/assets/synset_words.txt b/examples/squeezencnn/assets/synset_words.txt
new file mode 120000
index 00000000000..f84db6c2fce
--- /dev/null
+++ b/examples/squeezencnn/assets/synset_words.txt
@@ -0,0 +1 @@
+../../synset_words.txt
\ No newline at end of file
diff --git a/examples/squeezencnn/build.xml b/examples/squeezencnn/build.xml
new file mode 100644
index 00000000000..47b725e7431
--- /dev/null
+++ b/examples/squeezencnn/build.xml
@@ -0,0 +1,92 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/examples/squeezencnn/jni/Android.mk b/examples/squeezencnn/jni/Android.mk
new file mode 100644
index 00000000000..11f00a3b9a1
--- /dev/null
+++ b/examples/squeezencnn/jni/Android.mk
@@ -0,0 +1,30 @@
+LOCAL_PATH := $(call my-dir)
+
+# change this folder path to yours
+NCNN_INSTALL_PATH := /home/nihui/dev/qqfacecnn/ncnn/build-android-armv7/install
+
+include $(CLEAR_VARS)
+LOCAL_MODULE := ncnn
+LOCAL_SRC_FILES := $(NCNN_INSTALL_PATH)/lib/libncnn.a
+include $(PREBUILT_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := squeezencnn
+LOCAL_SRC_FILES := squeezencnn_jni.cpp
+
+LOCAL_C_INCLUDES := $(NCNN_INSTALL_PATH)/include
+
+LOCAL_STATIC_LIBRARIES := ncnn
+
+LOCAL_CFLAGS := -O2 -fvisibility=hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math
+LOCAL_CPPFLAGS := -O2 -fvisibility=hidden -fvisibility-inlines-hidden -fomit-frame-pointer -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math
+LOCAL_LDFLAGS += -Wl,--gc-sections
+
+LOCAL_CFLAGS += -fopenmp
+LOCAL_CPPFLAGS += -fopenmp
+LOCAL_LDFLAGS += -fopenmp
+
+LOCAL_LDLIBS := -lz -llog -ljnigraphics
+
+include $(BUILD_SHARED_LIBRARY)
diff --git a/examples/squeezencnn/jni/Application.mk b/examples/squeezencnn/jni/Application.mk
new file mode 100644
index 00000000000..a98c0484adc
--- /dev/null
+++ b/examples/squeezencnn/jni/Application.mk
@@ -0,0 +1,7 @@
+
+# APP_STL := stlport_static
+APP_STL := gnustl_static
+# APP_ABI := armeabi armeabi-v7a
+APP_ABI := armeabi-v7a
+APP_PLATFORM := android-9
+NDK_TOOLCHAIN_VERSION := 4.9
diff --git a/examples/squeezencnn/jni/squeezencnn_jni.cpp b/examples/squeezencnn/jni/squeezencnn_jni.cpp
new file mode 100644
index 00000000000..036f1435845
--- /dev/null
+++ b/examples/squeezencnn/jni/squeezencnn_jni.cpp
@@ -0,0 +1,181 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include
+#include
+
+#include
+
+#include
+#include
+
+// ncnn
+#include "net.h"
+
+#include "squeezenet_v1.1.id.h"
+
+#include
+#include
+
+static struct timeval tv_begin;
+static struct timeval tv_end;
+static double elasped;
+
+static void bench_start()
+{
+ gettimeofday(&tv_begin, NULL);
+}
+
+static void bench_end(const char* comment)
+{
+ gettimeofday(&tv_end, NULL);
+ elasped = ((tv_end.tv_sec - tv_begin.tv_sec) * 1000000.0f + tv_end.tv_usec - tv_begin.tv_usec) / 1000.0f;
+// fprintf(stderr, "%.2fms %s\n", elasped, comment);
+ __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "%.2fms %s", elasped, comment);
+}
+
+static std::vector squeezenet_param;
+static std::vector squeezenet_bin;
+static std::vector squeezenet_words;
+static ncnn::Net squeezenet;
+
+static std::vector split_string(const std::string& str, const std::string& delimiter)
+{
+ std::vector strings;
+
+ std::string::size_type pos = 0;
+ std::string::size_type prev = 0;
+ while ((pos = str.find(delimiter, prev)) != std::string::npos)
+ {
+ strings.push_back(str.substr(prev, pos - prev));
+ prev = pos + 1;
+ }
+
+ // To get the last substring (or only, if delimiter is not found)
+ strings.push_back(str.substr(prev));
+
+ return strings;
+}
+
+extern "C" {
+
+// public native boolean Init(byte[] param, byte[] bin, byte[] words);
+JNIEXPORT jboolean JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Init(JNIEnv* env, jobject thiz, jbyteArray param, jbyteArray bin, jbyteArray words)
+{
+ // init param
+ {
+ int len = env->GetArrayLength(param);
+ squeezenet_param.resize(len);
+ env->GetByteArrayRegion(param, 0, len, (jbyte*)squeezenet_param.data());
+ int ret = squeezenet.load_param(squeezenet_param.data());
+ __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_param %d %d", ret, len);
+ }
+
+ // init bin
+ {
+ int len = env->GetArrayLength(bin);
+ squeezenet_bin.resize(len);
+ env->GetByteArrayRegion(bin, 0, len, (jbyte*)squeezenet_bin.data());
+ int ret = squeezenet.load_model(squeezenet_bin.data());
+ __android_log_print(ANDROID_LOG_DEBUG, "SqueezeNcnn", "load_model %d %d", ret, len);
+ }
+
+ // init words
+ {
+ int len = env->GetArrayLength(words);
+ std::string words_buffer;
+ words_buffer.resize(len);
+ env->GetByteArrayRegion(words, 0, len, (jbyte*)words_buffer.data());
+ squeezenet_words = split_string(words_buffer, "\n");
+ }
+
+ return JNI_TRUE;
+}
+
+// public native String Detect(Bitmap bitmap);
+JNIEXPORT jstring JNICALL Java_com_tencent_squeezencnn_SqueezeNcnn_Detect(JNIEnv* env, jobject thiz, jobject bitmap)
+{
+ bench_start();
+
+ // ncnn from bitmap
+ ncnn::Mat in;
+ {
+ AndroidBitmapInfo info;
+ AndroidBitmap_getInfo(env, bitmap, &info);
+ int width = info.width;
+ int height = info.height;
+ if (width != 227 || height != 227)
+ return NULL;
+ if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888)
+ return NULL;
+
+ void* indata;
+ AndroidBitmap_lockPixels(env, bitmap, &indata);
+
+ in = ncnn::Mat::from_pixels((const unsigned char*)indata, ncnn::Mat::PIXEL_RGBA2BGR, width, height);
+
+ AndroidBitmap_unlockPixels(env, bitmap);
+ }
+
+ // squeezenet
+ std::vector cls_scores;
+ {
+ const float mean_vals[3] = {104.f, 117.f, 123.f};
+ in.substract_mean_normalize(mean_vals, 0);
+
+ ncnn::Extractor ex = squeezenet.create_extractor();
+ ex.set_light_mode(true);
+ ex.set_num_threads(4);
+
+ ex.input(squeezenet_v1_1_param_id::BLOB_data, in);
+
+ ncnn::Mat out;
+ ex.extract(squeezenet_v1_1_param_id::BLOB_prob, out);
+
+ cls_scores.resize(out.c);
+ for (int j=0; j max_score)
+ {
+ top_class = i;
+ max_score = s;
+ }
+ }
+
+ const std::string& word = squeezenet_words[top_class];
+ char tmp[32];
+ sprintf(tmp, "%.3f", max_score);
+ std::string result_str = std::string(word.c_str() + 10) + " = " + tmp;
+
+ // +10 to skip leading n03179701
+ jstring result = env->NewStringUTF(result_str.c_str());
+
+ bench_end("detect");
+
+ return result;
+}
+
+}
diff --git a/examples/squeezencnn/jni/squeezenet_v1.1.id.h b/examples/squeezencnn/jni/squeezenet_v1.1.id.h
new file mode 100644
index 00000000000..94ae7f5c5b3
--- /dev/null
+++ b/examples/squeezencnn/jni/squeezenet_v1.1.id.h
@@ -0,0 +1,163 @@
+#ifndef NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
+#define NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
+namespace squeezenet_v1_1_param_id {
+const int LAYER_data = 0;
+const int BLOB_data = 0;
+const int LAYER_conv1 = 1;
+const int BLOB_conv1 = 1;
+const int LAYER_relu_conv1 = 2;
+const int BLOB_conv1_relu_conv1 = 2;
+const int LAYER_pool1 = 3;
+const int BLOB_pool1 = 3;
+const int LAYER_fire2_squeeze1x1 = 4;
+const int BLOB_fire2_squeeze1x1 = 4;
+const int LAYER_fire2_relu_squeeze1x1 = 5;
+const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1 = 5;
+const int LAYER_splitncnn_0 = 6;
+const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_0 = 6;
+const int BLOB_fire2_squeeze1x1_fire2_relu_squeeze1x1_splitncnn_1 = 7;
+const int LAYER_fire2_expand1x1 = 7;
+const int BLOB_fire2_expand1x1 = 8;
+const int LAYER_fire2_relu_expand1x1 = 8;
+const int BLOB_fire2_expand1x1_fire2_relu_expand1x1 = 9;
+const int LAYER_fire2_expand3x3 = 9;
+const int BLOB_fire2_expand3x3 = 10;
+const int LAYER_fire2_relu_expand3x3 = 10;
+const int BLOB_fire2_expand3x3_fire2_relu_expand3x3 = 11;
+const int LAYER_fire2_concat = 11;
+const int BLOB_fire2_concat = 12;
+const int LAYER_fire3_squeeze1x1 = 12;
+const int BLOB_fire3_squeeze1x1 = 13;
+const int LAYER_fire3_relu_squeeze1x1 = 13;
+const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1 = 14;
+const int LAYER_splitncnn_1 = 14;
+const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_0 = 15;
+const int BLOB_fire3_squeeze1x1_fire3_relu_squeeze1x1_splitncnn_1 = 16;
+const int LAYER_fire3_expand1x1 = 15;
+const int BLOB_fire3_expand1x1 = 17;
+const int LAYER_fire3_relu_expand1x1 = 16;
+const int BLOB_fire3_expand1x1_fire3_relu_expand1x1 = 18;
+const int LAYER_fire3_expand3x3 = 17;
+const int BLOB_fire3_expand3x3 = 19;
+const int LAYER_fire3_relu_expand3x3 = 18;
+const int BLOB_fire3_expand3x3_fire3_relu_expand3x3 = 20;
+const int LAYER_fire3_concat = 19;
+const int BLOB_fire3_concat = 21;
+const int LAYER_pool3 = 20;
+const int BLOB_pool3 = 22;
+const int LAYER_fire4_squeeze1x1 = 21;
+const int BLOB_fire4_squeeze1x1 = 23;
+const int LAYER_fire4_relu_squeeze1x1 = 22;
+const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1 = 24;
+const int LAYER_splitncnn_2 = 23;
+const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_0 = 25;
+const int BLOB_fire4_squeeze1x1_fire4_relu_squeeze1x1_splitncnn_1 = 26;
+const int LAYER_fire4_expand1x1 = 24;
+const int BLOB_fire4_expand1x1 = 27;
+const int LAYER_fire4_relu_expand1x1 = 25;
+const int BLOB_fire4_expand1x1_fire4_relu_expand1x1 = 28;
+const int LAYER_fire4_expand3x3 = 26;
+const int BLOB_fire4_expand3x3 = 29;
+const int LAYER_fire4_relu_expand3x3 = 27;
+const int BLOB_fire4_expand3x3_fire4_relu_expand3x3 = 30;
+const int LAYER_fire4_concat = 28;
+const int BLOB_fire4_concat = 31;
+const int LAYER_fire5_squeeze1x1 = 29;
+const int BLOB_fire5_squeeze1x1 = 32;
+const int LAYER_fire5_relu_squeeze1x1 = 30;
+const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1 = 33;
+const int LAYER_splitncnn_3 = 31;
+const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_0 = 34;
+const int BLOB_fire5_squeeze1x1_fire5_relu_squeeze1x1_splitncnn_1 = 35;
+const int LAYER_fire5_expand1x1 = 32;
+const int BLOB_fire5_expand1x1 = 36;
+const int LAYER_fire5_relu_expand1x1 = 33;
+const int BLOB_fire5_expand1x1_fire5_relu_expand1x1 = 37;
+const int LAYER_fire5_expand3x3 = 34;
+const int BLOB_fire5_expand3x3 = 38;
+const int LAYER_fire5_relu_expand3x3 = 35;
+const int BLOB_fire5_expand3x3_fire5_relu_expand3x3 = 39;
+const int LAYER_fire5_concat = 36;
+const int BLOB_fire5_concat = 40;
+const int LAYER_pool5 = 37;
+const int BLOB_pool5 = 41;
+const int LAYER_fire6_squeeze1x1 = 38;
+const int BLOB_fire6_squeeze1x1 = 42;
+const int LAYER_fire6_relu_squeeze1x1 = 39;
+const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1 = 43;
+const int LAYER_splitncnn_4 = 40;
+const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_0 = 44;
+const int BLOB_fire6_squeeze1x1_fire6_relu_squeeze1x1_splitncnn_1 = 45;
+const int LAYER_fire6_expand1x1 = 41;
+const int BLOB_fire6_expand1x1 = 46;
+const int LAYER_fire6_relu_expand1x1 = 42;
+const int BLOB_fire6_expand1x1_fire6_relu_expand1x1 = 47;
+const int LAYER_fire6_expand3x3 = 43;
+const int BLOB_fire6_expand3x3 = 48;
+const int LAYER_fire6_relu_expand3x3 = 44;
+const int BLOB_fire6_expand3x3_fire6_relu_expand3x3 = 49;
+const int LAYER_fire6_concat = 45;
+const int BLOB_fire6_concat = 50;
+const int LAYER_fire7_squeeze1x1 = 46;
+const int BLOB_fire7_squeeze1x1 = 51;
+const int LAYER_fire7_relu_squeeze1x1 = 47;
+const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1 = 52;
+const int LAYER_splitncnn_5 = 48;
+const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_0 = 53;
+const int BLOB_fire7_squeeze1x1_fire7_relu_squeeze1x1_splitncnn_1 = 54;
+const int LAYER_fire7_expand1x1 = 49;
+const int BLOB_fire7_expand1x1 = 55;
+const int LAYER_fire7_relu_expand1x1 = 50;
+const int BLOB_fire7_expand1x1_fire7_relu_expand1x1 = 56;
+const int LAYER_fire7_expand3x3 = 51;
+const int BLOB_fire7_expand3x3 = 57;
+const int LAYER_fire7_relu_expand3x3 = 52;
+const int BLOB_fire7_expand3x3_fire7_relu_expand3x3 = 58;
+const int LAYER_fire7_concat = 53;
+const int BLOB_fire7_concat = 59;
+const int LAYER_fire8_squeeze1x1 = 54;
+const int BLOB_fire8_squeeze1x1 = 60;
+const int LAYER_fire8_relu_squeeze1x1 = 55;
+const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1 = 61;
+const int LAYER_splitncnn_6 = 56;
+const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_0 = 62;
+const int BLOB_fire8_squeeze1x1_fire8_relu_squeeze1x1_splitncnn_1 = 63;
+const int LAYER_fire8_expand1x1 = 57;
+const int BLOB_fire8_expand1x1 = 64;
+const int LAYER_fire8_relu_expand1x1 = 58;
+const int BLOB_fire8_expand1x1_fire8_relu_expand1x1 = 65;
+const int LAYER_fire8_expand3x3 = 59;
+const int BLOB_fire8_expand3x3 = 66;
+const int LAYER_fire8_relu_expand3x3 = 60;
+const int BLOB_fire8_expand3x3_fire8_relu_expand3x3 = 67;
+const int LAYER_fire8_concat = 61;
+const int BLOB_fire8_concat = 68;
+const int LAYER_fire9_squeeze1x1 = 62;
+const int BLOB_fire9_squeeze1x1 = 69;
+const int LAYER_fire9_relu_squeeze1x1 = 63;
+const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1 = 70;
+const int LAYER_splitncnn_7 = 64;
+const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_0 = 71;
+const int BLOB_fire9_squeeze1x1_fire9_relu_squeeze1x1_splitncnn_1 = 72;
+const int LAYER_fire9_expand1x1 = 65;
+const int BLOB_fire9_expand1x1 = 73;
+const int LAYER_fire9_relu_expand1x1 = 66;
+const int BLOB_fire9_expand1x1_fire9_relu_expand1x1 = 74;
+const int LAYER_fire9_expand3x3 = 67;
+const int BLOB_fire9_expand3x3 = 75;
+const int LAYER_fire9_relu_expand3x3 = 68;
+const int BLOB_fire9_expand3x3_fire9_relu_expand3x3 = 76;
+const int LAYER_fire9_concat = 69;
+const int BLOB_fire9_concat = 77;
+const int LAYER_drop9 = 70;
+const int BLOB_fire9_concat_drop9 = 78;
+const int LAYER_conv10 = 71;
+const int BLOB_conv10 = 79;
+const int LAYER_relu_conv10 = 72;
+const int BLOB_conv10_relu_conv10 = 80;
+const int LAYER_pool10 = 73;
+const int BLOB_pool10 = 81;
+const int LAYER_prob = 74;
+const int BLOB_prob = 82;
+} // namespace squeezenet_v1_1_param_id
+#endif // NCNN_INCLUDE_GUARD_squeezenet_v1_1_id_h
diff --git a/examples/squeezencnn/local.properties b/examples/squeezencnn/local.properties
new file mode 100644
index 00000000000..916b3624c19
--- /dev/null
+++ b/examples/squeezencnn/local.properties
@@ -0,0 +1,10 @@
+# This file is automatically generated by Android Tools.
+# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
+#
+# This file must *NOT* be checked into Version Control Systems,
+# as it contains information specific to your local configuration.
+
+# location of the SDK. This is only used by Ant
+# For customization when using a Version Control System, please read the
+# header note.
+sdk.dir=/home/nihui/osd/android-sdk-linux
diff --git a/examples/squeezencnn/proguard-project.txt b/examples/squeezencnn/proguard-project.txt
new file mode 100644
index 00000000000..f2fe1559a21
--- /dev/null
+++ b/examples/squeezencnn/proguard-project.txt
@@ -0,0 +1,20 @@
+# To enable ProGuard in your project, edit project.properties
+# to define the proguard.config property as described in that file.
+#
+# Add project specific ProGuard rules here.
+# By default, the flags in this file are appended to flags specified
+# in ${sdk.dir}/tools/proguard/proguard-android.txt
+# You can edit the include path and order by changing the ProGuard
+# include property in project.properties.
+#
+# For more details, see
+# http://developer.android.com/guide/developing/tools/proguard.html
+
+# Add any project specific keep options here:
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+# public *;
+#}
diff --git a/examples/squeezencnn/project.properties b/examples/squeezencnn/project.properties
new file mode 100644
index 00000000000..c6998b3d101
--- /dev/null
+++ b/examples/squeezencnn/project.properties
@@ -0,0 +1,14 @@
+# This file is automatically generated by Android Tools.
+# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
+#
+# This file must be checked in Version Control Systems.
+#
+# To customize properties used by the Ant build system edit
+# "ant.properties", and override values to adapt the script to your
+# project structure.
+#
+# To enable ProGuard to shrink and obfuscate your code, uncomment this (available properties: sdk.dir, user.home):
+#proguard.config=${sdk.dir}/tools/proguard/proguard-android.txt:proguard-project.txt
+
+# Project target.
+target=android-9
diff --git a/examples/squeezencnn/res/layout/main.xml b/examples/squeezencnn/res/layout/main.xml
new file mode 100644
index 00000000000..37cf35675ba
--- /dev/null
+++ b/examples/squeezencnn/res/layout/main.xml
@@ -0,0 +1,36 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/examples/squeezencnn/res/values/strings.xml b/examples/squeezencnn/res/values/strings.xml
new file mode 100644
index 00000000000..283e0263a07
--- /dev/null
+++ b/examples/squeezencnn/res/values/strings.xml
@@ -0,0 +1,4 @@
+
+
+ squeezencnn
+
diff --git a/examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java b/examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java
new file mode 100644
index 00000000000..666aaee840e
--- /dev/null
+++ b/examples/squeezencnn/src/com/tencent/squeezencnn/MainActivity.java
@@ -0,0 +1,189 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+package com.tencent.squeezencnn;
+
+import android.app.Activity;
+import android.os.Bundle;
+
+import android.content.Context;
+import android.content.Intent;
+import android.database.Cursor;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+import android.net.Uri;
+import android.provider.MediaStore;
+import android.util.Log;
+import android.view.View;
+import android.widget.Button;
+import android.widget.ImageView;
+import android.widget.TextView;
+
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.tencent.squeezencnn.SqueezeNcnn;
+
+public class MainActivity extends Activity
+{
+ private static final int SELECT_IMAGE = 1;
+
+ private TextView infoResult;
+ private ImageView imageView;
+ private Bitmap yourSelectedImage = null;
+
+ private SqueezeNcnn squeezencnn = new SqueezeNcnn();
+
+ /** Called when the activity is first created. */
+ @Override
+ public void onCreate(Bundle savedInstanceState)
+ {
+ super.onCreate(savedInstanceState);
+ setContentView(R.layout.main);
+
+ try
+ {
+ initSqueezeNcnn();
+ }
+ catch (IOException e)
+ {
+ Log.e("MainActivity", "initSqueezeNcnn error");
+ }
+
+ infoResult = (TextView) findViewById(R.id.infoResult);
+ imageView = (ImageView) findViewById(R.id.imageView);
+
+ Button buttonImage = (Button) findViewById(R.id.buttonImage);
+ buttonImage.setOnClickListener(new View.OnClickListener() {
+ @Override
+ public void onClick(View arg0) {
+ Intent i = new Intent(Intent.ACTION_PICK);
+ i.setType("image/*");
+ startActivityForResult(i, SELECT_IMAGE);
+ }
+ });
+
+ Button buttonDetect = (Button) findViewById(R.id.buttonDetect);
+ buttonDetect.setOnClickListener(new View.OnClickListener() {
+ @Override
+ public void onClick(View arg0) {
+ if (yourSelectedImage == null)
+ return;
+
+ String result = squeezencnn.Detect(yourSelectedImage);
+
+ if (result == null)
+ {
+ infoResult.setText("detect failed");
+ }
+ else
+ {
+ infoResult.setText(result);
+ }
+ }
+ });
+ }
+
+ private void initSqueezeNcnn() throws IOException
+ {
+ byte[] param = null;
+ byte[] bin = null;
+ byte[] words = null;
+
+ {
+ InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.param.bin");
+ int available = assetsInputStream.available();
+ param = new byte[available];
+ int byteCode = assetsInputStream.read(param);
+ assetsInputStream.close();
+ }
+ {
+ InputStream assetsInputStream = getAssets().open("squeezenet_v1.1.bin");
+ int available = assetsInputStream.available();
+ bin = new byte[available];
+ int byteCode = assetsInputStream.read(bin);
+ assetsInputStream.close();
+ }
+ {
+ InputStream assetsInputStream = getAssets().open("synset_words.txt");
+ int available = assetsInputStream.available();
+ words = new byte[available];
+ int byteCode = assetsInputStream.read(words);
+ assetsInputStream.close();
+ }
+
+ squeezencnn.Init(param, bin, words);
+ }
+
+ @Override
+ protected void onActivityResult(int requestCode, int resultCode, Intent data)
+ {
+ super.onActivityResult(requestCode, resultCode, data);
+
+ if (resultCode == RESULT_OK && null != data) {
+ Uri selectedImage = data.getData();
+
+ try
+ {
+ if (requestCode == SELECT_IMAGE) {
+ Bitmap bitmap = decodeUri(selectedImage);
+
+ Bitmap rgba = bitmap.copy(Bitmap.Config.ARGB_8888, true);
+
+ // resize to 227x227
+ yourSelectedImage = Bitmap.createScaledBitmap(rgba, 227, 227, false);
+
+ imageView.setImageBitmap(yourSelectedImage);
+ }
+ }
+ catch (FileNotFoundException e)
+ {
+ Log.e("MainActivity", "FileNotFoundException");
+ return;
+ }
+ }
+ }
+
+ private Bitmap decodeUri(Uri selectedImage) throws FileNotFoundException
+ {
+ // Decode image size
+ BitmapFactory.Options o = new BitmapFactory.Options();
+ o.inJustDecodeBounds = true;
+ BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o);
+
+ // The new size we want to scale to
+ final int REQUIRED_SIZE = 400;
+
+ // Find the correct scale value. It should be the power of 2.
+ int width_tmp = o.outWidth, height_tmp = o.outHeight;
+ int scale = 1;
+ while (true) {
+ if (width_tmp / 2 < REQUIRED_SIZE
+ || height_tmp / 2 < REQUIRED_SIZE) {
+ break;
+ }
+ width_tmp /= 2;
+ height_tmp /= 2;
+ scale *= 2;
+ }
+
+ // Decode with inSampleSize
+ BitmapFactory.Options o2 = new BitmapFactory.Options();
+ o2.inSampleSize = scale;
+ return BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o2);
+ }
+
+}
diff --git a/examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java b/examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java
new file mode 100644
index 00000000000..ac0b5973229
--- /dev/null
+++ b/examples/squeezencnn/src/com/tencent/squeezencnn/SqueezeNcnn.java
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+package com.tencent.squeezencnn;
+
+import android.graphics.Bitmap;
+import android.content.Context;
+
+public class SqueezeNcnn
+{
+ public native boolean Init(byte[] param, byte[] bin, byte[] words);
+
+ public native String Detect(Bitmap bitmap);
+
+ static {
+ System.loadLibrary("squeezencnn");
+ }
+}
diff --git a/examples/squeezenet.cpp b/examples/squeezenet.cpp
new file mode 100644
index 00000000000..bab2a35ba94
--- /dev/null
+++ b/examples/squeezenet.cpp
@@ -0,0 +1,95 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include
+#include
+#include
+#include
+#include
+
+#include "net.h"
+
+static int detect_squeezenet(const cv::Mat& bgr, std::vector& cls_scores)
+{
+ ncnn::Net squeezenet;
+ squeezenet.load_param("squeezenet_v1.1.param");
+ squeezenet.load_model("squeezenet_v1.1.bin");
+
+ ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 227, 227);
+
+ const float mean_vals[3] = {104.f, 117.f, 123.f};
+ in.substract_mean_normalize(mean_vals, 0);
+
+ ncnn::Extractor ex = squeezenet.create_extractor();
+ ex.set_light_mode(true);
+
+ ex.input("data", in);
+
+ ncnn::Mat out;
+ ex.extract("prob", out);
+
+ cls_scores.resize(out.c);
+ for (int j=0; j& cls_scores, int topk)
+{
+ // partial sort topk with index
+ int size = cls_scores.size();
+ std::vector< std::pair > vec;
+ vec.resize(size);
+ for (int i=0; i >());
+
+ // print topk and score
+ for (int i=0; i cls_scores;
+ detect_squeezenet(m, cls_scores);
+
+ print_topk(cls_scores, 3);
+
+ return 0;
+}
+
diff --git a/examples/squeezenet_v1.1.bin b/examples/squeezenet_v1.1.bin
new file mode 100644
index 00000000000..2b39bf8c42d
Binary files /dev/null and b/examples/squeezenet_v1.1.bin differ
diff --git a/examples/squeezenet_v1.1.caffemodel b/examples/squeezenet_v1.1.caffemodel
new file mode 100644
index 00000000000..9d2fc33abf6
Binary files /dev/null and b/examples/squeezenet_v1.1.caffemodel differ
diff --git a/examples/squeezenet_v1.1.param b/examples/squeezenet_v1.1.param
new file mode 100644
index 00000000000..6c1bd296e22
--- /dev/null
+++ b/examples/squeezenet_v1.1.param
@@ -0,0 +1,76 @@
+75 83
+Input data 0 1 data 3 227 227
+Convolution conv1 1 1 data conv1 64 3 1 2 0 1 1728
+ReLU relu_conv1 1 1 conv1 conv1_relu_conv1 0.000000
+Pooling pool1 1 1 conv1_relu_conv1 pool1 0 3 2 0 0
+Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 16 1 1 1 0 1 1024
+ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0.000000
+Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
+Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 64 1 1 1 0 1 1024
+ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 0.000000
+Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 64 3 1 1 1 1 9216
+ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 0.000000
+Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat
+Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 16 1 1 1 0 1 2048
+ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 0.000000
+Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
+Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 64 1 1 1 0 1 1024
+ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 0.000000
+Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 64 3 1 1 1 1 9216
+ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 0.000000
+Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat
+Pooling pool3 1 1 fire3/concat pool3 0 3 2 0 0
+Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 32 1 1 1 0 1 4096
+ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 0.000000
+Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
+Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 128 1 1 1 0 1 4096
+ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 0.000000
+Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 128 3 1 1 1 1 36864
+ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 0.000000
+Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat
+Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 32 1 1 1 0 1 8192
+ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 0.000000
+Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
+Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 128 1 1 1 0 1 4096
+ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 0.000000
+Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 128 3 1 1 1 1 36864
+ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 0.000000
+Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat
+Pooling pool5 1 1 fire5/concat pool5 0 3 2 0 0
+Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 48 1 1 1 0 1 12288
+ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 0.000000
+Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
+Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 192 1 1 1 0 1 9216
+ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 0.000000
+Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 192 3 1 1 1 1 82944
+ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 0.000000
+Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat
+Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 48 1 1 1 0 1 18432
+ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 0.000000
+Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
+Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 192 1 1 1 0 1 9216
+ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 0.000000
+Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 192 3 1 1 1 1 82944
+ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 0.000000
+Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat
+Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 64 1 1 1 0 1 24576
+ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 0.000000
+Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
+Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 256 1 1 1 0 1 16384
+ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 0.000000
+Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 256 3 1 1 1 1 147456
+ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 0.000000
+Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat
+Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 64 1 1 1 0 1 32768
+ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 0.000000
+Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
+Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 256 1 1 1 0 1 16384
+ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 0.000000
+Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 256 3 1 1 1 1 147456
+ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 0.000000
+Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat
+Dropout drop9 1 1 fire9/concat fire9/concat_drop9
+Convolution conv10 1 1 fire9/concat_drop9 conv10 1000 1 1 1 1 1 512000
+ReLU relu_conv10 1 1 conv10 conv10_relu_conv10 0.000000
+Pooling pool10 1 1 conv10_relu_conv10 pool10 1 0 1 0 1
+Softmax prob 1 1 pool10 prob
diff --git a/examples/squeezenet_v1.1.prototxt b/examples/squeezenet_v1.1.prototxt
new file mode 100644
index 00000000000..7dc9853b4e5
--- /dev/null
+++ b/examples/squeezenet_v1.1.prototxt
@@ -0,0 +1,548 @@
+name: "squeezenet_v1.1_deploy"
+
+layer {
+ name: "data"
+ type: "Input"
+ top: "data"
+ input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
+}
+layer {
+ name: "conv1"
+ type: "Convolution"
+ bottom: "data"
+ top: "conv1"
+ convolution_param {
+ num_output: 64
+ kernel_size: 3
+ stride: 2
+ }
+}
+layer {
+ name: "relu_conv1"
+ type: "ReLU"
+ bottom: "conv1"
+ top: "conv1"
+}
+layer {
+ name: "pool1"
+ type: "Pooling"
+ bottom: "conv1"
+ top: "pool1"
+ pooling_param {
+ pool: MAX
+ kernel_size: 3
+ stride: 2
+ }
+}
+layer {
+ name: "fire2/squeeze1x1"
+ type: "Convolution"
+ bottom: "pool1"
+ top: "fire2/squeeze1x1"
+ convolution_param {
+ num_output: 16
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire2/relu_squeeze1x1"
+ type: "ReLU"
+ bottom: "fire2/squeeze1x1"
+ top: "fire2/squeeze1x1"
+}
+layer {
+ name: "fire2/expand1x1"
+ type: "Convolution"
+ bottom: "fire2/squeeze1x1"
+ top: "fire2/expand1x1"
+ convolution_param {
+ num_output: 64
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire2/relu_expand1x1"
+ type: "ReLU"
+ bottom: "fire2/expand1x1"
+ top: "fire2/expand1x1"
+}
+layer {
+ name: "fire2/expand3x3"
+ type: "Convolution"
+ bottom: "fire2/squeeze1x1"
+ top: "fire2/expand3x3"
+ convolution_param {
+ num_output: 64
+ pad: 1
+ kernel_size: 3
+ }
+}
+layer {
+ name: "fire2/relu_expand3x3"
+ type: "ReLU"
+ bottom: "fire2/expand3x3"
+ top: "fire2/expand3x3"
+}
+layer {
+ name: "fire2/concat"
+ type: "Concat"
+ bottom: "fire2/expand1x1"
+ bottom: "fire2/expand3x3"
+ top: "fire2/concat"
+}
+layer {
+ name: "fire3/squeeze1x1"
+ type: "Convolution"
+ bottom: "fire2/concat"
+ top: "fire3/squeeze1x1"
+ convolution_param {
+ num_output: 16
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire3/relu_squeeze1x1"
+ type: "ReLU"
+ bottom: "fire3/squeeze1x1"
+ top: "fire3/squeeze1x1"
+}
+layer {
+ name: "fire3/expand1x1"
+ type: "Convolution"
+ bottom: "fire3/squeeze1x1"
+ top: "fire3/expand1x1"
+ convolution_param {
+ num_output: 64
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire3/relu_expand1x1"
+ type: "ReLU"
+ bottom: "fire3/expand1x1"
+ top: "fire3/expand1x1"
+}
+layer {
+ name: "fire3/expand3x3"
+ type: "Convolution"
+ bottom: "fire3/squeeze1x1"
+ top: "fire3/expand3x3"
+ convolution_param {
+ num_output: 64
+ pad: 1
+ kernel_size: 3
+ }
+}
+layer {
+ name: "fire3/relu_expand3x3"
+ type: "ReLU"
+ bottom: "fire3/expand3x3"
+ top: "fire3/expand3x3"
+}
+layer {
+ name: "fire3/concat"
+ type: "Concat"
+ bottom: "fire3/expand1x1"
+ bottom: "fire3/expand3x3"
+ top: "fire3/concat"
+}
+layer {
+ name: "pool3"
+ type: "Pooling"
+ bottom: "fire3/concat"
+ top: "pool3"
+ pooling_param {
+ pool: MAX
+ kernel_size: 3
+ stride: 2
+ }
+}
+layer {
+ name: "fire4/squeeze1x1"
+ type: "Convolution"
+ bottom: "pool3"
+ top: "fire4/squeeze1x1"
+ convolution_param {
+ num_output: 32
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire4/relu_squeeze1x1"
+ type: "ReLU"
+ bottom: "fire4/squeeze1x1"
+ top: "fire4/squeeze1x1"
+}
+layer {
+ name: "fire4/expand1x1"
+ type: "Convolution"
+ bottom: "fire4/squeeze1x1"
+ top: "fire4/expand1x1"
+ convolution_param {
+ num_output: 128
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire4/relu_expand1x1"
+ type: "ReLU"
+ bottom: "fire4/expand1x1"
+ top: "fire4/expand1x1"
+}
+layer {
+ name: "fire4/expand3x3"
+ type: "Convolution"
+ bottom: "fire4/squeeze1x1"
+ top: "fire4/expand3x3"
+ convolution_param {
+ num_output: 128
+ pad: 1
+ kernel_size: 3
+ }
+}
+layer {
+ name: "fire4/relu_expand3x3"
+ type: "ReLU"
+ bottom: "fire4/expand3x3"
+ top: "fire4/expand3x3"
+}
+layer {
+ name: "fire4/concat"
+ type: "Concat"
+ bottom: "fire4/expand1x1"
+ bottom: "fire4/expand3x3"
+ top: "fire4/concat"
+}
+layer {
+ name: "fire5/squeeze1x1"
+ type: "Convolution"
+ bottom: "fire4/concat"
+ top: "fire5/squeeze1x1"
+ convolution_param {
+ num_output: 32
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire5/relu_squeeze1x1"
+ type: "ReLU"
+ bottom: "fire5/squeeze1x1"
+ top: "fire5/squeeze1x1"
+}
+layer {
+ name: "fire5/expand1x1"
+ type: "Convolution"
+ bottom: "fire5/squeeze1x1"
+ top: "fire5/expand1x1"
+ convolution_param {
+ num_output: 128
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire5/relu_expand1x1"
+ type: "ReLU"
+ bottom: "fire5/expand1x1"
+ top: "fire5/expand1x1"
+}
+layer {
+ name: "fire5/expand3x3"
+ type: "Convolution"
+ bottom: "fire5/squeeze1x1"
+ top: "fire5/expand3x3"
+ convolution_param {
+ num_output: 128
+ pad: 1
+ kernel_size: 3
+ }
+}
+layer {
+ name: "fire5/relu_expand3x3"
+ type: "ReLU"
+ bottom: "fire5/expand3x3"
+ top: "fire5/expand3x3"
+}
+layer {
+ name: "fire5/concat"
+ type: "Concat"
+ bottom: "fire5/expand1x1"
+ bottom: "fire5/expand3x3"
+ top: "fire5/concat"
+}
+layer {
+ name: "pool5"
+ type: "Pooling"
+ bottom: "fire5/concat"
+ top: "pool5"
+ pooling_param {
+ pool: MAX
+ kernel_size: 3
+ stride: 2
+ }
+}
+layer {
+ name: "fire6/squeeze1x1"
+ type: "Convolution"
+ bottom: "pool5"
+ top: "fire6/squeeze1x1"
+ convolution_param {
+ num_output: 48
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire6/relu_squeeze1x1"
+ type: "ReLU"
+ bottom: "fire6/squeeze1x1"
+ top: "fire6/squeeze1x1"
+}
+layer {
+ name: "fire6/expand1x1"
+ type: "Convolution"
+ bottom: "fire6/squeeze1x1"
+ top: "fire6/expand1x1"
+ convolution_param {
+ num_output: 192
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire6/relu_expand1x1"
+ type: "ReLU"
+ bottom: "fire6/expand1x1"
+ top: "fire6/expand1x1"
+}
+layer {
+ name: "fire6/expand3x3"
+ type: "Convolution"
+ bottom: "fire6/squeeze1x1"
+ top: "fire6/expand3x3"
+ convolution_param {
+ num_output: 192
+ pad: 1
+ kernel_size: 3
+ }
+}
+layer {
+ name: "fire6/relu_expand3x3"
+ type: "ReLU"
+ bottom: "fire6/expand3x3"
+ top: "fire6/expand3x3"
+}
+layer {
+ name: "fire6/concat"
+ type: "Concat"
+ bottom: "fire6/expand1x1"
+ bottom: "fire6/expand3x3"
+ top: "fire6/concat"
+}
+layer {
+ name: "fire7/squeeze1x1"
+ type: "Convolution"
+ bottom: "fire6/concat"
+ top: "fire7/squeeze1x1"
+ convolution_param {
+ num_output: 48
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire7/relu_squeeze1x1"
+ type: "ReLU"
+ bottom: "fire7/squeeze1x1"
+ top: "fire7/squeeze1x1"
+}
+layer {
+ name: "fire7/expand1x1"
+ type: "Convolution"
+ bottom: "fire7/squeeze1x1"
+ top: "fire7/expand1x1"
+ convolution_param {
+ num_output: 192
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire7/relu_expand1x1"
+ type: "ReLU"
+ bottom: "fire7/expand1x1"
+ top: "fire7/expand1x1"
+}
+layer {
+ name: "fire7/expand3x3"
+ type: "Convolution"
+ bottom: "fire7/squeeze1x1"
+ top: "fire7/expand3x3"
+ convolution_param {
+ num_output: 192
+ pad: 1
+ kernel_size: 3
+ }
+}
+layer {
+ name: "fire7/relu_expand3x3"
+ type: "ReLU"
+ bottom: "fire7/expand3x3"
+ top: "fire7/expand3x3"
+}
+layer {
+ name: "fire7/concat"
+ type: "Concat"
+ bottom: "fire7/expand1x1"
+ bottom: "fire7/expand3x3"
+ top: "fire7/concat"
+}
+layer {
+ name: "fire8/squeeze1x1"
+ type: "Convolution"
+ bottom: "fire7/concat"
+ top: "fire8/squeeze1x1"
+ convolution_param {
+ num_output: 64
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire8/relu_squeeze1x1"
+ type: "ReLU"
+ bottom: "fire8/squeeze1x1"
+ top: "fire8/squeeze1x1"
+}
+layer {
+ name: "fire8/expand1x1"
+ type: "Convolution"
+ bottom: "fire8/squeeze1x1"
+ top: "fire8/expand1x1"
+ convolution_param {
+ num_output: 256
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire8/relu_expand1x1"
+ type: "ReLU"
+ bottom: "fire8/expand1x1"
+ top: "fire8/expand1x1"
+}
+layer {
+ name: "fire8/expand3x3"
+ type: "Convolution"
+ bottom: "fire8/squeeze1x1"
+ top: "fire8/expand3x3"
+ convolution_param {
+ num_output: 256
+ pad: 1
+ kernel_size: 3
+ }
+}
+layer {
+ name: "fire8/relu_expand3x3"
+ type: "ReLU"
+ bottom: "fire8/expand3x3"
+ top: "fire8/expand3x3"
+}
+layer {
+ name: "fire8/concat"
+ type: "Concat"
+ bottom: "fire8/expand1x1"
+ bottom: "fire8/expand3x3"
+ top: "fire8/concat"
+}
+layer {
+ name: "fire9/squeeze1x1"
+ type: "Convolution"
+ bottom: "fire8/concat"
+ top: "fire9/squeeze1x1"
+ convolution_param {
+ num_output: 64
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire9/relu_squeeze1x1"
+ type: "ReLU"
+ bottom: "fire9/squeeze1x1"
+ top: "fire9/squeeze1x1"
+}
+layer {
+ name: "fire9/expand1x1"
+ type: "Convolution"
+ bottom: "fire9/squeeze1x1"
+ top: "fire9/expand1x1"
+ convolution_param {
+ num_output: 256
+ kernel_size: 1
+ }
+}
+layer {
+ name: "fire9/relu_expand1x1"
+ type: "ReLU"
+ bottom: "fire9/expand1x1"
+ top: "fire9/expand1x1"
+}
+layer {
+ name: "fire9/expand3x3"
+ type: "Convolution"
+ bottom: "fire9/squeeze1x1"
+ top: "fire9/expand3x3"
+ convolution_param {
+ num_output: 256
+ pad: 1
+ kernel_size: 3
+ }
+}
+layer {
+ name: "fire9/relu_expand3x3"
+ type: "ReLU"
+ bottom: "fire9/expand3x3"
+ top: "fire9/expand3x3"
+}
+layer {
+ name: "fire9/concat"
+ type: "Concat"
+ bottom: "fire9/expand1x1"
+ bottom: "fire9/expand3x3"
+ top: "fire9/concat"
+}
+layer {
+ name: "drop9"
+ type: "Dropout"
+ bottom: "fire9/concat"
+ top: "fire9/concat"
+ dropout_param {
+ dropout_ratio: 0.5
+ }
+}
+layer {
+ name: "conv10"
+ type: "Convolution"
+ bottom: "fire9/concat"
+ top: "conv10"
+ convolution_param {
+ num_output: 1000
+ pad: 1
+ kernel_size: 1
+ }
+}
+layer {
+ name: "relu_conv10"
+ type: "ReLU"
+ bottom: "conv10"
+ top: "conv10"
+}
+layer {
+ name: "pool10"
+ type: "Pooling"
+ bottom: "conv10"
+ top: "pool10"
+ pooling_param {
+ pool: AVE
+ global_pooling: true
+ }
+}
+layer {
+ name: "prob"
+ type: "Softmax"
+ bottom: "pool10"
+ top: "prob"
+}
diff --git a/examples/synset_words.txt b/examples/synset_words.txt
new file mode 100644
index 00000000000..a9e8c7f50d1
--- /dev/null
+++ b/examples/synset_words.txt
@@ -0,0 +1,1000 @@
+n01440764 tench, Tinca tinca
+n01443537 goldfish, Carassius auratus
+n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+n01491361 tiger shark, Galeocerdo cuvieri
+n01494475 hammerhead, hammerhead shark
+n01496331 electric ray, crampfish, numbfish, torpedo
+n01498041 stingray
+n01514668 cock
+n01514859 hen
+n01518878 ostrich, Struthio camelus
+n01530575 brambling, Fringilla montifringilla
+n01531178 goldfinch, Carduelis carduelis
+n01532829 house finch, linnet, Carpodacus mexicanus
+n01534433 junco, snowbird
+n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+n01558993 robin, American robin, Turdus migratorius
+n01560419 bulbul
+n01580077 jay
+n01582220 magpie
+n01592084 chickadee
+n01601694 water ouzel, dipper
+n01608432 kite
+n01614925 bald eagle, American eagle, Haliaeetus leucocephalus
+n01616318 vulture
+n01622779 great grey owl, great gray owl, Strix nebulosa
+n01629819 European fire salamander, Salamandra salamandra
+n01630670 common newt, Triturus vulgaris
+n01631663 eft
+n01632458 spotted salamander, Ambystoma maculatum
+n01632777 axolotl, mud puppy, Ambystoma mexicanum
+n01641577 bullfrog, Rana catesbeiana
+n01644373 tree frog, tree-frog
+n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+n01664065 loggerhead, loggerhead turtle, Caretta caretta
+n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+n01667114 mud turtle
+n01667778 terrapin
+n01669191 box turtle, box tortoise
+n01675722 banded gecko
+n01677366 common iguana, iguana, Iguana iguana
+n01682714 American chameleon, anole, Anolis carolinensis
+n01685808 whiptail, whiptail lizard
+n01687978 agama
+n01688243 frilled lizard, Chlamydosaurus kingi
+n01689811 alligator lizard
+n01692333 Gila monster, Heloderma suspectum
+n01693334 green lizard, Lacerta viridis
+n01694178 African chameleon, Chamaeleo chamaeleon
+n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+n01697457 African crocodile, Nile crocodile, Crocodylus niloticus
+n01698640 American alligator, Alligator mississipiensis
+n01704323 triceratops
+n01728572 thunder snake, worm snake, Carphophis amoenus
+n01728920 ringneck snake, ring-necked snake, ring snake
+n01729322 hognose snake, puff adder, sand viper
+n01729977 green snake, grass snake
+n01734418 king snake, kingsnake
+n01735189 garter snake, grass snake
+n01737021 water snake
+n01739381 vine snake
+n01740131 night snake, Hypsiglena torquata
+n01742172 boa constrictor, Constrictor constrictor
+n01744401 rock python, rock snake, Python sebae
+n01748264 Indian cobra, Naja naja
+n01749939 green mamba
+n01751748 sea snake
+n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus
+n01756291 sidewinder, horned rattlesnake, Crotalus cerastes
+n01768244 trilobite
+n01770081 harvestman, daddy longlegs, Phalangium opilio
+n01770393 scorpion
+n01773157 black and gold garden spider, Argiope aurantia
+n01773549 barn spider, Araneus cavaticus
+n01773797 garden spider, Aranea diademata
+n01774384 black widow, Latrodectus mactans
+n01774750 tarantula
+n01775062 wolf spider, hunting spider
+n01776313 tick
+n01784675 centipede
+n01795545 black grouse
+n01796340 ptarmigan
+n01797886 ruffed grouse, partridge, Bonasa umbellus
+n01798484 prairie chicken, prairie grouse, prairie fowl
+n01806143 peacock
+n01806567 quail
+n01807496 partridge
+n01817953 African grey, African gray, Psittacus erithacus
+n01818515 macaw
+n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+n01820546 lorikeet
+n01824575 coucal
+n01828970 bee eater
+n01829413 hornbill
+n01833805 hummingbird
+n01843065 jacamar
+n01843383 toucan
+n01847000 drake
+n01855032 red-breasted merganser, Mergus serrator
+n01855672 goose
+n01860187 black swan, Cygnus atratus
+n01871265 tusker
+n01872401 echidna, spiny anteater, anteater
+n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+n01877812 wallaby, brush kangaroo
+n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+n01883070 wombat
+n01910747 jellyfish
+n01914609 sea anemone, anemone
+n01917289 brain coral
+n01924916 flatworm, platyhelminth
+n01930112 nematode, nematode worm, roundworm
+n01943899 conch
+n01944390 snail
+n01945685 slug
+n01950731 sea slug, nudibranch
+n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore
+n01968897 chambered nautilus, pearly nautilus, nautilus
+n01978287 Dungeness crab, Cancer magister
+n01978455 rock crab, Cancer irroratus
+n01980166 fiddler crab
+n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+n01983481 American lobster, Northern lobster, Maine lobster, Homarus americanus
+n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+n01985128 crayfish, crawfish, crawdad, crawdaddy
+n01986214 hermit crab
+n01990800 isopod
+n02002556 white stork, Ciconia ciconia
+n02002724 black stork, Ciconia nigra
+n02006656 spoonbill
+n02007558 flamingo
+n02009229 little blue heron, Egretta caerulea
+n02009912 American egret, great white heron, Egretta albus
+n02011460 bittern
+n02012849 crane
+n02013706 limpkin, Aramus pictus
+n02017213 European gallinule, Porphyrio porphyrio
+n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana
+n02018795 bustard
+n02025239 ruddy turnstone, Arenaria interpres
+n02027492 red-backed sandpiper, dunlin, Erolia alpina
+n02028035 redshank, Tringa totanus
+n02033041 dowitcher
+n02037110 oystercatcher, oyster catcher
+n02051845 pelican
+n02056570 king penguin, Aptenodytes patagonica
+n02058221 albatross, mollymawk
+n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+n02074367 dugong, Dugong dugon
+n02077923 sea lion
+n02085620 Chihuahua
+n02085782 Japanese spaniel
+n02085936 Maltese dog, Maltese terrier, Maltese
+n02086079 Pekinese, Pekingese, Peke
+n02086240 Shih-Tzu
+n02086646 Blenheim spaniel
+n02086910 papillon
+n02087046 toy terrier
+n02087394 Rhodesian ridgeback
+n02088094 Afghan hound, Afghan
+n02088238 basset, basset hound
+n02088364 beagle
+n02088466 bloodhound, sleuthhound
+n02088632 bluetick
+n02089078 black-and-tan coonhound
+n02089867 Walker hound, Walker foxhound
+n02089973 English foxhound
+n02090379 redbone
+n02090622 borzoi, Russian wolfhound
+n02090721 Irish wolfhound
+n02091032 Italian greyhound
+n02091134 whippet
+n02091244 Ibizan hound, Ibizan Podenco
+n02091467 Norwegian elkhound, elkhound
+n02091635 otterhound, otter hound
+n02091831 Saluki, gazelle hound
+n02092002 Scottish deerhound, deerhound
+n02092339 Weimaraner
+n02093256 Staffordshire bullterrier, Staffordshire bull terrier
+n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+n02093647 Bedlington terrier
+n02093754 Border terrier
+n02093859 Kerry blue terrier
+n02093991 Irish terrier
+n02094114 Norfolk terrier
+n02094258 Norwich terrier
+n02094433 Yorkshire terrier
+n02095314 wire-haired fox terrier
+n02095570 Lakeland terrier
+n02095889 Sealyham terrier, Sealyham
+n02096051 Airedale, Airedale terrier
+n02096177 cairn, cairn terrier
+n02096294 Australian terrier
+n02096437 Dandie Dinmont, Dandie Dinmont terrier
+n02096585 Boston bull, Boston terrier
+n02097047 miniature schnauzer
+n02097130 giant schnauzer
+n02097209 standard schnauzer
+n02097298 Scotch terrier, Scottish terrier, Scottie
+n02097474 Tibetan terrier, chrysanthemum dog
+n02097658 silky terrier, Sydney silky
+n02098105 soft-coated wheaten terrier
+n02098286 West Highland white terrier
+n02098413 Lhasa, Lhasa apso
+n02099267 flat-coated retriever
+n02099429 curly-coated retriever
+n02099601 golden retriever
+n02099712 Labrador retriever
+n02099849 Chesapeake Bay retriever
+n02100236 German short-haired pointer
+n02100583 vizsla, Hungarian pointer
+n02100735 English setter
+n02100877 Irish setter, red setter
+n02101006 Gordon setter
+n02101388 Brittany spaniel
+n02101556 clumber, clumber spaniel
+n02102040 English springer, English springer spaniel
+n02102177 Welsh springer spaniel
+n02102318 cocker spaniel, English cocker spaniel, cocker
+n02102480 Sussex spaniel
+n02102973 Irish water spaniel
+n02104029 kuvasz
+n02104365 schipperke
+n02105056 groenendael
+n02105162 malinois
+n02105251 briard
+n02105412 kelpie
+n02105505 komondor
+n02105641 Old English sheepdog, bobtail
+n02105855 Shetland sheepdog, Shetland sheep dog, Shetland
+n02106030 collie
+n02106166 Border collie
+n02106382 Bouvier des Flandres, Bouviers des Flandres
+n02106550 Rottweiler
+n02106662 German shepherd, German shepherd dog, German police dog, alsatian
+n02107142 Doberman, Doberman pinscher
+n02107312 miniature pinscher
+n02107574 Greater Swiss Mountain dog
+n02107683 Bernese mountain dog
+n02107908 Appenzeller
+n02108000 EntleBucher
+n02108089 boxer
+n02108422 bull mastiff
+n02108551 Tibetan mastiff
+n02108915 French bulldog
+n02109047 Great Dane
+n02109525 Saint Bernard, St Bernard
+n02109961 Eskimo dog, husky
+n02110063 malamute, malemute, Alaskan malamute
+n02110185 Siberian husky
+n02110341 dalmatian, coach dog, carriage dog
+n02110627 affenpinscher, monkey pinscher, monkey dog
+n02110806 basenji
+n02110958 pug, pug-dog
+n02111129 Leonberg
+n02111277 Newfoundland, Newfoundland dog
+n02111500 Great Pyrenees
+n02111889 Samoyed, Samoyede
+n02112018 Pomeranian
+n02112137 chow, chow chow
+n02112350 keeshond
+n02112706 Brabancon griffon
+n02113023 Pembroke, Pembroke Welsh corgi
+n02113186 Cardigan, Cardigan Welsh corgi
+n02113624 toy poodle
+n02113712 miniature poodle
+n02113799 standard poodle
+n02113978 Mexican hairless
+n02114367 timber wolf, grey wolf, gray wolf, Canis lupus
+n02114548 white wolf, Arctic wolf, Canis lupus tundrarum
+n02114712 red wolf, maned wolf, Canis rufus, Canis niger
+n02114855 coyote, prairie wolf, brush wolf, Canis latrans
+n02115641 dingo, warrigal, warragal, Canis dingo
+n02115913 dhole, Cuon alpinus
+n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+n02117135 hyena, hyaena
+n02119022 red fox, Vulpes vulpes
+n02119789 kit fox, Vulpes macrotis
+n02120079 Arctic fox, white fox, Alopex lagopus
+n02120505 grey fox, gray fox, Urocyon cinereoargenteus
+n02123045 tabby, tabby cat
+n02123159 tiger cat
+n02123394 Persian cat
+n02123597 Siamese cat, Siamese
+n02124075 Egyptian cat
+n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+n02127052 lynx, catamount
+n02128385 leopard, Panthera pardus
+n02128757 snow leopard, ounce, Panthera uncia
+n02128925 jaguar, panther, Panthera onca, Felis onca
+n02129165 lion, king of beasts, Panthera leo
+n02129604 tiger, Panthera tigris
+n02130308 cheetah, chetah, Acinonyx jubatus
+n02132136 brown bear, bruin, Ursus arctos
+n02133161 American black bear, black bear, Ursus americanus, Euarctos americanus
+n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+n02134418 sloth bear, Melursus ursinus, Ursus ursinus
+n02137549 mongoose
+n02138441 meerkat, mierkat
+n02165105 tiger beetle
+n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+n02167151 ground beetle, carabid beetle
+n02168699 long-horned beetle, longicorn, longicorn beetle
+n02169497 leaf beetle, chrysomelid
+n02172182 dung beetle
+n02174001 rhinoceros beetle
+n02177972 weevil
+n02190166 fly
+n02206856 bee
+n02219486 ant, emmet, pismire
+n02226429 grasshopper, hopper
+n02229544 cricket
+n02231487 walking stick, walkingstick, stick insect
+n02233338 cockroach, roach
+n02236044 mantis, mantid
+n02256656 cicada, cicala
+n02259212 leafhopper
+n02264363 lacewing, lacewing fly
+n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+n02268853 damselfly
+n02276258 admiral
+n02277742 ringlet, ringlet butterfly
+n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+n02280649 cabbage butterfly
+n02281406 sulphur butterfly, sulfur butterfly
+n02281787 lycaenid, lycaenid butterfly
+n02317335 starfish, sea star
+n02319095 sea urchin
+n02321529 sea cucumber, holothurian
+n02325366 wood rabbit, cottontail, cottontail rabbit
+n02326432 hare
+n02328150 Angora, Angora rabbit
+n02342885 hamster
+n02346627 porcupine, hedgehog
+n02356798 fox squirrel, eastern fox squirrel, Sciurus niger
+n02361337 marmot
+n02363005 beaver
+n02364673 guinea pig, Cavia cobaya
+n02389026 sorrel
+n02391049 zebra
+n02395406 hog, pig, grunter, squealer, Sus scrofa
+n02396427 wild boar, boar, Sus scrofa
+n02397096 warthog
+n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius
+n02403003 ox
+n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+n02410509 bison
+n02412080 ram, tup
+n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+n02417914 ibex, Capra ibex
+n02422106 hartebeest
+n02422699 impala, Aepyceros melampus
+n02423022 gazelle
+n02437312 Arabian camel, dromedary, Camelus dromedarius
+n02437616 llama
+n02441942 weasel
+n02442845 mink
+n02443114 polecat, fitch, foulmart, foumart, Mustela putorius
+n02443484 black-footed ferret, ferret, Mustela nigripes
+n02444819 otter
+n02445715 skunk, polecat, wood pussy
+n02447366 badger
+n02454379 armadillo
+n02457408 three-toed sloth, ai, Bradypus tridactylus
+n02480495 orangutan, orang, orangutang, Pongo pygmaeus
+n02480855 gorilla, Gorilla gorilla
+n02481823 chimpanzee, chimp, Pan troglodytes
+n02483362 gibbon, Hylobates lar
+n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus
+n02484975 guenon, guenon monkey
+n02486261 patas, hussar monkey, Erythrocebus patas
+n02486410 baboon
+n02487347 macaque
+n02488291 langur
+n02488702 colobus, colobus monkey
+n02489166 proboscis monkey, Nasalis larvatus
+n02490219 marmoset
+n02492035 capuchin, ringtail, Cebus capucinus
+n02492660 howler monkey, howler
+n02493509 titi, titi monkey
+n02493793 spider monkey, Ateles geoffroyi
+n02494079 squirrel monkey, Saimiri sciureus
+n02497673 Madagascar cat, ring-tailed lemur, Lemur catta
+n02500267 indri, indris, Indri indri, Indri brevicaudatus
+n02504013 Indian elephant, Elephas maximus
+n02504458 African elephant, Loxodonta africana
+n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+n02514041 barracouta, snoek
+n02526121 eel
+n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+n02606052 rock beauty, Holocanthus tricolor
+n02607072 anemone fish
+n02640242 sturgeon
+n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus
+n02643566 lionfish
+n02655020 puffer, pufferfish, blowfish, globefish
+n02666196 abacus
+n02667093 abaya
+n02669723 academic gown, academic robe, judge's robe
+n02672831 accordion, piano accordion, squeeze box
+n02676566 acoustic guitar
+n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier
+n02690373 airliner
+n02692877 airship, dirigible
+n02699494 altar
+n02701002 ambulance
+n02704792 amphibian, amphibious vehicle
+n02708093 analog clock
+n02727426 apiary, bee house
+n02730930 apron
+n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+n02749479 assault rifle, assault gun
+n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack
+n02776631 bakery, bakeshop, bakehouse
+n02777292 balance beam, beam
+n02782093 balloon
+n02783161 ballpoint, ballpoint pen, ballpen, Biro
+n02786058 Band Aid
+n02787622 banjo
+n02788148 bannister, banister, balustrade, balusters, handrail
+n02790996 barbell
+n02791124 barber chair
+n02791270 barbershop
+n02793495 barn
+n02794156 barometer
+n02795169 barrel, cask
+n02797295 barrow, garden cart, lawn cart, wheelbarrow
+n02799071 baseball
+n02802426 basketball
+n02804414 bassinet
+n02804610 bassoon
+n02807133 bathing cap, swimming cap
+n02808304 bath towel
+n02808440 bathtub, bathing tub, bath, tub
+n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+n02814860 beacon, lighthouse, beacon light, pharos
+n02815834 beaker
+n02817516 bearskin, busby, shako
+n02823428 beer bottle
+n02823750 beer glass
+n02825657 bell cote, bell cot
+n02834397 bib
+n02835271 bicycle-built-for-two, tandem bicycle, tandem
+n02837789 bikini, two-piece
+n02840245 binder, ring-binder
+n02841315 binoculars, field glasses, opera glasses
+n02843684 birdhouse
+n02859443 boathouse
+n02860847 bobsled, bobsleigh, bob
+n02865351 bolo tie, bolo, bola tie, bola
+n02869837 bonnet, poke bonnet
+n02870880 bookcase
+n02871525 bookshop, bookstore, bookstall
+n02877765 bottlecap
+n02879718 bow
+n02883205 bow tie, bow-tie, bowtie
+n02892201 brass, memorial tablet, plaque
+n02892767 brassiere, bra, bandeau
+n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+n02895154 breastplate, aegis, egis
+n02906734 broom
+n02909870 bucket, pail
+n02910353 buckle
+n02916936 bulletproof vest
+n02917067 bullet train, bullet
+n02927161 butcher shop, meat market
+n02930766 cab, hack, taxi, taxicab
+n02939185 caldron, cauldron
+n02948072 candle, taper, wax light
+n02950826 cannon
+n02951358 canoe
+n02951585 can opener, tin opener
+n02963159 cardigan
+n02965783 car mirror
+n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig
+n02966687 carpenter's kit, tool kit
+n02971356 carton
+n02974003 car wheel
+n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+n02978881 cassette
+n02979186 cassette player
+n02980441 castle
+n02981792 catamaran
+n02988304 CD player
+n02992211 cello, violoncello
+n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone
+n02999410 chain
+n03000134 chainlink fence
+n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+n03000684 chain saw, chainsaw
+n03014705 chest
+n03016953 chiffonier, commode
+n03017168 chime, bell, gong
+n03018349 china cabinet, china closet
+n03026506 Christmas stocking
+n03028079 church, church building
+n03032252 cinema, movie theater, movie theatre, movie house, picture palace
+n03041632 cleaver, meat cleaver, chopper
+n03042490 cliff dwelling
+n03045698 cloak
+n03047690 clog, geta, patten, sabot
+n03062245 cocktail shaker
+n03063599 coffee mug
+n03063689 coffeepot
+n03065424 coil, spiral, volute, whorl, helix
+n03075370 combination lock
+n03085013 computer keyboard, keypad
+n03089624 confectionery, confectionary, candy store
+n03095699 container ship, containership, container vessel
+n03100240 convertible
+n03109150 corkscrew, bottle screw
+n03110669 cornet, horn, trumpet, trump
+n03124043 cowboy boot
+n03124170 cowboy hat, ten-gallon hat
+n03125729 cradle
+n03126707 crane
+n03127747 crash helmet
+n03127925 crate
+n03131574 crib, cot
+n03133878 Crock Pot
+n03134739 croquet ball
+n03141823 crutch
+n03146219 cuirass
+n03160309 dam, dike, dyke
+n03179701 desk
+n03180011 desktop computer
+n03187595 dial telephone, dial phone
+n03188531 diaper, nappy, napkin
+n03196217 digital clock
+n03197337 digital watch
+n03201208 dining table, board
+n03207743 dishrag, dishcloth
+n03207941 dishwasher, dish washer, dishwashing machine
+n03208938 disk brake, disc brake
+n03216828 dock, dockage, docking facility
+n03218198 dogsled, dog sled, dog sleigh
+n03220513 dome
+n03223299 doormat, welcome mat
+n03240683 drilling platform, offshore rig
+n03249569 drum, membranophone, tympan
+n03250847 drumstick
+n03255030 dumbbell
+n03259280 Dutch oven
+n03271574 electric fan, blower
+n03272010 electric guitar
+n03272562 electric locomotive
+n03290653 entertainment center
+n03291819 envelope
+n03297495 espresso maker
+n03314780 face powder
+n03325584 feather boa, boa
+n03337140 file, file cabinet, filing cabinet
+n03344393 fireboat
+n03345487 fire engine, fire truck
+n03347037 fire screen, fireguard
+n03355925 flagpole, flagstaff
+n03372029 flute, transverse flute
+n03376595 folding chair
+n03379051 football helmet
+n03384352 forklift
+n03388043 fountain
+n03388183 fountain pen
+n03388549 four-poster
+n03393912 freight car
+n03394916 French horn, horn
+n03400231 frying pan, frypan, skillet
+n03404251 fur coat
+n03417042 garbage truck, dustcart
+n03424325 gasmask, respirator, gas helmet
+n03425413 gas pump, gasoline pump, petrol pump, island dispenser
+n03443371 goblet
+n03444034 go-kart
+n03445777 golf ball
+n03445924 golfcart, golf cart
+n03447447 gondola
+n03447721 gong, tam-tam
+n03450230 gown
+n03452741 grand piano, grand
+n03457902 greenhouse, nursery, glasshouse
+n03459775 grille, radiator grille
+n03461385 grocery store, grocery, food market, market
+n03467068 guillotine
+n03476684 hair slide
+n03476991 hair spray
+n03478589 half track
+n03481172 hammer
+n03482405 hamper
+n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier
+n03485407 hand-held computer, hand-held microcomputer
+n03485794 handkerchief, hankie, hanky, hankey
+n03492542 hard disc, hard disk, fixed disk
+n03494278 harmonica, mouth organ, harp, mouth harp
+n03495258 harp
+n03496892 harvester, reaper
+n03498962 hatchet
+n03527444 holster
+n03529860 home theater, home theatre
+n03530642 honeycomb
+n03532672 hook, claw
+n03534580 hoopskirt, crinoline
+n03535780 horizontal bar, high bar
+n03538406 horse cart, horse-cart
+n03544143 hourglass
+n03584254 iPod
+n03584829 iron, smoothing iron
+n03590841 jack-o'-lantern
+n03594734 jean, blue jean, denim
+n03594945 jeep, landrover
+n03595614 jersey, T-shirt, tee shirt
+n03598930 jigsaw puzzle
+n03599486 jinrikisha, ricksha, rickshaw
+n03602883 joystick
+n03617480 kimono
+n03623198 knee pad
+n03627232 knot
+n03630383 lab coat, laboratory coat
+n03633091 ladle
+n03637318 lampshade, lamp shade
+n03642806 laptop, laptop computer
+n03649909 lawn mower, mower
+n03657121 lens cap, lens cover
+n03658185 letter opener, paper knife, paperknife
+n03661043 library
+n03662601 lifeboat
+n03666591 lighter, light, igniter, ignitor
+n03670208 limousine, limo
+n03673027 liner, ocean liner
+n03676483 lipstick, lip rouge
+n03680355 Loafer
+n03690938 lotion
+n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+n03692522 loupe, jeweler's loupe
+n03697007 lumbermill, sawmill
+n03706229 magnetic compass
+n03709823 mailbag, postbag
+n03710193 mailbox, letter box
+n03710637 maillot
+n03710721 maillot, tank suit
+n03717622 manhole cover
+n03720891 maraca
+n03721384 marimba, xylophone
+n03724870 mask
+n03729826 matchstick
+n03733131 maypole
+n03733281 maze, labyrinth
+n03733805 measuring cup
+n03742115 medicine chest, medicine cabinet
+n03743016 megalith, megalithic structure
+n03759954 microphone, mike
+n03761084 microwave, microwave oven
+n03763968 military uniform
+n03764736 milk can
+n03769881 minibus
+n03770439 miniskirt, mini
+n03770679 minivan
+n03773504 missile
+n03775071 mitten
+n03775546 mixing bowl
+n03776460 mobile home, manufactured home
+n03777568 Model T
+n03777754 modem
+n03781244 monastery
+n03782006 monitor
+n03785016 moped
+n03786901 mortar
+n03787032 mortarboard
+n03788195 mosque
+n03788365 mosquito net
+n03791053 motor scooter, scooter
+n03792782 mountain bike, all-terrain bike, off-roader
+n03792972 mountain tent
+n03793489 mouse, computer mouse
+n03794056 mousetrap
+n03796401 moving van
+n03803284 muzzle
+n03804744 nail
+n03814639 neck brace
+n03814906 necklace
+n03825788 nipple
+n03832673 notebook, notebook computer
+n03837869 obelisk
+n03838899 oboe, hautboy, hautbois
+n03840681 ocarina, sweet potato
+n03841143 odometer, hodometer, mileometer, milometer
+n03843555 oil filter
+n03854065 organ, pipe organ
+n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO
+n03866082 overskirt
+n03868242 oxcart
+n03868863 oxygen mask
+n03871628 packet
+n03873416 paddle, boat paddle
+n03874293 paddlewheel, paddle wheel
+n03874599 padlock
+n03876231 paintbrush
+n03877472 pajama, pyjama, pj's, jammies
+n03877845 palace
+n03884397 panpipe, pandean pipe, syrinx
+n03887697 paper towel
+n03888257 parachute, chute
+n03888605 parallel bars, bars
+n03891251 park bench
+n03891332 parking meter
+n03895866 passenger car, coach, carriage
+n03899768 patio, terrace
+n03902125 pay-phone, pay-station
+n03903868 pedestal, plinth, footstall
+n03908618 pencil box, pencil case
+n03908714 pencil sharpener
+n03916031 perfume, essence
+n03920288 Petri dish
+n03924679 photocopier
+n03929660 pick, plectrum, plectron
+n03929855 pickelhaube
+n03930313 picket fence, paling
+n03930630 pickup, pickup truck
+n03933933 pier
+n03935335 piggy bank, penny bank
+n03937543 pill bottle
+n03938244 pillow
+n03942813 ping-pong ball
+n03944341 pinwheel
+n03947888 pirate, pirate ship
+n03950228 pitcher, ewer
+n03954731 plane, carpenter's plane, woodworking plane
+n03956157 planetarium
+n03958227 plastic bag
+n03961711 plate rack
+n03967562 plow, plough
+n03970156 plunger, plumber's helper
+n03976467 Polaroid camera, Polaroid Land camera
+n03976657 pole
+n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+n03980874 poncho
+n03982430 pool table, billiard table, snooker table
+n03983396 pop bottle, soda bottle
+n03991062 pot, flowerpot
+n03992509 potter's wheel
+n03995372 power drill
+n03998194 prayer rug, prayer mat
+n04004767 printer
+n04005630 prison, prison house
+n04008634 projectile, missile
+n04009552 projector
+n04019541 puck, hockey puck
+n04023962 punching bag, punch bag, punching ball, punchball
+n04026417 purse
+n04033901 quill, quill pen
+n04033995 quilt, comforter, comfort, puff
+n04037443 racer, race car, racing car
+n04039381 racket, racquet
+n04040759 radiator
+n04041544 radio, wireless
+n04044716 radio telescope, radio reflector
+n04049303 rain barrel
+n04065272 recreational vehicle, RV, R.V.
+n04067472 reel
+n04069434 reflex camera
+n04070727 refrigerator, icebox
+n04074963 remote control, remote
+n04081281 restaurant, eating house, eating place, eatery
+n04086273 revolver, six-gun, six-shooter
+n04090263 rifle
+n04099969 rocking chair, rocker
+n04111531 rotisserie
+n04116512 rubber eraser, rubber, pencil eraser
+n04118538 rugby ball
+n04118776 rule, ruler
+n04120489 running shoe
+n04125021 safe
+n04127249 safety pin
+n04131690 saltshaker, salt shaker
+n04133789 sandal
+n04136333 sarong
+n04141076 sax, saxophone
+n04141327 scabbard
+n04141975 scale, weighing machine
+n04146614 school bus
+n04147183 schooner
+n04149813 scoreboard
+n04152593 screen, CRT screen
+n04153751 screw
+n04154565 screwdriver
+n04162706 seat belt, seatbelt
+n04179913 sewing machine
+n04192698 shield, buckler
+n04200800 shoe shop, shoe-shop, shoe store
+n04201297 shoji
+n04204238 shopping basket
+n04204347 shopping cart
+n04208210 shovel
+n04209133 shower cap
+n04209239 shower curtain
+n04228054 ski
+n04229816 ski mask
+n04235860 sleeping bag
+n04238763 slide rule, slipstick
+n04239074 sliding door
+n04243546 slot, one-armed bandit
+n04251144 snorkel
+n04252077 snowmobile
+n04252225 snowplow, snowplough
+n04254120 soap dispenser
+n04254680 soccer ball
+n04254777 sock
+n04258138 solar dish, solar collector, solar furnace
+n04259630 sombrero
+n04263257 soup bowl
+n04264628 space bar
+n04265275 space heater
+n04266014 space shuttle
+n04270147 spatula
+n04273569 speedboat
+n04275548 spider web, spider's web
+n04277352 spindle
+n04285008 sports car, sport car
+n04286575 spotlight, spot
+n04296562 stage
+n04310018 steam locomotive
+n04311004 steel arch bridge
+n04311174 steel drum
+n04317175 stethoscope
+n04325704 stole
+n04326547 stone wall
+n04328186 stopwatch, stop watch
+n04330267 stove
+n04332243 strainer
+n04335435 streetcar, tram, tramcar, trolley, trolley car
+n04336792 stretcher
+n04344873 studio couch, day bed
+n04346328 stupa, tope
+n04347754 submarine, pigboat, sub, U-boat
+n04350905 suit, suit of clothes
+n04355338 sundial
+n04355933 sunglass
+n04356056 sunglasses, dark glasses, shades
+n04357314 sunscreen, sunblock, sun blocker
+n04366367 suspension bridge
+n04367480 swab, swob, mop
+n04370456 sweatshirt
+n04371430 swimming trunks, bathing trunks
+n04371774 swing
+n04372370 switch, electric switch, electrical switch
+n04376876 syringe
+n04380533 table lamp
+n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle
+n04392985 tape player
+n04398044 teapot
+n04399382 teddy, teddy bear
+n04404412 television, television system
+n04409515 tennis ball
+n04417672 thatch, thatched roof
+n04418357 theater curtain, theatre curtain
+n04423845 thimble
+n04428191 thresher, thrasher, threshing machine
+n04429376 throne
+n04435653 tile roof
+n04442312 toaster
+n04443257 tobacco shop, tobacconist shop, tobacconist
+n04447861 toilet seat
+n04456115 torch
+n04458633 totem pole
+n04461696 tow truck, tow car, wrecker
+n04462240 toyshop
+n04465501 tractor
+n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+n04476259 tray
+n04479046 trench coat
+n04482393 tricycle, trike, velocipede
+n04483307 trimaran
+n04485082 tripod
+n04486054 triumphal arch
+n04487081 trolleybus, trolley coach, trackless trolley
+n04487394 trombone
+n04493381 tub, vat
+n04501370 turnstile
+n04505470 typewriter keyboard
+n04507155 umbrella
+n04509417 unicycle, monocycle
+n04515003 upright, upright piano
+n04517823 vacuum, vacuum cleaner
+n04522168 vase
+n04523525 vault
+n04525038 velvet
+n04525305 vending machine
+n04532106 vestment
+n04532670 viaduct
+n04536866 violin, fiddle
+n04540053 volleyball
+n04542943 waffle iron
+n04548280 wall clock
+n04548362 wallet, billfold, notecase, pocketbook
+n04550184 wardrobe, closet, press
+n04552348 warplane, military plane
+n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+n04554684 washer, automatic washer, washing machine
+n04557648 water bottle
+n04560804 water jug
+n04562935 water tower
+n04579145 whiskey jug
+n04579432 whistle
+n04584207 wig
+n04589890 window screen
+n04590129 window shade
+n04591157 Windsor tie
+n04591713 wine bottle
+n04592741 wing
+n04596742 wok
+n04597913 wooden spoon
+n04599235 wool, woolen, woollen
+n04604644 worm fence, snake fence, snake-rail fence, Virginia fence
+n04606251 wreck
+n04612504 yawl
+n04613696 yurt
+n06359193 web site, website, internet site, site
+n06596364 comic book
+n06785654 crossword puzzle, crossword
+n06794110 street sign
+n06874185 traffic light, traffic signal, stoplight
+n07248320 book jacket, dust cover, dust jacket, dust wrapper
+n07565083 menu
+n07579787 plate
+n07583066 guacamole
+n07584110 consomme
+n07590611 hot pot, hotpot
+n07613480 trifle
+n07614500 ice cream, icecream
+n07615774 ice lolly, lolly, lollipop, popsicle
+n07684084 French loaf
+n07693725 bagel, beigel
+n07695742 pretzel
+n07697313 cheeseburger
+n07697537 hotdog, hot dog, red hot
+n07711569 mashed potato
+n07714571 head cabbage
+n07714990 broccoli
+n07715103 cauliflower
+n07716358 zucchini, courgette
+n07716906 spaghetti squash
+n07717410 acorn squash
+n07717556 butternut squash
+n07718472 cucumber, cuke
+n07718747 artichoke, globe artichoke
+n07720875 bell pepper
+n07730033 cardoon
+n07734744 mushroom
+n07742313 Granny Smith
+n07745940 strawberry
+n07747607 orange
+n07749582 lemon
+n07753113 fig
+n07753275 pineapple, ananas
+n07753592 banana
+n07754684 jackfruit, jak, jack
+n07760859 custard apple
+n07768694 pomegranate
+n07802026 hay
+n07831146 carbonara
+n07836838 chocolate sauce, chocolate syrup
+n07860988 dough
+n07871810 meat loaf, meatloaf
+n07873807 pizza, pizza pie
+n07875152 potpie
+n07880968 burrito
+n07892512 red wine
+n07920052 espresso
+n07930864 cup
+n07932039 eggnog
+n09193705 alp
+n09229709 bubble
+n09246464 cliff, drop, drop-off
+n09256479 coral reef
+n09288635 geyser
+n09332890 lakeside, lakeshore
+n09399592 promontory, headland, head, foreland
+n09421951 sandbar, sand bar
+n09428293 seashore, coast, seacoast, sea-coast
+n09468604 valley, vale
+n09472597 volcano
+n09835506 ballplayer, baseball player
+n10148035 groom, bridegroom
+n10565667 scuba diver
+n11879895 rapeseed
+n11939491 daisy
+n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+n12144580 corn
+n12267677 acorn
+n12620546 hip, rose hip, rosehip
+n12768682 buckeye, horse chestnut, conker
+n12985857 coral fungus
+n12998815 agaric
+n13037406 gyromitra
+n13040303 stinkhorn, carrion fungus
+n13044778 earthstar
+n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+n13054560 bolete
+n13133613 ear, spike, capitulum
+n15075141 toilet tissue, toilet paper, bathroom tissue
diff --git a/ios.toolchain.cmake b/ios.toolchain.cmake
new file mode 100644
index 00000000000..05176def861
--- /dev/null
+++ b/ios.toolchain.cmake
@@ -0,0 +1,193 @@
+# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake
+# files which are included with CMake 2.8.4
+# It has been altered for iOS development
+
+# Options:
+#
+# IOS_PLATFORM = iPhoneOS (default) or iPhoneSimulator
+# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
+# iPhoneOS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
+# iPhoneSimulator - used to build for the Simulator platforms, which have an x86 arch.
+#
+# CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
+# By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+# If set manually, it will override the default location and force the user of a particular Developer Platform
+#
+# CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
+# By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
+# In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
+# If set manually, this will force the use of a specific SDK version
+
+# Macros:
+#
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
+# A convenience macro for setting xcode specific properties on targets
+# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
+#
+# find_host_package (PROGRAM ARGS)
+# A macro used to find executable programs on the host system, not within the iOS environment.
+# Thanks to the android-cmake project for providing the command
+
+# Standard settings
+set (CMAKE_SYSTEM_NAME Darwin)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (APPLE True)
+set (IOS True)
+
+# Required as of cmake 2.8.10
+set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
+
+# Determine the cmake host system version so we know where to find the iOS SDKs
+find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin)
+if (CMAKE_UNAME)
+ exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION)
+ string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}")
+endif (CMAKE_UNAME)
+
+# Force the compilers to gcc for iOS
+include (CMakeForceCompiler)
+CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple)
+CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple)
+set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
+
+# Skip the platform compiler checks for cross compiling
+set (CMAKE_CXX_COMPILER_WORKS TRUE)
+set (CMAKE_C_COMPILER_WORKS TRUE)
+
+# All iOS/Darwin specific settings - some may be redundant
+set (CMAKE_SHARED_LIBRARY_PREFIX "lib")
+set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
+set (CMAKE_SHARED_MODULE_PREFIX "lib")
+set (CMAKE_SHARED_MODULE_SUFFIX ".so")
+set (CMAKE_MODULE_EXISTS 1)
+set (CMAKE_DL_LIBS "")
+
+set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
+set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
+set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
+set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
+
+# Hidden visibilty is required for cxx on iOS
+set (CMAKE_C_FLAGS_INIT "-isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")
+set (CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -fvisibility=hidden -fvisibility-inlines-hidden -isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0")
+
+set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
+set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
+
+set (CMAKE_PLATFORM_HAS_INSTALLNAME 1)
+set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
+set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
+set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
+set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
+set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
+
+# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
+# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
+# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
+# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
+if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+ find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
+endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+
+# Setup iOS platform unless specified manually with IOS_PLATFORM
+if (NOT DEFINED IOS_PLATFORM)
+ set (IOS_PLATFORM "iPhoneOS")
+endif (NOT DEFINED IOS_PLATFORM)
+set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
+
+# Check the platform selection and setup for developer root
+if (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+ set (IOS_PLATFORM_LOCATION "iPhoneOS.platform")
+
+ # This causes the installers to properly locate the output libraries
+ set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
+elseif (${IOS_PLATFORM} STREQUAL "iPhoneSimulator")
+ set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
+
+ # This causes the installers to properly locate the output libraries
+ set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
+else (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+ message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose iPhoneOS or iPhoneSimulator")
+endif (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+
+# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
+# Note Xcode 4.3 changed the installation location, choose the most recent one available
+set (XCODE_POST_43_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+ if (EXISTS ${XCODE_POST_43_ROOT})
+ set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
+ elseif(EXISTS ${XCODE_PRE_43_ROOT})
+ set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
+ endif (EXISTS ${XCODE_POST_43_ROOT})
+endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
+
+# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
+if (NOT DEFINED CMAKE_IOS_SDK_ROOT)
+ file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
+ if (_CMAKE_IOS_SDKS)
+ list (SORT _CMAKE_IOS_SDKS)
+ list (REVERSE _CMAKE_IOS_SDKS)
+ list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
+ else (_CMAKE_IOS_SDKS)
+ message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
+ endif (_CMAKE_IOS_SDKS)
+ message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
+endif (NOT DEFINED CMAKE_IOS_SDK_ROOT)
+set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
+
+# Set the sysroot default to the most recent SDK
+set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+# set the architecture for iOS
+# NOTE: Currently both ARCHS_STANDARD_32_BIT and ARCHS_UNIVERSAL_IPHONE_OS set armv7 only, so set both manually
+if (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+ set (IOS_ARCH armv7)
+else (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+ set (IOS_ARCH i386)
+endif (${IOS_PLATFORM} STREQUAL "iPhoneOS")
+
+set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
+
+# Set the find root to the iOS developer roots and to user defined paths
+set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root")
+
+# default to searching for frameworks first
+set (CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set (CMAKE_SYSTEM_FRAMEWORK_PATH
+ ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+ ${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks
+ ${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks
+)
+
+# only search the iOS sdks, not the remainder of the host filesystem
+set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+
+# This little macro lets you set any XCode specific property
+macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
+ set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
+endmacro (set_xcode_property)
+
+
+# This macro lets you find executable programs on the host system
+macro (find_host_package)
+ set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+ set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+ set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+ set (IOS FALSE)
+
+ find_package(${ARGN})
+
+ set (IOS TRUE)
+ set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+ set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+ set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro (find_host_package)
+
diff --git a/iossimxc.toolchain.cmake b/iossimxc.toolchain.cmake
new file mode 100644
index 00000000000..27bda76a25b
--- /dev/null
+++ b/iossimxc.toolchain.cmake
@@ -0,0 +1,40 @@
+# Standard settings
+# set(UNIX True)
+# set(Darwin True)
+# set(IOS True)
+set (CMAKE_SYSTEM_NAME Darwin)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (APPLE True)
+set (IOS True)
+
+# suppress -rdynamic
+# set(CMAKE_SYSTEM_NAME Generic)
+
+set(CMAKE_C_COMPILER i386-apple-darwin11-clang)
+set(CMAKE_CXX_COMPILER i386-apple-darwin11-clang++)
+
+set(_CMAKE_TOOLCHAIN_PREFIX i386-apple-darwin11-)
+
+set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target-sim/SDK/")
+
+# Set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS Simulator support")
+
+# set the architecture for iOS
+# set(IOS_ARCH i386)
+# set(IOS_ARCH x86_64)
+set(IOS_ARCH i386;x86_64)
+
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS Simulator")
+
+# Set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS Simulator find search path root")
+
+# searching for frameworks only
+set(CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+ ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+)
diff --git a/iosxc.toolchain.cmake b/iosxc.toolchain.cmake
new file mode 100644
index 00000000000..a4e9751b5a5
--- /dev/null
+++ b/iosxc.toolchain.cmake
@@ -0,0 +1,39 @@
+# Standard settings
+# set(UNIX True)
+# set(Darwin True)
+# set(IOS True)
+set (CMAKE_SYSTEM_NAME Darwin)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (APPLE True)
+set (IOS True)
+
+# suppress -rdynamic
+# set(CMAKE_SYSTEM_NAME Generic)
+
+set(CMAKE_C_COMPILER arm-apple-darwin11-clang)
+set(CMAKE_CXX_COMPILER arm-apple-darwin11-clang++)
+
+set(_CMAKE_TOOLCHAIN_PREFIX arm-apple-darwin11-)
+
+set(CMAKE_IOS_SDK_ROOT "/home/nihui/osd/cctools-port/usage_examples/ios_toolchain/target/SDK/")
+
+# Set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+# set the architecture for iOS
+# set(IOS_ARCH arm64)
+set(IOS_ARCH armv7;arm64)
+
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
+
+# Set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root")
+
+# searching for frameworks only
+set(CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+ ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+)
diff --git a/package.sh b/package.sh
new file mode 100644
index 00000000000..ff743923b4c
--- /dev/null
+++ b/package.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/bash
+
+NAME=ncnn
+
+##### package android lib
+ANDROIDPKGNAME=${NAME}-android-lib
+rm -rf $ANDROIDPKGNAME
+mkdir -p $ANDROIDPKGNAME
+mkdir -p $ANDROIDPKGNAME/armeabi-v7a
+mkdir -p $ANDROIDPKGNAME/arm64-v8a
+mkdir -p $ANDROIDPKGNAME/include
+cp build-android-armv7/install/lib/lib${NAME}.a $ANDROIDPKGNAME/armeabi-v7a/
+cp build-android-aarch64/install/lib/lib${NAME}.a $ANDROIDPKGNAME/arm64-v8a/
+cp build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/
+rm -f $ANDROIDPKGNAME.zip
+zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME
+
+##### package ios framework
+IOSPKGNAME=${NAME}.framework
+rm -rf $IOSPKGNAME
+mkdir -p $IOSPKGNAME/Versions/A/Headers
+mkdir -p $IOSPKGNAME/Versions/A/Resources
+ln -s A $IOSPKGNAME/Versions/Current
+ln -s Versions/Current/Headers $IOSPKGNAME/Headers
+ln -s Versions/Current/Resources $IOSPKGNAME/Resources
+ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
+lipo -create \
+ build-ios/install/lib/lib${NAME}.a \
+ build-ios-sim/install/lib/lib${NAME}.a \
+ -o $IOSPKGNAME/Versions/A/${NAME}
+cp -r build-ios/install/include/* $IOSPKGNAME/Versions/A/Headers/
+cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
+rm -f $IOSPKGNAME.zip
+zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 00000000000..d9e491eb059
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,135 @@
+
+##############################################
+
+configure_file(platform.h.in ${CMAKE_CURRENT_BINARY_DIR}/platform.h)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/layer)
+
+set(ncnn_SRCS
+ blob.cpp
+ cpu.cpp
+ layer.cpp
+ mat.cpp
+ mat_pixel.cpp
+ net.cpp
+ opencv.cpp
+)
+
+macro(ncnn_add_layer class)
+ string(TOLOWER ${class} name)
+
+ # WITH_LAYER_xxx option
+ if(${ARGC} EQUAL 2)
+ option(WITH_LAYER_${name} "build with layer ${name}" ${ARGV1})
+ else()
+ option(WITH_LAYER_${name} "build with layer ${name}" ON)
+ endif()
+
+ message("WITH_LAYER_${name} = ${WITH_LAYER_${name}}")
+
+ if(WITH_LAYER_${name})
+ list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp")
+
+ # look for arch specific implementation and append source
+ # optimized implementation for armv7 aarch64
+ if((ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "armv7-a"))
+ OR (ANDROID AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64"))
+ OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7"))
+ OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64"))
+ OR (IOS AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "armv7;arm64")))
+ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp")
+ list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/arm/${name}_arm.cpp")
+ set(WITH_LAYER_${name}_arm 1)
+ endif()
+ else()
+ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp")
+ list(APPEND ncnn_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/layer/x86/${name}_x86.cpp")
+ set(WITH_LAYER_${name}_x86 1)
+ endif()
+ endif()
+ endif()
+
+ # generate layer_declaration and layer_registry file
+ if(WITH_LAYER_${name})
+ if(WITH_LAYER_${name}_arm)
+ file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
+ "extern Layer* ${class}_arm_layer_creator();\n")
+ file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
+ "#if NCNN_STRING\n{\"${class}\",${class}_arm_layer_creator},\n#else\n{${class}_arm_layer_creator},\n#endif\n")
+ elseif(WITH_LAYER_${name}_x86)
+ file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
+ "extern Layer* ${class}_x86_layer_creator();\n")
+ file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
+ "#if NCNN_STRING\n{\"${class}\",${class}_x86_layer_creator},\n#else\n{${class}_x86_layer_creator},\n#endif\n")
+ else()
+ file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h
+ "extern Layer* ${class}_layer_creator();\n")
+ file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h
+ "#if NCNN_STRING\n{\"${class}\",${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
+ endif()
+ else()
+ file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h "#if NCNN_STRING\n{\"${class}\",0},\n#else\n{0},\n#endif\n")
+ endif()
+endmacro()
+
+# create new
+file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h)
+file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h)
+
+# layer implementation
+ncnn_add_layer(AbsVal)
+ncnn_add_layer(ArgMax OFF)
+ncnn_add_layer(BatchNorm)
+ncnn_add_layer(Bias)
+ncnn_add_layer(BNLL)
+ncnn_add_layer(Concat)
+ncnn_add_layer(Convolution)
+ncnn_add_layer(Crop)
+ncnn_add_layer(Deconvolution)
+ncnn_add_layer(Dropout)
+ncnn_add_layer(Eltwise)
+ncnn_add_layer(ELU)
+ncnn_add_layer(Embed OFF)
+ncnn_add_layer(Exp)
+ncnn_add_layer(Flatten)
+ncnn_add_layer(InnerProduct)
+ncnn_add_layer(Input)
+ncnn_add_layer(Log)
+ncnn_add_layer(LRN)
+ncnn_add_layer(MemoryData OFF)
+ncnn_add_layer(MVN)
+ncnn_add_layer(Pooling)
+ncnn_add_layer(Power)
+ncnn_add_layer(PReLU)
+ncnn_add_layer(Proposal OFF)
+ncnn_add_layer(Reduction OFF)
+ncnn_add_layer(ReLU)
+ncnn_add_layer(Reshape OFF)
+ncnn_add_layer(ROIPooling OFF)
+ncnn_add_layer(Scale)
+ncnn_add_layer(Sigmoid)
+ncnn_add_layer(Slice)
+ncnn_add_layer(Softmax)
+ncnn_add_layer(Split)
+ncnn_add_layer(SPP OFF)
+ncnn_add_layer(TanH)
+ncnn_add_layer(Threshold)
+ncnn_add_layer(Tile OFF)
+ncnn_add_layer(RNN OFF)
+ncnn_add_layer(LSTM OFF)
+
+add_library(ncnn STATIC ${ncnn_SRCS})
+
+install(TARGETS ncnn ARCHIVE DESTINATION lib)
+install(FILES
+ blob.h
+ cpu.h
+ layer.h
+ mat.h
+ net.h
+ opencv.h
+ ${CMAKE_CURRENT_BINARY_DIR}/platform.h
+ DESTINATION include
+)
diff --git a/src/blob.cpp b/src/blob.cpp
new file mode 100644
index 00000000000..8af899fb799
--- /dev/null
+++ b/src/blob.cpp
@@ -0,0 +1,24 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "blob.h"
+
+namespace ncnn {
+
+Blob::Blob()
+{
+ producer = -1;
+}
+
+} // namespace ncnn
diff --git a/src/blob.h b/src/blob.h
new file mode 100644
index 00000000000..31f2c1d48d7
--- /dev/null
+++ b/src/blob.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include
+#include
+#include "platform.h"
+
+namespace ncnn {
+
+class Blob
+{
+public:
+ // empty
+ Blob();
+
+public:
+#if NCNN_STRING
+ // blob name
+ std::string name;
+#endif // NCNN_STRING
+ // layer index which produce this blob as output
+ int producer;
+ // layer index which need this blob as input
+ std::vector consumers;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/src/cpu.cpp b/src/cpu.cpp
new file mode 100644
index 00000000000..c43832a165c
--- /dev/null
+++ b/src/cpu.cpp
@@ -0,0 +1,471 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu.h"
+
+#include
+#include
+
+#ifdef _OPENMP
+#include
+#endif
+
+#ifdef __ANDROID__
+#include
+#include
+#endif
+
+#if __APPLE__
+#include "TargetConditionals.h"
+#if TARGET_OS_IPHONE
+#include
+#include
+#include
+#define __IOS__ 1
+#endif
+#endif
+
+namespace ncnn {
+
+#ifdef __ANDROID__
+
+// extract the ELF HW capabilities bitmap from /proc/self/auxv
+static unsigned int get_elf_hwcap_from_proc_self_auxv()
+{
+ FILE* fp = fopen("/proc/self/auxv", "rb");
+ if (!fp)
+ {
+ return 0;
+ }
+
+#define AT_HWCAP 16
+#define AT_HWCAP2 26
+
+ struct { unsigned int tag; unsigned int value; } entry;
+
+ unsigned int result = 0;
+ while (!feof(fp))
+ {
+ int nread = fread((char*)&entry, sizeof(entry), 1, fp);
+ if (nread != 1)
+ break;
+
+ if (entry.tag == 0 && entry.value == 0)
+ break;
+
+ if (entry.tag == AT_HWCAP)
+ {
+ result = entry.value;
+ break;
+ }
+ }
+
+ fclose(fp);
+
+ return result;
+}
+
+static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();
+
+#if __aarch64__
+// from arch/arm64/include/uapi/asm/hwcap.h
+#define HWCAP_ASIMD (1 << 1)
+#define HWCAP_ASIMDHP (1 << 10)
+#else
+// from arch/arm/include/uapi/asm/hwcap.h
+#define HWCAP_NEON (1 << 12)
+#define HWCAP_VFPv4 (1 << 16)
+#endif
+
+#endif // __ANDROID__
+
+#if __IOS__
+static cpu_type_t get_hw_cputype()
+{
+ cpu_type_t value = 0;
+ size_t len = sizeof(value);
+ sysctlbyname("hw.cputype", &value, &len, NULL, 0);
+ return value;
+}
+
+static cpu_subtype_t get_hw_cpusubtype()
+{
+ cpu_subtype_t value = 0;
+ size_t len = sizeof(value);
+ sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
+ return value;
+}
+
+static cpu_type_t g_hw_cputype = get_hw_cputype();
+static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
+#endif // __IOS__
+
+int cpu_support_arm_neon()
+{
+#ifdef __ANDROID__
+#if __aarch64__
+ return g_hwcaps & HWCAP_ASIMD;
+#else
+ return g_hwcaps & HWCAP_NEON;
+#endif
+#elif __IOS__
+#if __aarch64__
+ return g_hw_cputype == CPU_TYPE_ARM64;
+#else
+ return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
+#endif
+#else
+ return 0;
+#endif
+}
+
+int cpu_support_arm_vfpv4()
+{
+#ifdef __ANDROID__
+#if __aarch64__
+ // neon always enable fma and fp16
+ return g_hwcaps & HWCAP_ASIMD;
+#else
+ return g_hwcaps & HWCAP_VFPv4;
+#endif
+#elif __IOS__
+#if __aarch64__
+ return g_hw_cputype == CPU_TYPE_ARM64;
+#else
+ return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
+#endif
+#else
+ return 0;
+#endif
+}
+
+int cpu_support_arm_asimdhp()
+{
+#ifdef __ANDROID__
+#if __aarch64__
+ return g_hwcaps & HWCAP_ASIMDHP;
+#else
+ return 0;
+#endif
+#elif __IOS__
+#if __aarch64__
+ return 0;
+#else
+ return 0;
+#endif
+#else
+ return 0;
+#endif
+}
+
+static int get_cpucount()
+{
+#ifdef __ANDROID__
+ // get cpu count from /proc/cpuinfo
+ FILE* fp = fopen("/proc/cpuinfo", "rb");
+ if (!fp)
+ return 1;
+
+ int count = 0;
+ char line[1024];
+ while (!feof(fp))
+ {
+ char* s = fgets(line, 1024, fp);
+ if (!s)
+ break;
+
+ if (memcmp(line, "processor", 9) == 0)
+ {
+ count++;
+ }
+ }
+
+ fclose(fp);
+
+ if (count < 1)
+ count = 1;
+
+ return count;
+#elif __IOS__
+ int count = 0;
+ size_t len = sizeof(count);
+ sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
+
+ if (count < 1)
+ count = 1;
+
+ return count;
+#else
+ return 1;
+#endif
+}
+
+static int g_cpucount = get_cpucount();
+
+int get_cpu_count()
+{
+ return g_cpucount;
+}
+
+#ifdef __ANDROID__
+static int get_max_freq_khz(int cpuid)
+{
+ char path[256];
+ sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
+
+ FILE* fp = fopen(path, "rb");
+
+ if (!fp)
+ return -1;
+
+ int max_freq_khz = 0;
+ while (!feof(fp))
+ {
+ int freq_khz = 0;
+ int nscan = fscanf(fp, "%d %*d", &freq_khz);
+ if (nscan != 1)
+ break;
+
+ if (freq_khz > max_freq_khz)
+ max_freq_khz = freq_khz;
+ }
+
+ fclose(fp);
+
+ return max_freq_khz;
+}
+
+static int set_sched_affinity(const std::vector& cpuids)
+{
+ // cpu_set_t definition
+ // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity
+#define CPU_SETSIZE 1024
+#define __NCPUBITS (8 * sizeof (unsigned long))
+typedef struct
+{
+ unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
+} cpu_set_t;
+
+#define CPU_SET(cpu, cpusetp) \
+ ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
+
+#define CPU_ZERO(cpusetp) \
+ memset((cpusetp), 0, sizeof(cpu_set_t))
+
+ // set affinity for thread
+ pid_t pid = gettid();
+
+ cpu_set_t mask;
+ CPU_ZERO(&mask);
+ for (int i=0; i<(int)cpuids.size(); i++)
+ {
+ CPU_SET(cpuids[i], &mask);
+ }
+
+ int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
+ if (syscallret)
+ {
+ fprintf(stderr, "syscall error %d\n", syscallret);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int sort_cpuid_by_max_frequency(std::vector& cpuids, int* little_cluster_offset)
+{
+ const int cpu_count = cpuids.size();
+
+ *little_cluster_offset = 0;
+
+ if (cpu_count == 0)
+ return 0;
+
+ std::vector cpu_max_freq_khz;
+ cpu_max_freq_khz.resize(cpu_count);
+
+ for (int i=0; i sorted_cpuids;
+ static int little_cluster_offset = 0;
+
+ if (sorted_cpuids.empty())
+ {
+ // 0 ~ g_cpucount
+ sorted_cpuids.resize(g_cpucount);
+ for (int i=0; i cpuids;
+ if (powersave == 0)
+ {
+ cpuids = sorted_cpuids;
+ }
+ else if (powersave == 1)
+ {
+ cpuids = std::vector(sorted_cpuids.begin() + little_cluster_offset, sorted_cpuids.end());
+ }
+ else if (powersave == 2)
+ {
+ cpuids = std::vector(sorted_cpuids.begin(), sorted_cpuids.begin() + + little_cluster_offset);
+ }
+ else
+ {
+ fprintf(stderr, "powersave %d not supported\n", powersave);
+ return -1;
+ }
+
+#ifdef _OPENMP
+ // set affinity for each thread
+ int num_threads = cpuids.size();
+ omp_set_num_threads(num_threads);
+ std::vector ssarets(num_threads, 0);
+ #pragma omp parallel for
+ for (int i=0; i
+#include
+
+namespace ncnn {
+
+Layer::Layer()
+{
+ one_blob_only = false;
+ support_inplace = false;
+}
+
+Layer::~Layer()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Layer::load_param(FILE* /*paramfp*/)
+{
+ return 0;
+}
+#endif // NCNN_STRING
+
+int Layer::load_param_bin(FILE* /*paramfp*/)
+{
+ return 0;
+}
+
+int Layer::load_model(FILE* /*binfp*/)
+{
+ return 0;
+}
+#endif // NCNN_STDIO
+
+int Layer::load_param(const unsigned char*& /*mem*/)
+{
+ return 0;
+}
+
+int Layer::load_model(const unsigned char*& /*mem*/)
+{
+ return 0;
+}
+
+int Layer::forward(const std::vector& /*bottom_blobs*/, std::vector& /*top_blobs*/) const
+{
+ return -1;
+}
+
+int Layer::forward(const Mat& /*bottom_blob*/, Mat& /*top_blob*/) const
+{
+ return -1;
+}
+
+int Layer::forward_inplace(std::vector& bottom_top_blobs) const
+{
+ std::vector top_blobs;
+ int ret = forward(bottom_top_blobs, top_blobs);
+ bottom_top_blobs = top_blobs;
+ return ret;
+}
+
+int Layer::forward_inplace(Mat& bottom_top_blob) const
+{
+ Mat top_blob;
+ int ret = forward(bottom_top_blob, top_blob);
+ bottom_top_blob = top_blob;
+ return ret;
+}
+
+#include "layer_declaration.h"
+
+static const layer_registry_entry layer_registry[] =
+{
+#include "layer_registry.h"
+};
+
+static const int layer_registry_entry_count = sizeof(layer_registry) / sizeof(layer_registry_entry);
+
+#if NCNN_STRING
+int layer_to_index(const char* type)
+{
+ for (int i=0; i= layer_registry_entry_count)
+ {
+ fprintf(stderr, "layer index %d not exists\n", index);
+ return 0;
+ }
+
+ layer_creator_func layer_creator = layer_registry[index].creator;
+ if (!layer_creator)
+ {
+ fprintf(stderr, "layer index %d not enabled\n", index);
+ return 0;
+ }
+
+ return layer_creator();
+}
+
+} // namespace ncnn
diff --git a/src/layer.h b/src/layer.h
new file mode 100644
index 00000000000..13bb2e5520f
--- /dev/null
+++ b/src/layer.h
@@ -0,0 +1,163 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include
+#include
+#include
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class Layer
+{
+public:
+ // empty
+ Layer();
+ // virtual destructor
+ virtual ~Layer();
+
+#if NCNN_STDIO
+#if NCNN_STRING
+ // load layer specific parameter from plain param file
+ // return 0 if success
+ virtual int load_param(FILE* paramfp);
+#endif // NCNN_STRING
+ // load layer specific parameter from binary param file
+ // return 0 if success
+ virtual int load_param_bin(FILE* paramfp);
+
+ // load layer specific weight data from model file
+ // return 0 if success
+ virtual int load_model(FILE* binfp);
+#endif // NCNN_STDIO
+
+ // load layer specific parameter from memory
+ // memory pointer is 32-bit aligned
+ // return 0 if success
+ virtual int load_param(const unsigned char*& mem);
+
+ // load layer specific weight data from memory
+ // memory pointer is 32-bit aligned
+ // return 0 if success
+ virtual int load_model(const unsigned char*& mem);
+
+public:
+ // one input and one output blob
+ bool one_blob_only;
+
+ // support inplace inference
+ bool support_inplace;
+
+public:
+ // implement inference
+ // return 0 if success
+ virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const;
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ // implement inplace inference
+ // return 0 if success
+ virtual int forward_inplace(std::vector& bottom_top_blobs) const;
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+#if NCNN_STRING
+ // layer type name
+ std::string type;
+ // layer name
+ std::string name;
+#endif // NCNN_STRING
+ // blob index which this layer needs as input
+ std::vector bottoms;
+ // blob index which this layer produces as output
+ std::vector tops;
+};
+
+namespace LayerType {
+enum
+{
+ AbsVal = 0,
+ ArgMax = 1,
+ BatchNorm = 2,
+ Bias = 3,
+ BNLL = 4,
+ Concat = 5,
+ Convolution = 6,
+ Crop = 7,
+ Deconvolution = 8,
+ Dropout = 9,
+ ELU = 10,
+ Eltwise = 11,
+ Embed = 12,
+ Exp = 13,
+ Flatten = 14,
+ InnerProduct = 15,
+ Input = 16,
+ Log = 17,
+ LRN = 18,
+ MemoryData = 19,
+ MVN = 20,
+ Pooling = 21,
+ Power = 22,
+ PReLU = 23,
+ Proposal = 24,
+ Reduction = 25,
+ ReLU = 26,
+ Reshape = 27,
+ ROIPooling = 28,
+ Scale = 29,
+ Sigmoid = 30,
+ Slice = 31,
+ Softmax = 32,
+ Split = 33,
+ SPP = 34,
+ TanH = 35,
+ Threshold = 36,
+ Tile = 37,
+ RNN = 38,
+ LSTM = 39,
+
+ CustomBit = (1<<8),
+};
+} // namespace LayerType
+
+// layer factory function
+typedef Layer* (*layer_creator_func)();
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+ // layer type name
+ const char* name;
+#endif // NCNN_STRING
+ // layer factory entry
+ layer_creator_func creator;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+int layer_to_index(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name) \
+ Layer* name##_layer_creator() { return new name; }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/src/layer/absval.cpp b/src/layer/absval.cpp
new file mode 100644
index 00000000000..ed8f7e66a5f
--- /dev/null
+++ b/src/layer/absval.cpp
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "absval.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(AbsVal)
+
+AbsVal::AbsVal()
+{
+ one_blob_only = true;
+ support_inplace = true;
+}
+
+int AbsVal::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+
+ #pragma omp parallel for
+ for (int q=0; q
+#include
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(ArgMax)
+
+ArgMax::ArgMax()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int ArgMax::load_param(FILE* paramfp)
+{
+ int nscan = fscanf(paramfp, "%d %d", &out_max_val, &topk);
+ if (nscan != 2)
+ {
+ fprintf(stderr, "ArgMax load_param failed %d\n", nscan);
+ return -1;
+ }
+
+ return 0;
+}
+#endif // NCNN_STRING
+int ArgMax::load_param_bin(FILE* paramfp)
+{
+ fread(&out_max_val, sizeof(int), 1, paramfp);
+
+ fread(&topk, sizeof(int), 1, paramfp);
+
+ return 0;
+}
+#endif // NCNN_STDIO
+
+int ArgMax::load_param(const unsigned char*& mem)
+{
+ out_max_val = *(int*)(mem);
+ mem += 4;
+
+ topk = *(int*)(mem);
+ mem += 4;
+
+ return 0;
+}
+
+int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int size = bottom_blob.total();
+
+ if (out_max_val)
+ top_blob.create(topk, 2);
+ else
+ top_blob.create(topk, 1);
+ if (top_blob.empty())
+ return -100;
+
+ const float* ptr = bottom_blob;
+
+ // partial sort topk with index
+ // optional value
+ std::vector< std::pair > vec;
+ vec.resize(size);
+ for (int i=0; i >());
+
+ float* outptr = top_blob;
+ if (out_max_val)
+ {
+ float* valptr = outptr + topk;
+ for (int i=0; i
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(AbsVal_arm)
+
+int AbsVal_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _outp = vabsq_f32(_p);
+ vst1q_f32(outptr, _outp);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "vld1.f32 {d0-d1}, [%1]! \n"
+ "vabs.f32 q0, q0 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%2]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr)
+ : "cc", "memory", "q0"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = *ptr > 0 ? *ptr : -*ptr;
+
+ ptr++;
+ outptr++;
+ }
+ }
+
+ return 0;
+}
+
+int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int channels = bottom_top_blob.c;
+ int size = w * h;
+
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ _p = vabsq_f32(_p);
+ vst1q_f32(ptr, _p);
+
+ ptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "vld1.f32 {d0-d1}, [%1] \n"
+ "vabs.f32 q0, q0 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%1]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn),
+ "1"(ptr)
+ : "cc", "memory", "q0"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *ptr = *ptr > 0 ? *ptr : -*ptr;
+
+ ptr++;
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/absval_arm.h b/src/layer/arm/absval_arm.h
new file mode 100644
index 00000000000..787da11af91
--- /dev/null
+++ b/src/layer/arm/absval_arm.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ABSVAL_ARM_H
+#define LAYER_ABSVAL_ARM_H
+
+#include "absval.h"
+
+namespace ncnn {
+
+class AbsVal_arm : public AbsVal
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ABSVAL_ARM_H
diff --git a/src/layer/arm/batchnorm_arm.cpp b/src/layer/arm/batchnorm_arm.cpp
new file mode 100644
index 00000000000..0469410a0a6
--- /dev/null
+++ b/src/layer/arm/batchnorm_arm.cpp
@@ -0,0 +1,186 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "batchnorm_arm.h"
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(BatchNorm_arm)
+
+int BatchNorm_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ // a = bias - slope * mean / sqrt(var)
+ // b = slope / sqrt(var)
+ // value = b * value + a
+
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ const float* a_data_ptr = a_data;
+ const float* b_data_ptr = b_data;
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ float32x4_t _a = vdupq_n_f32(a);
+ float32x4_t _b = vdupq_n_f32(b);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _outp = _a;
+ _outp = vfmaq_f32(_outp, _p, _b);
+ vst1q_f32(outptr, _outp);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "vdup.f32 q1, %6 \n"
+ "vdup.f32 q2, %7 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vorr.32 q3, q1, q1 \n"
+ "vmla.f32 q3, q0, q2 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d6-d7}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr),
+ "r"(a), // %6
+ "r"(b) // %7
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = b * *ptr + a;
+
+ ptr++;
+ outptr++;
+ }
+ }
+
+ return 0;
+}
+
+int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+ // a = bias - slope * mean / sqrt(var)
+ // b = slope / sqrt(var)
+ // value = b * value + a
+
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int size = w * h;
+
+ const float* a_data_ptr = a_data;
+ const float* b_data_ptr = b_data;
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ float32x4_t _a = vdupq_n_f32(a);
+ float32x4_t _b = vdupq_n_f32(b);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _outp = _a;
+ _outp = vfmaq_f32(_outp, _p, _b);
+ vst1q_f32(ptr, _outp);
+
+ ptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "vdup.f32 q1, %4 \n"
+ "vdup.f32 q2, %5 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128] \n"
+ "vorr.32 q3, q1, q1 \n"
+ "vmla.f32 q3, q0, q2 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d6-d7}, [%1 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn),
+ "1"(ptr),
+ "r"(a), // %4
+ "r"(b) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *ptr = b * *ptr + a;
+
+ ptr++;
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/batchnorm_arm.h b/src/layer/arm/batchnorm_arm.h
new file mode 100644
index 00000000000..448b5a49834
--- /dev/null
+++ b/src/layer/arm/batchnorm_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BATCHNORM_ARM_H
+#define LAYER_BATCHNORM_ARM_H
+
+#include "batchnorm.h"
+
+namespace ncnn {
+
+class BatchNorm_arm : public BatchNorm
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BATCHNORM_ARM_H
diff --git a/src/layer/arm/bias_arm.cpp b/src/layer/arm/bias_arm.cpp
new file mode 100644
index 00000000000..e32e8f39652
--- /dev/null
+++ b/src/layer/arm/bias_arm.cpp
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "bias_arm.h"
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Bias_arm)
+
+int Bias_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ const float* bias_ptr = bias_data;
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _bias = vdupq_n_f32(bias);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _outp = vaddq_f32(_p, _bias);
+ vst1q_f32(outptr, _outp);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *outptr = *ptr + bias;
+
+ ptr++;
+ outptr++;
+ }
+ }
+
+ return 0;
+}
+
+int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int channels = bottom_top_blob.c;
+ int size = w * h;
+
+ const float* bias_ptr = bias_data;
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _bias = vdupq_n_f32(bias);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _outp = vaddq_f32(_p, _bias);
+ vst1q_f32(ptr, _outp);
+
+ ptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *ptr = *ptr + bias;
+
+ ptr++;
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/bias_arm.h b/src/layer/arm/bias_arm.h
new file mode 100644
index 00000000000..27f13f8ea2e
--- /dev/null
+++ b/src/layer/arm/bias_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BIAS_ARM_H
+#define LAYER_BIAS_ARM_H
+
+#include "bias.h"
+
+namespace ncnn {
+
+class Bias_arm : public Bias
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BIAS_ARM_H
diff --git a/src/layer/arm/convolution_1x1.h b/src/layer/arm/convolution_1x1.h
new file mode 100644
index 00000000000..32778526bad
--- /dev/null
+++ b/src/layer/arm/convolution_1x1.h
@@ -0,0 +1,543 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 3;
+ int remain = size & 7;
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _k0 = vdupq_n_f32(k0);
+ float32x4_t _k1 = vdupq_n_f32(k1);
+ float32x4_t _k2 = vdupq_n_f32(k2);
+ float32x4_t _k3 = vdupq_n_f32(k3);
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(r0);
+ float32x4_t _pn = vld1q_f32(r0+4);
+
+ float32x4_t _outp = vld1q_f32(outptr);
+ float32x4_t _outpn = vld1q_f32(outptr+4);
+
+ _outp = vfmaq_f32(_outp, _p, _k0);
+ _outpn = vfmaq_f32(_outpn, _pn, _k0);
+
+ float32x4_t _p1 = vld1q_f32(r1);
+ float32x4_t _p1n = vld1q_f32(r1+4);
+
+ _outp = vfmaq_f32(_outp, _p1, _k1);
+ _outpn = vfmaq_f32(_outpn, _p1n, _k1);
+
+ float32x4_t _p2 = vld1q_f32(r2);
+ float32x4_t _p2n = vld1q_f32(r2+4);
+
+ _outp = vfmaq_f32(_outp, _p2, _k2);
+ _outpn = vfmaq_f32(_outpn, _p2n, _k2);
+
+ float32x4_t _p3 = vld1q_f32(r3);
+ float32x4_t _p3n = vld1q_f32(r3+4);
+
+ _outp = vfmaq_f32(_outp, _p3, _k3);
+ _outpn = vfmaq_f32(_outpn, _p3n, _k3);
+
+ vst1q_f32(outptr, _outp);
+ vst1q_f32(outptr+4, _outpn);
+
+ r0 += 8;
+ r1 += 8;
+ r2 += 8;
+ r3 += 8;
+ outptr += 8;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "pld [%2, #256] \n"
+ "vld1.f32 {d4-d7}, [%2 :128]! \n"
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.f32 {d0-d3}, [%1 :128] \n"
+ "vmla.f32 q0, q2, %q12 \n"
+ "vmla.f32 q1, q3, %q12 \n"
+ "pld [%3, #256] \n"
+ "vld1.f32 {d4-d7}, [%3 :128]! \n"
+ "vmla.f32 q0, q2, %q13 \n"
+ "vmla.f32 q1, q3, %q13 \n"
+ "pld [%4, #256] \n"
+ "vld1.f32 {d4-d7}, [%4 :128]! \n"
+ "vmla.f32 q0, q2, %q14 \n"
+ "vmla.f32 q1, q3, %q14 \n"
+ "pld [%5, #256] \n"
+ "vld1.f32 {d4-d7}, [%5 :128]! \n"
+ "vmla.f32 q0, q2, %q15 \n"
+ "vmla.f32 q1, q3, %q15 \n"
+ "pld [%2, #256] \n"
+ "vld1.f32 {d4-d7}, [%2 :128]! \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d3}, [%1 :128]! \n"
+ "bne 0b \n"
+ "sub %2, #32 \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0), // %2
+ "=r"(r1), // %3
+ "=r"(r2), // %4
+ "=r"(r3) // %5
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "3"(r1),
+ "4"(r2),
+ "5"(r3),
+ "w"(_k0), // %12
+ "w"(_k1), // %13
+ "w"(_k2), // %14
+ "w"(_k3) // %15
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ float sum = *r0 * k0;
+ float sum1 = *r1 * k1;
+ float sum2 = *r2 * k2;
+ float sum3 = *r3 * k3;
+
+ *outptr += sum + sum1 + sum2 + sum3;
+
+ r0++;
+ r1++;
+ r2++;
+ r3++;
+ outptr++;
+ }
+
+ }
+
+ for (; q> 3;
+ int remain = size & 7;
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _k0 = vdupq_n_f32(k0);
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(r0);
+ float32x4_t _outp = vld1q_f32(outptr);
+
+ float32x4_t _pn = vld1q_f32(r0+4);
+ float32x4_t _outpn = vld1q_f32(outptr+4);
+
+ _outp = vfmaq_f32(_outp, _p, _k0);
+ _outpn = vfmaq_f32(_outpn, _pn, _k0);
+
+ vst1q_f32(outptr, _outp);
+ vst1q_f32(outptr+4, _outpn);
+
+ r0 += 8;
+ outptr += 8;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "pld [%2, #256] \n"
+ "vld1.f32 {d4-d7}, [%2 :128]! \n"
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.f32 {d0-d3}, [%1 :128] \n"
+ "vmla.f32 q0, q2, %q6 \n"
+ "vmla.f32 q1, q3, %q6 \n"
+ "pld [%2, #256] \n"
+ "vld1.f32 {d4-d7}, [%2 :128]! \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d3}, [%1 :128]! \n"
+ "bne 0b \n"
+ "sub %2, #32 \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0) // %2
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "w"(_k0) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ float sum = *r0 * k0;
+
+ *outptr += sum;
+
+ r0++;
+ outptr++;
+ }
+
+ }
+ }
+
+}
+
+static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const int tailstep = w - 2*outw + w;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 3;
+ int remain = outw & 7;
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _k0 = vdupq_n_f32(k0);
+ float32x4_t _k1 = vdupq_n_f32(k1);
+ float32x4_t _k2 = vdupq_n_f32(k2);
+ float32x4_t _k3 = vdupq_n_f32(k3);
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4x2_t _px2 = vld2q_f32(r0);
+ float32x4_t _p = _px2.val[0];
+ float32x4_t _outp = vld1q_f32(outptr);
+
+ float32x4x2_t _pnx2 = vld2q_f32(r0+8);
+ float32x4_t _pn = _pnx2.val[0];
+ float32x4_t _outpn = vld1q_f32(outptr+4);
+
+ _outp = vmlaq_f32(_outp, _p, _k0);
+ _outpn = vmlaq_f32(_outpn, _pn, _k0);
+
+ float32x4x2_t _p1x2 = vld2q_f32(r1);
+ float32x4_t _p1 = _p1x2.val[0];
+ float32x4x2_t _p1nx2 = vld2q_f32(r1+8);
+ float32x4_t _p1n = _p1nx2.val[0];
+
+ _outp = vmlaq_f32(_outp, _p1, _k1);
+ _outpn = vmlaq_f32(_outpn, _p1n, _k1);
+
+ float32x4x2_t _p2x2 = vld2q_f32(r2);
+ float32x4_t _p2 = _p2x2.val[0];
+ float32x4x2_t _p2nx2 = vld2q_f32(r2+8);
+ float32x4_t _p2n = _p2nx2.val[0];
+
+ _outp = vmlaq_f32(_outp, _p2, _k2);
+ _outpn = vmlaq_f32(_outpn, _p2n, _k2);
+
+ float32x4x2_t _p3x2 = vld2q_f32(r3);
+ float32x4_t _p3 = _p3x2.val[0];
+ float32x4x2_t _p3nx2 = vld2q_f32(r3+8);
+ float32x4_t _p3n = _p3nx2.val[0];
+
+ _outp = vmlaq_f32(_outp, _p3, _k3);
+ _outpn = vmlaq_f32(_outpn, _p3n, _k3);
+
+ vst1q_f32(outptr, _outp);
+ vst1q_f32(outptr+8, _outpn);
+
+ r0 += 16;
+ r1 += 16;
+ r2 += 16;
+ r3 += 16;
+ outptr += 8;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "pld [%2, #512] \n"
+ "vld2.f32 {d4-d7}, [%2]! \n"
+ "vld2.f32 {d16-d19}, [%2]! \n"
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.f32 {d0-d3}, [%1] \n"
+ "vmla.f32 q0, q2, %q12 \n"
+ "vmla.f32 q1, q8, %q12 \n"
+ "pld [%3, #512] \n"
+ "vld2.f32 {d4-d7}, [%3]! \n"
+ "vld2.f32 {d16-d19}, [%3]! \n"
+ "vmla.f32 q0, q2, %q13 \n"
+ "vmla.f32 q1, q8, %q13 \n"
+ "pld [%4, #512] \n"
+ "vld2.f32 {d4-d7}, [%4]! \n"
+ "vld2.f32 {d16-d19}, [%4]! \n"
+ "vmla.f32 q0, q2, %q14 \n"
+ "vmla.f32 q1, q8, %q14 \n"
+ "pld [%5, #512] \n"
+ "vld2.f32 {d4-d7}, [%5]! \n"
+ "vld2.f32 {d16-d19}, [%5]! \n"
+ "vmla.f32 q0, q2, %q15 \n"
+ "vmla.f32 q1, q8, %q15 \n"
+ "pld [%2, #512] \n"
+ "vld2.f32 {d4-d7}, [%2]! \n"
+ "vld2.f32 {d16-d19}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d3}, [%1]! \n"
+ "bne 0b \n"
+ "sub %2, #64 \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0), // %2
+ "=r"(r1), // %3
+ "=r"(r2), // %4
+ "=r"(r3) // %5
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "3"(r1),
+ "4"(r2),
+ "5"(r3),
+ "w"(_k0), // %12
+ "w"(_k1), // %13
+ "w"(_k2), // %14
+ "w"(_k3) // %15
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ float sum = *r0 * k0;
+ float sum1 = *r1 * k1;
+ float sum2 = *r2 * k2;
+ float sum3 = *r3 * k3;
+
+ *outptr += sum + sum1 + sum2 + sum3;
+
+ r0 += 2;
+ r1 += 2;
+ r2 += 2;
+ r3 += 2;
+ outptr++;
+ }
+
+ r0 += tailstep;
+ r1 += tailstep;
+ r2 += tailstep;
+ r3 += tailstep;
+ }
+
+ }
+
+ for (; q> 3;
+ int remain = outw & 7;
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _k0 = vdupq_n_f32(k0);
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4x2_t _px2 = vld2q_f32(r0);
+ float32x4_t _p = _px2.val[0];
+ float32x4_t _outp = vld1q_f32(outptr);
+
+ float32x4x2_t _pnx2 = vld2q_f32(r0+8);
+ float32x4_t _pn = _pnx2.val[0];
+ float32x4_t _outpn = vld1q_f32(outptr+4);
+
+ _outp = vmlaq_f32(_outp, _p, _k0);
+ _outpn = vmlaq_f32(_outpn, _pn, _k0);
+
+ vst1q_f32(outptr, _outp);
+ vst1q_f32(outptr+4, _outpn);
+
+ r0 += 16;
+ outptr += 8;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "pld [%2, #512] \n"
+ "vld2.f32 {d4-d7}, [%2]! \n"
+ "vld2.f32 {d16-d19}, [%2]! \n"
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.f32 {d0-d3}, [%1] \n"
+ "vmla.f32 q0, q2, %q6 \n"
+ "vmla.f32 q1, q8, %q6 \n"
+ "pld [%2, #512] \n"
+ "vld2.f32 {d4-d7}, [%2]! \n"
+ "vld2.f32 {d16-d19}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d3}, [%1]! \n"
+ "bne 0b \n"
+ "sub %2, #64 \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0) // %2
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "w"(_k0) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ float sum = *r0 * k0;
+
+ *outptr += sum;
+
+ r0 += 2;
+ outptr++;
+ }
+
+ r0 += tailstep;
+ }
+
+ }
+ }
+
+}
diff --git a/src/layer/arm/convolution_2x2.h b/src/layer/arm/convolution_2x2.h
new file mode 100644
index 00000000000..fc4ed6672a8
--- /dev/null
+++ b/src/layer/arm/convolution_2x2.h
@@ -0,0 +1,381 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 2;
+ int remain = outw & 3;
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _r000 = vld1q_f32(r00);
+ float32x4_t _r010 = vld1q_f32(r01);
+ float32x4_t _r001 = vld1q_f32(r00 + 1);
+ float32x4_t _r011 = vld1q_f32(r01 + 1);
+
+ float32x4_t _r100 = vld1q_f32(r10);
+ float32x4_t _r110 = vld1q_f32(r11);
+ float32x4_t _r101 = vld1q_f32(r10 + 1);
+ float32x4_t _r111 = vld1q_f32(r11 + 1);
+
+ float32x4_t _sum = vld1q_f32(outptr);
+
+ _sum = vmlaq_lane_f32(_sum, _r000, vget_low_f32(_k0), 0);
+ _sum = vmlaq_lane_f32(_sum, _r001, vget_low_f32(_k0), 1);
+ _sum = vmlaq_lane_f32(_sum, _r010, vget_high_f32(_k0), 0);
+ _sum = vmlaq_lane_f32(_sum, _r011, vget_high_f32(_k0), 1);
+
+ _sum = vmlaq_lane_f32(_sum, _r100, vget_low_f32(_k1), 0);
+ _sum = vmlaq_lane_f32(_sum, _r101, vget_low_f32(_k1), 1);
+ _sum = vmlaq_lane_f32(_sum, _r110, vget_high_f32(_k1), 0);
+ _sum = vmlaq_lane_f32(_sum, _r111, vget_high_f32(_k1), 1);
+
+ vst1q_f32(outptr, _sum);
+
+ r00 += 4;
+ r01 += 4;
+ r10 += 4;
+ r11 += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1]! \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d4-d5}, [%2]! \n"
+
+ "pld [%3, #128] \n"
+ "vld1.f32 {d24-d25}, [%3]! \n"
+ "pld [%4, #128] \n"
+ "vld1.f32 {d28-d29}, [%4]! \n"
+
+ "0: \n"
+ "pld [%5, #128] \n"
+ "vld1.f32 {d18-d19}, [%5] \n"// q9 = sum
+
+ "vmul.f32 q8, q0, %e12[0] \n"
+ "vmla.f32 q9, q2, %f12[0] \n"
+
+ "pld [%1, #128] \n"
+ "vld1.f32 {d2-d3}, [%1]! \n"
+
+ "pld [%2, #128] \n"
+ "vld1.f32 {d6-d7}, [%2]! \n"
+
+ "vext.f32 q10, q0, q1, #1 \n"
+ "vext.f32 q11, q2, q3, #1 \n"
+
+ "vmla.f32 q8, q12, %e13[0] \n"
+ "vmla.f32 q9, q14, %f13[0] \n"
+
+ "pld [%3, #128] \n"
+ "vld1.f32 {d26-d27}, [%3]! \n"
+
+ "pld [%4, #128] \n"
+ "vld1.f32 {d30-d31}, [%4]! \n"
+
+ "vmla.f32 q8, q10, %e12[1] \n"
+ "vmla.f32 q9, q11, %f12[1] \n"
+
+ "vext.f32 q10, q12, q13, #1 \n"
+ "vext.f32 q11, q14, q15, #1 \n"
+
+ "vmla.f32 q8, q10, %e13[1] \n"
+ "vmla.f32 q9, q11, %f13[1] \n"
+
+ "vorr q0, q1, q1 \n"
+ "vorr q2, q3, q3 \n"
+
+ "vadd.f32 q8, q8, q9 \n"
+
+ "vorr q12, q13, q13 \n"
+ "vorr q14, q15, q15 \n"
+
+ "subs %0, #1 \n"
+
+ "vst1.f32 {d16-d17}, [%5]! \n"
+
+ "bne 0b \n"
+ "sub %1, #16 \n"
+ "sub %2, #16 \n"
+ "sub %3, #16 \n"
+ "sub %4, #16 \n"
+ : "=r"(nn), // %0
+ "=r"(r00), // %1
+ "=r"(r01), // %2
+ "=r"(r10), // %3
+ "=r"(r11), // %4
+ "=r"(outptr) // %5
+ : "0"(nn),
+ "1"(r00),
+ "2"(r01),
+ "3"(r10),
+ "4"(r11),
+ "5"(outptr),
+ "w"(_k0), // %12
+ "w"(_k1) // %13
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+#if __ARM_NEON
+ float32x2_t _r00 = vld1_f32(r00);
+ float32x2_t _r01 = vld1_f32(r01);
+ float32x4_t _r00r1 = vcombine_f32(_r00, _r01);
+ float32x4_t _s0s1 = vmulq_f32(_r00r1, _k0);
+
+ float32x2_t _r10 = vld1_f32(r10);
+ float32x2_t _r11 = vld1_f32(r11);
+ float32x4_t _r10r1 = vcombine_f32(_r10, _r11);
+ _s0s1 = vmlaq_f32(_s0s1, _r10r1, _k1);
+
+ float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
+ _s = vpadd_f32(_s, _s);
+ *outptr += vget_lane_f32(_s, 0);
+#else
+ float sum = 0.f;
+
+ sum += r00[0] * kernel0[0];
+ sum += r00[1] * kernel0[1];
+ sum += r01[0] * kernel0[2];
+ sum += r01[1] * kernel0[3];
+
+ sum += r10[0] * kernel1[0];
+ sum += r10[1] * kernel1[1];
+ sum += r11[0] * kernel1[2];
+ sum += r11[1] * kernel1[3];
+
+ *outptr += sum;
+#endif // __ARM_NEON
+
+ r00 += 1;
+ r01 += 1;
+ r10 += 1;
+ r11 += 1;
+ outptr++;
+ }
+
+ r00 += 1;
+ r01 += 1;
+ r10 += 1;
+ r11 += 1;
+ }
+ }
+
+ for (; q> 2;
+ int remain = outw & 3;
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r01 = vld1q_f32(r0 + 1);
+ float32x4_t _r11 = vld1q_f32(r1 + 1);
+
+ float32x4_t _sum = vld1q_f32(outptr);
+ float32x4_t _sum2;
+
+ _sum = vmlaq_f32(_sum, _r00, _k0);
+ _sum2 = vmulq_f32(_r01, _k1);
+ _sum = vmlaq_f32(_sum, _r10, _k2);
+ _sum2 = vmlaq_f32(_sum2, _r11, _k3);
+
+ _sum = vaddq_f32(_sum, _sum2);
+
+ vst1q_f32(outptr, _sum);
+
+ r0 += 4;
+ r1 += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1]! \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d4-d5}, [%2]! \n"
+
+ "0: \n"
+ "pld [%3, #128] \n"
+ "vld1.f32 {d18-d19}, [%3] \n"// q9 = sum
+
+ "vmul.f32 q8, q0, %q8 \n"
+ "vmla.f32 q9, q2, %q10 \n"
+
+ "pld [%1, #128] \n"
+ "vld1.f32 {d2-d3}, [%1]! \n"
+ "vext.f32 q10, q0, q1, #1 \n"
+
+ "vmla.f32 q8, q10, %q9 \n"
+
+ "pld [%2, #128] \n"
+ "vld1.f32 {d6-d7}, [%2]! \n"
+ "vext.f32 q11, q2, q3, #1 \n"
+
+ "vmla.f32 q9, q11, %q11 \n"
+
+ "vorr q0, q1, q1 \n"
+ "vadd.f32 q8, q8, q9 \n"
+ "vorr q2, q3, q3 \n"
+
+ "subs %0, #1 \n"
+ "vst1.f32 {d16-d17}, [%3]! \n"
+ "bne 0b \n"
+ "sub %1, #16 \n"
+ "sub %2, #16 \n"
+ : "=r"(nn), // %0
+ "=r"(r0), // %1
+ "=r"(r1), // %2
+ "=r"(outptr) // %3
+ : "0"(nn),
+ "1"(r0),
+ "2"(r1),
+ "3"(outptr),
+ "w"(_k0), // %8
+ "w"(_k1), // %9
+ "w"(_k2), // %10
+ "w"(_k3) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _k0123 = vld1q_f32(kernel0);
+#endif
+
+ for (; remain>0; remain--)
+ {
+#if __ARM_NEON
+ float32x2_t _r0 = vld1_f32(r0);
+ float32x2_t _r1 = vld1_f32(r1);
+ float32x4_t _r0r1 = vcombine_f32(_r0, _r1);
+ float32x4_t _s0s1 = vmulq_f32(_r0r1, _k0123);
+ float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
+ _s = vpadd_f32(_s, _s);
+ *outptr += vget_lane_f32(_s, 0);
+#else
+ float sum = 0.f;
+ sum += r0[0] * kernel0[0];
+ sum += r0[1] * kernel0[1];
+ sum += r1[0] * kernel0[2];
+ sum += r1[1] * kernel0[3];
+ *outptr += sum;
+#endif
+
+ r0 += 1;
+ r1 += 1;
+ outptr++;
+ }
+
+ r0 += 1;
+ r1 += 1;
+
+ }
+
+ }
+ }
+
+}
diff --git a/src/layer/arm/convolution_3x3.h b/src/layer/arm/convolution_3x3.h
new file mode 100644
index 00000000000..c998c514a1c
--- /dev/null
+++ b/src/layer/arm/convolution_3x3.h
@@ -0,0 +1,753 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 2;
+ int remain = outw & 3;
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _sum1 = vld1q_f32(outptr);
+ float32x4_t _sum2 = vdupq_n_f32(0.f);
+ float32x4_t _sum3 = vld1q_f32(outptr2);
+ float32x4_t _sum4 = vdupq_n_f32(0.f);
+
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r00n = vld1q_f32(r0 + 4);
+ float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
+ float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);
+
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r10n = vld1q_f32(r1 + 4);
+ float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
+ float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);
+
+ float32x4_t _r20 = vld1q_f32(r2);
+ float32x4_t _r20n = vld1q_f32(r2 + 4);
+ float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
+ float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);
+
+ float32x4_t _r30 = vld1q_f32(r3);
+ float32x4_t _r30n = vld1q_f32(r3 + 4);
+ float32x4_t _r31 = vextq_f32(_r30, _r30n, 1);
+ float32x4_t _r32 = vextq_f32(_r30, _r30n, 2);
+
+ _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1);
+ _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0);
+ _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2);
+ _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1);
+ _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2);
+
+ _sum3 = vfmaq_laneq_f32(_sum3, _r10, _k0123, 0);
+ _sum4 = vfmaq_laneq_f32(_sum4, _r11, _k0123, 1);
+ _sum3 = vfmaq_laneq_f32(_sum3, _r12, _k0123, 2);
+ _sum4 = vfmaq_laneq_f32(_sum4, _r20, _k3456, 0);
+ _sum3 = vfmaq_laneq_f32(_sum3, _r21, _k3456, 1);
+ _sum4 = vfmaq_laneq_f32(_sum4, _r22, _k3456, 2);
+ _sum3 = vfmaq_laneq_f32(_sum3, _r30, _k6789, 0);
+ _sum4 = vfmaq_laneq_f32(_sum4, _r31, _k6789, 1);
+ _sum3 = vfmaq_laneq_f32(_sum3, _r32, _k6789, 2);
+
+ _sum1 = vaddq_f32(_sum1, _sum2);
+ _sum3 = vaddq_f32(_sum3, _sum4);
+
+ vst1q_f32(outptr, _sum1);
+ vst1q_f32(outptr2, _sum3);
+
+ r0 += 4;
+ r1 += 4;
+ r2 += 4;
+ r3 += 4;
+ outptr += 4;
+ outptr2 += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "veor q6, q6 \n"
+ "veor q15, q15 \n"
+
+ "pld [%3, #192] \n"
+ "vld1.f32 {d18-d20}, [%3 :64] \n"// r0
+ "add %3, #16 \n"
+
+ "veor q13, q13 \n"
+ "veor q14, q14 \n"
+
+ "vext.32 q11, q9, q10, #1 \n"
+ "vext.32 q12, q9, q10, #2 \n"
+
+ "0: \n"
+
+ "pld [%1, #128] \n"
+ "vld1.f32 {d14-d15}, [%1 :64] \n"// _sum
+
+ "vmla.f32 q7, q9, %e14[0] \n"
+ "vmla.f32 q6, q11, %e14[1] \n"
+ "vmla.f32 q13, q12, %f14[0] \n"
+
+ "pld [%4, #192] \n"
+ "vld1.f32 {d18-d20}, [%4] \n"// r1
+ "add %4, #16 \n"
+
+ "vmla.f32 q7, q9, %e15[0] \n"
+
+ "vext.32 q11, q9, q10, #1 \n"
+ "vext.32 q12, q9, q10, #2 \n"
+
+ "vmla.f32 q6, q11, %e15[1] \n"
+ "vmla.f32 q13, q12, %f15[0] \n"
+
+ "pld [%2, #128] \n"
+ "vld1.f32 {d16-d17}, [%2] \n"// _sum2
+
+ "vmla.f32 q8, q9, %e14[0] \n"
+ "vmla.f32 q14, q11, %e14[1] \n"
+ "vmla.f32 q15, q12, %f14[0] \n"
+
+ "pld [%5, #192] \n"
+ "vld1.f32 {d18-d20}, [%5 :64] \n"// r2
+ "add %5, #16 \n"
+
+ "vmla.f32 q7, q9, %e16[0] \n"
+
+ "vext.32 q11, q9, q10, #1 \n"
+ "vext.32 q12, q9, q10, #2 \n"
+
+ "vmla.f32 q6, q11, %e16[1] \n"
+ "vmla.f32 q13, q12, %f16[0] \n"
+
+ "vmla.f32 q8, q9, %e15[0] \n"
+ "vmla.f32 q14, q11, %e15[1] \n"
+ "vmla.f32 q15, q12, %f15[0] \n"
+
+ "pld [%6, #192] \n"
+ "vld1.f32 {d18-d20}, [%6] \n"// r3
+ "add %6, #16 \n"
+
+ "vmla.f32 q8, q9, %e16[0] \n"
+
+ "vext.32 q11, q9, q10, #1 \n"
+ "vext.32 q12, q9, q10, #2 \n"
+
+ "vmla.f32 q14, q11, %e16[1] \n"
+ "vmla.f32 q15, q12, %f16[0] \n"
+
+ "vadd.f32 q7, q7, q6 \n"
+ "veor q6, q6 \n"
+
+ "pld [%3, #192] \n"
+ "vld1.f32 {d18-d20}, [%3 :64] \n"// r0
+
+ "vadd.f32 q8, q8, q14 \n"
+ "veor q14, q14 \n"
+ "vadd.f32 q7, q7, q13 \n"
+ "veor q13, q13 \n"
+ "vadd.f32 q8, q8, q15 \n"
+ "veor q15, q15 \n"
+
+ "vext.32 q11, q9, q10, #1 \n"
+ "vext.32 q12, q9, q10, #2 \n"
+
+ "add %3, #16 \n"
+
+ "vst1.f32 {d14-d15}, [%1]! \n"
+ "vst1.f32 {d16-d17}, [%2]! \n"
+
+ "subs %0, #1 \n"
+ "bne 0b \n"
+
+ "sub %3, #16 \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(outptr2), // %2
+ "=r"(r0), // %3
+ "=r"(r1), // %4
+ "=r"(r2), // %5
+ "=r"(r3) // %6
+ : "0"(nn),
+ "1"(outptr),
+ "2"(outptr2),
+ "3"(r0),
+ "4"(r1),
+ "5"(r2),
+ "6"(r3),
+ "w"(_k0123), // %14
+ "w"(_k3456), // %15
+ "w"(_k6789) // %16
+ : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+#if __ARM_NEON
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r20 = vld1q_f32(r2);
+ float32x4_t _r30 = vld1q_f32(r3);
+
+ float32x4_t _sum = vmulq_f32(_r00, _k0123);
+ _sum = vmlaq_f32(_sum, _r10, _k3456);
+ _sum = vmlaq_f32(_sum, _r20, _k6789);
+
+ float32x4_t _sum2 = vmulq_f32(_r10, _k0123);
+ _sum2 = vmlaq_f32(_sum2, _r20, _k3456);
+ _sum2 = vmlaq_f32(_sum2, _r30, _k6789);
+
+ _sum = vsetq_lane_f32(*outptr, _sum, 3);
+ _sum2 = vsetq_lane_f32(*outptr2, _sum2, 3);
+
+#if __aarch64__
+ *outptr = vaddvq_f32(_sum);
+ *outptr2 = vaddvq_f32(_sum2);
+#else
+ float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+ float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
+
+ float32x2_t _sss2 = vpadd_f32(_ss, _ss2);
+
+ *outptr = vget_lane_f32(_sss2, 0);
+ *outptr2 = vget_lane_f32(_sss2, 1);
+#endif // __aarch64__
+#else
+ float sum = 0;
+ float sum2 = 0;
+
+ sum += r0[0] * k0[0];
+ sum += r0[1] * k0[1];
+ sum += r0[2] * k0[2];
+ sum += r1[0] * k1[0];
+ sum += r1[1] * k1[1];
+ sum += r1[2] * k1[2];
+ sum += r2[0] * k2[0];
+ sum += r2[1] * k2[1];
+ sum += r2[2] * k2[2];
+
+ sum2 += r1[0] * k0[0];
+ sum2 += r1[1] * k0[1];
+ sum2 += r1[2] * k0[2];
+ sum2 += r2[0] * k1[0];
+ sum2 += r2[1] * k1[1];
+ sum2 += r2[2] * k1[2];
+ sum2 += r3[0] * k2[0];
+ sum2 += r3[1] * k2[1];
+ sum2 += r3[2] * k2[2];
+
+ *outptr += sum;
+ *outptr2 += sum2;
+#endif
+ r0++;
+ r1++;
+ r2++;
+ r3++;
+ outptr++;
+ outptr2++;
+ }
+
+ r0 += 2 + w;
+ r1 += 2 + w;
+ r2 += 2 + w;
+ r3 += 2 + w;
+
+ outptr += outw;
+ outptr2 += outw;
+ }
+
+ for (; i < outh; i++)
+ {
+
+#if __ARM_NEON
+ int nn = outw >> 2;
+ int remain = outw & 3;
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _sum1 = vld1q_f32(outptr);
+ float32x4_t _sum2 = vdupq_n_f32(0.f);
+
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r00n = vld1q_f32(r0 + 4);
+ float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
+ float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);
+
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r10n = vld1q_f32(r1 + 4);
+ float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
+ float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);
+
+ float32x4_t _r20 = vld1q_f32(r2);
+ float32x4_t _r20n = vld1q_f32(r2 + 4);
+ float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
+ float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);
+
+ _sum1 = vfmaq_laneq_f32(_sum1, _r00, _k0123, 0);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r01, _k0123, 1);
+ _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k0123, 2);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k3456, 0);
+ _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k3456, 1);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k3456, 2);
+ _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k6789, 0);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k6789, 1);
+ _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k6789, 2);
+
+ _sum1 = vaddq_f32(_sum1, _sum2);
+
+ vst1q_f32(outptr, _sum1);
+
+ r0 += 4;
+ r1 += 4;
+ r2 += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "pld [%2, #192] \n"
+ "vld1.f32 {d16-d18}, [%2] \n"// r0
+ "add %2, #16 \n"
+
+ "veor q13, q13 \n"
+ "veor q14, q14 \n"
+
+ "vext.32 q10, q8, q9, #1 \n"
+ "vext.32 q11, q8, q9, #2 \n"
+
+ "0: \n"
+
+ "pld [%1, #128] \n"
+ "vld1.f32 {d14-d15}, [%1] \n"// _sum
+
+ "vmla.f32 q7, q8, %e10[0] \n"
+ "vmla.f32 q13, q10, %e10[1] \n"
+ "vmla.f32 q14, q11, %f10[0] \n"
+
+ "pld [%3, #192] \n"
+ "vld1.f32 {d16-d18}, [%3] \n"// r1
+ "add %3, #16 \n"
+
+ "vmla.f32 q7, q8, %e11[0] \n"
+
+ "vext.32 q10, q8, q9, #1 \n"
+ "vext.32 q11, q8, q9, #2 \n"
+
+ "vmla.f32 q13, q10, %e11[1] \n"
+ "vmla.f32 q14, q11, %f11[0] \n"
+
+ "pld [%4, #192] \n"
+ "vld1.f32 {d16-d18}, [%4] \n"// r2
+ "add %4, #16 \n"
+
+ "vmla.f32 q7, q8, %e12[0] \n"
+
+ "vext.32 q10, q8, q9, #1 \n"
+ "vext.32 q11, q8, q9, #2 \n"
+
+ "vmla.f32 q13, q10, %e12[1] \n"
+ "vmla.f32 q14, q11, %f12[0] \n"
+
+ "pld [%2, #192] \n"
+ "vld1.f32 {d16-d18}, [%2] \n"// r0
+ "add %2, #16 \n"
+
+ "vadd.f32 q7, q7, q13 \n"
+ "veor q13, q13 \n"
+ "vadd.f32 q7, q7, q14 \n"
+ "veor q14, q14 \n"
+
+ "vext.32 q10, q8, q9, #1 \n"
+ "vext.32 q11, q8, q9, #2 \n"
+
+ "vst1.f32 {d14-d15}, [%1]! \n"
+
+ "subs %0, #1 \n"
+ "bne 0b \n"
+
+ "sub %2, #16 \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0), // %2
+ "=r"(r1), // %3
+ "=r"(r2) // %4
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "3"(r1),
+ "4"(r2),
+ "w"(_k0123), // %10
+ "w"(_k3456), // %11
+ "w"(_k6789) // %12
+ : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+#if __ARM_NEON
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r20 = vld1q_f32(r2);
+
+ float32x4_t _sum = vmulq_f32(_r00, _k0123);
+ _sum = vmlaq_f32(_sum, _r10, _k3456);
+ _sum = vmlaq_f32(_sum, _r20, _k6789);
+
+ _sum = vsetq_lane_f32(*outptr, _sum, 3);
+
+#if __aarch64__
+ *outptr = vaddvq_f32(_sum);
+#else
+ float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+ _ss = vpadd_f32(_ss, _ss);
+
+ *outptr = vget_lane_f32(_ss, 0);
+#endif // __aarch64__
+#else
+ float sum = 0;
+
+ sum += r0[0] * k0[0];
+ sum += r0[1] * k0[1];
+ sum += r0[2] * k0[2];
+ sum += r1[0] * k1[0];
+ sum += r1[1] * k1[1];
+ sum += r1[2] * k1[2];
+ sum += r2[0] * k2[0];
+ sum += r2[1] * k2[1];
+ sum += r2[2] * k2[2];
+
+ *outptr += sum;
+#endif
+ r0++;
+ r1++;
+ r2++;
+ outptr++;
+ }
+
+ r0 += 2;
+ r1 += 2;
+ r2 += 2;
+ }
+
+ kernel0 += 9;
+ }
+ }
+
+}
+
+static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const int tailstep = w - 2*outw + w;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 2;
+ int remain = outw & 3;
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _outp = vld1q_f32(outptr);
+
+ float32x4x2_t _r0 = vld2q_f32(r0);
+ float32x4x2_t _r0n = vld2q_f32(r0+8);
+
+ float32x4_t _r00 = _r0.val[0];// 0 2 4 6
+ float32x4_t _r01 = _r0.val[1];// 1 3 5 7
+ float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8
+
+ _outp = vfmaq_laneq_f32(_outp, _r00, _k0123, 0);
+ _outp = vfmaq_laneq_f32(_outp, _r01, _k0123, 1);
+ _outp = vfmaq_laneq_f32(_outp, _r02, _k0123, 2);
+
+ float32x4x2_t _r1 = vld2q_f32(r1);
+ float32x4x2_t _r1n = vld2q_f32(r1+8);
+
+ float32x4_t _r10 = _r1.val[0];
+ float32x4_t _r11 = _r1.val[1];
+ float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1);
+
+ _outp = vfmaq_laneq_f32(_outp, _r10, _k3456, 0);
+ _outp = vfmaq_laneq_f32(_outp, _r11, _k3456, 1);
+ _outp = vfmaq_laneq_f32(_outp, _r12, _k3456, 2);
+
+ float32x4x2_t _r2 = vld2q_f32(r2);
+ float32x4x2_t _r2n = vld2q_f32(r2+8);
+
+ float32x4_t _r20 = _r2.val[0];
+ float32x4_t _r21 = _r2.val[1];
+ float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1);
+
+ _outp = vfmaq_laneq_f32(_outp, _r20, _k6789, 0);
+ _outp = vfmaq_laneq_f32(_outp, _r21, _k6789, 1);
+ _outp = vfmaq_laneq_f32(_outp, _r22, _k6789, 2);
+
+ vst1q_f32(outptr, _outp);
+
+ r0 += 8;
+ r1 += 8;
+ r2 += 8;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "pld [%2, #256] \n"
+ "vld2.f32 {d4-d7}, [%2]! \n"
+
+ "veor q10, q10 \n"
+ "veor q11, q11 \n"
+
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1] \n"
+
+ "vmla.f32 q0, q2, %e10[0] \n"
+ "vmla.f32 q10, q3, %e10[1] \n"
+
+ "pld [%2, #256] \n"
+ "vld2.f32 {d16-d19}, [%2] \n"
+ "vext.32 q1, q2, q8, #1 \n"
+
+ "vmla.f32 q11, q1, %f10[0] \n"
+
+ "pld [%3, #256] \n"
+ "vld2.f32 {d4-d7}, [%3]! \n"
+
+ "vmla.f32 q0, q2, %e11[0] \n"
+ "vmla.f32 q10, q3, %e11[1] \n"
+
+ "pld [%3, #256] \n"
+ "vld2.f32 {d16-d19}, [%3] \n"
+ "vext.32 q1, q2, q8, #1 \n"
+
+ "vmla.f32 q11, q1, %f11[0] \n"
+
+ "pld [%4, #256] \n"
+ "vld2.f32 {d4-d7}, [%4]! \n"
+
+ "vmla.f32 q0, q2, %e12[0] \n"
+ "vmla.f32 q10, q3, %e12[1] \n"
+
+ "pld [%4, #256] \n"
+ "vld2.f32 {d16-d19}, [%4] \n"
+ "vext.32 q1, q2, q8, #1 \n"
+
+ "vmla.f32 q11, q1, %f12[0] \n"
+
+ "pld [%2, #256] \n"
+ "vld2.f32 {d4-d7}, [%2]! \n"
+
+ "vadd.f32 q0, q0, q10 \n"
+ "veor q10, q10 \n"
+ "vadd.f32 q0, q0, q11 \n"
+ "veor q11, q11 \n"
+
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%1]! \n"
+ "bne 0b \n"
+ "sub %2, #32 \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0), // %2
+ "=r"(r1),
+ "=r"(r2)
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "3"(r1),
+ "4"(r2),
+ "w"(_k0123), // %10
+ "w"(_k3456), // %11
+ "w"(_k6789) // %12
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+#if __ARM_NEON
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r20 = vld1q_f32(r2);
+
+ float32x4_t _sum = vmulq_f32(_r00, _k0123);
+ _sum = vmlaq_f32(_sum, _r10, _k3456);
+ _sum = vmlaq_f32(_sum, _r20, _k6789);
+
+ _sum = vsetq_lane_f32(*outptr, _sum, 3);
+
+#if __aarch64__
+ *outptr = vaddvq_f32(_sum);
+#else
+ float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+ _ss = vpadd_f32(_ss, _ss);
+
+ *outptr = vget_lane_f32(_ss, 0);
+#endif // __aarch64__
+#else
+ float sum = 0;
+
+ sum += r0[0] * k0[0];
+ sum += r0[1] * k0[1];
+ sum += r0[2] * k0[2];
+ sum += r1[0] * k1[0];
+ sum += r1[1] * k1[1];
+ sum += r1[2] * k1[2];
+ sum += r2[0] * k2[0];
+ sum += r2[1] * k2[1];
+ sum += r2[2] * k2[2];
+
+ *outptr += sum;
+#endif // __ARM_NEON
+
+ r0 += 2;
+ r1 += 2;
+ r2 += 2;
+ outptr++;
+ }
+
+ r0 += tailstep;
+ r1 += tailstep;
+ r2 += tailstep;
+ }
+
+ kernel0 += 9;
+ }
+ }
+}
diff --git a/src/layer/arm/convolution_4x4.h b/src/layer/arm/convolution_4x4.h
new file mode 100644
index 00000000000..18ef572f094
--- /dev/null
+++ b/src/layer/arm/convolution_4x4.h
@@ -0,0 +1,340 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 2;
+ int remain = outw - (nn << 2);
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r20 = vld1q_f32(r2);
+ float32x4_t _r30 = vld1q_f32(r3);
+
+ float32x4_t _r01 = vld1q_f32(r0 + 4);
+ float32x4_t _r11 = vld1q_f32(r1 + 4);
+ float32x4_t _r21 = vld1q_f32(r2 + 4);
+ float32x4_t _r31 = vld1q_f32(r3 + 4);
+
+ float32x4_t _r02 = vld1q_f32(r0 + 8);
+ float32x4_t _r12 = vld1q_f32(r1 + 8);
+ float32x4_t _r22 = vld1q_f32(r2 + 8);
+ float32x4_t _r32 = vld1q_f32(r3 + 8);
+
+ float32x4_t _r03 = vld1q_f32(r0 + 12);
+ float32x4_t _r13 = vld1q_f32(r1 + 12);
+ float32x4_t _r23 = vld1q_f32(r2 + 12);
+ float32x4_t _r33 = vld1q_f32(r3 + 12);
+
+ float32x4_t _sum0 = vmulq_f32(_r00, _k0123);
+ float32x4_t _sum1 = vmulq_f32(_r01, _k0123);
+ float32x4_t _sum2 = vmulq_f32(_r02, _k0123);
+ float32x4_t _sum3 = vmulq_f32(_r03, _k0123);
+
+ _sum0 = vfmaq_f32(_sum0, _r10, _k4567);
+ _sum1 = vfmaq_f32(_sum1, _r11, _k4567);
+ _sum2 = vfmaq_f32(_sum2, _r12, _k4567);
+ _sum3 = vfmaq_f32(_sum3, _r13, _k4567);
+
+ _sum0 = vfmaq_f32(_sum0, _r20, _k891011);
+ _sum1 = vfmaq_f32(_sum1, _r21, _k891011);
+ _sum2 = vfmaq_f32(_sum2, _r22, _k891011);
+ _sum3 = vfmaq_f32(_sum3, _r23, _k891011);
+
+ _sum0 = vfmaq_f32(_sum0, _r30, _k12131415);
+ _sum1 = vfmaq_f32(_sum1, _r31, _k12131415);
+ _sum2 = vfmaq_f32(_sum2, _r32, _k12131415);
+ _sum3 = vfmaq_f32(_sum3, _r33, _k12131415);
+
+ float32x4_t _s01 = vpaddq_f32(_sum0, _sum1);
+ float32x4_t _s23 = vpaddq_f32(_sum2, _sum3);
+ float32x4_t _sum = vpaddq_f32(_s01, _s23);
+
+ float32x4_t _outp = vld1q_f32(outptr);
+
+ _outp = vaddq_f32(_outp, _sum);
+
+ vst1q_f32(outptr, _sum);
+
+ r0 += 16;
+ r1 += 16;
+ r2 += 16;
+ r3 += 16;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+
+ "pld [%1, #128] \n"
+
+ "0: \n"
+
+ "pld [%2, #512] \n"
+ "pld [%3, #512] \n"
+
+ "vld1.f32 {d14-d15}, [%1] \n"// q7 = outptr
+
+ "vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0
+ "vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1
+
+ "pld [%4, #512] \n"
+ "pld [%5, #512] \n"
+
+ "vmul.f32 q12, q8, %q12 \n"
+ "vmul.f32 q13, q9, %q13 \n"
+
+ "vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2
+ "vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3
+
+ "vmla.f32 q12, q10, %q14 \n"
+ "vmla.f32 q13, q11, %q15 \n"
+
+ "vadd.f32 q5, q12, q13 \n"
+
+ "vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0
+ "vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1
+
+ "vmul.f32 q12, q8, %q12 \n"
+ "vmul.f32 q13, q9, %q13 \n"
+
+ "vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2
+ "vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3
+
+ "vmla.f32 q12, q10, %q14 \n"
+ "vmla.f32 q13, q11, %q15 \n"
+
+ "vadd.f32 q6, q12, q13 \n"
+
+ "vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0
+ "vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1
+
+ "vmul.f32 q12, q8, %q12 \n"
+ "vmul.f32 q13, q9, %q13 \n"
+
+ "vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2
+ "vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3
+
+ "vmla.f32 q12, q10, %q14 \n"
+ "vmla.f32 q13, q11, %q15 \n"
+
+ "vadd.f32 q14, q12, q13 \n"
+
+ "vld1.f32 {d16-d17}, [%2]! \n"// q8 = r0
+ "vld1.f32 {d18-d19}, [%3]! \n"// q9 = r1
+
+ "vmul.f32 q12, q8, %q12 \n"
+ "vmul.f32 q13, q9, %q13 \n"
+
+ "vld1.f32 {d20-d21}, [%4]! \n"// q10 = r2
+ "vld1.f32 {d22-d23}, [%5]! \n"// q11 = r3
+
+ "vmla.f32 q12, q10, %q14 \n"
+ "vmla.f32 q13, q11, %q15 \n"
+
+ "vadd.f32 q15, q12, q13 \n"
+
+ "vadd.f32 d10, d10, d11 \n"
+ "vadd.f32 d28, d28, d29 \n"
+ "vadd.f32 d11, d12, d13 \n"
+ "vadd.f32 d29, d30, d31 \n"
+
+ "vpadd.f32 d10, d10, d11 \n"
+ "vpadd.f32 d11, d28, d29 \n"
+
+ "vadd.f32 q7, q7, q5 \n"
+
+ "vst1.f32 {d14-d15}, [%1]! \n"
+
+ "pld [%1, #128] \n"
+
+ "subs %0, #1 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0), // %2
+ "=r"(r1), // %3
+ "=r"(r2), // %4
+ "=r"(r3) // %5
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "3"(r1),
+ "4"(r2),
+ "5"(r3),
+ "w"(_k0123), // %12
+ "w"(_k4567), // %13
+ "w"(_k891011), // %14
+ "w"(_k12131415) // %15
+ : "cc", "memory", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+#if __ARM_NEON
+#if __aarch64__
+ float32x4_t _r0 = vld1q_f32(r0);
+ float32x4_t _r1 = vld1q_f32(r1);
+ float32x4_t _r2 = vld1q_f32(r2);
+ float32x4_t _r3 = vld1q_f32(r3);
+
+ float32x4_t _sum = vmulq_f32(_r0, _k0123);
+ _sum = vmlaq_f32(_sum, _r1, _k4567);
+ _sum = vmlaq_f32(_sum, _r2, _k891011);
+ _sum = vmlaq_f32(_sum, _r3, _k12131415);
+
+ *outptr += vaddvq_f32(_sum);
+#else
+ float sum = 0.f;
+
+ asm volatile(
+ "vld1.f32 {d16-d17}, [%0]! \n"// q8 = r0
+ "vld1.f32 {d18-d19}, [%1]! \n"// q9 = r1
+
+ "vmul.f32 q12, q8, %q9 \n"
+ "vmul.f32 q13, q9, %q10 \n"
+
+ "vld1.f32 {d20-d21}, [%2]! \n"// q10 = r2
+ "vld1.f32 {d22-d23}, [%3]! \n"// q11 = r3
+
+ "vmla.f32 q12, q10, %q11 \n"
+ "vmla.f32 q13, q11, %q12 \n"
+
+ "vadd.f32 q5, q12, q13 \n"
+ "vadd.f32 d10, d10, d11 \n"
+ "vpadd.f32 d10, d10, d10 \n"
+ "vmov.f32 %4, d10[0] \n"
+ : "=r"(r0), // %0
+ "=r"(r1), // %1
+ "=r"(r2), // %2
+ "=r"(r3), // %3
+ "=r"(sum) // %4
+ : "0"(r0),
+ "1"(r1),
+ "2"(r2),
+ "3"(r3),
+ "w"(_k0123), // %9
+ "w"(_k4567), // %10
+ "w"(_k891011), // %11
+ "w"(_k12131415) // %12
+ : "cc", "memory", "q5", "q6", "q8", "q9", "q10", "q11", "q12", "q13"
+ );
+
+ *outptr += sum;
+#endif // __aarch64__
+#else
+ float sum = 0;
+
+ sum += r0[0] * k0[0];
+ sum += r0[1] * k0[1];
+ sum += r0[2] * k0[2];
+ sum += r0[3] * k0[3];
+
+ sum += r1[0] * k1[0];
+ sum += r1[1] * k1[1];
+ sum += r1[2] * k1[2];
+ sum += r1[3] * k1[3];
+
+ sum += r2[0] * k2[0];
+ sum += r2[1] * k2[1];
+ sum += r2[2] * k2[2];
+ sum += r2[3] * k2[3];
+
+ sum += r3[0] * k3[0];
+ sum += r3[1] * k3[1];
+ sum += r3[2] * k3[2];
+ sum += r3[3] * k3[3];
+
+ *outptr += sum;
+#endif // __ARM_NEON
+ r0 += 4;
+ r1 += 4;
+ r2 += 4;
+ r3 += 4;
+ outptr++;
+ }
+
+ r0 += w * 3;
+ r1 += w * 3;
+ r2 += w * 3;
+ r3 += w * 3;
+ }
+
+ }
+ }
+
+}
+
diff --git a/src/layer/arm/convolution_5x5.h b/src/layer/arm/convolution_5x5.h
new file mode 100644
index 00000000000..2c44a3b4cff
--- /dev/null
+++ b/src/layer/arm/convolution_5x5.h
@@ -0,0 +1,1251 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 2;
+ int remain = outw - (nn << 2);
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _sum = vdupq_n_f32(0.f);
+ float32x4_t _sum2 = vdupq_n_f32(0.f);
+
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r04 = vld1q_f32(r0 + 4);
+ float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
+ float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
+ float32x4_t _r03 = vextq_f32(_r00, _r04, 3);
+
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r14 = vld1q_f32(r1 + 4);
+ float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
+ float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
+ float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
+
+ float32x4_t _r20 = vld1q_f32(r2);
+ float32x4_t _r24 = vld1q_f32(r2 + 4);
+ float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
+ float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
+ float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
+
+ float32x4_t _r30 = vld1q_f32(r3);
+ float32x4_t _r34 = vld1q_f32(r3 + 4);
+ float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
+ float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
+ float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
+
+ float32x4_t _r40 = vld1q_f32(r4);
+ float32x4_t _r44 = vld1q_f32(r4 + 4);
+ float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
+ float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
+ float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
+
+ float32x4_t _r50 = vld1q_f32(r5);
+ float32x4_t _r54 = vld1q_f32(r5 + 4);
+ float32x4_t _r51 = vextq_f32(_r50, _r54, 1);
+ float32x4_t _r52 = vextq_f32(_r50, _r54, 2);
+ float32x4_t _r53 = vextq_f32(_r50, _r54, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+
+ _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
+
+ _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
+
+ _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k0123, 0);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r11, _k0123, 1);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k0123, 2);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r13, _k0123, 3);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r14, _k4567, 0);
+
+ _sum2 = vfmaq_laneq_f32(_sum2, _r20, _k4567, 1);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k4567, 2);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r22, _k4567, 3);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r23, _k891011, 0);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r24, _k891011, 1);
+
+ _sum2 = vfmaq_laneq_f32(_sum2, _r30, _k891011, 2);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r31, _k891011, 3);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r32, _k12131415, 0);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r33, _k12131415, 1);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r34, _k12131415, 2);
+
+ _sum2 = vfmaq_laneq_f32(_sum2, _r40, _k12131415, 3);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r41, _k16171819, 0);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r42, _k16171819, 1);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r43, _k16171819, 2);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r44, _k16171819, 3);
+
+ _sum2 = vfmaq_laneq_f32(_sum2, _r50, _k20212223, 0);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r51, _k20212223, 1);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r52, _k20212223, 2);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r53, _k20212223, 3);
+ _sum2 = vfmaq_laneq_f32(_sum2, _r54, _k24242424, 0);
+
+ vst1q_f32(outptr, _sum);
+ vst1q_f32(outptr2, _sum2);
+
+ r0 += 4;
+ r1 += 4;
+ r2 += 4;
+ r3 += 4;
+ r4 += 4;
+ r5 += 4;
+ outptr += 4;
+ outptr2 += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+// "veor q13, q13 \n"
+// "veor q14, q14 \n"
+
+ "pld [%1, #128] \n"
+
+ "vld1.f32 {d14-d15}, [%1] \n"// q7 = out
+
+ "0: \n"
+
+ // q11 = rx1 / rx3
+ // q12 = rx2
+
+ // q13 q14 = intermediate sum register
+
+ "pld [%2, #128] \n"
+
+ "vld1.f32 {d16-d17}, [%2] \n"// q8 = out2
+
+
+ "pld [%4, #256] \n"
+
+ // r1
+ "vld1.f32 {d18-d21}, [%4] \n"// q9 q10 = r10 r14
+ "add %4, #16 \n"
+
+ "vext.32 q11, q9, q10, #1 \n"// r11
+ "vmul.f32 q13, q9, %e19[1] \n"
+ "vmla.f32 q8, q9, %e18[0] \n"
+
+ "vext.32 q12, q9, q10, #2 \n"// r12
+ "vmla.f32 q7, q11, %f19[0] \n"
+ "vmul.f32 q14, q11, %e18[1] \n"
+
+ "vext.32 q11, q9, q10, #3 \n"// r13
+ "vmla.f32 q13, q12, %f19[1] \n"
+ "vmla.f32 q8, q12, %f18[0] \n"
+
+ "vmla.f32 q7, q11, %e20[0] \n"
+ "vmla.f32 q14, q11, %f18[1] \n"
+
+ "pld [%5, #256] \n"
+
+ "vmla.f32 q13, q10, %e20[1] \n"
+ "vmla.f32 q8, q10, %e19[0] \n"
+
+ // r2
+ "vld1.f32 {d18-d21}, [%5] \n"// q9 q10 = r20 r24
+ "add %5, #16 \n"
+
+ "vext.32 q11, q9, q10, #1 \n"// r21
+ "vmla.f32 q7, q9, %f20[0] \n"
+ "vmla.f32 q14, q9, %e19[1] \n"
+
+ "vext.32 q12, q9, q10, #2 \n"// r22
+ "vmla.f32 q13, q11, %f20[1] \n"
+ "vmla.f32 q8, q11, %f19[0] \n"
+
+ "vext.32 q11, q9, q10, #3 \n"// r23
+ "vmla.f32 q7, q12, %e21[0] \n"
+ "vmla.f32 q14, q12, %f19[1] \n"
+
+ "vmla.f32 q13, q11, %e21[1] \n"
+ "vmla.f32 q8, q11, %e20[0] \n"
+
+ "pld [%6, #256] \n"
+
+ "vmla.f32 q7, q10, %f21[0] \n"
+ "vmla.f32 q14, q10, %e20[1] \n"
+
+ // r3
+ "vld1.f32 {d18-d21}, [%6] \n"// q9 q10 = r30 r34
+ "add %6, #16 \n"
+
+ "vext.32 q11, q9, q10, #1 \n"// r31
+ "vmla.f32 q13, q9, %f21[1] \n"
+ "vmla.f32 q8, q9, %f20[0] \n"
+
+ "vext.32 q12, q9, q10, #2 \n"// r32
+ "vmla.f32 q7, q11, %e22[0] \n"
+ "vmla.f32 q14, q11, %f20[1] \n"
+
+ "vext.32 q11, q9, q10, #3 \n"// r33
+ "vmla.f32 q13, q12, %e22[1] \n"
+ "vmla.f32 q8, q12, %e21[0] \n"
+
+ "vmla.f32 q7, q11, %f22[0] \n"
+ "vmla.f32 q14, q11, %e21[1] \n"
+
+ "pld [%7, #256] \n"
+
+ "vmla.f32 q13, q10, %f22[1] \n"
+ "vmla.f32 q8, q10, %f21[0] \n"
+
+ // r4
+ "vld1.f32 {d18-d21}, [%7] \n"// q9 q10 = r40 r44
+ "add %7, #16 \n"
+
+ "vext.32 q11, q9, q10, #1 \n"// r41
+ "vmla.f32 q7, q9, %e23[0] \n"
+ "vmla.f32 q14, q9, %f21[1] \n"
+
+ "vext.32 q12, q9, q10, #2 \n"// r42
+ "vmla.f32 q13, q11, %e23[1] \n"
+ "vmla.f32 q8, q11, %e22[0] \n"
+
+ "vext.32 q11, q9, q10, #3 \n"// r43
+ "vmla.f32 q7, q12, %f23[0] \n"
+ "vmla.f32 q14, q12, %e22[1] \n"
+
+ "vmla.f32 q13, q11, %f23[1] \n"
+ "vmla.f32 q8, q11, %f22[0] \n"
+
+ "pld [%3, #256] \n"
+
+ "vmla.f32 q7, q10, %e24[0] \n"
+ "vmla.f32 q14, q10, %f22[1] \n"
+
+ // r0 and r5
+ "vld1.f32 {d18-d21}, [%3] \n"// q9 q10 = r00 r04
+ "add %3, #16 \n"
+
+ "vext.32 q11, q9, q10, #1 \n"// r01
+ "vmla.f32 q13, q11, %e18[1] \n"
+
+ "vext.32 q12, q9, q10, #2 \n"// r02
+ "vmla.f32 q7, q12, %f18[0] \n"
+
+ "vext.32 q11, q9, q10, #3 \n"// r03
+
+ "pld [%8, #256] \n"
+
+ "vmla.f32 q13, q11, %f18[1] \n"
+
+ // r5
+ "vld1.f32 {d22-d25}, [%8] \n"// q11 q12 = r50 r54
+ "add %8, #16 \n"
+
+ "vmla.f32 q8, q11, %e23[0] \n"
+ "vmla.f32 q14, q12, %e24[0] \n"
+
+ "vmla.f32 q7, q9, %e18[0] \n"
+ "vmla.f32 q13, q10, %e19[0] \n"
+
+ "vext.32 q9, q11, q12, #1 \n"// r51
+ "vext.32 q10, q11, q12, #2 \n"// r52
+
+ "vmla.f32 q14, q9, %e23[1] \n"
+
+ "vext.32 q9, q11, q12, #3 \n"// r53
+ "vmla.f32 q8, q10, %f23[0] \n"
+
+ "vmla.f32 q14, q9, %f23[1] \n"
+
+ "vadd.f32 q7, q7, q13 \n"
+
+// "veor q13, q13 \n"
+
+ "vst1.f32 {d14-d15}, [%1]! \n"
+
+ "vadd.f32 q8, q8, q14 \n"
+
+ "pld [%1, #128] \n"
+
+ "vld1.f32 {d14-d15}, [%1] \n"// q7 = out
+
+// "veor q14, q14 \n"
+
+ "vst1.f32 {d16-d17}, [%2]! \n"
+
+ "subs %0, #1 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(outptr2), // %2
+ "=r"(r0), // %3
+ "=r"(r1), // %4
+ "=r"(r2), // %5
+ "=r"(r3), // %6
+ "=r"(r4), // %7
+ "=r"(r5) // %8
+ : "0"(nn),
+ "1"(outptr),
+ "2"(outptr2),
+ "3"(r0),
+ "4"(r1),
+ "5"(r2),
+ "6"(r3),
+ "7"(r4),
+ "8"(r5),
+ "w"(_k0123), // %18
+ "w"(_k4567), // %19
+ "w"(_k891011), // %20
+ "w"(_k12131415), // %21
+ "w"(_k16171819), // %22
+ "w"(_k20212223), // %23
+ "w"(_k24242424) // %24
+ : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ float sum = 0;
+ float sum2 = 0;
+#if __ARM_NEON
+ float32x4_t _r1 = vld1q_f32(r1);
+ float32x4_t _k1 = vld1q_f32(k1);
+ float32x4_t _sum = vmulq_f32(_r1, _k1);
+ float32x4_t _sum2 = vmulq_f32(_r1, _k0123);
+
+ float32x4_t _r2 = vld1q_f32(r2);
+ float32x4_t _k2 = vld1q_f32(k2);
+ _sum = vmlaq_f32(_sum, _r2, _k2);
+ _sum2 = vmlaq_f32(_sum2, _r2, _k1);
+
+ float32x4_t _r3 = vld1q_f32(r3);
+ _sum = vmlaq_f32(_sum, _r3, _k20212223);
+ _sum2 = vmlaq_f32(_sum2, _r3, _k2);
+
+ float32x4_t _r4 = vld1q_f32(r4);
+ float32x4_t _k4 = vld1q_f32(k4);
+ _sum = vmlaq_f32(_sum, _r4, _k4);
+ _sum2 = vmlaq_f32(_sum2, _r4, _k20212223);
+
+ float32x4_t _r0 = vld1q_f32(r0);
+ _sum = vmlaq_f32(_sum, _r0, _k0123);
+ float32x4_t _r5 = vld1q_f32(r5);
+ _sum2 = vmlaq_f32(_sum2, _r5, _k4);
+
+ float32x4_t _k_t4;
+ _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
+ _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
+ _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
+ _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);
+
+ float32x4_t _r_t4;
+
+ _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
+ _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
+ _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
+ _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
+ _sum = vmlaq_f32(_sum, _r_t4, _k_t4);
+
+ sum = r4[4] * k4[4];
+
+ _r_t4 = vextq_f32(_r_t4, _r_t4, 1);
+ _r_t4 = vsetq_lane_f32(r4[4], _r_t4, 3);
+ _sum2 = vmlaq_f32(_sum2, _r_t4, _k_t4);
+
+ sum2 = r5[4] * k4[4];
+
+ float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+ float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
+ float32x2_t _ss_ss2 = vpadd_f32(_ss, _ss2);
+
+ sum += vget_lane_f32(_ss_ss2, 0);
+ sum2 += vget_lane_f32(_ss_ss2, 1);
+#else
+ sum += r0[0] * k0[0];
+ sum += r0[1] * k0[1];
+ sum += r0[2] * k0[2];
+ sum += r0[3] * k0[3];
+ sum += r0[4] * k0[4];
+
+ sum += r1[0] * k1[0];
+ sum += r1[1] * k1[1];
+ sum += r1[2] * k1[2];
+ sum += r1[3] * k1[3];
+ sum += r1[4] * k1[4];
+
+ sum += r2[0] * k2[0];
+ sum += r2[1] * k2[1];
+ sum += r2[2] * k2[2];
+ sum += r2[3] * k2[3];
+ sum += r2[4] * k2[4];
+
+ sum += r3[0] * k3[0];
+ sum += r3[1] * k3[1];
+ sum += r3[2] * k3[2];
+ sum += r3[3] * k3[3];
+ sum += r3[4] * k3[4];
+
+ sum += r4[0] * k4[0];
+ sum += r4[1] * k4[1];
+ sum += r4[2] * k4[2];
+ sum += r4[3] * k4[3];
+ sum += r4[4] * k4[4];
+
+ sum2 += r1[0] * k0[0];
+ sum2 += r1[1] * k0[1];
+ sum2 += r1[2] * k0[2];
+ sum2 += r1[3] * k0[3];
+ sum2 += r1[4] * k0[4];
+
+ sum2 += r2[0] * k1[0];
+ sum2 += r2[1] * k1[1];
+ sum2 += r2[2] * k1[2];
+ sum2 += r2[3] * k1[3];
+ sum2 += r2[4] * k1[4];
+
+ sum2 += r3[0] * k2[0];
+ sum2 += r3[1] * k2[1];
+ sum2 += r3[2] * k2[2];
+ sum2 += r3[3] * k2[3];
+ sum2 += r3[4] * k2[4];
+
+ sum2 += r4[0] * k3[0];
+ sum2 += r4[1] * k3[1];
+ sum2 += r4[2] * k3[2];
+ sum2 += r4[3] * k3[3];
+ sum2 += r4[4] * k3[4];
+
+ sum2 += r5[0] * k4[0];
+ sum2 += r5[1] * k4[1];
+ sum2 += r5[2] * k4[2];
+ sum2 += r5[3] * k4[3];
+ sum2 += r5[4] * k4[4];
+#endif // __ARM_NEON
+ *outptr += sum;
+ *outptr2 += sum2;
+
+ r0++;
+ r1++;
+ r2++;
+ r3++;
+ r4++;
+ r5++;
+ outptr++;
+ outptr2++;
+ }
+
+ r0 += 4 + w;
+ r1 += 4 + w;
+ r2 += 4 + w;
+ r3 += 4 + w;
+ r4 += 4 + w;
+ r5 += 4 + w;
+
+ outptr += outw;
+ outptr2 += outw;
+ }
+
+ for (; i < outh; i++)
+ {
+
+#if __ARM_NEON
+ int nn = outw >> 2;
+ int remain = outw - (nn << 2);
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _sum = vdupq_n_f32(0.f);
+
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r04 = vld1q_f32(r0 + 4);
+ float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
+ float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
+ float32x4_t _r03 = vextq_f32(_r00, _r04, 3);
+
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r14 = vld1q_f32(r1 + 4);
+ float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
+ float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
+ float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
+
+ float32x4_t _r20 = vld1q_f32(r2);
+ float32x4_t _r24 = vld1q_f32(r2 + 4);
+ float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
+ float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
+ float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
+
+ float32x4_t _r30 = vld1q_f32(r3);
+ float32x4_t _r34 = vld1q_f32(r3 + 4);
+ float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
+ float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
+ float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
+
+ float32x4_t _r40 = vld1q_f32(r4);
+ float32x4_t _r44 = vld1q_f32(r4 + 4);
+ float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
+ float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
+ float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+
+ _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
+
+ _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
+
+ vst1q_f32(outptr, _sum);
+
+ r0 += 4;
+ r1 += 4;
+ r2 += 4;
+ r3 += 4;
+ r4 += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+// "veor q15, q15 \n"// _sum3 = 0;
+
+ "pld [%1, #128] \n"
+
+ "pld [%2, #256] \n"
+
+ "vld1.f32 {d16-d19}, [%2] \n"// _r00 = vld1q_f32(r0+j);
+ "add %2, #16 \n"
+
+ "0: \n"
+
+ "vld1.f32 {d14-d15}, [%1] \n"// _sum = vld1q_f32(outptr+j);
+ "veor q13, q13 \n"// _sum2 = 0;
+ "veor q14, q14 \n"// _sum3 = 0;
+
+ "vext.32 q10, q8, q9, #1 \n"// _r01
+ "vext.32 q11, q8, q9, #2 \n"// _r02
+ "vext.32 q12, q8, q9, #3 \n"// _r03
+
+ "vmla.f32 q7, q8, %e14[0] \n"
+ "vmla.f32 q13, q10, %e14[1] \n"
+
+ "pld [%3, #256] \n"
+
+ "vmla.f32 q14, q11, %f14[0] \n"
+ "vmul.f32 q15, q12, %f14[1] \n"
+ "vmla.f32 q7, q9, %e15[0] \n"
+
+ "vld1.f32 {d16-d19}, [%3] \n"
+ "add %3, #16 \n"
+ "vext.32 q10, q8, q9, #1 \n"
+ "vext.32 q11, q8, q9, #2 \n"
+ "vext.32 q12, q8, q9, #3 \n"
+
+ "vmla.f32 q7, q8, %e15[1] \n"
+ "vmla.f32 q13, q10, %f15[0] \n"
+
+ "pld [%4, #256] \n"
+
+ "vmla.f32 q14, q11, %f15[1] \n"
+ "vmla.f32 q15, q12, %e16[0] \n"
+ "vmla.f32 q7, q9, %e16[1] \n"
+
+ "vld1.f32 {d16-d19}, [%4] \n"
+ "add %4, #16 \n"
+ "vext.32 q10, q8, q9, #1 \n"
+ "vext.32 q11, q8, q9, #2 \n"
+ "vext.32 q12, q8, q9, #3 \n"
+
+ "vmla.f32 q7, q8, %f16[0] \n"
+ "vmla.f32 q13, q10, %f16[1] \n"
+
+ "pld [%5, #256] \n"
+
+ "vmla.f32 q14, q11, %e17[0] \n"
+ "vmla.f32 q15, q12, %e17[1] \n"
+ "vmla.f32 q7, q9, %f17[0] \n"
+
+ "vld1.f32 {d16-d19}, [%5] \n"
+ "add %5, #16 \n"
+ "vext.32 q10, q8, q9, #1 \n"
+ "vext.32 q11, q8, q9, #2 \n"
+ "vext.32 q12, q8, q9, #3 \n"
+
+ "vmla.f32 q7, q8, %f17[1] \n"
+ "vmla.f32 q13, q10, %e18[0] \n"
+
+ "pld [%6, #256] \n"
+
+ "vmla.f32 q14, q11, %e18[1] \n"
+ "vmla.f32 q15, q12, %f18[0] \n"
+ "vmla.f32 q7, q9, %f18[1] \n"
+
+ "vld1.f32 {d16-d19}, [%6] \n"
+ "add %6, #16 \n"
+ "vext.32 q10, q8, q9, #1 \n"
+ "vext.32 q11, q8, q9, #2 \n"
+ "vext.32 q12, q8, q9, #3 \n"
+
+ "vmla.f32 q7, q8, %e19[0] \n"
+ "vmla.f32 q13, q10, %e19[1] \n"
+ "vmla.f32 q14, q11, %f19[0] \n"
+ "vmla.f32 q15, q12, %f19[1] \n"
+ "vmla.f32 q7, q9, %e20[0] \n"
+
+ "vadd.f32 q14, q14, q15 \n"
+ "vadd.f32 q7, q7, q13 \n"
+// "veor q15, q15 \n"// _sum3 = 0;
+
+ "pld [%2, #256] \n"
+
+ "vadd.f32 q7, q7, q14 \n"
+
+ "vld1.f32 {d16-d19}, [%2] \n"// _r00 = vld1q_f32(r0+j);
+ "add %2, #16 \n"
+
+ "vst1.f32 {d14-d15}, [%1]! \n"
+
+ "pld [%1, #128] \n"
+
+ "subs %0, #1 \n"
+ "bne 0b \n"
+
+ "sub %2, #16 \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0), // %2
+ "=r"(r1), // %3
+ "=r"(r2), // %4
+ "=r"(r3), // %5
+ "=r"(r4) // %6
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "3"(r1),
+ "4"(r2),
+ "5"(r3),
+ "6"(r4),
+ "w"(_k0123), // %14
+ "w"(_k4567), // %15
+ "w"(_k891011), // %16
+ "w"(_k12131415), // %17
+ "w"(_k16171819), // %18
+ "w"(_k20212223), // %19
+ "w"(_k24242424) // %20
+ : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ float sum = 0;
+#if __ARM_NEON
+ float32x4_t _r0 = vld1q_f32(r0);
+ float32x4_t _sum = vmulq_f32(_r0, _k0123);
+
+ float32x4_t _r1 = vld1q_f32(r1);
+ _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));
+
+ float32x4_t _r2 = vld1q_f32(r2);
+ _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));
+
+ float32x4_t _r3 = vld1q_f32(r3);
+ _sum = vmlaq_f32(_sum, _r3, _k20212223);
+
+ float32x4_t _r4 = vld1q_f32(r4);
+ _sum = vmlaq_f32(_sum, _r4, vld1q_f32(k4));
+
+ float32x4_t _k_t4;
+ _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
+ _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
+ _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
+ _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);
+
+ float32x4_t _r_t4;
+
+ _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
+ _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
+ _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
+ _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
+ _sum = vmlaq_f32(_sum, _r_t4, _k_t4);
+
+ sum = r4[4] * k4[4];
+
+ float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+ _ss = vpadd_f32(_ss, _ss);
+
+ sum += vget_lane_f32(_ss, 0);
+#else
+ sum += r0[0] * k0[0];
+ sum += r0[1] * k0[1];
+ sum += r0[2] * k0[2];
+ sum += r0[3] * k0[3];
+ sum += r0[4] * k0[4];
+
+ sum += r1[0] * k1[0];
+ sum += r1[1] * k1[1];
+ sum += r1[2] * k1[2];
+ sum += r1[3] * k1[3];
+ sum += r1[4] * k1[4];
+
+ sum += r2[0] * k2[0];
+ sum += r2[1] * k2[1];
+ sum += r2[2] * k2[2];
+ sum += r2[3] * k2[3];
+ sum += r2[4] * k2[4];
+
+ sum += r3[0] * k3[0];
+ sum += r3[1] * k3[1];
+ sum += r3[2] * k3[2];
+ sum += r3[3] * k3[3];
+ sum += r3[4] * k3[4];
+
+ sum += r4[0] * k4[0];
+ sum += r4[1] * k4[1];
+ sum += r4[2] * k4[2];
+ sum += r4[3] * k4[3];
+ sum += r4[4] * k4[4];
+#endif
+ *outptr += sum;
+
+ r0++;
+ r1++;
+ r2++;
+ r3++;
+ r4++;
+ outptr++;
+ }
+
+ r0 += 4;
+ r1 += 4;
+ r2 += 4;
+ r3 += 4;
+ r4 += 4;
+
+ }
+
+ }
+ }
+
+}
+
+static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const int tailstep = w - 2*outw + w;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 2;
+ int remain = outw - (nn << 2);
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _sum = vdupq_n_f32(0.f);
+
+ float32x4x2_t _r00_02461357 = vld2q_f32(r0);
+ float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8);
+ float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14
+ float32x4_t _r0_9111315 = _r00nx2.val[1];// 9 11 13 15
+ float32x4_t _r00 = _r00_02461357.val[0];// 0 2 4 6
+ float32x4_t _r01 = _r00_02461357.val[1];// 1 3 5 7
+ float32x4_t _r02 = vextq_f32(_r00, _r0_8101214, 1);// 2 4 6 8
+ float32x4_t _r03 = vextq_f32(_r01, _r0_9111315, 1);// 3 5 7 9
+ float32x4_t _r04 = vextq_f32(_r00, _r0_8101214, 2);// 4 6 8 10
+
+ float32x4x2_t _r10_02461357 = vld2q_f32(r1);
+ float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8);
+ float32x4_t _r1_8101214 = _r10nx2.val[0];
+ float32x4_t _r1_9111315 = _r10nx2.val[1];
+ float32x4_t _r10 = _r10_02461357.val[0];
+ float32x4_t _r11 = _r10_02461357.val[1];
+ float32x4_t _r12 = vextq_f32(_r10, _r1_8101214, 1);
+ float32x4_t _r13 = vextq_f32(_r11, _r1_9111315, 1);
+ float32x4_t _r14 = vextq_f32(_r10, _r1_8101214, 2);
+
+ float32x4x2_t _r20_02461357 = vld2q_f32(r2);
+ float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8);
+ float32x4_t _r2_8101214 = _r20nx2.val[0];
+ float32x4_t _r2_9111315 = _r20nx2.val[1];
+ float32x4_t _r20 = _r20_02461357.val[0];
+ float32x4_t _r21 = _r20_02461357.val[1];
+ float32x4_t _r22 = vextq_f32(_r20, _r2_8101214, 1);
+ float32x4_t _r23 = vextq_f32(_r21, _r2_9111315, 1);
+ float32x4_t _r24 = vextq_f32(_r20, _r2_8101214, 2);
+
+ float32x4x2_t _r30_02461357 = vld2q_f32(r3);
+ float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8);
+ float32x4_t _r3_8101214 = _r30nx2.val[0];
+ float32x4_t _r3_9111315 = _r30nx2.val[1];
+ float32x4_t _r30 = _r30_02461357.val[0];
+ float32x4_t _r31 = _r30_02461357.val[1];
+ float32x4_t _r32 = vextq_f32(_r30, _r3_8101214, 1);
+ float32x4_t _r33 = vextq_f32(_r31, _r3_9111315, 1);
+ float32x4_t _r34 = vextq_f32(_r30, _r3_8101214, 2);
+
+ float32x4x2_t _r40_02461357 = vld2q_f32(r4);
+ float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8);
+ float32x4_t _r4_8101214 = _r40nx2.val[0];
+ float32x4_t _r4_9111315 = _r40nx2.val[1];
+ float32x4_t _r40 = _r40_02461357.val[0];
+ float32x4_t _r41 = _r40_02461357.val[1];
+ float32x4_t _r42 = vextq_f32(_r40, _r4_8101214, 1);
+ float32x4_t _r43 = vextq_f32(_r41, _r4_9111315, 1);
+ float32x4_t _r44 = vextq_f32(_r40, _r4_8101214, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+
+ _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
+
+ _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
+
+ vst1q_f32(outptr, _sum);
+
+ r0 += 8;
+ r1 += 8;
+ r2 += 8;
+ r3 += 8;
+ r4 += 8;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "veor q15, q15 \n"// _sump3 = 0;
+ "pld [%1, #128] \n"
+ "veor q13, q13 \n"// _sump2 = 0;
+ "pld [%2, #256] \n"
+ "veor q14, q14 \n"// _sump3 = 0;
+
+ "vld2.f32 {d16-d19}, [%2]! \n"// q8 = 0 2 4 6 q9 = 1 3 5 7
+
+ "pld [%2, #256] \n"
+
+ "vld2.f32 {d20-d23}, [%2] \n"// q10 = 8 10 12 14 q11 = 9 11 13 15
+
+ "0: \n"
+
+ "vld1.f32 {d14-d15}, [%1] \n"// q7 = outptr
+
+ "vext.32 q12, q8, q10, #1 \n"// q12 = 2 4 6 8
+ "vext.32 q11, q9, q11, #1 \n"// q11 = 3 5 7 9
+ "vext.32 q10, q8, q10, #2 \n"// q10 = 4 6 8 10
+
+ "vmla.f32 q7, q8, %e14[0] \n"
+ "vmla.f32 q13, q9, %e14[1] \n"
+
+ "pld [%3, #256] \n"
+
+ "vmla.f32 q14, q12, %f14[0] \n"
+ "vmla.f32 q15, q11, %f14[1] \n"
+ "vmla.f32 q7, q10, %e15[0] \n"
+
+ "vld2.f32 {d16-d19}, [%3]! \n"
+
+ "pld [%3, #256] \n"
+
+ "vld2.f32 {d20-d23}, [%3] \n"
+ "vext.32 q12, q8, q10, #1 \n"
+ "vext.32 q11, q9, q11, #1 \n"
+ "vext.32 q10, q8, q10, #2 \n"
+
+ "vmla.f32 q7, q8, %e15[1] \n"
+ "vmla.f32 q13, q9, %f15[0] \n"
+
+ "pld [%4, #256] \n"
+
+ "vmla.f32 q14, q12, %f15[1] \n"
+ "vmla.f32 q15, q11, %e16[0] \n"
+ "vmla.f32 q7, q10, %e16[1] \n"
+
+ "vld2.f32 {d16-d19}, [%4]! \n"
+
+ "pld [%4, #256] \n"
+
+ "vld2.f32 {d20-d23}, [%4] \n"
+ "vext.32 q12, q8, q10, #1 \n"
+ "vext.32 q11, q9, q11, #1 \n"
+ "vext.32 q10, q8, q10, #2 \n"
+
+ "vmla.f32 q7, q8, %f16[0] \n"
+ "vmla.f32 q13, q9, %f16[1] \n"
+
+ "pld [%5, #256] \n"
+
+ "vmla.f32 q14, q12, %e17[0] \n"
+ "vmla.f32 q15, q11, %e17[1] \n"
+ "vmla.f32 q7, q10, %f17[0] \n"
+
+ "vld2.f32 {d16-d19}, [%5]! \n"
+
+ "pld [%5, #256] \n"
+
+ "vld2.f32 {d20-d23}, [%5] \n"
+ "vext.32 q12, q8, q10, #1 \n"
+ "vext.32 q11, q9, q11, #1 \n"
+ "vext.32 q10, q8, q10, #2 \n"
+
+ "vmla.f32 q7, q8, %f17[1] \n"
+ "vmla.f32 q13, q9, %e18[0] \n"
+
+ "pld [%6, #256] \n"
+
+ "vmla.f32 q14, q12, %e18[1] \n"
+ "vmla.f32 q15, q11, %f18[0] \n"
+ "vmla.f32 q7, q10, %f18[1] \n"
+
+ "vld2.f32 {d16-d19}, [%6]! \n"
+
+ "pld [%6, #256] \n"
+
+ "vld2.f32 {d20-d23}, [%6] \n"
+ "vext.32 q12, q8, q10, #1 \n"
+ "vext.32 q11, q9, q11, #1 \n"
+ "vext.32 q10, q8, q10, #2 \n"
+
+ "vmla.f32 q7, q8, %e19[0] \n"
+ "vmla.f32 q13, q9, %e19[1] \n"
+ "vmla.f32 q14, q12, %f19[0] \n"
+ "vmla.f32 q15, q11, %f19[1] \n"
+ "vmla.f32 q7, q10, %e20[0] \n"
+
+ "pld [%2, #256] \n"
+
+ "vld2.f32 {d16-d19}, [%2]! \n"// q8 = 0 2 4 6 q9 = 1 3 5 7
+
+ "vadd.f32 q14, q14, q15 \n"
+ "vadd.f32 q7, q7, q13 \n"
+ "veor q15, q15 \n"// _sump3 = 0;
+ "veor q13, q13 \n"// _sump2 = 0;
+
+ "pld [%2, #256] \n"
+
+ "vadd.f32 q7, q7, q14 \n"
+
+ "vld2.f32 {d20-d23}, [%2] \n"// q10 = 8 10 12 14 q11 = 9 11 13 15
+
+ "veor q14, q14 \n"// _sump3 = 0;
+
+ "vst1.f32 {d14-d15}, [%1]! \n"
+
+ "pld [%1, #128] \n"
+
+ "subs %0, #1 \n"
+ "bne 0b \n"
+
+ "sub %2, #32 \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0), // %2
+ "=r"(r1), // %3
+ "=r"(r2), // %4
+ "=r"(r3), // %5
+ "=r"(r4) // %6
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "3"(r1),
+ "4"(r2),
+ "5"(r3),
+ "6"(r4),
+ "w"(_k0123), // %14
+ "w"(_k4567), // %15
+ "w"(_k891011), // %16
+ "w"(_k12131415), // %17
+ "w"(_k16171819), // %18
+ "w"(_k20212223), // %19
+ "w"(_k24242424) // %20
+ : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ float sum = 0;
+#if __ARM_NEON
+ float32x4_t _r0 = vld1q_f32(r0);
+ float32x4_t _sum = vmulq_f32(_r0, _k0123);
+
+ float32x4_t _r1 = vld1q_f32(r1);
+ _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));
+
+ float32x4_t _r2 = vld1q_f32(r2);
+ _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));
+
+ float32x4_t _r3 = vld1q_f32(r3);
+ _sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3));
+
+ float32x4_t _r4 = vld1q_f32(r4);
+ _sum = vmlaq_f32(_sum, _r4, _k20212223);
+
+ sum += r0[4] * k0[4];
+ sum += r1[4] * k1[4];
+ sum += r2[4] * k2[4];
+ sum += r3[4] * k3[4];
+ sum += r4[4] * k4[4];
+
+ float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+ _ss = vpadd_f32(_ss, _ss);
+
+ sum += vget_lane_f32(_ss, 0);
+#else
+ sum += r0[0] * k0[0];
+ sum += r0[1] * k0[1];
+ sum += r0[2] * k0[2];
+ sum += r0[3] * k0[3];
+ sum += r0[4] * k0[4];
+
+ sum += r1[0] * k1[0];
+ sum += r1[1] * k1[1];
+ sum += r1[2] * k1[2];
+ sum += r1[3] * k1[3];
+ sum += r1[4] * k1[4];
+
+ sum += r2[0] * k2[0];
+ sum += r2[1] * k2[1];
+ sum += r2[2] * k2[2];
+ sum += r2[3] * k2[3];
+ sum += r2[4] * k2[4];
+
+ sum += r3[0] * k3[0];
+ sum += r3[1] * k3[1];
+ sum += r3[2] * k3[2];
+ sum += r3[3] * k3[3];
+ sum += r3[4] * k3[4];
+
+ sum += r4[0] * k4[0];
+ sum += r4[1] * k4[1];
+ sum += r4[2] * k4[2];
+ sum += r4[3] * k4[3];
+ sum += r4[4] * k4[4];
+#endif
+ *outptr += sum;
+
+ r0 += 2;
+ r1 += 2;
+ r2 += 2;
+ r3 += 2;
+ r4 += 2;
+ outptr++;
+ }
+
+ r0 += tailstep;
+ r1 += tailstep;
+ r2 += tailstep;
+ r3 += tailstep;
+ r4 += tailstep;
+ }
+
+ }
+ }
+
+}
diff --git a/src/layer/arm/convolution_7x7.h b/src/layer/arm/convolution_7x7.h
new file mode 100644
index 00000000000..7c018b1b39d
--- /dev/null
+++ b/src/layer/arm/convolution_7x7.h
@@ -0,0 +1,1073 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 2;
+ int remain = outw - (nn << 2);
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _sum = vld1q_f32(outptr);
+
+ float32x4_t _k0123 = vld1q_f32(k0);
+ float32x4_t _k4567 = vld1q_f32(k0 + 4);
+
+ float32x4_t _r00 = vld1q_f32(r0);// 0 1 2 3
+ float32x4_t _r04 = vld1q_f32(r0 + 4);// 4 5 6 7
+ float32x4_t _r00n = vld1q_f32(r0 + 8);// 8 9 10 11
+ float32x4_t _r01 = vextq_f32(_r00, _r04, 1);// 1 2 3 4
+ float32x4_t _r02 = vextq_f32(_r00, _r04, 2);// 2 3 4 5
+ float32x4_t _r03 = vextq_f32(_r00, _r04, 3);// 3 4 5 6
+ float32x4_t _r05 = vextq_f32(_r04, _r00n, 1);// 5 6 7 8
+ float32x4_t _r06 = vextq_f32(_r04, _r00n, 2);// 6 7 8 9
+
+ _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2);
+
+ float32x4_t _k78910 = vld1q_f32(k1);
+ float32x4_t _k11121314 = vld1q_f32(k1 + 4);
+
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r14 = vld1q_f32(r1 + 4);
+ float32x4_t _r10n = vld1q_f32(r1 + 8);
+ float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
+ float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
+ float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
+ float32x4_t _r15 = vextq_f32(_r14, _r10n, 1);
+ float32x4_t _r16 = vextq_f32(_r14, _r10n, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r10, _k78910, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r11, _k78910, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r12, _k78910, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r13, _k78910, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r14, _k11121314, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2);
+
+ float32x4_t _k14151617 = vld1q_f32(k2);
+ float32x4_t _k18192021 = vld1q_f32(k2 + 4);
+
+ float32x4_t _r20 = vld1q_f32(r2);
+ float32x4_t _r24 = vld1q_f32(r2 + 4);
+ float32x4_t _r20n = vld1q_f32(r2 + 8);
+ float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
+ float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
+ float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
+ float32x4_t _r25 = vextq_f32(_r24, _r20n, 1);
+ float32x4_t _r26 = vextq_f32(_r24, _r20n, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r20, _k14151617, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r21, _k14151617, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r22, _k14151617, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r23, _k14151617, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r24, _k18192021, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2);
+
+ float32x4_t _k21222324 = vld1q_f32(k3);
+ float32x4_t _k25262728 = vld1q_f32(k3 + 4);
+
+ float32x4_t _r30 = vld1q_f32(r3);
+ float32x4_t _r34 = vld1q_f32(r3 + 4);
+ float32x4_t _r30n = vld1q_f32(r3 + 8);
+ float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
+ float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
+ float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
+ float32x4_t _r35 = vextq_f32(_r34, _r30n, 1);
+ float32x4_t _r36 = vextq_f32(_r34, _r30n, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r30, _k21222324, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r31, _k21222324, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r32, _k21222324, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r33, _k21222324, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r34, _k25262728, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2);
+
+ float32x4_t _k28293031 = vld1q_f32(k4);
+ float32x4_t _k32333435 = vld1q_f32(k4 + 4);
+
+ float32x4_t _r40 = vld1q_f32(r4);
+ float32x4_t _r44 = vld1q_f32(r4 + 4);
+ float32x4_t _r40n = vld1q_f32(r4 + 8);
+ float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
+ float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
+ float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
+ float32x4_t _r45 = vextq_f32(_r44, _r40n, 1);
+ float32x4_t _r46 = vextq_f32(_r44, _r40n, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r40, _k28293031, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r41, _k28293031, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r42, _k28293031, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r43, _k28293031, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r44, _k32333435, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2);
+
+ float32x4_t _k35363738 = vld1q_f32(k5);
+ float32x4_t _k39404142 = vld1q_f32(k5 + 4);
+
+ float32x4_t _r50 = vld1q_f32(r5);
+ float32x4_t _r54 = vld1q_f32(r5 + 4);
+ float32x4_t _r50n = vld1q_f32(r5 + 8);
+ float32x4_t _r51 = vextq_f32(_r50, _r54, 1);
+ float32x4_t _r52 = vextq_f32(_r50, _r54, 2);
+ float32x4_t _r53 = vextq_f32(_r50, _r54, 3);
+ float32x4_t _r55 = vextq_f32(_r54, _r50n, 1);
+ float32x4_t _r56 = vextq_f32(_r54, _r50n, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r50, _k35363738, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r51, _k35363738, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r52, _k35363738, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r53, _k35363738, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r54, _k39404142, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2);
+
+ float32x4_t _k42434445 = vld1q_f32(k6);
+ float32x4_t _k46474849 = vld1q_f32(k6 + 4);
+
+ float32x4_t _r60 = vld1q_f32(r6);
+ float32x4_t _r64 = vld1q_f32(r6 + 4);
+ float32x4_t _r60n = vld1q_f32(r6 + 8);
+ float32x4_t _r61 = vextq_f32(_r60, _r64, 1);
+ float32x4_t _r62 = vextq_f32(_r60, _r64, 2);
+ float32x4_t _r63 = vextq_f32(_r60, _r64, 3);
+ float32x4_t _r65 = vextq_f32(_r64, _r60n, 1);
+ float32x4_t _r66 = vextq_f32(_r64, _r60n, 2);
+
+ _sum = vfmaq_laneq_f32(_sum, _r60, _k42434445, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r61, _k42434445, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r62, _k42434445, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r63, _k42434445, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r64, _k46474849, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r65, _k46474849, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r66, _k46474849, 2);
+
+ vst1q_f32(outptr, _sum);
+
+ r0 += 4;
+ r1 += 4;
+ r2 += 4;
+ r3 += 4;
+ r4 += 4;
+ r5 += 4;
+ r6 += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+
+ "pld [%1, #256] \n"
+ "vld1.f32 {d24-d25}, [%1] \n"// _sum
+ "veor q13, q13 \n"// _sum2 = 0;
+ "veor q14, q14 \n"// _sum3 = 0;
+ "veor q15, q15 \n"// _sum4 = 0;
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d8-d11}, [%9] \n"// q4 q5 = k0123 k4567
+ "add %9, #28 \n"
+
+ "pld [%2, #128] \n"
+ "vld1.f32 {d0-d1}, [%2]! \n"// q0 = 0 1 2 3
+ "vmla.f32 q12, q0, d8[0] \n"
+
+ "pld [%2, #256] \n"
+ "vld1.f32 {d4-d7}, [%2] \n"// q2 = 4 5 6 7 q3 = 8 9 10 11
+ "vmla.f32 q13, q2, d10[0] \n"
+
+ "vext.32 q1, q0, q2, #1 \n"// q1 = 1 2 3 4
+ "vext.32 q10, q2, q3, #1 \n"// q10= 5 6 7 8
+ "vmla.f32 q14, q1, d8[1] \n"
+ "vmla.f32 q15, q10, d10[1] \n"
+
+ "vext.32 q8, q0, q2, #2 \n"// q8 = 2 3 4 5
+ "vext.32 q11, q2, q3, #2 \n"// q11= 6 7 8 9
+ "vmla.f32 q12, q8, d9[0] \n"
+ "vmla.f32 q13, q11, d11[0] \n"
+
+ "vext.32 q9, q0, q2, #3 \n"// q9 = 3 4 5 6
+ "vmla.f32 q14, q9, d9[1] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d12-d15}, [%9] \n"// q6 q7 = k78910 k11121314
+ "add %9, #28 \n"
+
+ "pld [%3, #128] \n"
+ "vld1.f32 {d0-d1}, [%3]! \n"
+ "vmla.f32 q15, q0, d12[0] \n"
+
+ "pld [%3, #256] \n"
+ "vld1.f32 {d4-d7}, [%3] \n"
+ "vmla.f32 q12, q2, d14[0] \n"
+
+ "vext.32 q1, q0, q2, #1 \n"
+ "vext.32 q10, q2, q3, #1 \n"
+ "vmla.f32 q13, q1, d12[1] \n"
+ "vmla.f32 q14, q10, d14[1] \n"
+
+ "vext.32 q8, q0, q2, #2 \n"
+ "vext.32 q11, q2, q3, #2 \n"
+ "vmla.f32 q15, q8, d13[0] \n"
+ "vmla.f32 q12, q11, d15[0] \n"
+
+ "vext.32 q9, q0, q2, #3 \n"
+ "vmla.f32 q13, q9, d13[1] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d8-d11}, [%9] \n"// q4 q5 = k14151617 k18192021
+ "add %9, #28 \n"
+
+ "pld [%4, #128] \n"
+ "vld1.f32 {d0-d1}, [%4]! \n"
+ "vmla.f32 q14, q0, d8[0] \n"
+
+ "pld [%4, #256] \n"
+ "vld1.f32 {d4-d7}, [%4] \n"
+ "vmla.f32 q15, q2, d10[0] \n"
+
+ "vext.32 q1, q0, q2, #1 \n"
+ "vext.32 q10, q2, q3, #1 \n"
+ "vmla.f32 q12, q1, d8[1] \n"
+ "vmla.f32 q13, q10, d10[1] \n"
+
+ "vext.32 q8, q0, q2, #2 \n"
+ "vext.32 q11, q2, q3, #2 \n"
+ "vmla.f32 q14, q8, d9[0] \n"
+ "vmla.f32 q15, q11, d11[0] \n"
+
+ "vext.32 q9, q0, q2, #3 \n"
+ "vmla.f32 q12, q9, d9[1] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d12-d15}, [%9] \n"// q6 q7 = k21222324 k25262728
+ "add %9, #28 \n"
+
+ "pld [%5, #128] \n"
+ "vld1.f32 {d0-d1}, [%5]! \n"
+ "vmla.f32 q13, q0, d12[0] \n"
+
+ "pld [%5, #256] \n"
+ "vld1.f32 {d4-d7}, [%5] \n"
+ "vmla.f32 q14, q2, d14[0] \n"
+
+ "vext.32 q1, q0, q2, #1 \n"
+ "vext.32 q10, q2, q3, #1 \n"
+ "vmla.f32 q15, q1, d12[1] \n"
+ "vmla.f32 q12, q10, d14[1] \n"
+
+ "vext.32 q8, q0, q2, #2 \n"
+ "vext.32 q11, q2, q3, #2 \n"
+ "vmla.f32 q13, q8, d13[0] \n"
+ "vmla.f32 q14, q11, d15[0] \n"
+
+ "vext.32 q9, q0, q2, #3 \n"
+ "vmla.f32 q15, q9, d13[1] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d8-d11}, [%9] \n"// q4 q5 = k28293031 k32333435
+ "add %9, #28 \n"
+
+ "pld [%6, #128] \n"
+ "vld1.f32 {d0-d1}, [%6]! \n"
+ "vmla.f32 q12, q0, d8[0] \n"
+
+ "pld [%6, #256] \n"
+ "vld1.f32 {d4-d7}, [%6] \n"
+ "vmla.f32 q13, q2, d10[0] \n"
+
+ "vext.32 q1, q0, q2, #1 \n"
+ "vext.32 q10, q2, q3, #1 \n"
+ "vmla.f32 q14, q1, d8[1] \n"
+ "vmla.f32 q15, q10, d10[1] \n"
+
+ "vext.32 q8, q0, q2, #2 \n"
+ "vext.32 q11, q2, q3, #2 \n"
+ "vmla.f32 q12, q8, d9[0] \n"
+ "vmla.f32 q13, q11, d11[0] \n"
+
+ "vext.32 q9, q0, q2, #3 \n"
+ "vmla.f32 q14, q9, d9[1] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d12-d15}, [%9] \n"// q6 q7 = k35363738 k39404142
+ "add %9, #28 \n"
+
+ "pld [%7, #128] \n"
+ "vld1.f32 {d0-d1}, [%7]! \n"
+ "vmla.f32 q15, q0, d12[0] \n"
+
+ "pld [%7, #256] \n"
+ "vld1.f32 {d4-d7}, [%7] \n"
+ "vmla.f32 q12, q2, d14[0] \n"
+
+ "vext.32 q1, q0, q2, #1 \n"
+ "vext.32 q10, q2, q3, #1 \n"
+ "vmla.f32 q13, q1, d12[1] \n"
+ "vmla.f32 q14, q10, d14[1] \n"
+
+ "vext.32 q8, q0, q2, #2 \n"
+ "vext.32 q11, q2, q3, #2 \n"
+ "vmla.f32 q15, q8, d13[0] \n"
+ "vmla.f32 q12, q11, d15[0] \n"
+
+ "vext.32 q9, q0, q2, #3 \n"
+ "vmla.f32 q13, q9, d13[1] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d8-d11}, [%9] \n"// q4 q5 = k42434445 k46474849
+ "sub %9, #168 \n"// restore k0
+
+ "pld [%8, #128] \n"
+ "vld1.f32 {d0-d1}, [%8]! \n"
+ "vmla.f32 q14, q0, d8[0] \n"
+
+ "pld [%8, #256] \n"
+ "vld1.f32 {d4-d7}, [%8] \n"
+ "vmla.f32 q15, q2, d10[0] \n"
+
+ "vext.32 q1, q0, q2, #1 \n"
+ "vext.32 q10, q2, q3, #1 \n"
+ "vmla.f32 q12, q1, d8[1] \n"
+ "vmla.f32 q13, q10, d10[1] \n"
+
+ "vext.32 q8, q0, q2, #2 \n"
+ "vext.32 q11, q2, q3, #2 \n"
+ "vmla.f32 q14, q8, d9[0] \n"
+ "vmla.f32 q15, q11, d11[0] \n"
+
+ "vext.32 q9, q0, q2, #3 \n"
+ "vmla.f32 q12, q9, d9[1] \n"
+
+ "vadd.f32 q13, q13, q14 \n"
+ "vadd.f32 q13, q13, q15 \n"
+ "vadd.f32 q12, q12, q13 \n"
+
+ "vst1.f32 {d24-d25}, [%1]! \n"
+
+ "subs %0, #1 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0), // %2
+ "=r"(r1), // %3
+ "=r"(r2), // %4
+ "=r"(r3), // %5
+ "=r"(r4), // %6
+ "=r"(r5), // %7
+ "=r"(r6), // %8
+ "=r"(k0) // %9
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "3"(r1),
+ "4"(r2),
+ "5"(r3),
+ "6"(r4),
+ "7"(r5),
+ "8"(r6),
+ "9"(k0)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ float sum = 0;
+
+ sum += r0[0] * k0[0];
+ sum += r0[1] * k0[1];
+ sum += r0[2] * k0[2];
+ sum += r0[3] * k0[3];
+ sum += r0[4] * k0[4];
+ sum += r0[5] * k0[5];
+ sum += r0[6] * k0[6];
+
+ sum += r1[0] * k1[0];
+ sum += r1[1] * k1[1];
+ sum += r1[2] * k1[2];
+ sum += r1[3] * k1[3];
+ sum += r1[4] * k1[4];
+ sum += r1[5] * k1[5];
+ sum += r1[6] * k1[6];
+
+ sum += r2[0] * k2[0];
+ sum += r2[1] * k2[1];
+ sum += r2[2] * k2[2];
+ sum += r2[3] * k2[3];
+ sum += r2[4] * k2[4];
+ sum += r2[5] * k2[5];
+ sum += r2[6] * k2[6];
+
+ sum += r3[0] * k3[0];
+ sum += r3[1] * k3[1];
+ sum += r3[2] * k3[2];
+ sum += r3[3] * k3[3];
+ sum += r3[4] * k3[4];
+ sum += r3[5] * k3[5];
+ sum += r3[6] * k3[6];
+
+ sum += r4[0] * k4[0];
+ sum += r4[1] * k4[1];
+ sum += r4[2] * k4[2];
+ sum += r4[3] * k4[3];
+ sum += r4[4] * k4[4];
+ sum += r4[5] * k4[5];
+ sum += r4[6] * k4[6];
+
+ sum += r5[0] * k5[0];
+ sum += r5[1] * k5[1];
+ sum += r5[2] * k5[2];
+ sum += r5[3] * k5[3];
+ sum += r5[4] * k5[4];
+ sum += r5[5] * k5[5];
+ sum += r5[6] * k5[6];
+
+ sum += r6[0] * k6[0];
+ sum += r6[1] * k6[1];
+ sum += r6[2] * k6[2];
+ sum += r6[3] * k6[3];
+ sum += r6[4] * k6[4];
+ sum += r6[5] * k6[5];
+ sum += r6[6] * k6[6];
+
+ *outptr += sum;
+
+ r0++;
+ r1++;
+ r2++;
+ r3++;
+ r4++;
+ r5++;
+ r6++;
+ outptr++;
+ }
+
+ r0 += 6;
+ r1 += 6;
+ r2 += 6;
+ r3 += 6;
+ r4 += 6;
+ r5 += 6;
+ r6 += 6;
+
+ }
+
+ }
+ }
+
+}
+
+static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const int tailstep = w - 2*outw + w;
+
+ const float* kernel = _kernel;
+ const float* bias = _bias;
+
+ #pragma omp parallel for
+ for (int p=0; p> 2;
+ int remain = outw - (nn << 2);
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _sum = vld1q_f32(outptr);
+
+ float32x4_t _k0123 = vld1q_f32(k0);
+ float32x4_t _k4567 = vld1q_f32(k0 + 4);
+
+ float32x4x2_t _r00_02461357 = vld2q_f32(r0);
+ float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8);
+ float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14
+ float32x4_t _r0_9111315 = _r00nx2.val[1];// 9 11 13 15
+ float32x4_t _r00 = _r00_02461357.val[0];// 0 2 4 6
+ float32x4_t _r01 = _r00_02461357.val[1];// 1 3 5 7
+ float32x4_t _r02 = vextq_f32(_r00, _r0_8101214, 1);// 2 4 6 8
+ float32x4_t _r03 = vextq_f32(_r01, _r0_9111315, 1);// 3 5 7 9
+ float32x4_t _r04 = vextq_f32(_r00, _r0_8101214, 2);// 4 6 8 10
+ float32x4_t _r05 = vextq_f32(_r01, _r0_9111315, 2);// 5 7 9 11
+ float32x4_t _r06 = vextq_f32(_r00, _r0_8101214, 3);// 6 8 10 12
+
+ _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2);
+
+ float32x4_t _k78910 = vld1q_f32(k1);
+ float32x4_t _k11121314 = vld1q_f32(k1 + 4);
+
+ float32x4x2_t _r10_02461357 = vld2q_f32(r1);
+ float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8);
+ float32x4_t _r1_8101214 = _r10nx2.val[0];
+ float32x4_t _r1_9111315 = _r10nx2.val[1];
+ float32x4_t _r10 = _r10_02461357.val[0];
+ float32x4_t _r11 = _r10_02461357.val[1];
+ float32x4_t _r12 = vextq_f32(_r10, _r1_8101214, 1);
+ float32x4_t _r13 = vextq_f32(_r11, _r1_9111315, 1);
+ float32x4_t _r14 = vextq_f32(_r10, _r1_8101214, 2);
+ float32x4_t _r15 = vextq_f32(_r11, _r1_9111315, 2);
+ float32x4_t _r16 = vextq_f32(_r10, _r1_8101214, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r10, _k78910, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r11, _k78910, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r12, _k78910, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r13, _k78910, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r14, _k11121314, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2);
+
+ float32x4_t _k14151617 = vld1q_f32(k2);
+ float32x4_t _k18192021 = vld1q_f32(k2 + 4);
+
+ float32x4x2_t _r20_02461357 = vld2q_f32(r2);
+ float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8);
+ float32x4_t _r2_8101214 = _r20nx2.val[0];
+ float32x4_t _r2_9111315 = _r20nx2.val[1];
+ float32x4_t _r20 = _r20_02461357.val[0];
+ float32x4_t _r21 = _r20_02461357.val[1];
+ float32x4_t _r22 = vextq_f32(_r20, _r2_8101214, 1);
+ float32x4_t _r23 = vextq_f32(_r21, _r2_9111315, 1);
+ float32x4_t _r24 = vextq_f32(_r20, _r2_8101214, 2);
+ float32x4_t _r25 = vextq_f32(_r21, _r2_9111315, 2);
+ float32x4_t _r26 = vextq_f32(_r20, _r2_8101214, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r20, _k14151617, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r21, _k14151617, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r22, _k14151617, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r23, _k14151617, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r24, _k18192021, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2);
+
+ float32x4_t _k21222324 = vld1q_f32(k3);
+ float32x4_t _k25262728 = vld1q_f32(k3 + 4);
+
+ float32x4x2_t _r30_02461357 = vld2q_f32(r3);
+ float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8);
+ float32x4_t _r3_8101214 = _r30nx2.val[0];
+ float32x4_t _r3_9111315 = _r30nx2.val[1];
+ float32x4_t _r30 = _r30_02461357.val[0];
+ float32x4_t _r31 = _r30_02461357.val[1];
+ float32x4_t _r32 = vextq_f32(_r30, _r3_8101214, 1);
+ float32x4_t _r33 = vextq_f32(_r31, _r3_9111315, 1);
+ float32x4_t _r34 = vextq_f32(_r30, _r3_8101214, 2);
+ float32x4_t _r35 = vextq_f32(_r31, _r3_9111315, 2);
+ float32x4_t _r36 = vextq_f32(_r30, _r3_8101214, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r30, _k21222324, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r31, _k21222324, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r32, _k21222324, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r33, _k21222324, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r34, _k25262728, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2);
+
+ float32x4_t _k28293031 = vld1q_f32(k4);
+ float32x4_t _k32333435 = vld1q_f32(k4 + 4);
+
+ float32x4x2_t _r40_02461357 = vld2q_f32(r4);
+ float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8);
+ float32x4_t _r4_8101214 = _r40nx2.val[0];
+ float32x4_t _r4_9111315 = _r40nx2.val[1];
+ float32x4_t _r40 = _r40_02461357.val[0];
+ float32x4_t _r41 = _r40_02461357.val[1];
+ float32x4_t _r42 = vextq_f32(_r40, _r4_8101214, 1);
+ float32x4_t _r43 = vextq_f32(_r41, _r4_9111315, 1);
+ float32x4_t _r44 = vextq_f32(_r40, _r4_8101214, 2);
+ float32x4_t _r45 = vextq_f32(_r41, _r4_9111315, 2);
+ float32x4_t _r46 = vextq_f32(_r40, _r4_8101214, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r40, _k28293031, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r41, _k28293031, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r42, _k28293031, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r43, _k28293031, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r44, _k32333435, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2);
+
+ float32x4_t _k35363738 = vld1q_f32(k5);
+ float32x4_t _k39404142 = vld1q_f32(k5 + 4);
+
+ float32x4x2_t _r50_02461357 = vld2q_f32(r5);
+ float32x4x2_t _r50nx2 = vld2q_f32(r5 + 8);
+ float32x4_t _r5_8101214 = _r50nx2.val[0];
+ float32x4_t _r5_9111315 = _r50nx2.val[1];
+ float32x4_t _r50 = _r50_02461357.val[0];
+ float32x4_t _r51 = _r50_02461357.val[1];
+ float32x4_t _r52 = vextq_f32(_r50, _r5_8101214, 1);
+ float32x4_t _r53 = vextq_f32(_r51, _r5_9111315, 1);
+ float32x4_t _r54 = vextq_f32(_r50, _r5_8101214, 2);
+ float32x4_t _r55 = vextq_f32(_r51, _r5_9111315, 2);
+ float32x4_t _r56 = vextq_f32(_r50, _r5_8101214, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r50, _k35363738, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r51, _k35363738, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r52, _k35363738, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r53, _k35363738, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r54, _k39404142, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2);
+
+ float32x4_t _k42434445 = vld1q_f32(k6);
+ float32x4_t _k46474849 = vld1q_f32(k6 + 4);
+
+ float32x4x2_t _r60_02461357 = vld2q_f32(r6);
+ float32x4x2_t _r60nx2 = vld2q_f32(r6 + 8);
+ float32x4_t _r6_8101214 = _r60nx2.val[0];
+ float32x4_t _r6_9111315 = _r60nx2.val[1];
+ float32x4_t _r60 = _r60_02461357.val[0];
+ float32x4_t _r61 = _r60_02461357.val[1];
+ float32x4_t _r62 = vextq_f32(_r60, _r6_8101214, 1);
+ float32x4_t _r63 = vextq_f32(_r61, _r6_9111315, 1);
+ float32x4_t _r64 = vextq_f32(_r60, _r6_8101214, 2);
+ float32x4_t _r65 = vextq_f32(_r61, _r6_9111315, 2);
+ float32x4_t _r66 = vextq_f32(_r60, _r6_8101214, 3);
+
+ _sum = vfmaq_laneq_f32(_sum, _r60, _k42434445, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r61, _k42434445, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r62, _k42434445, 2);
+ _sum = vfmaq_laneq_f32(_sum, _r63, _k42434445, 3);
+ _sum = vfmaq_laneq_f32(_sum, _r64, _k46474849, 0);
+ _sum = vfmaq_laneq_f32(_sum, _r65, _k46474849, 1);
+ _sum = vfmaq_laneq_f32(_sum, _r66, _k46474849, 2);
+
+ vst1q_f32(outptr, _sum);
+
+ r0 += 8;
+ r1 += 8;
+ r2 += 8;
+ r3 += 8;
+ r4 += 8;
+ r5 += 8;
+ r6 += 8;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+
+ "pld [%1, #256] \n"
+ "vld1.f32 {d26-d27}, [%1] \n"// _sum
+ "veor q14, q14 \n"// _sum2 = 0;
+ "veor q15, q15 \n"// _sum3 = 0;
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d8-d11}, [%9] \n"// q4 q5 = k0123 k4567
+ "add %9, #28 \n"
+
+ "pld [%2, #512] \n"
+ "vld2.f32 {d0-d3}, [%2]! \n"// q0 = 0 2 4 6 q1 = 1 3 5 7
+ "vmla.f32 q13, q0, d8[0] \n"
+ "vmla.f32 q14, q1, d8[1] \n"
+
+ "vld2.f32 {d4-d7}, [%2] \n"// q2 = 8 10 12 14 q3 = 9 11 13 15
+ "vext.32 q8, q0, q2, #1 \n"// q8 = 2 4 6 8
+ "vext.32 q9, q1, q3, #1 \n"// q9 = 3 5 7 9
+ "vmla.f32 q15, q8, d9[0] \n"
+ "vmla.f32 q13, q9, d9[1] \n"
+
+ "vext.32 q10, q0, q2, #2 \n"// q10= 4 6 8 10
+ "vext.32 q11, q1, q3, #2 \n"// q11= 5 7 9 11
+ "vmla.f32 q14, q10, d10[0] \n"
+ "vmla.f32 q15, q11, d10[1] \n"
+
+ "vext.32 q12, q0, q2, #3 \n"// q12= 6 8 10 12
+ "vmla.f32 q13, q12, d11[0] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d12-d15}, [%9] \n"// q6 q7 = k78910 k11121314
+ "add %9, #28 \n"
+
+ "pld [%3, #512] \n"
+ "vld2.f32 {d0-d3}, [%3]! \n"
+ "vmla.f32 q14, q0, d12[0] \n"
+ "vmla.f32 q15, q1, d12[1] \n"
+
+ "vld2.f32 {d4-d7}, [%3] \n"
+ "vext.32 q8, q0, q2, #1 \n"
+ "vext.32 q9, q1, q3, #1 \n"
+ "vmla.f32 q13, q8, d13[0] \n"
+ "vmla.f32 q14, q9, d13[1] \n"
+
+ "vext.32 q10, q0, q2, #2 \n"
+ "vext.32 q11, q1, q3, #2 \n"
+ "vmla.f32 q15, q10, d14[0] \n"
+ "vmla.f32 q13, q11, d14[1] \n"
+
+ "vext.32 q12, q0, q2, #3 \n"
+ "vmla.f32 q14, q12, d15[0] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d8-d11}, [%9] \n"// q4 q5 = k14151617 k18192021
+ "add %9, #28 \n"
+
+ "pld [%4, #512] \n"
+ "vld2.f32 {d0-d3}, [%4]! \n"
+ "vmla.f32 q15, q0, d8[0] \n"
+ "vmla.f32 q13, q1, d8[1] \n"
+
+ "vld2.f32 {d4-d7}, [%4] \n"
+ "vext.32 q8, q0, q2, #1 \n"
+ "vext.32 q9, q1, q3, #1 \n"
+ "vmla.f32 q14, q8, d9[0] \n"
+ "vmla.f32 q15, q9, d9[1] \n"
+
+ "vext.32 q10, q0, q2, #2 \n"
+ "vext.32 q11, q1, q3, #2 \n"
+ "vmla.f32 q13, q10, d10[0] \n"
+ "vmla.f32 q14, q11, d10[1] \n"
+
+ "vext.32 q12, q0, q2, #3 \n"
+ "vmla.f32 q15, q12, d11[0] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d12-d15}, [%9] \n"// q6 q7 = k21222324 k25262728
+ "add %9, #28 \n"
+
+ "pld [%5, #512] \n"
+ "vld2.f32 {d0-d3}, [%5]! \n"
+ "vmla.f32 q13, q0, d12[0] \n"
+ "vmla.f32 q14, q1, d12[1] \n"
+
+ "vld2.f32 {d4-d7}, [%5] \n"
+ "vext.32 q8, q0, q2, #1 \n"
+ "vext.32 q9, q1, q3, #1 \n"
+ "vmla.f32 q15, q8, d13[0] \n"
+ "vmla.f32 q13, q9, d13[1] \n"
+
+ "vext.32 q10, q0, q2, #2 \n"
+ "vext.32 q11, q1, q3, #2 \n"
+ "vmla.f32 q14, q10, d14[0] \n"
+ "vmla.f32 q15, q11, d14[1] \n"
+
+ "vext.32 q12, q0, q2, #3 \n"
+ "vmla.f32 q13, q12, d15[0] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d8-d11}, [%9] \n"// q4 q5 = k28293031 k32333435
+ "add %9, #28 \n"
+
+ "pld [%6, #512] \n"
+ "vld2.f32 {d0-d3}, [%6]! \n"
+ "vmla.f32 q14, q0, d8[0] \n"
+ "vmla.f32 q15, q1, d8[1] \n"
+
+ "vld2.f32 {d4-d7}, [%6] \n"
+ "vext.32 q8, q0, q2, #1 \n"
+ "vext.32 q9, q1, q3, #1 \n"
+ "vmla.f32 q13, q8, d9[0] \n"
+ "vmla.f32 q14, q9, d9[1] \n"
+
+ "vext.32 q10, q0, q2, #2 \n"
+ "vext.32 q11, q1, q3, #2 \n"
+ "vmla.f32 q15, q10, d10[0] \n"
+ "vmla.f32 q13, q11, d10[1] \n"
+
+ "vext.32 q12, q0, q2, #3 \n"
+ "vmla.f32 q14, q12, d11[0] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d12-d15}, [%9] \n"// q6 q7 = k35363738 k39404142
+ "add %9, #28 \n"
+
+ "pld [%7, #512] \n"
+ "vld2.f32 {d0-d3}, [%7]! \n"
+ "vmla.f32 q15, q0, d12[0] \n"
+ "vmla.f32 q13, q1, d12[1] \n"
+
+ "vld2.f32 {d4-d7}, [%7] \n"
+ "vext.32 q8, q0, q2, #1 \n"
+ "vext.32 q9, q1, q3, #1 \n"
+ "vmla.f32 q14, q8, d13[0] \n"
+ "vmla.f32 q15, q9, d13[1] \n"
+
+ "vext.32 q10, q0, q2, #2 \n"
+ "vext.32 q11, q1, q3, #2 \n"
+ "vmla.f32 q13, q10, d14[0] \n"
+ "vmla.f32 q14, q11, d14[1] \n"
+
+ "vext.32 q12, q0, q2, #3 \n"
+ "vmla.f32 q15, q12, d15[0] \n"
+
+ "pld [%9, #256] \n"
+ "vld1.f32 {d8-d11}, [%9] \n"// q4 q5 = k42434445 k46474849
+ "sub %9, #168 \n"// restore k0
+
+ "pld [%8, #512] \n"
+ "vld2.f32 {d0-d3}, [%8]! \n"
+ "vmla.f32 q13, q0, d8[0] \n"
+ "vmla.f32 q14, q1, d8[1] \n"
+
+ "vld2.f32 {d4-d7}, [%8] \n"
+ "vext.32 q8, q0, q2, #1 \n"
+ "vext.32 q9, q1, q3, #1 \n"
+ "vmla.f32 q15, q8, d9[0] \n"
+ "vmla.f32 q13, q9, d9[1] \n"
+
+ "vext.32 q10, q0, q2, #2 \n"
+ "vext.32 q11, q1, q3, #2 \n"
+ "vmla.f32 q14, q10, d10[0] \n"
+ "vmla.f32 q15, q11, d10[1] \n"
+
+ "vext.32 q12, q0, q2, #3 \n"
+ "vmla.f32 q13, q12, d11[0] \n"
+
+ "vadd.f32 q14, q14, q15 \n"
+ "vadd.f32 q13, q13, q14 \n"
+
+ "vst1.f32 {d26-d27}, [%1]! \n"
+
+ "subs %0, #1 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(outptr), // %1
+ "=r"(r0), // %2
+ "=r"(r1), // %3
+ "=r"(r2), // %4
+ "=r"(r3), // %5
+ "=r"(r4), // %6
+ "=r"(r5), // %7
+ "=r"(r6), // %8
+ "=r"(k0) // %9
+ : "0"(nn),
+ "1"(outptr),
+ "2"(r0),
+ "3"(r1),
+ "4"(r2),
+ "5"(r3),
+ "6"(r4),
+ "7"(r5),
+ "8"(r6),
+ "9"(k0)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ float sum = 0;
+
+ sum += r0[0] * k0[0];
+ sum += r0[1] * k0[1];
+ sum += r0[2] * k0[2];
+ sum += r0[3] * k0[3];
+ sum += r0[4] * k0[4];
+ sum += r0[5] * k0[5];
+ sum += r0[6] * k0[6];
+
+ sum += r1[0] * k1[0];
+ sum += r1[1] * k1[1];
+ sum += r1[2] * k1[2];
+ sum += r1[3] * k1[3];
+ sum += r1[4] * k1[4];
+ sum += r1[5] * k1[5];
+ sum += r1[6] * k1[6];
+
+ sum += r2[0] * k2[0];
+ sum += r2[1] * k2[1];
+ sum += r2[2] * k2[2];
+ sum += r2[3] * k2[3];
+ sum += r2[4] * k2[4];
+ sum += r2[5] * k2[5];
+ sum += r2[6] * k2[6];
+
+ sum += r3[0] * k3[0];
+ sum += r3[1] * k3[1];
+ sum += r3[2] * k3[2];
+ sum += r3[3] * k3[3];
+ sum += r3[4] * k3[4];
+ sum += r3[5] * k3[5];
+ sum += r3[6] * k3[6];
+
+ sum += r4[0] * k4[0];
+ sum += r4[1] * k4[1];
+ sum += r4[2] * k4[2];
+ sum += r4[3] * k4[3];
+ sum += r4[4] * k4[4];
+ sum += r4[5] * k4[5];
+ sum += r4[6] * k4[6];
+
+ sum += r5[0] * k5[0];
+ sum += r5[1] * k5[1];
+ sum += r5[2] * k5[2];
+ sum += r5[3] * k5[3];
+ sum += r5[4] * k5[4];
+ sum += r5[5] * k5[5];
+ sum += r5[6] * k5[6];
+
+ sum += r6[0] * k6[0];
+ sum += r6[1] * k6[1];
+ sum += r6[2] * k6[2];
+ sum += r6[3] * k6[3];
+ sum += r6[4] * k6[4];
+ sum += r6[5] * k6[5];
+ sum += r6[6] * k6[6];
+
+ *outptr += sum;
+
+ r0 += 2;
+ r1 += 2;
+ r2 += 2;
+ r3 += 2;
+ r4 += 2;
+ r5 += 2;
+ r6 += 2;
+ outptr++;
+ }
+
+ r0 += tailstep;
+ r1 += tailstep;
+ r2 += tailstep;
+ r3 += tailstep;
+ r4 += tailstep;
+ r5 += tailstep;
+ r6 += tailstep;
+
+ }
+
+ }
+ }
+
+}
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
new file mode 100644
index 00000000000..61d00e783f0
--- /dev/null
+++ b/src/layer/arm/convolution_arm.cpp
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_arm.h"
+
+namespace ncnn {
+
+#include "convolution_1x1.h"
+#include "convolution_2x2.h"
+#include "convolution_3x3.h"
+#include "convolution_4x4.h"
+#include "convolution_5x5.h"
+#include "convolution_7x7.h"
+
+DEFINE_LAYER_CREATOR(Convolution_arm)
+
+int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ // convolv with NxN kernel
+ // value = value + bias
+
+ if (kernel_size > 7 || stride > 4 || dilation != 1)
+ {
+ return Convolution::forward(bottom_blob, top_blob);
+ }
+
+ typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
+
+ // kernel_size x stride
+ conv_func conv_func_table[7][4] =
+ {
+ {
+ conv1x1s1_neon,
+ conv1x1s2_neon,
+ 0,
+ 0
+ }, // kernel_size = 1
+ {
+ conv2x2s1_neon,
+ 0,
+ 0,
+ 0
+ }, // kernel_size = 2
+ {
+ conv3x3s1_neon,
+ conv3x3s2_neon,
+ 0,
+ 0
+ }, // kernel_size = 3
+ {
+ 0,
+ 0,
+ 0,
+ conv4x4s4_neon
+ }, // kernel_size = 4
+ {
+ conv5x5s1_neon,
+ conv5x5s2_neon,
+ 0,
+ 0
+ }, // kernel_size = 5
+ {
+ 0,
+ 0,
+ 0,
+ 0
+ }, // kernel_size = 6
+ {
+ conv7x7s1_neon,
+ conv7x7s2_neon,
+ 0,
+ 0
+ } // kernel_size = 7
+ };
+
+ conv_func conv = conv_func_table[kernel_size-1][stride-1];
+ if (!conv)
+ {
+ return Convolution::forward(bottom_blob, top_blob);
+ }
+
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+
+ Mat bottom_blob_bordered = bottom_blob;
+ if (pad > 0)
+ {
+ copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
+ if (bottom_blob_bordered.empty())
+ return -100;
+
+ w = bottom_blob_bordered.w;
+ h = bottom_blob_bordered.h;
+ }
+
+ int outw = (w - kernel_size) / stride + 1;
+ int outh = (h - kernel_size) / stride + 1;
+
+ top_blob.create(outw, outh, num_output);
+ if (top_blob.empty())
+ return -100;
+
+ conv(bottom_blob_bordered, top_blob, weight_data, bias_data);
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h
new file mode 100644
index 00000000000..6f2bf05de3d
--- /dev/null
+++ b/src/layer/arm/convolution_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION_ARM_H
+#define LAYER_CONVOLUTION_ARM_H
+
+#include "convolution.h"
+
+namespace ncnn {
+
+class Convolution_arm : public Convolution
+{
+public:
+ virtual int forward(const Mat& bottom_blobs, Mat& top_blobs) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION_ARM_H
diff --git a/src/layer/arm/eltwise_arm.cpp b/src/layer/arm/eltwise_arm.cpp
new file mode 100644
index 00000000000..eb5f81947d2
--- /dev/null
+++ b/src/layer/arm/eltwise_arm.cpp
@@ -0,0 +1,574 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "eltwise_arm.h"
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Eltwise_arm)
+
+int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs) const
+{
+ const Mat& bottom_blob = bottom_blobs[0];
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ Mat& top_blob = top_blobs[0];
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ if (op_type == Operation_PROD)
+ {
+ // first blob
+ const Mat& bottom_blob1 = bottom_blobs[1];
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _ptr = vld1q_f32(ptr);
+ float32x4_t _ptr1 = vld1q_f32(ptr1);
+ float32x4_t _p = vmulq_f32(_ptr, _ptr1);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ ptr1 += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vld1.f32 {d2-d3}, [%2 :128]! \n"
+ "vmul.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%3 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(ptr1), // %2
+ "=r"(outptr) // %3
+ : "0"(nn),
+ "1"(ptr),
+ "2"(ptr1),
+ "3"(outptr)
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = *ptr * *ptr1;
+
+ ptr++;
+ ptr1++;
+ outptr++;
+ }
+ }
+
+ for (size_t b=2; b> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _ptr = vld1q_f32(ptr);
+ float32x4_t _p = vld1q_f32(outptr);
+ _p = vmulq_f32(_ptr, _p);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vld1.f32 {d2-d3}, [%2 :128] \n"
+ "vmul.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr)
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr *= *ptr;
+
+ ptr++;
+ outptr++;
+ }
+ }
+ }
+ }
+ else if (op_type == Operation_SUM)
+ {
+ if (num_coeff == 0)
+ {
+ // first blob
+ const Mat& bottom_blob1 = bottom_blobs[1];
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _ptr = vld1q_f32(ptr);
+ float32x4_t _ptr1 = vld1q_f32(ptr1);
+ float32x4_t _p = vaddq_f32(_ptr, _ptr1);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ ptr1 += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vld1.f32 {d2-d3}, [%2 :128]! \n"
+ "vadd.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%3 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(ptr1), // %2
+ "=r"(outptr) // %3
+ : "0"(nn),
+ "1"(ptr),
+ "2"(ptr1),
+ "3"(outptr)
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = *ptr + *ptr1;
+
+ ptr++;
+ ptr1++;
+ outptr++;
+ }
+ }
+
+ for (size_t b=2; b> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _ptr = vld1q_f32(ptr);
+ float32x4_t _p = vld1q_f32(outptr);
+ _p = vaddq_f32(_ptr, _p);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vld1.f32 {d2-d3}, [%2 :128] \n"
+ "vadd.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr)
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr += *ptr;
+
+ ptr++;
+ outptr++;
+ }
+ }
+ }
+ }
+ else
+ {
+ const float* coeffs_ptr = coeffs;
+
+ // first blob
+ const Mat& bottom_blob1 = bottom_blobs[1];
+ float coeff0 = coeffs_ptr[0];
+ float coeff1 = coeffs_ptr[1];
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _coeff0 = vdupq_n_f32(coeff0);
+ float32x4_t _coeff1 = vdupq_n_f32(coeff1);
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _ptr = vld1q_f32(ptr);
+ float32x4_t _ptr1 = vld1q_f32(ptr1);
+ float32x4_t _p = vmulq_f32(_ptr, _coeff0);
+ _p = vmlaq_f32(_p, _ptr1, _coeff1);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ ptr1 += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vld1.f32 {d2-d3}, [%2 :128]! \n"
+ "vmul.f32 q0, q0, %q8 \n"
+ "vmla.f32 q0, q1, %q9 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%3 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(ptr1), // %2
+ "=r"(outptr) // %3
+ : "0"(nn),
+ "1"(ptr),
+ "2"(ptr1),
+ "3"(outptr),
+ "w"(_coeff0), // %8
+ "w"(_coeff1) // %9
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = *ptr * coeff0 + *ptr1 * coeff1;
+
+ ptr++;
+ ptr1++;
+ outptr++;
+ }
+ }
+
+ for (size_t b=2; b> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _coeff = vdupq_n_f32(coeff);
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _ptr = vld1q_f32(ptr);
+ float32x4_t _p = vld1q_f32(outptr);
+ _p = vmlaq_f32(_p, _ptr, _coeff);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vld1.f32 {d2-d3}, [%2 :128] \n"
+ "vmla.f32 q1, q0, %q6 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr),
+ "w"(_coeff) // %6
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr += *ptr * coeff;
+
+ ptr++;
+ outptr++;
+ }
+ }
+ }
+ }
+ }
+ else if (op_type == Operation_MAX)
+ {
+ // first blob
+ const Mat& bottom_blob1 = bottom_blobs[1];
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _ptr = vld1q_f32(ptr);
+ float32x4_t _ptr1 = vld1q_f32(ptr1);
+ float32x4_t _p = vmaxq_f32(_ptr, _ptr1);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ ptr1 += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vld1.f32 {d2-d3}, [%2 :128]! \n"
+ "vmax.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%3 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(ptr1), // %2
+ "=r"(outptr) // %3
+ : "0"(nn),
+ "1"(ptr),
+ "2"(ptr1),
+ "3"(outptr)
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = std::max(*ptr, *ptr1);
+
+ ptr++;
+ ptr1++;
+ outptr++;
+ }
+ }
+
+ for (size_t b=2; b> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _ptr = vld1q_f32(ptr);
+ float32x4_t _p = vld1q_f32(outptr);
+ _p = vmaxq_f32(_ptr, _p);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "pld [%2, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vld1.f32 {d2-d3}, [%2 :128] \n"
+ "vmax.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr)
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = std::max(*ptr, *outptr);
+
+ ptr++;
+ outptr++;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/eltwise_arm.h b/src/layer/arm/eltwise_arm.h
new file mode 100644
index 00000000000..060fac695fc
--- /dev/null
+++ b/src/layer/arm/eltwise_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ELTWISE_ARM_H
+#define LAYER_ELTWISE_ARM_H
+
+#include "eltwise.h"
+
+namespace ncnn {
+
+class Eltwise_arm : public Eltwise
+{
+public:
+ virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELTWISE_ARM_H
diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
new file mode 100644
index 00000000000..50e6cdaf5c9
--- /dev/null
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "innerproduct_arm.h"
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(InnerProduct_arm)
+
+int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(1, 1, num_output);
+ if (top_blob.empty())
+ return -100;
+
+ // num_output
+ const float* weight_data_ptr = weight_data;
+ #pragma omp parallel for
+ for (int p=0; p> 3;
+ int remain = size & 7;
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _m = vld1q_f32(m);
+ float32x4_t _w = vld1q_f32(w);
+ _sum = vfmaq_f32(_sum, _m, _w);
+
+ _m = vld1q_f32(m + 4);
+ _w = vld1q_f32(w + 4);
+ _sum2 = vfmaq_f32(_sum2, _m, _w);
+
+ m += 8;
+ w += 8;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.f32 {d0-d3}, [%1 :128]! \n"
+ "pld [%2, #256] \n"
+ "vld1.f32 {d4-d7}, [%2]! \n"
+ "vmla.f32 %q3, q0, q2 \n"
+ "subs %0, #1 \n"
+ "vmla.f32 %q4, q1, q3 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(m), // %1
+ "=r"(w), // %2
+ "=w"(_sum), // %3
+ "=w"(_sum2) // %4
+ : "0"(nn),
+ "1"(m),
+ "2"(w),
+ "3"(_sum),
+ "4"(_sum2)
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ sum += *m * *w;
+
+ m++;
+ w++;
+ }
+ }
+
+#if __ARM_NEON
+ _sum = vaddq_f32(_sum, _sum2);
+#if __aarch64__
+ sum += vaddvq_f32(_sum);
+#else
+ float32x2_t _sumss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
+ _sumss = vpadd_f32(_sumss, _sumss);
+ sum += vget_lane_f32(_sumss, 0);
+#endif // __aarch64__
+#endif // __ARM_NEON
+
+ outptr[0] = sum;
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h
new file mode 100644
index 00000000000..5fdf3fe20f8
--- /dev/null
+++ b/src/layer/arm/innerproduct_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INNERPRODUCT_ARM_H
+#define LAYER_INNERPRODUCT_ARM_H
+
+#include "innerproduct.h"
+
+namespace ncnn {
+
+class InnerProduct_arm : public InnerProduct
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_INNERPRODUCT_ARM_H
diff --git a/src/layer/arm/lrn_arm.cpp b/src/layer/arm/lrn_arm.cpp
new file mode 100644
index 00000000000..901bc6e0243
--- /dev/null
+++ b/src/layer/arm/lrn_arm.cpp
@@ -0,0 +1,227 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "lrn_arm.h"
+#include
+
+#if __ARM_NEON
+#include
+#include "neon_mathfun.h"
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(LRN_arm)
+
+int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int channels = bottom_top_blob.c;
+ int size = w * h;
+
+ // squared values with local_size padding
+ Mat square_blob;
+ square_blob.create(w, h, channels);
+ if (square_blob.empty())
+ return -100;
+
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _outp = vmulq_f32(_p, _p);
+ vst1q_f32(outptr, _outp);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = *ptr * *ptr;
+
+ ptr++;
+ outptr++;
+ }
+ }
+
+ float alpha_div_size = alpha / local_size;
+
+ if (region_type == NormRegion_ACROSS_CHANNELS)
+ {
+ Mat square_sum;
+ square_sum.create(w, h, channels);
+ if (square_sum.empty())
+ return -100;
+ square_sum.fill(0.f);
+
+ #pragma omp parallel for
+ for (int q=0; q= channels)
+ continue;
+
+ const float* sptr = square_blob.channel(p);
+ float* ssptr = square_sum.channel(q);
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ for (; nn>0; nn--)
+ {
+ float32x4_t _sp = vld1q_f32(sptr);
+ float32x4_t _ssp = vld1q_f32(ssptr);
+ _ssp = vaddq_f32(_ssp, _sp);
+ vst1q_f32(ssptr, _ssp);
+
+ sptr += 4;
+ ssptr += 4;
+ }
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *ssptr += *sptr;
+ sptr++;
+ ssptr++;
+ }
+ }
+
+ float* ptr = bottom_top_blob.channel(q);
+ float* ssptr = square_sum.channel(q);
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _v1 = vdupq_n_f32(1.f);
+ float32x4_t _ads = vdupq_n_f32(alpha_div_size);
+ float32x4_t _mb = vdupq_n_f32(-beta);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _ssp = vld1q_f32(ssptr);
+ _ssp = vmulq_f32(_ssp, _ads);
+ _ssp = vaddq_f32(_ssp, _v1);
+ _ssp = pow_ps(_ssp, _mb);
+ _p = vmulq_f32(_p, _ssp);
+ vst1q_f32(ptr, _p);
+
+ ssptr += 4;
+ ptr += 4;
+ }
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *ptr = *ptr * pow(1.f + alpha_div_size * *ssptr, -beta);
+
+ ssptr++;
+ ptr++;
+ }
+ }
+ }
+ else if (region_type == NormRegion_WITHIN_CHANNEL)
+ {
+ int outw = w;
+ int outh = h;
+
+ Mat square_blob_bordered = square_blob;
+ int pad = local_size / 2;
+ if (pad > 0)
+ {
+ copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
+ if (square_blob_bordered.empty())
+ return -100;
+
+ w = square_blob_bordered.w;
+ h = square_blob_bordered.h;
+ }
+
+ const int maxk = local_size * local_size;
+
+ // norm window offsets
+ std::vector _space_ofs(maxk);
+ int* space_ofs = &_space_ofs[0];
+ {
+ int p1 = 0;
+ int p2 = 0;
+ int gap = w - local_size;
+ for (int i = 0; i < local_size; i++)
+ {
+ for (int j = 0; j < local_size; j++)
+ {
+ space_ofs[p1] = p2;
+ p1++;
+ p2++;
+ }
+ p2 += gap;
+ }
+ }
+
+ #pragma omp parallel for
+ for (int q=0; q
+
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 - 1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 - 1.2420140846E-1
+#define c_cephes_log_p4 + 1.4249322787E-1
+#define c_cephes_log_p5 - 1.6668057665E-1
+#define c_cephes_log_p6 + 2.0000714765E-1
+#define c_cephes_log_p7 - 2.4999993993E-1
+#define c_cephes_log_p8 + 3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+
+/* natural logarithm computed for 4 simultaneous float
+ * return NaN for x <= 0
+ */
+static inline float32x4_t log_ps(float32x4_t x)
+{
+ float32x4_t one = vdupq_n_f32(1);
+
+ x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+ uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+ int32x4_t ux = vreinterpretq_s32_f32(x);
+
+ int32x4_t emm0 = vshrq_n_s32(ux, 23);
+
+ /* keep only the fractional part */
+ ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+ ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+ x = vreinterpretq_f32_s32(ux);
+
+ emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+ float32x4_t e = vcvtq_f32_s32(emm0);
+
+ e = vaddq_f32(e, one);
+
+ /* part2:
+ * if( x < SQRTHF ) {
+ * e -= 1;
+ * x = x + x - 1.0;
+ * } else { x = x - 1.0; }
+ */
+ uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+ float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+ x = vsubq_f32(x, one);
+ e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+ x = vaddq_f32(x, tmp);
+
+ float32x4_t z = vmulq_f32(x,x);
+
+ float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+ y = vmulq_f32(y, x);
+
+ y = vmulq_f32(y, z);
+
+
+ tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+ y = vaddq_f32(y, tmp);
+
+
+ tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+ y = vsubq_f32(y, tmp);
+
+ tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+ x = vaddq_f32(x, y);
+ x = vaddq_f32(x, tmp);
+ x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+ return x;
+}
+
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+
+/* exp() computed for 4 float at once */
+static inline float32x4_t exp_ps(float32x4_t x)
+{
+ float32x4_t tmp, fx;
+
+ float32x4_t one = vdupq_n_f32(1);
+ x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+ x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+
+ /* express exp(x) as exp(g + n*log(2)) */
+ fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+
+ /* perform a floorf */
+ tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+ /* if greater, substract 1 */
+ uint32x4_t mask = vcgtq_f32(tmp, fx);
+ mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+
+
+ fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+ tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+ float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+ x = vsubq_f32(x, tmp);
+ x = vsubq_f32(x, z);
+
+ static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
+ float32x4_t y = vld1q_dup_f32(cephes_exp_p+0);
+ float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1);
+ float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2);
+ float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3);
+ float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4);
+ float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5);
+
+ y = vmulq_f32(y, x);
+ z = vmulq_f32(x, x);
+
+ y = vaddq_f32(y, c1);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, c2);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, c3);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, c4);
+ y = vmulq_f32(y, x);
+ y = vaddq_f32(y, c5);
+
+ y = vmulq_f32(y, z);
+ y = vaddq_f32(y, x);
+ y = vaddq_f32(y, one);
+
+ /* build 2^n */
+ int32x4_t mm;
+ mm = vcvtq_s32_f32(fx);
+ mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+ mm = vshlq_n_s32(mm, 23);
+ float32x4_t pow2n = vreinterpretq_f32_s32(mm);
+
+ y = vmulq_f32(y, pow2n);
+ return y;
+}
+
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1 8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0 2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2 4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+
+/* evaluation of 4 sines & cosines at once.
+ *
+ * The code is the exact rewriting of the cephes sinf function.
+ * Precision is excellent as long as x < 8192 (I did not bother to
+ * take into account the special handling they have for greater values
+ * -- it does not return garbage for arguments over 8192, though, but
+ * the extra precision is missing).
+ *
+ * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+ * surprising but correct result.
+ *
+ * Note also that when you compute sin(x), cos(x) is available at
+ * almost no extra price so both sin_ps and cos_ps make use of
+ * sincos_ps..
+ */
+static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos)
+{
+ // any x
+ float32x4_t xmm1, xmm2, xmm3, y;
+
+ uint32x4_t emm2;
+
+ uint32x4_t sign_mask_sin, sign_mask_cos;
+ sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+ x = vabsq_f32(x);
+
+ /* scale by 4/Pi */
+ y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+
+ /* store the integer part of y in mm0 */
+ emm2 = vcvtq_u32_f32(y);
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+ emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+ y = vcvtq_f32_u32(emm2);
+
+ /* get the polynom selection mask
+ * there is one polynom for 0 <= x <= Pi/4
+ * and another one for Pi/4
+#endif // __ARM_NEON
+
+static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = outw - (nn << 2);
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _r00 = vld1q_f32(r0);
+ float32x4_t _r10 = vld1q_f32(r1);
+ float32x4_t _r01 = vld1q_f32(r0 + 4);
+ float32x4_t _r11 = vld1q_f32(r1 + 4);
+
+ float32x4_t _max0 = vmaxq_f32(_r00, _r10);
+ float32x4_t _max1 = vmaxq_f32(_r01, _r11);
+
+ float32x4_t _max = vpmaxq_f32(_max0, _max1);
+
+ vst1q_f32(outptr, _max);
+
+ r0 += 8;
+ r1 += 8;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "pld [%2, #256] \n"
+ "vld1.f32 {d0-d3}, [%1]! \n"
+ "vld1.f32 {d4-d7}, [%2]! \n"
+ "vmax.f32 q0, q0, q2 \n"
+ "vmax.f32 q1, q1, q3 \n"
+ "vpmax.f32 d4, d0, d1 \n"
+ "vpmax.f32 d5, d2, d3 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d4-d5}, [%3]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(r0), // %1
+ "=r"(r1), // %2
+ "=r"(outptr) // %3
+ : "0"(nn),
+ "1"(r0),
+ "2"(r1),
+ "3"(outptr)
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ float max0 = std::max(r0[0], r0[1]);
+ float max1 = std::max(r1[0], r1[1]);
+
+ *outptr = std::max(max0, max1);
+
+ r0 += 2;
+ r1 += 2;
+ outptr++;
+ }
+
+ r0 += w;
+ r1 += w;
+ }
+ }
+}
diff --git a/src/layer/arm/pooling_3x3.h b/src/layer/arm/pooling_3x3.h
new file mode 100644
index 00000000000..47dad16d22a
--- /dev/null
+++ b/src/layer/arm/pooling_3x3.h
@@ -0,0 +1,170 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int inch = bottom_blob.c;
+
+ int outw = top_blob.w;
+ int outh = top_blob.h;
+ int outch = top_blob.c;
+
+ const int tailstep = w - 2*outw + w;
+
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = outw - (nn << 2);
+#else
+ int remain = outw;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ float32x4x2_t _r0 = vld2q_f32(r0);
+ float32x4x2_t _r1 = vld2q_f32(r1);
+ float32x4x2_t _r2 = vld2q_f32(r2);
+ for (; nn>0; nn--)
+ {
+ float32x4x2_t _r0n = vld2q_f32(r0+8);
+ float32x4x2_t _r1n = vld2q_f32(r1+8);
+ float32x4x2_t _r2n = vld2q_f32(r2+8);
+
+ float32x4_t _max0 = vmaxq_f32(_r0.val[0], _r0.val[1]);
+ float32x4_t _max1 = vmaxq_f32(_r1.val[0], _r1.val[1]);
+ float32x4_t _max2 = vmaxq_f32(_r2.val[0], _r2.val[1]);
+
+ float32x4_t _r02 = vextq_f32(_r0.val[0], _r0n.val[0], 1);
+ float32x4_t _r12 = vextq_f32(_r1.val[0], _r1n.val[0], 1);
+ float32x4_t _r22 = vextq_f32(_r2.val[0], _r2n.val[0], 1);
+
+ _max0 = vmaxq_f32(_max0, _r02);
+ _max1 = vmaxq_f32(_max1, _r12);
+ _max2 = vmaxq_f32(_max2, _r22);
+
+ float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2);
+
+ vst1q_f32(outptr, _max);
+
+ _r0 = _r0n;
+ _r1 = _r1n;
+ _r2 = _r2n;
+
+ r0 += 8;
+ r1 += 8;
+ r2 += 8;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "pld [%1, #256] \n"
+ "vld2.f32 {d0-d3}, [%1]! \n"// q0 = 0 2 4 6 q1 = 1 3 5 7
+ "pld [%2, #256] \n"
+ "vld2.f32 {d4-d7}, [%2]! \n"
+ "pld [%3, #256] \n"
+ "vld2.f32 {d8-d11}, [%3]! \n"
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld2.f32 {d12-d15}, [%1]! \n"// q6 = 8 10 12 14 q7 = 9 11 13 15
+
+ "vmax.f32 q12, q0, q1 \n"
+ "vmax.f32 q13, q2, q3 \n"
+
+ "pld [%2, #256] \n"
+ "vld2.f32 {d16-d19}, [%2]! \n"
+
+ "vmax.f32 q14, q4, q5 \n"
+ "vext.32 q0, q0, q6, #1 \n"
+
+ "pld [%3, #256] \n"
+ "vld2.f32 {d20-d23}, [%3]! \n"
+
+ "vext.32 q2, q2, q8, #1 \n"
+
+ "vmax.f32 q12, q12, q0 \n"
+ "vext.32 q4, q4, q10, #1 \n"
+
+ "vmax.f32 q13, q13, q2 \n"
+ "vmax.f32 q14, q14, q4 \n"
+ "vmax.f32 q12, q12, q13 \n"
+
+ "vorr q0, q6, q6 \n"
+ "vorr q1, q7, q7 \n"
+ "vmax.f32 q12, q12, q14 \n"
+
+ "vorr q2, q8, q8 \n"
+ "vorr q3, q9, q9 \n"
+ "vorr q4, q10, q10 \n"
+ "vorr q5, q11, q11 \n"
+
+ "subs %0, #1 \n"
+ "vst1.f32 {d24-d25}, [%4]! \n"
+ "bne 0b \n"
+ "sub %1, #32 \n"
+ "sub %2, #32 \n"
+ "sub %3, #32 \n"
+ : "=r"(nn), // %0
+ "=r"(r0), // %1
+ "=r"(r1), // %2
+ "=r"(r2), // %3
+ "=r"(outptr) // %4
+ : "0"(nn),
+ "1"(r0),
+ "2"(r1),
+ "3"(r2),
+ "4"(outptr)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ float max0 = std::max(std::max(r0[0], r0[1]), r0[2]);
+ float max1 = std::max(std::max(r1[0], r1[1]), r1[2]);
+ float max2 = std::max(std::max(r2[0], r2[1]), r2[2]);
+
+ *outptr = std::max(std::max(max0, max1), max2);
+
+ r0 += 2;
+ r1 += 2;
+ r2 += 2;
+ outptr++;
+ }
+
+ r0 += tailstep;//1 + w;
+ r1 += tailstep;//1 + w;
+ r2 += tailstep;//1 + w;
+ }
+ }
+}
diff --git a/src/layer/arm/pooling_arm.cpp b/src/layer/arm/pooling_arm.cpp
new file mode 100644
index 00000000000..59c1c997f9e
--- /dev/null
+++ b/src/layer/arm/pooling_arm.cpp
@@ -0,0 +1,96 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pooling_arm.h"
+
+namespace ncnn {
+
+#include "pooling_2x2.h"
+#include "pooling_3x3.h"
+
+DEFINE_LAYER_CREATOR(Pooling_arm)
+
+int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ // max value in NxN window
+ // avg value in NxN window
+
+ if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
+ {
+ return Pooling::forward(bottom_blob, top_blob);
+ }
+
+ if (kernel_size != 2 && kernel_size != 3)
+ {
+ return Pooling::forward(bottom_blob, top_blob);
+ }
+
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+
+ Mat bottom_blob_bordered = bottom_blob;
+ if (pad > 0)
+ {
+ copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
+ if (bottom_blob_bordered.empty())
+ return -100;
+
+ w = bottom_blob_bordered.w;
+ h = bottom_blob_bordered.h;
+ }
+
+ int outw = (w - kernel_size) / stride + 1;
+ int outh = (h - kernel_size) / stride + 1;
+
+ int wtail = (w - kernel_size) % stride;
+ int htail = (h - kernel_size) % stride;
+ if (wtail != 0 || htail != 0)
+ {
+ int wtailpad = 0;
+ int htailpad = 0;
+ if (wtail != 0)
+ wtailpad = kernel_size - wtail;
+ if (htail != 0)
+ htailpad = kernel_size - htail;
+
+ Mat bottom_blob_bordered2;
+ copy_make_border(bottom_blob_bordered, bottom_blob_bordered2, 0, htailpad, 0, wtailpad, BORDER_REPLICATE, 0.f);
+ if (bottom_blob_bordered2.empty())
+ return -100;
+
+ bottom_blob_bordered = bottom_blob_bordered2;
+
+ w = bottom_blob_bordered.w;
+ h = bottom_blob_bordered.h;
+
+ if (wtail != 0)
+ outw += 1;
+ if (htail != 0)
+ outh += 1;
+ }
+
+ top_blob.create(outw, outh, channels);
+ if (top_blob.empty())
+ return -100;
+
+ if (kernel_size == 2)
+ pooling2x2s2_max_neon(bottom_blob_bordered, top_blob);
+ if (kernel_size == 3)
+ pooling3x3s2_max_neon(bottom_blob_bordered, top_blob);
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/pooling_arm.h b/src/layer/arm/pooling_arm.h
new file mode 100644
index 00000000000..b7d774fa273
--- /dev/null
+++ b/src/layer/arm/pooling_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_POOLING_ARM_H
+#define LAYER_POOLING_ARM_H
+
+#include "pooling.h"
+
+namespace ncnn {
+
+class Pooling_arm : public Pooling
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_POOLING_ARM_H
diff --git a/src/layer/arm/prelu_arm.cpp b/src/layer/arm/prelu_arm.cpp
new file mode 100644
index 00000000000..72d9ae0fba6
--- /dev/null
+++ b/src/layer/arm/prelu_arm.cpp
@@ -0,0 +1,182 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "prelu_arm.h"
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(PReLU_arm)
+
+int PReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ const float* slope_data_ptr = slope_data;
+
+ #pragma omp parallel for
+ for (int q=0; q 1 ? slope_data_ptr[q] : slope_data_ptr[0];
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ float32x4_t _zero = vdupq_n_f32(0.f);
+ float32x4_t _slope = vdupq_n_f32(slope);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ uint32x4_t _lemask = vcleq_f32(_p, _zero);
+ float32x4_t _ps = vmulq_f32(_p, _slope);
+ float32x4_t _outp = vbslq_f32(_lemask, _ps, _p);
+ vst1q_f32(outptr, _outp);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "veor q1, q0, q0 \n"
+ "vdup.f32 q2, %6 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128] \n"
+ "vcle.f32 q3, q0, q1 \n"
+ "vmul.f32 q4, q0, q2 \n"
+ "vbit.32 q0, q4, q3 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr),
+ "r"(slope) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ if (*ptr < 0)
+ *outptr = *ptr * slope;
+ else
+ *outptr = *ptr;
+
+ ptr++;
+ outptr++;
+ }
+ }
+
+ return 0;
+}
+
+int PReLU_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int channels = bottom_top_blob.c;
+ int size = w * h;
+
+ const float* slope_data_ptr = slope_data;
+
+ #pragma omp parallel for
+ for (int q=0; q 1 ? slope_data_ptr[q] : slope_data_ptr[0];
+
+#if __ARM_NEON
+ int nn = size >> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ float32x4_t _zero = vdupq_n_f32(0.f);
+ float32x4_t _slope = vdupq_n_f32(slope);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ uint32x4_t _lemask = vcleq_f32(_p, _zero);
+ float32x4_t _ps = vmulq_f32(_p, _slope);
+ _p = vbslq_f32(_lemask, _ps, _p);
+ vst1q_f32(ptr, _p);
+
+ ptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "veor q1, q0, q0 \n"
+ "vdup.f32 q2, %4 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128] \n"
+ "vcle.f32 q3, q0, q1 \n"
+ "vmul.f32 q4, q0, q2 \n"
+ "vbit.32 q0, q4, q3 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%1 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn),
+ "1"(ptr),
+ "r"(slope) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ if (*ptr < 0)
+ *ptr *= slope;
+
+ ptr++;
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/prelu_arm.h b/src/layer/arm/prelu_arm.h
new file mode 100644
index 00000000000..fbd32f7fe6c
--- /dev/null
+++ b/src/layer/arm/prelu_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PRELU_ARM_H
+#define LAYER_PRELU_ARM_H
+
+#include "prelu.h"
+
+namespace ncnn {
+
+class PReLU_arm : public PReLU
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PRELU_ARM_H
diff --git a/src/layer/arm/relu_arm.cpp b/src/layer/arm/relu_arm.cpp
new file mode 100644
index 00000000000..5477c37afe1
--- /dev/null
+++ b/src/layer/arm/relu_arm.cpp
@@ -0,0 +1,295 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "relu_arm.h"
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(ReLU_arm)
+
+int ReLU_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ if (slope == 0.f)
+ {
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ float32x4_t _zero = vdupq_n_f32(0.f);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _outp = vmaxq_f32(_p, _zero);
+ vst1q_f32(outptr, _outp);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "veor q1, q0, q0 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128]! \n"
+ "vmax.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr)
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = std::max(*ptr, 0.f);
+
+ ptr++;
+ outptr++;
+ }
+ }
+ }
+ else
+ {
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ float32x4_t _zero = vdupq_n_f32(0.f);
+ float32x4_t _slope = vdupq_n_f32(slope);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ uint32x4_t _lemask = vcleq_f32(_p, _zero);
+ float32x4_t _ps = vmulq_f32(_p, _slope);
+ float32x4_t _outp = vbslq_f32(_lemask, _ps, _p);
+ vst1q_f32(outptr, _outp);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "veor q1, q0, q0 \n"
+ "vdup.f32 q2, %6 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128] \n"
+ "vcle.f32 q3, q0, q1 \n"
+ "vmul.f32 q4, q0, q2 \n"
+ "vbit.32 q0, q4, q3 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr),
+ "r"(slope) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ if (*ptr < 0)
+ *outptr = *ptr * slope;
+ else
+ *outptr = *ptr;
+
+ ptr++;
+ outptr++;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int ReLU_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int channels = bottom_top_blob.c;
+ int size = w * h;
+
+ if (slope == 0.f)
+ {
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ float32x4_t _zero = vdupq_n_f32(0.f);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ _p = vmaxq_f32(_p, _zero);
+ vst1q_f32(ptr, _p);
+
+ ptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "veor q1, q0, q0 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128] \n"
+ "vmax.f32 q0, q0, q1 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%1 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn),
+ "1"(ptr)
+ : "cc", "memory", "q0", "q1"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *ptr = std::max(*ptr, 0.f);
+
+ ptr++;
+ }
+ }
+ }
+ else
+ {
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ float32x4_t _zero = vdupq_n_f32(0.f);
+ float32x4_t _slope = vdupq_n_f32(slope);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ uint32x4_t _lemask = vcleq_f32(_p, _zero);
+ float32x4_t _ps = vmulq_f32(_p, _slope);
+ _p = vbslq_f32(_lemask, _ps, _p);
+ vst1q_f32(ptr, _p);
+
+ ptr += 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "veor q1, q0, q0 \n"
+ "vdup.f32 q2, %4 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.f32 {d0-d1}, [%1 :128] \n"
+ "vcle.f32 q3, q0, q1 \n"
+ "vmul.f32 q4, q0, q2 \n"
+ "vbit.32 q0, q4, q3 \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d1}, [%1 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr) // %1
+ : "0"(nn),
+ "1"(ptr),
+ "r"(slope) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ if (*ptr < 0)
+ *ptr *= slope;
+
+ ptr++;
+ }
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/relu_arm.h b/src/layer/arm/relu_arm.h
new file mode 100644
index 00000000000..294a28b8fd8
--- /dev/null
+++ b/src/layer/arm/relu_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RELU_ARM_H
+#define LAYER_RELU_ARM_H
+
+#include "relu.h"
+
+namespace ncnn {
+
+class ReLU_arm : public ReLU
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RELU_ARM_H
diff --git a/src/layer/arm/scale_arm.cpp b/src/layer/arm/scale_arm.cpp
new file mode 100644
index 00000000000..754001d301f
--- /dev/null
+++ b/src/layer/arm/scale_arm.cpp
@@ -0,0 +1,211 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "scale_arm.h"
+
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Scale_arm)
+
+int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ if (bias_term)
+ {
+ const float* scale_ptr = scale_data;
+ const float* bias_ptr = bias_data;
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _s = vdupq_n_f32(s);
+ float32x4_t _bias = vdupq_n_f32(bias);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ _p = vmlaq_f32(_bias, _p, _s);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *outptr = *ptr * s + bias;
+
+ ptr++;
+ outptr++;
+ }
+ }
+ }
+ else
+ {
+ const float* scale_ptr = scale_data;
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _s = vdupq_n_f32(s);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ _p = vmulq_f32(_p, _s);
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *outptr = *ptr * s;
+
+ ptr++;
+ outptr++;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int Scale_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int channels = bottom_top_blob.c;
+ int size = w * h;
+
+ if (bias_term)
+ {
+ const float* scale_ptr = scale_data;
+ const float* bias_ptr = bias_data;
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _s = vdupq_n_f32(s);
+ float32x4_t _bias = vdupq_n_f32(bias);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ _p = vmlaq_f32(_bias, _p, _s);
+ vst1q_f32(ptr, _p);
+
+ ptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *ptr = *ptr * s + bias;
+
+ ptr++;
+ }
+ }
+ }
+ else
+ {
+ const float* scale_ptr = scale_data;
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _s = vdupq_n_f32(s);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ _p = vmulq_f32(_p, _s);
+ vst1q_f32(ptr, _p);
+
+ ptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *ptr *= s;
+
+ ptr++;
+ }
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/scale_arm.h b/src/layer/arm/scale_arm.h
new file mode 100644
index 00000000000..11a739b90d5
--- /dev/null
+++ b/src/layer/arm/scale_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SCALE_ARM_H
+#define LAYER_SCALE_ARM_H
+
+#include "scale.h"
+
+namespace ncnn {
+
+class Scale_arm : public Scale
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SCALE_ARM_H
diff --git a/src/layer/arm/sigmoid_arm.cpp b/src/layer/arm/sigmoid_arm.cpp
new file mode 100644
index 00000000000..754f9cf5bc5
--- /dev/null
+++ b/src/layer/arm/sigmoid_arm.cpp
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "sigmoid_arm.h"
+
+#if __ARM_NEON
+#include
+#include "neon_mathfun.h"
+#endif // __ARM_NEON
+
+#include
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Sigmoid_arm)
+
+int Sigmoid_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _one = vdupq_n_f32(1.f);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ _p = vnegq_f32(_p);
+ _p = exp_ps(_p);
+ _p = vaddq_f32(_p, _one);
+ float32x4_t _outp = vrecpeq_f32(_p);
+ _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+// _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+ vst1q_f32(outptr, _outp);
+
+ ptr += 4;
+ outptr += 4;
+ }
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr = 1.f / (1.f + exp(-*ptr));
+
+ ptr++;
+ outptr++;
+ }
+ }
+
+ return 0;
+}
+
+int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int channels = bottom_top_blob.c;
+ int size = w * h;
+
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ float32x4_t _one = vdupq_n_f32(1.f);
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ _p = vnegq_f32(_p);
+ _p = exp_ps(_p);
+ _p = vaddq_f32(_p, _one);
+ _p = vrecpeq_f32(_p);
+ _p = vmulq_f32(vrecpsq_f32(_p, _p), _p);
+// _p = vmulq_f32(vrecpsq_f32(_p, _p), _p);
+ vst1q_f32(ptr, _p);
+
+ ptr += 4;
+ }
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *ptr = 1.f / (1.f + exp(-*ptr));
+
+ ptr++;
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h
new file mode 100644
index 00000000000..7fe558db561
--- /dev/null
+++ b/src/layer/arm/sigmoid_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SIGMOID_ARM_H
+#define LAYER_SIGMOID_ARM_H
+
+#include "sigmoid.h"
+
+namespace ncnn {
+
+class Sigmoid_arm : public Sigmoid
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SIGMOID_ARM_H
diff --git a/src/layer/arm/slice_arm.cpp b/src/layer/arm/slice_arm.cpp
new file mode 100644
index 00000000000..b4a7801b8f0
--- /dev/null
+++ b/src/layer/arm/slice_arm.cpp
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "slice_arm.h"
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Slice_arm)
+
+int Slice_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs) const
+{
+ const Mat& bottom_blob = bottom_blobs[0];
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+
+ int q = 0;
+ const int* slices_ptr = (const int*)slices.data;
+ for (size_t i=0; i> 3;
+ int remain = size - (nn << 3);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _p2 = vld1q_f32(ptr+4);
+ vst1q_f32(outptr, _p);
+ vst1q_f32(outptr+4, _p2);
+
+ ptr += 8;
+ outptr += 8;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.f32 {d0-d3}, [%1 :128]! \n"
+ "subs %0, #1 \n"
+ "vst1.f32 {d0-d3}, [%2 :128]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(ptr), // %1
+ "=r"(outptr) // %2
+ : "0"(nn),
+ "1"(ptr),
+ "2"(outptr)
+ : "cc", "memory", "q0"
+ );
+ }
+#endif // __aarch64__
+#endif // __ARM_NEON
+ for (; remain>0; remain--)
+ {
+ *outptr++ = *ptr++;
+ }
+
+ q += slice;
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/slice_arm.h b/src/layer/arm/slice_arm.h
new file mode 100644
index 00000000000..16e97dc8226
--- /dev/null
+++ b/src/layer/arm/slice_arm.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SLICE_ARM_H
+#define LAYER_SLICE_ARM_H
+
+#include "slice.h"
+
+namespace ncnn {
+
+class Slice_arm : public Slice
+{
+public:
+ virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SLICE_ARM_H
diff --git a/src/layer/arm/softmax_arm.cpp b/src/layer/arm/softmax_arm.cpp
new file mode 100644
index 00000000000..09ed21a7824
--- /dev/null
+++ b/src/layer/arm/softmax_arm.cpp
@@ -0,0 +1,302 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "softmax_arm.h"
+#include
+#include
+
+#if __ARM_NEON
+#include
+#include "neon_mathfun.h"
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Softmax_arm)
+
+int Softmax_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ // value = exp( value - global max value )
+ // sum all value
+ // value = value / sum
+
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ Mat max;
+ max.create(w, h);
+ if (max.empty())
+ return -100;
+ max.fill(-FLT_MAX);
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _max = vld1q_f32(maxptr);
+
+ _p = exp_ps(vsubq_f32(_p, _max));
+
+ vst1q_f32(outptr, _p);
+
+ ptr += 4;
+ maxptr += 4;
+ outptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *outptr = exp(*ptr - *maxptr);
+
+ ptr++;
+ maxptr++;
+ outptr++;
+ }
+ }
+
+ Mat sum;
+ sum.create(w, h);
+ if (sum.empty())
+ return -100;
+ sum.fill(0.f);
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(outptr);
+ float32x4_t _sum = vld1q_f32(sumptr);
+#if __aarch64__
+ _p = vdivq_f32(_p, _sum);
+#else
+ _p = div_ps(_p, _sum);
+#endif // __aarch64__
+ vst1q_f32(outptr, _p);
+
+ outptr += 4;
+ sumptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *outptr /= *sumptr;
+
+ outptr++;
+ sumptr++;
+ }
+ }
+
+ return 0;
+}
+
+int Softmax_arm::forward_inplace(Mat& bottom_top_blob) const
+{
+ // value = exp( value - global max value )
+ // sum all value
+ // value = value / sum
+
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int channels = bottom_top_blob.c;
+ int size = w * h;
+
+ Mat max;
+ max.create(w, h);
+ if (max.empty())
+ return -100;
+ max.fill(-FLT_MAX);
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _max = vld1q_f32(maxptr);
+
+ _p = exp_ps(vsubq_f32(_p, _max));
+
+ vst1q_f32(ptr, _p);
+
+ ptr += 4;
+ maxptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *ptr = exp(*ptr - *maxptr);
+
+ ptr++;
+ maxptr++;
+ }
+ }
+
+ Mat sum;
+ sum.create(w, h);
+ if (sum.empty())
+ return -100;
+ sum.fill(0.f);
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _sum = vld1q_f32(sumptr);
+ _sum = vaddq_f32(_sum, _p);
+ vst1q_f32(sumptr, _sum);
+
+ ptr += 4;
+ sumptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *sumptr += *ptr;
+
+ ptr++;
+ sumptr++;
+ }
+ }
+
+ #pragma omp parallel for
+ for (int q=0; q> 2;
+ int remain = size - (nn << 2);
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ for (; nn>0; nn--)
+ {
+ float32x4_t _p = vld1q_f32(ptr);
+ float32x4_t _sum = vld1q_f32(sumptr);
+#if __aarch64__
+ _p = vdivq_f32(_p, _sum);
+#else
+ _p = div_ps(_p, _sum);
+#endif // __aarch64__
+ vst1q_f32(ptr, _p);
+
+ ptr += 4;
+ sumptr += 4;
+ }
+#endif // __ARM_NEON
+
+ for (; remain>0; remain--)
+ {
+ *ptr /= *sumptr;
+
+ ptr++;
+ sumptr++;
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/softmax_arm.h b/src/layer/arm/softmax_arm.h
new file mode 100644
index 00000000000..3eea580ebeb
--- /dev/null
+++ b/src/layer/arm/softmax_arm.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SOFTMAX_ARM_H
+#define LAYER_SOFTMAX_ARM_H
+
+#include "softmax.h"
+
+namespace ncnn {
+
+class Softmax_arm : public Softmax
+{
+public:
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SOFTMAX_ARM_H
diff --git a/src/layer/batchnorm.cpp b/src/layer/batchnorm.cpp
new file mode 100644
index 00000000000..ab6c3e25c58
--- /dev/null
+++ b/src/layer/batchnorm.cpp
@@ -0,0 +1,227 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "batchnorm.h"
+#include
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(BatchNorm)
+
+BatchNorm::BatchNorm()
+{
+ one_blob_only = true;
+ support_inplace = true;
+}
+
+BatchNorm::~BatchNorm()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int BatchNorm::load_param(FILE* paramfp)
+{
+ int nscan = fscanf(paramfp, "%d", &channels);
+ if (nscan != 1)
+ {
+ fprintf(stderr, "BatchNorm load_param failed %d\n", nscan);
+ return -1;
+ }
+
+ return 0;
+}
+#endif // NCNN_STRING
+int BatchNorm::load_param_bin(FILE* paramfp)
+{
+ fread(&channels, sizeof(int), 1, paramfp);
+
+ return 0;
+}
+
+int BatchNorm::load_model(FILE* binfp)
+{
+ int nread;
+
+ slope_data.create(channels);
+ if (slope_data.empty())
+ return -100;
+ nread = fread(slope_data, channels * sizeof(float), 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "BatchNorm read slope_data failed %d\n", nread);
+ return -1;
+ }
+
+ mean_data.create(channels);
+ if (mean_data.empty())
+ return -100;
+ nread = fread(mean_data, channels * sizeof(float), 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "BatchNorm read mean_data failed %d\n", nread);
+ return -1;
+ }
+
+ var_data.create(channels);
+ if (var_data.empty())
+ return -100;
+ nread = fread(var_data, channels * sizeof(float), 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "BatchNorm read var_data failed %d\n", nread);
+ return -1;
+ }
+
+ bias_data.create(channels);
+ if (bias_data.empty())
+ return -100;
+ nread = fread(bias_data, channels * sizeof(float), 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "BatchNorm read bias_data failed %d\n", nread);
+ return -1;
+ }
+
+ a_data.create(channels);
+ if (a_data.empty())
+ return -100;
+ b_data.create(channels);
+ if (b_data.empty())
+ return -100;
+ const float* slope_data_ptr = slope_data;
+ const float* mean_data_ptr = mean_data;
+ const float* var_data_ptr = var_data;
+ const float* bias_data_ptr = bias_data;
+ float* a_data_ptr = a_data;
+ float* b_data_ptr = b_data;
+ for (int i=0; i
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(BNLL)
+
+BNLL::BNLL()
+{
+ one_blob_only = true;
+ support_inplace = true;
+}
+
+int BNLL::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+ int size = w * h;
+
+ top_blob.create(w, h, channels);
+ if (top_blob.empty())
+ return -100;
+
+ #pragma omp parallel for
+ for (int q=0; q 0)
+ outptr[i] = ptr[i] + log(1.f + exp(-ptr[i]));
+ else
+ outptr[i] = log(1.f + exp(ptr[i]));
+ }
+ }
+
+ return 0;
+}
+
+int BNLL::forward_inplace(Mat& bottom_top_blob) const
+{
+ int w = bottom_top_blob.w;
+ int h = bottom_top_blob.h;
+ int channels = bottom_top_blob.c;
+ int size = w * h;
+
+ #pragma omp parallel for
+ for (int q=0; q 0)
+ ptr[i] = ptr[i] + log(1.f + exp(-ptr[i]));
+ else
+ ptr[i] = log(1.f + exp(ptr[i]));
+ }
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/bnll.h b/src/layer/bnll.h
new file mode 100644
index 00000000000..490dbdedf5e
--- /dev/null
+++ b/src/layer/bnll.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BNLL_H
+#define LAYER_BNLL_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class BNLL : public Layer
+{
+public:
+ BNLL();
+
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+
+ virtual int forward_inplace(Mat& bottom_top_blob) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BNLL_H
diff --git a/src/layer/concat.cpp b/src/layer/concat.cpp
new file mode 100644
index 00000000000..6c1b9dc89d2
--- /dev/null
+++ b/src/layer/concat.cpp
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "concat.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Concat)
+
+Concat::Concat()
+{
+}
+
+int Concat::forward(const std::vector& bottom_blobs, std::vector& top_blobs) const
+{
+ int w = bottom_blobs[0].w;
+ int h = bottom_blobs[0].h;
+
+ // total channels
+ int top_channels = 0;
+ for (size_t b=0; b& bottom_blobs, std::vector& top_blobs) const;
+
+public:
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONCAT_H
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
new file mode 100644
index 00000000000..f638b7f6fdb
--- /dev/null
+++ b/src/layer/convolution.cpp
@@ -0,0 +1,350 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Convolution)
+
+Convolution::Convolution()
+{
+ one_blob_only = true;
+ support_inplace = false;
+}
+
+Convolution::~Convolution()
+{
+}
+
+#if NCNN_STDIO
+#if NCNN_STRING
+int Convolution::load_param(FILE* paramfp)
+{
+ int nscan = fscanf(paramfp, "%d %d %d %d %d %d %d",
+ &num_output, &kernel_size, &dilation, &stride, &pad, &bias_term,
+ &weight_data_size);
+ if (nscan != 7)
+ {
+ fprintf(stderr, "Convolution load_param failed %d\n", nscan);
+ return -1;
+ }
+
+ return 0;
+}
+#endif // NCNN_STRING
+int Convolution::load_param_bin(FILE* paramfp)
+{
+ fread(&num_output, sizeof(int), 1, paramfp);
+
+ fread(&kernel_size, sizeof(int), 1, paramfp);
+
+ fread(&dilation, sizeof(int), 1, paramfp);
+
+ fread(&stride, sizeof(int), 1, paramfp);
+
+ fread(&pad, sizeof(int), 1, paramfp);
+
+ fread(&bias_term, sizeof(int), 1, paramfp);
+
+ fread(&weight_data_size, sizeof(int), 1, paramfp);
+
+ return 0;
+}
+
+int Convolution::load_model(FILE* binfp)
+{
+ int nread;
+
+ union
+ {
+ struct
+ {
+ unsigned char f0;
+ unsigned char f1;
+ unsigned char f2;
+ unsigned char f3;
+ };
+ unsigned int tag;
+ } flag_struct;
+
+ nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "Convolution read flag_struct failed %d\n", nread);
+ return -1;
+ }
+
+ unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+ weight_data.create(weight_data_size);
+ if (weight_data.empty())
+ return -100;
+
+ if (flag_struct.tag == 0x01306B47)
+ {
+ // half-precision weight data
+ int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
+ std::vector float16_weights;
+ float16_weights.resize(align_weight_data_size);
+ nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "Convolution read float16_weights failed %d\n", nread);
+ return -1;
+ }
+
+ weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
+ if (weight_data.empty())
+ return -100;
+ }
+ else if (flag != 0)
+ {
+ // quantized weight data
+ float quantization_value[256];
+ nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "Convolution read quantization_value failed %d\n", nread);
+ return -1;
+ }
+
+ int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
+ std::vector index_array;
+ index_array.resize(align_weight_data_size);
+ nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "Convolution read index_array failed %d\n", nread);
+ return -1;
+ }
+
+ float* weight_data_ptr = weight_data;
+ for (int i = 0; i < weight_data_size; i++)
+ {
+ weight_data_ptr[i] = quantization_value[ index_array[i] ];
+ }
+ }
+ else if (flag_struct.f0 == 0)
+ {
+ // raw weight data
+ nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "Convolution read weight_data failed %d\n", nread);
+ return -1;
+ }
+ }
+
+ if (bias_term)
+ {
+ bias_data.create(num_output);
+ if (bias_data.empty())
+ return -100;
+ nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
+ if (nread != 1)
+ {
+ fprintf(stderr, "Convolution read bias_data failed %d\n", nread);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+#endif // NCNN_STDIO
+
+int Convolution::load_param(const unsigned char*& mem)
+{
+ num_output = *(int*)(mem);
+ mem += 4;
+
+ kernel_size = *(int*)(mem);
+ mem += 4;
+
+ dilation = *(int*)(mem);
+ mem += 4;
+
+ stride = *(int*)(mem);
+ mem += 4;
+
+ pad = *(int*)(mem);
+ mem += 4;
+
+ bias_term = *(int*)(mem);
+ mem += 4;
+
+ weight_data_size = *(int*)(mem);
+ mem += 4;
+
+ return 0;
+}
+
+int Convolution::load_model(const unsigned char*& mem)
+{
+ union
+ {
+ struct
+ {
+ unsigned char f0;
+ unsigned char f1;
+ unsigned char f2;
+ unsigned char f3;
+ };
+ unsigned int tag;
+ } flag_struct;
+
+ memcpy(&flag_struct, mem, sizeof(flag_struct));
+ mem += sizeof(flag_struct);
+
+ unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+ if (flag_struct.tag == 0x01306B47)
+ {
+ // half-precision weight data
+ weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
+ mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
+ if (weight_data.empty())
+ return -100;
+ }
+ else if (flag != 0)
+ {
+ // quantized weight data
+ const float* quantization_value = (const float*)mem;
+ mem += 256 * sizeof(float);
+
+ const unsigned char* index_array = (const unsigned char*)mem;
+ mem += alignSize(weight_data_size * sizeof(unsigned char), 4);
+
+ weight_data.create(weight_data_size);
+ if (weight_data.empty())
+ return -100;
+ float* weight_data_ptr = weight_data;
+ for (int i = 0; i < weight_data_size; i++)
+ {
+ weight_data_ptr[i] = quantization_value[ index_array[i] ];
+ }
+ }
+ else if (flag_struct.f0 == 0)
+ {
+ // raw weight data
+ weight_data = Mat(weight_data_size, (float*)mem);
+ mem += weight_data_size * sizeof(float);
+ }
+
+ if (bias_term)
+ {
+ bias_data = Mat(num_output, (float*)mem);
+ mem += num_output * sizeof(float);
+ }
+
+ return 0;
+}
+
+int Convolution::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+ // convolv with NxN kernel
+ // value = value + bias
+
+ int w = bottom_blob.w;
+ int h = bottom_blob.h;
+ int channels = bottom_blob.c;
+
+// fprintf(stderr, "Convolution input %d x %d pad = %d ksize=%d stride=%d\n", w, h, pad, kernel_size, stride);
+
+ Mat bottom_blob_bordered = bottom_blob;
+ if (pad > 0)
+ {
+ copy_make_border(bottom_blob, bottom_blob_bordered, pad, pad, pad, pad, BORDER_CONSTANT, 0.f);
+ if (bottom_blob_bordered.empty())
+ return -100;
+
+ w = bottom_blob_bordered.w;
+ h = bottom_blob_bordered.h;
+ }
+
+ const int kernel_extent = dilation * (kernel_size - 1) + 1;
+
+ int outw = (w - kernel_extent) / stride + 1;
+ int outh = (h - kernel_extent) / stride + 1;
+
+ top_blob.create(outw, outh, num_output);
+ if (top_blob.empty())
+ return -100;
+
+ const int maxk = kernel_size * kernel_size;
+
+ // kernel offsets
+ std::vector