diff --git a/.gitignore b/.gitignore index 941067c..4ea59ea 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,6 @@ sign/ sign.sh compat/curl-for-windows/ + +bin/ +*.log diff --git a/Android.mk b/Android.mk deleted file mode 100644 index d7d90ba..0000000 --- a/Android.mk +++ /dev/null @@ -1,77 +0,0 @@ -################################################################ -# Sample Android repo Makefile, used to test arm on the Tegra K1 -################################################################ - -cpuminer-src := $(call my-dir) - -LOCAL_PATH := $(cpuminer-src) -include $(CLEAR_VARS) - -LOCAL_MODULE=cpuminer-jansson -LOCAL_MODULE_TAGS=optional - -define all-c-files-under -$(patsubst ./%,%, \ - $(shell cd $(LOCAL_PATH) ; \ - find -L $(1) -name "*.c" -and -not -name ".*") \ - ) -endef - -LOCAL_SRC_FILES := $(call all-c-files-under,compat/jansson) -LOCAL_C_INCLUDES := $(cpuminer-src)/compat/jansson - -include $(BUILD_STATIC_LIBRARY) - -################################################################ - - -LOCAL_PATH := $(cpuminer-src) -include $(CLEAR_VARS) - -LOCAL_MODULE=cpuminer -LOCAL_MODULE_TAGS=optional -LOCAL_MODULE_CLASS := UTILITY_EXECUTABLES -LOCAL_MODULE_PATH := $(PRODUCT_OUT)/utilities -LOCAL_MODULE_STEM := $(LOCAL_MODULE) - -LOCAL_C_INCLUDES := $(cpuminer-src)/compat/bionic \ - $(cpuminer-src)/compat/jansson \ - $(TARGET_OUT_INTERMEDIATES)/include/libcurl \ - external/openssl/include \ - -LOCAL_CFLAGS := -std=c99 -Wno-pointer-sign -Wno-missing-field-initializers \ - -Wno-unused-parameter #-DNOASM -LOCAL_CFLAGS += -DVERSION=\"1.2\" - -sph_files:=$(call all-c-files-under,sha3) - -LOCAL_SRC_FILES=\ - cpu-miner.c util.c \ - api.c sysinfos.c \ - $(call all-c-files-under,algo) \ - $(filter-out sha3/md_helper.c,$(sph_files)) \ - $(call all-c-files-under,crypto) \ - $(call all-c-files-under,lyra2) \ - asm/sha2-$(TARGET_ARCH).S \ - asm/scrypt-$(TARGET_ARCH).S \ - asm/neoscrypt_asm.S - -LOCAL_STATIC_LIBRARIES := libm cpuminer-jansson -LOCAL_STATIC_LIBRARIES += libz libcrypto_static -LOCAL_STATIC_LIBRARIES += libssl_static - -# Require curl config changes and an addional -# module definition in external/curl(_static?) -#LOCAL_FORCE_STATIC_EXECUTABLE := true - -ifeq ($(LOCAL_FORCE_STATIC_EXECUTABLE),true) -LOCAL_CFLAGS += -DCURL_STATICLIB # -DHTTP_ONLY -LOCAL_STATIC_LIBRARIES += libcurl_static libc -else -LOCAL_SHARED_LIBRARIES := libssl libcrypto -LOCAL_SHARED_LIBRARIES += libcurl -#LOCAL_STATIC_LIBRARIES += libcurl_static -endif - -include $(BUILD_EXECUTABLE) - diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index c191bd3..0000000 --- a/Dockerfile +++ /dev/null @@ -1,35 +0,0 @@ -# -# Dockerfile for cpuminer-opt -# usage: docker build -t cpuminer-opt:latest . -# run: docker run -it --rm cpuminer-opt:latest [ARGS] -# ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3 -# - -# Build -FROM ubuntu:16.04 as builder - -RUN apt-get update \ - && apt-get install -y \ - build-essential \ - libssl-dev \ - libgmp-dev \ - libcurl4-openssl-dev \ - libjansson-dev \ - automake \ - && rm -rf /var/lib/apt/lists/* - -COPY . /app/ -RUN cd /app/ && ./build.sh - -# App -FROM ubuntu:16.04 - -RUN apt-get update \ - && apt-get install -y \ - libcurl3 \ - libjansson4 \ - && rm -rf /var/lib/apt/lists/* - -COPY --from=builder /app/cpuminer . -ENTRYPOINT ["./cpuminer"] -CMD ["-h"] diff --git a/Makefile.am b/Makefile.am index 03e207e..9393b08 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,276 +15,61 @@ bin_PROGRAMS = cpuminer dist_man_MANS = cpuminer.1 + + + cpuminer_SOURCES = \ cpu-miner.c \ util.c \ - uint256.cpp \ api.c \ sysinfos.c \ - algo-gate-api.c\ + algo-gate-api.c \ crypto/oaes_lib.c \ crypto/c_keccak.c \ crypto/c_groestl.c \ crypto/c_blake256.c \ crypto/c_jh.c \ crypto/magimath.cpp \ - algo/argon2/argon2a/argon2a.c \ - algo/argon2/argon2a/ar2/argon2.c \ - algo/argon2/argon2a/ar2/opt.c \ - algo/argon2/argon2a/ar2/cores.c \ - algo/argon2/argon2a/ar2/ar2-scrypt-jane.c \ - algo/argon2/argon2a/ar2/blake2b.c \ - algo/argon2/argon2d/argon2d-gate.c \ - algo/argon2/argon2d/blake2/blake2b.c \ - algo/argon2/argon2d/argon2d/argon2.c \ - algo/argon2/argon2d/argon2d/core.c \ - algo/argon2/argon2d/argon2d/opt.c \ - algo/argon2/argon2d/argon2d/argon2d_thread.c \ - algo/argon2/argon2d/argon2d/encoding.c \ algo/blake/sph_blake.c \ - algo/blake/blake256-hash-4way.c \ - algo/blake/blake512-hash-4way.c \ - algo/blake/blake-gate.c \ - algo/blake/blake.c \ - algo/blake/blake-4way.c \ - algo/blake/sph_blake2b.c \ - algo/blake/blake2b.c \ - algo/blake/sph-blake2s.c \ - algo/blake/blake2s-hash-4way.c \ - algo/blake/blake2s.c \ - algo/blake/blake2s-gate.c \ - algo/blake/blake2s-4way.c \ - algo/blake/blakecoin-gate.c \ - algo/blake/mod_blakecoin.c \ - algo/blake/blakecoin.c \ - algo/blake/blakecoin-4way.c \ - algo/blake/decred-gate.c \ - algo/blake/decred.c \ - algo/blake/decred-4way.c \ - algo/blake/pentablake-gate.c \ - algo/blake/pentablake-4way.c \ - algo/blake/pentablake.c \ algo/bmw/sph_bmw.c \ - algo/bmw/bmw256-hash-4way.c \ - algo/bmw/bmw512-hash-4way.c \ - algo/bmw/bmw256.c \ - algo/bmw/bmw512-gate.c \ - algo/bmw/bmw512.c \ - algo/bmw/bmw512-4way.c \ algo/cubehash/sph_cubehash.c \ - algo/cubehash/cubehash_sse2.c\ - algo/cubehash/cube-hash-2way.c \ algo/echo/sph_echo.c \ algo/echo/aes_ni/hash.c\ - algo/gost/sph_gost.c \ algo/groestl/sph_groestl.c \ - algo/groestl/groestl.c \ - algo/groestl/myrgr-gate.c \ - algo/groestl/myrgr-4way.c \ - algo/groestl/myr-groestl.c \ - algo/groestl/aes_ni/hash-groestl.c \ - algo/groestl/aes_ni/hash-groestl256.c \ algo/fugue/sph_fugue.c \ algo/hamsi/sph_hamsi.c \ - algo/hamsi/hamsi-hash-4way.c \ - algo/haval/haval.c \ - algo/haval/haval-hash-4way.c \ - algo/heavy/sph_hefty1.c \ - algo/heavy/heavy.c \ - algo/heavy/bastion.c \ - algo/hodl/aes.c \ - algo/hodl/hodl-gate.c \ - algo/hodl/hodl-wolf.c \ - algo/hodl/sha512_avx.c \ - algo/hodl/sha512_avx2.c \ algo/jh/sph_jh.c \ - algo/jh/jh-hash-4way.c \ - algo/jh/jha-gate.c \ - algo/jh/jha-4way.c \ - algo/jh/jha.c \ algo/keccak/sph_keccak.c \ - algo/keccak/keccak.c\ - algo/keccak/keccak-hash-4way.c \ - algo/keccak/keccak-4way.c\ + algo/keccak/keccak.c \ algo/keccak/keccak-gate.c \ - algo/keccak/sse2/keccak.c \ + algo/keccak/keccak-4way.c \ + algo/keccak/keccak-hash-4way.c \ + algo/keccak/sha3d.c \ + algo/keccak/sha3d-4way.c \ algo/luffa/sph_luffa.c \ - algo/luffa/luffa.c \ - algo/luffa/luffa_for_sse2.c \ - algo/luffa/luffa-hash-2way.c \ algo/lyra2/lyra2.c \ algo/lyra2/sponge.c \ - algo/lyra2/lyra2-gate.c \ - algo/lyra2/lyra2rev2.c \ - algo/lyra2/lyra2rev2-4way.c \ - algo/lyra2/lyra2rev3.c \ - algo/lyra2/lyra2rev3-4way.c \ - algo/lyra2/lyra2re.c \ - algo/lyra2/lyra2z.c \ - algo/lyra2/lyra2z-4way.c \ - algo/lyra2/lyra2z330.c \ - algo/lyra2/lyra2h.c \ - algo/lyra2/lyra2h-4way.c \ - algo/lyra2/allium-4way.c \ - algo/lyra2/allium.c \ - algo/lyra2/phi2-4way.c \ - algo/lyra2/phi2.c \ - algo/m7m.c \ - algo/nist5/nist5-gate.c \ - algo/nist5/nist5-4way.c \ - algo/nist5/nist5.c \ - algo/nist5/zr5.c \ - algo/panama/sph_panama.c \ - algo/radiogatun/sph_radiogatun.c \ - algo/quark/quark-gate.c \ - algo/quark/quark.c \ - algo/quark/quark-4way.c \ - algo/quark/anime-gate.c \ - algo/quark/anime.c \ - algo/quark/anime-4way.c \ - algo/quark/hmq1725-gate.c \ - algo/quark/hmq1725-4way.c \ - algo/quark/hmq1725.c \ - algo/qubit/qubit-gate.c \ - algo/qubit/qubit.c \ - algo/qubit/qubit-2way.c \ - algo/qubit/deep-gate.c \ - algo/qubit/deep-2way.c \ - algo/qubit/deep.c \ - algo/ripemd/sph_ripemd.c \ - algo/ripemd/ripemd-hash-4way.c \ - algo/ripemd/lbry-gate.c \ - algo/ripemd/lbry.c \ - algo/ripemd/lbry-4way.c \ - algo/scrypt/scrypt.c \ - algo/scrypt/neoscrypt.c \ - algo/scrypt/pluck.c \ - algo/scryptjane/scrypt-jane.c \ algo/sha/sph_sha2.c \ algo/sha/sph_sha2big.c \ - algo/sha/sha2-hash-4way.c \ - algo/sha/sha256_hash_11way.c \ algo/sha/sha2.c \ - algo/sha/sha256t-gate.c \ - algo/sha/sha256t-4way.c \ - algo/sha/sha256t.c \ - algo/sha/sha256q-4way.c \ - algo/sha/sha256q.c \ + algo/sha/sha256-hash-opt.c \ + algo/sha/aes_helper.c \ algo/shabal/sph_shabal.c \ - algo/shabal/shabal-hash-4way.c \ algo/shavite/sph_shavite.c \ algo/shavite/sph-shavite-aesni.c \ - algo/shavite/shavite-hash-2way.c \ - algo/shavite/shavite.c \ algo/gr/cryptonote/crypto/aesb.c \ algo/gr/cryptonote/crypto/hash.c \ algo/gr/cryptonote/crypto/c_skein.c \ - algo/gr/cryptonote/cryptonight.c \ algo/gr/cryptonote/cryptonight_dark.c \ algo/gr/cryptonote/cryptonight_dark_lite.c \ algo/gr/cryptonote/cryptonight_fast.c \ algo/gr/cryptonote/cryptonight_lite.c \ - algo/gr/cryptonote/cryptonight_soft_shell.c \ algo/gr/cryptonote/cryptonight_turtle.c \ algo/gr/cryptonote/cryptonight_turtle_lite.c \ algo/gr/gr-gate.c \ algo/simd/sph_simd.c \ - algo/simd/nist.c \ - algo/simd/vector.c \ - algo/simd/simd-hash-2way.c \ algo/skein/sph_skein.c \ - algo/skein/skein-hash-4way.c \ - algo/skein/skein.c \ - algo/skein/skein-4way.c \ - algo/skein/skein-gate.c \ - algo/skein/skein2.c \ - algo/skein/skein2-4way.c \ - algo/skein/skein2-gate.c \ - algo/sm3/sm3.c \ - algo/sm3/sm3-hash-4way.c \ - algo/tiger/sph_tiger.c \ - algo/whirlpool/sph_whirlpool.c \ - algo/whirlpool/whirlpool-hash-4way.c \ - algo/whirlpool/whirlpool-gate.c \ - algo/whirlpool/whirlpool.c \ - algo/whirlpool/whirlpoolx.c \ - algo/x11/x11-gate.c \ - algo/x11/x11.c \ - algo/x11/x11-4way.c \ - algo/x11/x11gost-gate.c \ - algo/x11/x11gost.c \ - algo/x11/x11gost-4way.c \ - algo/x11/c11-gate.c \ - algo/x11/c11.c \ - algo/x11/c11-4way.c \ - algo/x11/tribus-gate.c \ - algo/x11/tribus.c \ - algo/x11/tribus-4way.c \ - algo/x11/timetravel-gate.c \ - algo/x11/timetravel.c \ - algo/x11/timetravel-4way.c \ - algo/x11/timetravel10-gate.c \ - algo/x11/timetravel10.c \ - algo/x11/timetravel10-4way.c \ - algo/x11/fresh.c \ - algo/x11/x11evo.c \ - algo/x11/x11evo-4way.c \ - algo/x11/x11evo-gate.c \ - algo/x12/x12-gate.c \ - algo/x12/x12.c \ - algo/x12/x12-4way.c \ - algo/x13/x13-gate.c \ - algo/x13/x13.c \ - algo/x13/x13-4way.c \ - algo/x13/x13sm3-gate.c \ - algo/x13/x13sm3.c \ - algo/x13/x13sm3-4way.c \ - algo/x13/phi1612-gate.c \ - algo/x13/phi1612.c \ - algo/x13/phi1612-4way.c \ - algo/x13/skunk-gate.c \ - algo/x13/skunk-4way.c \ - algo/x13/skunk.c \ - algo/x13/drop.c \ - algo/x13/x13bcd-4way.c \ - algo/x13/x13bcd.c \ - algo/x14/x14-gate.c \ - algo/x14/x14.c \ - algo/x14/x14-4way.c \ - algo/x14/veltor-gate.c \ - algo/x14/veltor.c \ - algo/x14/veltor-4way.c \ - algo/x14/polytimos-gate.c \ - algo/x14/polytimos.c \ - algo/x14/polytimos-4way.c \ - algo/x14/axiom.c \ - algo/x15/x15-gate.c \ - algo/x15/x15.c \ - algo/x15/x15-4way.c \ - algo/x16/x16r-gate.c \ - algo/x16/x16r.c \ - algo/x16/x16r-4way.c \ - algo/x16/x16rt.c \ - algo/x16/x16rt-4way.c \ - algo/x16/hex.c \ - algo/x16/x21s-4way.c \ - algo/x16/x21s.c \ - algo/x17/x17-gate.c \ - algo/x17/x17.c \ - algo/x17/x17-4way.c \ - algo/x17/xevan-gate.c \ - algo/x17/xevan.c \ - algo/x17/xevan-4way.c \ - algo/x17/sonoa-gate.c \ - algo/x17/sonoa-4way.c \ - algo/x17/sonoa.c \ - algo/x20/x20r.c \ - algo/yescrypt/yescrypt.c \ - algo/yescrypt/sha256_Y.c \ - algo/yescrypt/yescrypt-best.c \ - algo/yespower/yespower.c \ - algo/yespower/sha256_p.c \ - algo/yespower/yespower-opt.c + algo/whirlpool/sph_whirlpool.c disable_flags = diff --git a/README.md b/README.md index eba3b86..460a675 100644 --- a/README.md +++ b/README.md @@ -1,156 +1,107 @@ -cpuminer-opt is a fork of cpuminer-multi by TPruvot with optimizations -imported from other miners developped by lucas Jones, djm34, Wolf0, pooler, -Jeff garzik, ig0tik3d, elmad, palmd, and Optiminer, with additional -optimizations by Jay D Dee. +This version was created to support ARMv7 (ARM) and ARMv8 (Aarch64). +Code was stripped from any unnecessary algorithms and currently only +supports Ghost Rider (gr, Raptoreum) algorithm. +Algorithm removal was done to minimize size and reduce compilation time +as it **should** be compiled locally to achieve the best performance possible. +It also supports compilation with x86_64 architecture processors. -All of the code is believed to be open and free. If anyone has a -claim to any of it post your case in the cpuminer-opt Bitcoin Talk forum -or by email. -Miner programs are often flagged as malware by antivirus programs. This is -a false positive, they are flagged simply because they are cryptocurrency -miners. The source code is open for anyone to inspect. If you don't trust -the software, don't use it. +Requirements +------------ -https://bitcointalk.org/index.php?topic=1326803.0 +1. 64 or 32 bit Linux OS. Raspbian (Debian) is known to work and have all dependencies in their repositories. Others may work but may require more effort. -mailto://jayddee246@gmail.com +2. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or RPC getwork using http:// or https://. GBT is YMMV. -See file RELEASE_NOTES for change log and INSTALL_LINUX or INSTALL_WINDOWS -for compile instructions. +Supported Algorithms +-------------------- -Requirements ------------- + gr Gr Hash (RTM) + +Changes +-------------------- -1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes -Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI -optimizations a CPU with AES_NI is required. This includes Intel Westbridge -and newer and AMD equivalents. Further optimizations are available on some -algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively. +Due to missing instructions such as SSE2 on ARM architecture processors some +code had to be modified (mostly includes). -Older CPUs are supported by cpuminer-multi by TPruvot but at reduced -performance. +sse2neon (https://github.com/DLTcollab/sse2neon) was used as an alternative +and easy solution to port required functionality and make it work on ARM. -ARM CPUs are not supported. +Main modifications compared to the original release: -2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and -Centos, are known to work and have all dependencies in their repositories. -Others may work but may require more effort. Older versions such as Centos 6 -don't work due to missing features. -64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries. +simd-utils.h - use sse2neon. Disable most of the includes. +simd-utils/simd-int.h - Exclude \_\_int128 on ARM. -MacOS, OSx and Android are not supported. +algo/lyra2/lyra2.c -3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV. +algo/lyra2/sponge.c - use sse2neon -Supported Algorithms +util.c - Remove mentions and variables used by X16, PHI2 and LBRY + +miner.h - Remove mention of other algorithms. + +algo-gate-api.c - Remove mention of other algorithms. + +cpu-miner.c - Remove requirement for SSE2 check. + +Makefile.am - Remove source files for unused algorithms. + +Install -------------------- - allium Garlicoin - anime Animecoin - argon2 Argon2 coin (AR2) - argon2d250 argon2d-crds, Credits (CRDS) - argon2d500 argon2d-dyn, Dynamic (DYN) - argon2d4096 argon2d-uis, Unitus, (UIS) - axiom Shabal-256 MemoHash - bastion - blake Blake-256 (SFR) - blakecoin blake256r8 - blake2s Blake-2 S - bmw BMW 256 - bmw512 BMW 512 - c11 Chaincoin - decred - deep Deepcoin (DCN) - dmd-gr Diamond-Groestl - drop Dropcoin - fresh Fresh - groestl Groestl coin - heavy Heavy - hex x16r-hex - hmq1725 Espers - hodl Hodlcoin - jha Jackpotcoin - keccak Maxcoin - keccakc Creative coin - lbry LBC, LBRY Credits - luffa Luffa - lyra2h Hppcoin - lyra2re lyra2 - lyra2rev2 lyra2v2 - lyra2rev3 lyrav2v3, Vertcoin - lyra2z - lyra2z330 Lyra2 330 rows, Zoin (ZOI) - m7m Magi (XMG) - myr-gr Myriad-Groestl - neoscrypt NeoScrypt(128, 2, 1) - nist5 Nist5 - pentablake Pentablake - phi1612 phi, LUX coin (original algo) - phi2 LUX coin (new algo) - pluck Pluck:128 (Supcoin) - polytimos Ninja - quark Quark - qubit Qubit - scrypt scrypt(1024, 1, 1) (default) - scrypt:N scrypt(N, 1, 1) - scryptjane:nf - sha256d Double SHA-256 - sha256q Quad SHA-256, Pyrite (PYE) - sha256t Triple SHA-256, Onecoin (OC) - shavite3 Shavite3 - skein Skein+Sha (Skeincoin) - skein2 Double Skein (Woodcoin) - skunk Signatum (SIGT) - sonoa Sono - timetravel Machinecoin (MAC) - timetravel10 Bitcore - tribus Denarius (DNR) - vanilla blake256r8vnl (VCash) - veltor (VLT) - whirlpool - whirlpoolx - x11 Dash - x11evo Revolvercoin - x11gost sib (SibCoin) - x12 Galaxie Cash (GCH) - x13 X13 - x13bcd bcd - x13sm3 hsr (Hshare) - x14 X14 - x15 X15 - x16r Ravencoin (RVN) - x16rt Gincoin (GIN) - x16rt_veil Veil (VEIL) - x16s Pigeoncoin (PGN) - x17 - x21s - xevan Bitsend (BSD) - yescrypt Globalboost-Y (BSTY) - yescryptr8 BitZeny (ZNY) - yescryptr16 Eli - yescryptr32 WAVI - yespower Cryply - yespowerr16 Yenten (YTN) - zr5 Ziftr - -Errata ------- - -Cryptonight and variants are no longer supported, use another miner. - -Neoscrypt crashes on Windows, use legacy version. - -AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not -supported by cpuminer-opt due to an incompatible implementation of SSE2 on -these CPUs. Some algos may crash the miner with an invalid instruction. -Users are recommended to use an unoptimized miner such as cpuminer-multi. - -cpuminer-opt does not work mining Decred algo at Nicehash and produces -only "invalid extranonce2 size" rejects. - -Benchmark testing does not work for x11evo. +It is HIGHLY recommended to compile the code on the local machine. +The most important information can be found in **INSTALL_LINUX** file. + +Example for Raspbian: +1. Install depenencies: +`sudo apt-get update && sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev texinfo git` +2. Get a repository. Either zipped file or `git clone https://github.com/michal-zurkowski/cpuminer-gr` +3. Build: The basic process is inside `build.sh` file and should work by itself as it contains simple logic about selecting proper configuration depending on the system architecture. +``` +./build.sh +``` + +Tested Systems +------------ +``` +Hardware System Notes +Raspberry Pi 3 Raspbian 32bit system. +Raspberry Pi 4 Raspbian See Troubleshooting section. Compiled as ARMv7. +``` + +Troubleshooting +------------ +Raspberry Pi 4 Raspbian +Problems with alignments can occur and give `Bus error`. Posible solutions: +1. To fix it run followinf command: `sudo echo "0" > /proc/cpu/alignment` +2. Switch from SD card to USB drive. + + +Note from Jay D Dee. repository +------------ +https://github.com/JayDDee/cpuminer-opt +cpuminer-opt is a fork of cpuminer-multi by TPruvot with optimizations imported from other miners developped by lucas Jones, djm34, Wolf0, pooler, Jeff garzik, ig0tik3d, elmad, palmd, and Optiminer, with additional optimizations by Jay D Dee. + +All of the code is believed to be open and free. If anyone has a claim to any of it post your case in the cpuminer-opt Bitcoin Talk forum or by email. + +Miner programs are often flagged as malware by antivirus programs. This is a false positive, they are flagged simply because they are cryptocurrency miners. The source code is open for anyone to inspect. If you don't trust the software, don't use it. + +New thread: + +https://bitcointalk.org/index.php?topic=5226770.msg53865575#msg53865575 + +Old thread: + +https://bitcointalk.org/index.php?topic=1326803.0 + +mailto://jayddee246@gmail.com + +This note is to confirm that bitcointalk users JayDDee and joblo are the same person. + +I created a new BCT user JayDDee to match my github user id. The old thread has been locked but still contains useful information for reading. +See file RELEASE_NOTES for change log and INSTALL_LINUX or INSTALL_WINDOWS for compile instructions. + Bugs ---- @@ -168,11 +119,7 @@ Donations cpuminer-opt has no fees of any kind but donations are accepted. - BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT - ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0 - LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8 - BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ - BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ +BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT Happy mining! diff --git a/aclocal.m4 b/aclocal.m4 index 0e40733..516b455 100644 --- a/aclocal.m4 +++ b/aclocal.m4 @@ -1,6 +1,6 @@ -# generated automatically by aclocal 1.15.1 -*- Autoconf -*- +# generated automatically by aclocal 1.16.3 -*- Autoconf -*- -# Copyright (C) 1996-2017 Free Software Foundation, Inc. +# Copyright (C) 1996-2020 Free Software Foundation, Inc. # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -20,7 +20,7 @@ You have another version of autoconf. It may work, but is not guaranteed to. If you have problems, you may need to regenerate the build system entirely. To do so, use the procedure documented by the package, typically 'autoreconf'.])]) -# Copyright (C) 2002-2017 Free Software Foundation, Inc. +# Copyright (C) 2002-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.]) # generated from the m4 files accompanying Automake X.Y. # (This private macro should not be called outside this file.) AC_DEFUN([AM_AUTOMAKE_VERSION], -[am__api_version='1.15' +[am__api_version='1.16' dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to dnl require some minimum version. Point them to the right macro. -m4_if([$1], [1.15.1], [], +m4_if([$1], [1.16.3], [], [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl ]) @@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], []) # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. # This function is AC_REQUIREd by AM_INIT_AUTOMAKE. AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], -[AM_AUTOMAKE_VERSION([1.15.1])dnl +[AM_AUTOMAKE_VERSION([1.16.3])dnl m4_ifndef([AC_AUTOCONF_VERSION], [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) # Figure out how to run the assembler. -*- Autoconf -*- -# Copyright (C) 2001-2017 Free Software Foundation, Inc. +# Copyright (C) 2001-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl # AM_AUX_DIR_EXPAND -*- Autoconf -*- -# Copyright (C) 2001-2017 Free Software Foundation, Inc. +# Copyright (C) 2001-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd` # AM_CONDITIONAL -*- Autoconf -*- -# Copyright (C) 1997-2017 Free Software Foundation, Inc. +# Copyright (C) 1997-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE( Usually this means the macro was only invoked conditionally.]]) fi])]) -# Copyright (C) 1999-2017 Free Software Foundation, Inc. +# Copyright (C) 1999-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -352,13 +352,12 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl # Generate code to set up dependency tracking. -*- Autoconf -*- -# Copyright (C) 1999-2017 Free Software Foundation, Inc. +# Copyright (C) 1999-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. - # _AM_OUTPUT_DEPENDENCY_COMMANDS # ------------------------------ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS], @@ -366,49 +365,43 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS], # Older Autoconf quotes --file arguments for eval, but not when files # are listed without --file. Let's play safe and only enable the eval # if we detect the quoting. - case $CONFIG_FILES in - *\'*) eval set x "$CONFIG_FILES" ;; - *) set x $CONFIG_FILES ;; - esac + # TODO: see whether this extra hack can be removed once we start + # requiring Autoconf 2.70 or later. + AS_CASE([$CONFIG_FILES], + [*\'*], [eval set x "$CONFIG_FILES"], + [*], [set x $CONFIG_FILES]) shift - for mf + # Used to flag and report bootstrapping failures. + am_rc=0 + for am_mf do # Strip MF so we end up with the name of the file. - mf=`echo "$mf" | sed -e 's/:.*$//'` - # Check whether this is an Automake generated Makefile or not. - # We used to match only the files named 'Makefile.in', but - # some people rename them; so instead we look at the file content. - # Grep'ing the first line is not enough: some people post-process - # each Makefile.in and add a new line on top of each file to say so. - # Grep'ing the whole file is not good either: AIX grep has a line + am_mf=`AS_ECHO(["$am_mf"]) | sed -e 's/:.*$//'` + # Check whether this is an Automake generated Makefile which includes + # dependency-tracking related rules and includes. + # Grep'ing the whole file directly is not great: AIX grep has a line # limit of 2048, but all sed's we know have understand at least 4000. - if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then - dirpart=`AS_DIRNAME("$mf")` - else - continue - fi - # Extract the definition of DEPDIR, am__include, and am__quote - # from the Makefile without running 'make'. - DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"` - test -z "$DEPDIR" && continue - am__include=`sed -n 's/^am__include = //p' < "$mf"` - test -z "$am__include" && continue - am__quote=`sed -n 's/^am__quote = //p' < "$mf"` - # Find all dependency output files, they are included files with - # $(DEPDIR) in their names. We invoke sed twice because it is the - # simplest approach to changing $(DEPDIR) to its actual value in the - # expansion. - for file in `sed -n " - s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \ - sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do - # Make sure the directory exists. - test -f "$dirpart/$file" && continue - fdir=`AS_DIRNAME(["$file"])` - AS_MKDIR_P([$dirpart/$fdir]) - # echo "creating $dirpart/$file" - echo '# dummy' > "$dirpart/$file" - done + sed -n 's,^am--depfiles:.*,X,p' "$am_mf" | grep X >/dev/null 2>&1 \ + || continue + am_dirpart=`AS_DIRNAME(["$am_mf"])` + am_filepart=`AS_BASENAME(["$am_mf"])` + AM_RUN_LOG([cd "$am_dirpart" \ + && sed -e '/# am--include-marker/d' "$am_filepart" \ + | $MAKE -f - am--depfiles]) || am_rc=$? done + if test $am_rc -ne 0; then + AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments + for automatic dependency tracking. If GNU make was not used, consider + re-running the configure script with MAKE="gmake" (or whatever is + necessary). You can also try re-running configure with the + '--disable-dependency-tracking' option to at least be able to build + the package (albeit without support for automatic dependency tracking).]) + fi + AS_UNSET([am_dirpart]) + AS_UNSET([am_filepart]) + AS_UNSET([am_mf]) + AS_UNSET([am_rc]) + rm -f conftest-deps.mk } ])# _AM_OUTPUT_DEPENDENCY_COMMANDS @@ -417,18 +410,17 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS], # ----------------------------- # This macro should only be invoked once -- use via AC_REQUIRE. # -# This code is only required when automatic dependency tracking -# is enabled. FIXME. This creates each '.P' file that we will -# need in order to bootstrap the dependency handling code. +# This code is only required when automatic dependency tracking is enabled. +# This creates each '.Po' and '.Plo' makefile fragment that we'll need in +# order to bootstrap the dependency handling code. AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS], [AC_CONFIG_COMMANDS([depfiles], [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS], - [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"]) -]) + [AMDEP_TRUE="$AMDEP_TRUE" MAKE="${MAKE-make}"])]) # Do all the work for Automake. -*- Autoconf -*- -# Copyright (C) 1996-2017 Free Software Foundation, Inc. +# Copyright (C) 1996-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -515,8 +507,8 @@ AC_REQUIRE([AM_PROG_INSTALL_STRIP])dnl AC_REQUIRE([AC_PROG_MKDIR_P])dnl # For better backward compatibility. To be removed once Automake 1.9.x # dies out for good. For more background, see: -# -# +# +# AC_SUBST([mkdir_p], ['$(MKDIR_P)']) # We need awk for the "check" target (and possibly the TAP driver). The # system "awk" is bad on some platforms. @@ -583,7 +575,7 @@ END Aborting the configuration process, to ensure you take notice of the issue. You can download and install GNU coreutils to get an 'rm' implementation -that behaves properly: . +that behaves properly: . If you want to complete the configuration process using your problematic 'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM @@ -625,7 +617,7 @@ for _am_header in $config_headers :; do done echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) -# Copyright (C) 2001-2017 Free Software Foundation, Inc. +# Copyright (C) 2001-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -646,7 +638,7 @@ if test x"${install_sh+set}" != xset; then fi AC_SUBST([install_sh])]) -# Copyright (C) 2003-2017 Free Software Foundation, Inc. +# Copyright (C) 2003-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -668,7 +660,7 @@ AC_SUBST([am__leading_dot])]) # Add --enable-maintainer-mode option to configure. -*- Autoconf -*- # From Jim Meyering -# Copyright (C) 1996-2017 Free Software Foundation, Inc. +# Copyright (C) 1996-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -703,7 +695,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles]) # Check to see how 'make' treats includes. -*- Autoconf -*- -# Copyright (C) 2001-2017 Free Software Foundation, Inc. +# Copyright (C) 2001-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -711,49 +703,42 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles]) # AM_MAKE_INCLUDE() # ----------------- -# Check to see how make treats includes. +# Check whether make has an 'include' directive that can support all +# the idioms we need for our automatic dependency tracking code. AC_DEFUN([AM_MAKE_INCLUDE], -[am_make=${MAKE-make} -cat > confinc << 'END' +[AC_MSG_CHECKING([whether ${MAKE-make} supports the include directive]) +cat > confinc.mk << 'END' am__doit: - @echo this is the am__doit target + @echo this is the am__doit target >confinc.out .PHONY: am__doit END -# If we don't find an include directive, just comment out the code. -AC_MSG_CHECKING([for style of include used by $am_make]) am__include="#" am__quote= -_am_result=none -# First try GNU make style include. -echo "include confinc" > confmf -# Ignore all kinds of additional output from 'make'. -case `$am_make -s -f confmf 2> /dev/null` in #( -*the\ am__doit\ target*) - am__include=include - am__quote= - _am_result=GNU - ;; -esac -# Now try BSD make style include. -if test "$am__include" = "#"; then - echo '.include "confinc"' > confmf - case `$am_make -s -f confmf 2> /dev/null` in #( - *the\ am__doit\ target*) - am__include=.include - am__quote="\"" - _am_result=BSD - ;; - esac -fi -AC_SUBST([am__include]) -AC_SUBST([am__quote]) -AC_MSG_RESULT([$_am_result]) -rm -f confinc confmf -]) +# BSD make does it like this. +echo '.include "confinc.mk" # ignored' > confmf.BSD +# Other make implementations (GNU, Solaris 10, AIX) do it like this. +echo 'include confinc.mk # ignored' > confmf.GNU +_am_result=no +for s in GNU BSD; do + AM_RUN_LOG([${MAKE-make} -f confmf.$s && cat confinc.out]) + AS_CASE([$?:`cat confinc.out 2>/dev/null`], + ['0:this is the am__doit target'], + [AS_CASE([$s], + [BSD], [am__include='.include' am__quote='"'], + [am__include='include' am__quote=''])]) + if test "$am__include" != "#"; then + _am_result="yes ($s style)" + break + fi +done +rm -f confinc.* confmf.* +AC_MSG_RESULT([${_am_result}]) +AC_SUBST([am__include])]) +AC_SUBST([am__quote])]) # Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- -# Copyright (C) 1997-2017 Free Software Foundation, Inc. +# Copyright (C) 1997-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -774,12 +759,7 @@ AC_DEFUN([AM_MISSING_HAS_RUN], [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl AC_REQUIRE_AUX_FILE([missing])dnl if test x"${MISSING+set}" != xset; then - case $am_aux_dir in - *\ * | *\ *) - MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;; - *) - MISSING="\${SHELL} $am_aux_dir/missing" ;; - esac + MISSING="\${SHELL} '$am_aux_dir/missing'" fi # Use eval to expand $SHELL if eval "$MISSING --is-lightweight"; then @@ -792,7 +772,7 @@ fi # Helper functions for option handling. -*- Autoconf -*- -# Copyright (C) 2001-2017 Free Software Foundation, Inc. +# Copyright (C) 2001-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -821,7 +801,7 @@ AC_DEFUN([_AM_SET_OPTIONS], AC_DEFUN([_AM_IF_OPTION], [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])]) -# Copyright (C) 1999-2017 Free Software Foundation, Inc. +# Copyright (C) 1999-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -868,7 +848,7 @@ AC_LANG_POP([C])]) # For backward compatibility. AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])]) -# Copyright (C) 2001-2017 Free Software Foundation, Inc. +# Copyright (C) 2001-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -887,7 +867,7 @@ AC_DEFUN([AM_RUN_LOG], # Check to make sure that the build environment is sane. -*- Autoconf -*- -# Copyright (C) 1996-2017 Free Software Foundation, Inc. +# Copyright (C) 1996-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -968,7 +948,7 @@ AC_CONFIG_COMMANDS_PRE( rm -f conftest.file ]) -# Copyright (C) 2009-2017 Free Software Foundation, Inc. +# Copyright (C) 2009-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -1028,7 +1008,7 @@ AC_SUBST([AM_BACKSLASH])dnl _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl ]) -# Copyright (C) 2001-2017 Free Software Foundation, Inc. +# Copyright (C) 2001-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -1056,7 +1036,7 @@ fi INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" AC_SUBST([INSTALL_STRIP_PROGRAM])]) -# Copyright (C) 2006-2017 Free Software Foundation, Inc. +# Copyright (C) 2006-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -1075,7 +1055,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)]) # Check how to create a tarball. -*- Autoconf -*- -# Copyright (C) 2004-2017 Free Software Foundation, Inc. +# Copyright (C) 2004-2020 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, diff --git a/algo-gate-api.c b/algo-gate-api.c index 0b65391..d9db0ac 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -9,15 +9,13 @@ // Functions in this file are used simultaneously by myultiple // threads and must therefore be re-entrant. -#include +#include "algo-gate-api.h" +#include +#include #include +#include #include -#include -#include #include -#include -//#include "miner.h" -#include "algo-gate-api.h" // Define null and standard functions. // @@ -36,7 +34,7 @@ // names. // // custom functions are algo spefic and are defined and registered in the -// algo's source file and are usually named [algo]_[function]. +// algo's source file and are usually named [algo]_[function]. // // In most cases the default is a null or std function. However in some // cases, for convenience when the null function is not the most popular, @@ -48,11 +46,11 @@ // // gate functions may call other gate functions under the following // restrictions. Any gate function defined here or used by more than one -// algo must call other functions using the gate: algo_gate.[function]. +// algo must call other functions using the gate: algo_gate.[function]. // custom functions may call other custom functions directly using // [algo]_[function], howver it is recommended to alway use the gate. // -// If, under rare circumstances, an algo with a custom gate function +// If, under rare circumstances, an algo with a custom gate function // needs to call a function of another algo it must define and register // a private gate from its rgistration function and use it to call // forein functions: [private_gate].[function]. If the algo needs to call @@ -63,202 +61,227 @@ // other global or local (to the algo) variables. // A set of predefined generic null functions that can be used as any null -// gate function with the same signature. - -void do_nothing () {} -bool return_true () { return true; } -bool return_false () { return false; } -void *return_null () { return NULL; } -void call_error () { printf("ERR: Uninitialized function pointer\n"); } - -void algo_not_tested() -{ - applog( LOG_WARNING,"Algo %s has not been tested live. It may not work", - algo_names[opt_algo] ); - applog(LOG_WARNING,"and bad things may happen. Use at your own risk."); +// gate function with the same signature. + +void do_nothing() {} +bool return_true() { return true; } +bool return_false() { return false; } +void *return_null() { return NULL; } +void call_error() { printf("ERR: Uninitialized function pointer\n"); } + +void algo_not_tested() { + applog(LOG_WARNING, "Algo %s has not been tested live. It may not work", + algo_names[opt_algo]); + applog(LOG_WARNING, "and bad things may happen. Use at your own risk."); } -void four_way_not_tested() -{ - applog( LOG_WARNING,"Algo %s has not been tested using 4way. It may not", algo_names[opt_algo] ); - applog( LOG_WARNING,"work or may be slower. Please report your results."); +void four_way_not_tested() { + applog(LOG_WARNING, "Algo %s has not been tested using 4way. It may not", + algo_names[opt_algo]); + applog(LOG_WARNING, "work or may be slower. Please report your results."); } -void algo_not_implemented() -{ - applog(LOG_ERR,"Algo %s has not been Implemented.",algo_names[opt_algo]); +void algo_not_implemented() { + applog(LOG_ERR, "Algo %s has not been Implemented.", algo_names[opt_algo]); } // default null functions +// deprecated, use generic as default +int null_scanhash() { + applog(LOG_WARNING, "SWERR: undefined scanhash function in algo_gate"); + return 0; +} + +// Default generic scanhash can be used in many cases. +int scanhash_generic(struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) { + uint32_t edata[20] __attribute__((aligned(64))); + uint32_t hash[8] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 1; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; -int null_scanhash() -{ - applog(LOG_WARNING,"SWERR: undefined scanhash function in algo_gate"); - return 0; + mm128_bswap32_80(edata, pdata); + do { + edata[19] = n; + if (likely(algo_gate.hash(hash, edata, thr_id))) + if (unlikely(valid_hash(hash, ptarget) && !bench)) { + pdata[19] = bswap_32(n); + submit_solution(work, hash, mythr); + } + n++; + } while (n < last_nonce && !work_restart[thr_id].restart); + *hashes_done = n - first_nonce; + pdata[19] = n; + return 0; } -void null_hash() -{ - applog(LOG_WARNING,"SWERR: null_hash unsafe null function"); -}; -void null_hash_suw() -{ - applog(LOG_WARNING,"SWERR: null_hash_suw unsafe null function"); +#if defined(__AVX2__) + +// int scanhash_4way_64_64( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ) + +// int scanhash_4way_64_640( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ) + +int scanhash_4way_64in_32out(struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) { + uint32_t hash32[8 * 4] __attribute__((aligned(64))); + uint32_t vdata[20 * 4] __attribute__((aligned(64))); + uint32_t lane_hash[8] __attribute__((aligned(64))); + uint32_t *hash32_d7 = &(hash32[7 * 4]); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; + __m256i *noncev = (__m256i *)vdata + 9; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const uint32_t targ32_d7 = ptarget[7]; + const bool bench = opt_benchmark; + + mm256_bswap32_intrlv80_4x64(vdata, pdata); + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32(n + 3, 0, n + 2, 0, n + 1, 0, n, 0), *noncev); + do { + if (likely(algo_gate.hash(hash32, vdata, thr_id))) + for (int lane = 0; lane < 4; lane++) + if (unlikely(hash32_d7[lane] <= targ32_d7 && !bench)) { + extr_lane_4x32(lane_hash, hash32, lane, 256); + if (valid_hash(lane_hash, ptarget)) { + pdata[19] = bswap_32(n + lane); + submit_solution(work, lane_hash, mythr); + } + } + *noncev = _mm256_add_epi32(*noncev, m256_const1_64(0x0000000400000000)); + n += 4; + } while (likely((n <= last_nonce) && !work_restart[thr_id].restart)); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +// int scanhash_8way_32_32( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ) + +#endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && \ + defined(__AVX512BW__) + +// int scanhash_8way_64_64( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ) + +// int scanhash_8way_64_640( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ) + +int scanhash_8way_64in_32out(struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) { + uint32_t hash32[8 * 8] __attribute__((aligned(128))); + uint32_t vdata[20 * 8] __attribute__((aligned(64))); + uint32_t lane_hash[8] __attribute__((aligned(64))); + uint32_t *hash32_d7 = &(hash32[7 * 8]); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i *)vdata + 9; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const uint32_t targ32_d7 = ptarget[7]; + const bool bench = opt_benchmark; + + mm512_bswap32_intrlv80_8x64(vdata, pdata); + *noncev = mm512_intrlv_blend_32(_mm512_set_epi32(n + 7, 0, n + 6, 0, n + 5, 0, + n + 4, 0, n + 3, 0, n + 2, 0, + n + 1, 0, n, 0), + *noncev); + do { + if (likely(algo_gate.hash(hash32, vdata, thr_id))) + for (int lane = 0; lane < 8; lane++) + if (unlikely((hash32_d7[lane] <= targ32_d7) && !bench)) { + extr_lane_8x32(lane_hash, hash32, lane, 256); + if (likely(valid_hash(lane_hash, ptarget))) { + pdata[19] = bswap_32(n + lane); + submit_solution(work, lane_hash, mythr); + } + } + *noncev = _mm512_add_epi32(*noncev, m512_const1_64(0x0000000800000000)); + n += 8; + } while (likely((n < last_nonce) && !work_restart[thr_id].restart)); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +// int scanhash_16way_32_32( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ) + +#endif + +int null_hash() { + applog(LOG_WARNING, "SWERR: null_hash unsafe null function"); + return 0; }; -void init_algo_gate( algo_gate_t* gate ) -{ - gate->miner_thread_init = (void*)&return_true; - gate->scanhash = (void*)&null_scanhash; - gate->hash = (void*)&null_hash; - gate->hash_suw = (void*)&null_hash_suw; - gate->get_new_work = (void*)&std_get_new_work; - gate->get_nonceptr = (void*)&std_get_nonceptr; - gate->work_decode = (void*)&std_le_work_decode; - gate->decode_extra_data = (void*)&do_nothing; - gate->wait_for_diff = (void*)&std_wait_for_diff; - gate->get_max64 = (void*)&get_max64_0x1fffffLL; - gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root; - gate->stratum_gen_work = (void*)&std_stratum_gen_work; - gate->build_stratum_request = (void*)&std_le_build_stratum_request; - gate->malloc_txs_request = (void*)&std_malloc_txs_request; - gate->set_target = (void*)&std_set_target; - gate->submit_getwork_result = (void*)&std_le_submit_getwork_result; - gate->build_block_header = (void*)&std_build_block_header; - gate->build_extraheader = (void*)&std_build_extraheader; - gate->set_work_data_endian = (void*)&do_nothing; - gate->calc_network_diff = (void*)&std_calc_network_diff; - gate->ready_to_mine = (void*)&std_ready_to_mine; - gate->resync_threads = (void*)&do_nothing; - gate->do_this_thread = (void*)&return_true; - gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call; - gate->stratum_handle_response = (void*)&std_stratum_handle_response; - gate->get_work_data_size = (void*)&std_get_work_data_size; - gate->optimizations = EMPTY_SET; - gate->ntime_index = STD_NTIME_INDEX; - gate->nbits_index = STD_NBITS_INDEX; - gate->nonce_index = STD_NONCE_INDEX; - gate->work_cmp_size = STD_WORK_CMP_SIZE; +void init_algo_gate(algo_gate_t *gate) { + gate->miner_thread_init = (void *)&return_true; + gate->scanhash = (void *)&scanhash_generic; + gate->hash = (void *)&null_hash; + gate->get_new_work = (void *)&std_get_new_work; + gate->work_decode = (void *)&std_le_work_decode; + gate->decode_extra_data = (void *)&do_nothing; + gate->gen_merkle_root = (void *)&sha256d_gen_merkle_root; + gate->build_stratum_request = (void *)&std_le_build_stratum_request; + gate->malloc_txs_request = (void *)&std_malloc_txs_request; + gate->submit_getwork_result = (void *)&std_le_submit_getwork_result; + gate->build_block_header = (void *)&std_build_block_header; + gate->build_extraheader = (void *)&std_build_extraheader; + gate->set_work_data_endian = (void *)&do_nothing; + gate->calc_network_diff = (void *)&std_calc_network_diff; + gate->ready_to_mine = (void *)&std_ready_to_mine; + gate->resync_threads = (void *)&do_nothing; + gate->do_this_thread = (void *)&return_true; + gate->longpoll_rpc_call = (void *)&std_longpoll_rpc_call; + gate->get_work_data_size = (void *)&std_get_work_data_size; + gate->optimizations = EMPTY_SET; + gate->ntime_index = STD_NTIME_INDEX; + gate->nbits_index = STD_NBITS_INDEX; + gate->nonce_index = STD_NONCE_INDEX; + gate->work_cmp_size = STD_WORK_CMP_SIZE; } // Ignore warnings for not yet defined register functions #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wimplicit-function-declaration" -// called by each thread that uses the gate -bool register_algo_gate( int algo, algo_gate_t *gate ) -{ - if ( NULL == gate ) - { - applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n"); +// Called once by main +bool register_algo_gate(int algo, algo_gate_t *gate) { + bool rc = false; + + if (NULL == gate) { + applog(LOG_ERR, "FAIL: algo_gate registration failed, NULL gate\n"); return false; } - init_algo_gate( gate ); - - switch (algo) - { - case ALGO_ALLIUM: register_allium_algo ( gate ); break; - case ALGO_ANIME: register_anime_algo ( gate ); break; - case ALGO_ARGON2: register_argon2_algo ( gate ); break; - case ALGO_ARGON2D250: register_argon2d_crds_algo ( gate ); break; - case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break; - case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break; - case ALGO_AXIOM: register_axiom_algo ( gate ); break; - case ALGO_BASTION: register_bastion_algo ( gate ); break; - case ALGO_BLAKE: register_blake_algo ( gate ); break; - case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break; -// case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break; - case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break; - case ALGO_BMW512: register_bmw512_algo ( gate ); break; - case ALGO_C11: register_c11_algo ( gate ); break; - case ALGO_DECRED: register_decred_algo ( gate ); break; - case ALGO_DEEP: register_deep_algo ( gate ); break; - case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break; - case ALGO_DROP: register_drop_algo ( gate ); break; - case ALGO_FRESH: register_fresh_algo ( gate ); break; - case ALGO_GROESTL: register_groestl_algo ( gate ); break; - case ALGO_GR: register_gr_algo ( gate ); break; - case ALGO_HEAVY: register_heavy_algo ( gate ); break; - case ALGO_HEX: register_hex_algo ( gate ); break; - case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break; - case ALGO_HODL: register_hodl_algo ( gate ); break; - case ALGO_JHA: register_jha_algo ( gate ); break; - case ALGO_KECCAK: register_keccak_algo ( gate ); break; - case ALGO_KECCAKC: register_keccakc_algo ( gate ); break; - case ALGO_LBRY: register_lbry_algo ( gate ); break; - case ALGO_LUFFA: register_luffa_algo ( gate ); break; - case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break; - case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break; - case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break; - case ALGO_LYRA2REV3: register_lyra2rev3_algo ( gate ); break; - case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break; - case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break; - case ALGO_M7M: register_m7m_algo ( gate ); break; - case ALGO_MYR_GR: register_myriad_algo ( gate ); break; - case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break; - case ALGO_NIST5: register_nist5_algo ( gate ); break; - case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break; - case ALGO_PHI1612: register_phi1612_algo ( gate ); break; - case ALGO_PHI2: register_phi2_algo ( gate ); break; - case ALGO_PLUCK: register_pluck_algo ( gate ); break; - case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break; - case ALGO_QUARK: register_quark_algo ( gate ); break; - case ALGO_QUBIT: register_qubit_algo ( gate ); break; - case ALGO_SCRYPT: register_scrypt_algo ( gate ); break; - case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break; - case ALGO_SHA256D: register_sha256d_algo ( gate ); break; - case ALGO_SHA256Q: register_sha256q_algo ( gate ); break; - case ALGO_SHA256T: register_sha256t_algo ( gate ); break; - case ALGO_SHAVITE3: register_shavite_algo ( gate ); break; - case ALGO_SKEIN: register_skein_algo ( gate ); break; - case ALGO_SKEIN2: register_skein2_algo ( gate ); break; - case ALGO_SKUNK: register_skunk_algo ( gate ); break; - case ALGO_SONOA: register_sonoa_algo ( gate ); break; - case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break; - case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break; - case ALGO_TRIBUS: register_tribus_algo ( gate ); break; - case ALGO_VANILLA: register_vanilla_algo ( gate ); break; - case ALGO_VELTOR: register_veltor_algo ( gate ); break; - case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break; - case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break; - case ALGO_X11: register_x11_algo ( gate ); break; - case ALGO_X11EVO: register_x11evo_algo ( gate ); break; - case ALGO_X11GOST: register_x11gost_algo ( gate ); break; - case ALGO_X12: register_x12_algo ( gate ); break; - case ALGO_X13: register_x13_algo ( gate ); break; - case ALGO_X13BCD: register_x13bcd_algo ( gate ); break; - case ALGO_X13SM3: register_x13sm3_algo ( gate ); break; - case ALGO_X14: register_x14_algo ( gate ); break; - case ALGO_X15: register_x15_algo ( gate ); break; - case ALGO_X16R: register_x16r_algo ( gate ); break; - case ALGO_X16RT: register_x16rt_algo ( gate ); break; - case ALGO_X16RT_VEIL: register_x16rt_veil_algo ( gate ); break; - case ALGO_X16S: register_x16s_algo ( gate ); break; - case ALGO_X17: register_x17_algo ( gate ); break; - case ALGO_X21S: register_x21s_algo ( gate ); break; - case ALGO_XEVAN: register_xevan_algo ( gate ); break; -/* case ALGO_YESCRYPT: register_yescrypt_05_algo ( gate ); break; - case ALGO_YESCRYPTR8: register_yescryptr8_05_algo ( gate ); break; - case ALGO_YESCRYPTR16: register_yescryptr16_05_algo ( gate ); break; - case ALGO_YESCRYPTR32: register_yescryptr32_05_algo ( gate ); break; -*/ - case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break; - case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break; - case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break; - case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break; - case ALGO_YESPOWER: register_yespower_algo ( gate ); break; - case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break; - case ALGO_ZR5: register_zr5_algo ( gate ); break; - default: - applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] ); - return false; + init_algo_gate(gate); + + switch (algo) { + case ALGO_GR: + rc = register_gr_algo(gate); + break; + default: + applog(LOG_ERR, "BUG: unregistered algorithm %s.\n", algo_names[opt_algo]); + return false; } // switch - // ensure required functions were defined. - if ( gate->scanhash == (void*)&null_scanhash ) - { - applog(LOG_ERR, "FAIL: Required algo_gate functions undefined\n"); + if (!rc) { + applog(LOG_ERR, "FAIL: %s algorithm failed to initialize\n", + algo_names[opt_algo]); return false; } return true; @@ -267,135 +290,38 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) // restore warnings #pragma GCC diagnostic pop -// override std defaults with jr2 defaults -bool register_json_rpc2( algo_gate_t *gate ) -{ - applog(LOG_WARNING,"\nCryptonight algorithm and variants are no longer"); - applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will"); - applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n"); - - gate->wait_for_diff = (void*)&do_nothing; - gate->get_new_work = (void*)&jr2_get_new_work; - gate->get_nonceptr = (void*)&jr2_get_nonceptr; - gate->stratum_gen_work = (void*)&jr2_stratum_gen_work; - gate->build_stratum_request = (void*)&jr2_build_stratum_request; - gate->submit_getwork_result = (void*)&jr2_submit_getwork_result; - gate->longpoll_rpc_call = (void*)&jr2_longpoll_rpc_call; - gate->work_decode = (void*)&jr2_work_decode; - gate->stratum_handle_response = (void*)&jr2_stratum_handle_response; - gate->nonce_index = JR2_NONCE_INDEX; - jsonrpc_2 = true; // still needed - opt_extranonce = false; -// have_gbt = false; - return true; - } - -// run the alternate hash function for a specific algo -void exec_hash_function( int algo, void *output, const void *pdata ) -{ - algo_gate_t gate; - gate.hash = (void*)&null_hash; - register_algo_gate( algo, &gate ); - gate.hash( output, pdata, 0 ); +void exec_hash_function(int algo, void *output, const void *pdata) { + algo_gate_t gate; + gate.hash = (void *)&null_hash; + register_algo_gate(algo, &gate); + gate.hash(output, pdata, 0); } #define PROPER (1) -#define ALIAS (0) +#define ALIAS (0) // The only difference between the alias and the proper algo name is the // proper name is the one that is defined in ALGO_NAMES. There may be // multiple aliases that map to the same proper name. // New aliases can be added anywhere in the array as long as NULL is last. // Alphabetic order of alias is recommended. -const char* const algo_alias_map[][2] = -{ -// alias proper - { "argon2d-crds", "argon2d250" }, - { "argon2d-dyn", "argon2d500" }, - { "argon2d-uis", "argon2d4096" }, - { "bitcore", "timetravel10" }, - { "bitzeny", "yescryptr8" }, - { "blake256r8", "blakecoin" }, - { "blake256r8vnl", "vanilla" }, - { "blake256r14", "blake" }, - { "blake256r14dcr", "decred" }, - { "diamond", "dmd-gr" }, - { "droplp", "drop" }, - { "espers", "hmq1725" }, - { "flax", "c11" }, - { "grhash", "gr" }, - { "hsr", "x13sm3" }, - { "jackpot", "jha" }, - { "jane", "scryptjane" }, - { "lyra2", "lyra2re" }, - { "lyra2v2", "lyra2rev2" }, - { "lyra2v3", "lyra2rev3" }, - { "myrgr", "myr-gr" }, - { "myriad", "myr-gr" }, - { "neo", "neoscrypt" }, - { "phi", "phi1612" }, -// { "sia", "blake2b" }, - { "sib", "x11gost" }, - { "timetravel8", "timetravel" }, - { "veil", "x16rt-veil" }, - { "x16r-hex", "hex" }, - { "yenten", "yescryptr16" }, - { "ziftr", "zr5" }, - { NULL, NULL } -}; +const char *const algo_alias_map[][2] = { + // alias proper + {"grhash", "gr"}, + {NULL, NULL}}; // if arg is a valid alias for a known algo it is updated with the proper // name. No validation of the algo or alias is done, It is the responsinility // of the calling function to validate the algo after return. -void get_algo_alias( char** algo_or_alias ) -{ +void get_algo_alias(char **algo_or_alias) { int i; - for ( i=0; algo_alias_map[i][ALIAS]; i++ ) - if ( !strcasecmp( *algo_or_alias, algo_alias_map[i][ ALIAS ] ) ) - { + for (i = 0; algo_alias_map[i][ALIAS]; i++) + if (!strcasecmp(*algo_or_alias, algo_alias_map[i][ALIAS])) { // found valid alias, return proper name - *algo_or_alias = (char* const)( algo_alias_map[i][ PROPER ] ); + *algo_or_alias = (char *)(algo_alias_map[i][PROPER]); return; } } #undef ALIAS #undef PROPER - -bool submit_solution( struct work *work, void *hash, - struct thr_info *thr ) -{ - work_set_target_ratio( work, hash ); - if ( submit_work( thr, work ) ) - { - if ( !opt_quiet ) - applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.", - accepted_share_count + rejected_share_count + 1, - thr->id, work->job_id ); - return true; - } - else - applog( LOG_WARNING, "Failed to submit share." ); - return false; -} - -bool submit_lane_solution( struct work *work, void *hash, - struct thr_info *thr, int lane ) -{ - work_set_target_ratio( work, hash ); - if ( submit_work( thr, work ) ) - { - if ( !opt_quiet ) -// applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d.", -// accepted_share_count + rejected_share_count + 1, -// thr->id, lane ); - applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d, job %s.", - accepted_share_count + rejected_share_count + 1, thr->id, - lane, work->job_id ); - return true; - } - else - applog( LOG_WARNING, "Failed to submit share." ); - return false; -} - diff --git a/algo-gate-api.h b/algo-gate-api.h index e2a26b3..f6c46a4 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -35,7 +35,7 @@ // 6. Determine if other non existant functions are required. // That is determined by the need to add code in cpu-miner.c // that applies only to the new algo. That is forbidden. All -// algo specific code must be in theh algo's file. +// algo specific code must be in the algo's file. // // 7. If new functions need to be added to the gate add the type // to the structure, declare a null instance in this file and define @@ -48,10 +48,10 @@ // instances as they are defined by default, or unsafe functions that // are not needed by the algo. // -// 9. Add an case entry to the switch/case in function register_gate +// 9. Add a case entry to the switch/case in function register_gate // in file algo-gate-api.c for the new algo. // -// 10 If a new function type was defined add an entry to ini talgo_gate +// 10 If a new function type was defined add an entry to init algo_gate // to initialize the new function to its null instance described in step 7. // // 11. If the new algo has aliases add them to the alias array in @@ -75,7 +75,7 @@ // my hack at creating a set data type using bit masks. Set inclusion, // exclusion union and intersection operations are provided for convenience. In // some cases it may be desireable to use boolean algebra directly on the -// data to perfomr set operations. Sets can be represented as single +// data to perform set operations. Sets can be represented as single // elements, a bitwise OR of multiple elements, a bitwise OR of multiple // set variables or constants, or combinations of the above. // Examples: @@ -85,14 +85,17 @@ typedef uint32_t set_t; -#define EMPTY_SET 0 -#define SSE2_OPT 1 -#define AES_OPT 2 -#define SSE42_OPT 4 -#define AVX_OPT 8 -#define AVX2_OPT 0x10 -#define SHA_OPT 0x20 -#define AVX512_OPT 0x40 +#define EMPTY_SET 0 +#define SSE2_OPT 1 +#define AES_OPT 2 +#define SSE42_OPT 4 +#define AVX_OPT 8 // Sandybridge +#define AVX2_OPT 0x10 // Haswell, Zen1 +#define SHA_OPT 0x20 // Zen1, Icelake (sha256) +#define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW]) +#define VAES_OPT 0x80 // Icelake (VAES & AVX512) +#define VAES256_OPT 0x100 // Zen3 (VAES without AVX512) + // return set containing all elements from sets a & b inline set_t set_union ( set_t a, set_t b ) { return a | b; } @@ -108,45 +111,65 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; } typedef struct { -// special case, only one target, provides a callback for scanhash to -// submit work with less overhead. -// bool (*submit_work ) ( struct thr_info*, const struct work* ); - -// mandatory functions, must be overwritten -// Added a 5th arg for the thread_info structure to replace the int thr id -// in the first arg. Both will co-exist during the trasition. -//int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* ); +// Mandatory functions, one of these is mandatory. If a generic scanhash +// is used a custom target hash function must be registered, with a custom +// scanhash the target hash function can be called directly and doesn't need +// to be registered with the gate. int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* ); -// optional unsafe, must be overwritten if algo uses function -void ( *hash ) ( void*, const void*, uint32_t ) ; -void ( *hash_suw ) ( void*, const void* ); +int ( *hash ) ( void*, const void*, int ); //optional, safe to use default in most cases -bool ( *miner_thread_init ) ( int ); -void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* ); -void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*, - bool ); -uint32_t *( *get_nonceptr ) ( uint32_t* ); -void ( *decode_extra_data ) ( struct work*, uint64_t* ); -void ( *wait_for_diff ) ( struct stratum_ctx* ); -int64_t ( *get_max64 ) (); -bool ( *work_decode ) ( const json_t*, struct work* ); -void ( *set_target) ( struct work*, double ); -bool ( *submit_getwork_result ) ( CURL*, struct work* ); -void ( *gen_merkle_root ) ( char*, struct stratum_ctx* ); -void ( *build_extraheader ) ( struct work*, struct stratum_ctx* ); -void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*, - uint32_t*, uint32_t, uint32_t ); -void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* ); -char* ( *malloc_txs_request ) ( struct work* ); -void ( *set_work_data_endian ) ( struct work* ); -double ( *calc_network_diff ) ( struct work* ); -bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int ); -void ( *resync_threads ) ( struct work* ); -bool ( *do_this_thread ) ( int ); -json_t* (*longpoll_rpc_call) ( CURL*, int*, char* ); -bool ( *stratum_handle_response )( json_t* ); + +// Called once by each miner thread to allocate thread local buffers and +// other initialization specific to miner threads. +bool ( *miner_thread_init ) ( int ); + +// Get thread local copy of blockheader with unique nonce. +void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t* ); + +// Decode getwork blockheader +bool ( *work_decode ) ( struct work* ); + +// Extra getwork data +void ( *decode_extra_data ) ( struct work*, uint64_t* ); + +bool ( *submit_getwork_result ) ( CURL*, struct work* ); + +void ( *gen_merkle_root ) ( char*, struct stratum_ctx* ); + +// Increment extranonce +void ( *build_extraheader ) ( struct work*, struct stratum_ctx* ); + +void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*, + uint32_t*, uint32_t, uint32_t, + unsigned char* ); + +// Build mining.submit message +void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* ); + +char* ( *malloc_txs_request ) ( struct work* ); + +// Big endian or little endian +void ( *set_work_data_endian ) ( struct work* ); + +double ( *calc_network_diff ) ( struct work* ); + +// Wait for first work +bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int ); + +// Diverge mining threads +bool ( *do_this_thread ) ( int ); + +// After do_this_thread +void ( *resync_threads ) ( int, struct work* ); + +// No longer needed +json_t* (*longpoll_rpc_call) ( CURL*, int*, char* ); + +// Ghost Rider functionality +int64_t ( *get_max64 ) (); + set_t optimizations; int ( *get_work_data_size ) (); int ntime_index; @@ -184,88 +207,97 @@ void four_way_not_tested(); #define STD_WORK_DATA_SIZE 128 #define STD_WORK_CMP_SIZE 76 -#define JR2_NONCE_INDEX 39 // 8 bit offset +//#define JR2_NONCE_INDEX 39 // 8 bit offset // These indexes are only used with JSON RPC2 and are not gated. -#define JR2_WORK_CMP_INDEX_2 43 -#define JR2_WORK_CMP_SIZE_2 33 +//#define JR2_WORK_CMP_INDEX_2 43 +//#define JR2_WORK_CMP_SIZE_2 33 -// allways returns failure +// deprecated, use generic instead int null_scanhash(); -// Allow algos to submit from scanhash loop. -bool submit_solution( struct work *work, void *hash, - struct thr_info *thr ); -bool submit_lane_solution( struct work *work, void *hash, - struct thr_info *thr, int lane ); +// Default generic, may be used in many cases. +// N-way is more complicated, requires many different implementations +// depending on architecture, input format, and output format. +// Naming convention is scanhash_[N]way_[input format]in_[output format]out +// N = number of lanes +// input/output format: +// 32: 32 bit interleaved parallel lanes +// 64: 64 bit interleaved parallel lanes +// 640: input only, not interleaved, contiguous serial 640 bit lanes. +// 256: output only, not interleaved, contiguous serial 256 bit lanes. - -bool submit_work( struct thr_info *thr, const struct work *work_in ); +int scanhash_generic( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); -// displays warning -void null_hash (); -void null_hash_suw(); +#if defined(__AVX2__) -// optional safe targets, default listed first unless noted. +//int scanhash_4way_64in_64out( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ); + +//int scanhash_4way_64in_256out( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ); + +int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +//int scanhash_8way_32in_32out( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ); + +#endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ); + +//int scanhash_8way_64in_256out( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ); + +int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); -void std_wait_for_diff(); +//int scanhash_16way_32in_32out( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ); -uint32_t *std_get_nonceptr( uint32_t *work_data ); -uint32_t *jr2_get_nonceptr( uint32_t *work_data ); +#endif + +// displays warning +int null_hash(); + +// optional safe targets, default listed first unless noted. void std_get_new_work( struct work *work, struct work *g_work, int thr_id, - uint32_t* end_nonce_ptr, bool clean_job ); -void jr2_get_new_work( struct work *work, struct work *g_work, int thr_id, uint32_t* end_nonce_ptr ); -void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *work ); -void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *work ); - void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx ); void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx ); -// pick your favorite or define your own -int64_t get_max64_0x1fffffLL(); // default -int64_t get_max64_0x40LL(); -int64_t get_max64_0x3ffff(); -int64_t get_max64_0x3fffffLL(); -int64_t get_max64_0x1ffff(); -int64_t get_max64_0xffffLL(); - -void std_set_target( struct work *work, double job_diff ); -void alt_set_target( struct work* work, double job_diff ); -void scrypt_set_target( struct work *work, double job_diff ); - -bool std_le_work_decode( const json_t *val, struct work *work ); -bool std_be_work_decode( const json_t *val, struct work *work ); -bool jr2_work_decode( const json_t *val, struct work *work ); +// Required by Ghost Rider +bool std_le_work_decode( struct work *work ); +bool std_be_work_decode( struct work *work ); bool std_le_submit_getwork_result( CURL *curl, struct work *work ); bool std_be_submit_getwork_result( CURL *curl, struct work *work ); -bool jr2_submit_getwork_result( CURL *curl, struct work *work ); void std_le_build_stratum_request( char *req, struct work *work ); void std_be_build_stratum_request( char *req, struct work *work ); -void jr2_build_stratum_request ( char *req, struct work *work ); char* std_malloc_txs_request( struct work *work ); -// Default is do_nothing (assumed LE) +// Default is do_nothing, little endian is assumed void set_work_data_big_endian( struct work *work ); double std_calc_network_diff( struct work *work ); void std_build_block_header( struct work* g_work, uint32_t version, - uint32_t *prevhash, uint32_t *merkle_root, - uint32_t ntime, uint32_t nbits ); + uint32_t *prevhash, uint32_t *merkle_root, + uint32_t ntime, uint32_t nbits, + unsigned char *final_sapling_hash ); void std_build_extraheader( struct work *work, struct stratum_ctx *sctx ); json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url ); -json_t* jr2_longpoll_rpc_call( CURL *curl, int *err ); - -bool std_stratum_handle_response( json_t *val ); -bool jr2_stratum_handle_response( json_t *val ); bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum, int thr_id ); @@ -278,19 +310,16 @@ int std_get_work_data_size(); // by calling the algo's register function. bool register_algo_gate( int algo, algo_gate_t *gate ); -// Override any default gate functions that are applicable and do any other -// algo-specific initialization. +// Called by algos to verride any default gate functions that are applicable +// and do any other algo-specific initialization. // The register functions for all the algos can be declared here to reduce // compiler warnings but that's just more work for devs adding new algos. bool register_algo( algo_gate_t *gate ); -// Overrides a common set of functions used by RPC2 and other RPC2-specific -// init. Called by algo's register function before initializing algo-specific -// functions and data. -bool register_json_rpc2( algo_gate_t *gate ); - // use this to call the hash function of an algo directly, ie util.c test. void exec_hash_function( int algo, void *output, const void *pdata ); -void get_algo_alias( char** algo_or_alias ); +// Validate a string as a known algo and alias, updates arg to proper +// algo name if valid alias, NULL if invalid alias or algo. +void get_algo_alias( char **algo_or_alias ); diff --git a/algo/argon2/argon2a/ar2/ar2-scrypt-jane.c b/algo/argon2/argon2a/ar2/ar2-scrypt-jane.c deleted file mode 100644 index e75b73b..0000000 --- a/algo/argon2/argon2a/ar2/ar2-scrypt-jane.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane - - Public Domain or MIT License, whichever is easier -*/ - -#include - -#if defined( _WINDOWS ) -#if !defined( QT_GUI ) -extern "C" { -#endif -#endif - -#include "ar2-scrypt-jane.h" - -#include "sj/scrypt-jane-portable.h" -#include "sj/scrypt-jane-hash.h" -#include "sj/scrypt-jane-romix.h" -#include "sj/scrypt-jane-test-vectors.h" - -#define scrypt_maxNfactor 30 /* (1 << (30 + 1)) = ~2 billion */ -#if (SCRYPT_BLOCK_BYTES == 64) -#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ -#elif (SCRYPT_BLOCK_BYTES == 128) -#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */ -#elif (SCRYPT_BLOCK_BYTES == 256) -#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */ -#elif (SCRYPT_BLOCK_BYTES == 512) -#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */ -#endif -#define scrypt_maxrfactor scrypt_r_32kb /* 32kb */ -#define scrypt_maxpfactor 25 /* (1 << 25) = ~33 million */ - -#include -//#include - -static void NORETURN -scrypt_fatal_error_default(const char *msg) { - fprintf(stderr, "%s\n", msg); - exit(1); -} - -static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; - -void scrypt_set_fatal_error(scrypt_fatal_errorfn fn) { - scrypt_fatal_error = fn; -} - -static int scrypt_power_on_self_test(void) -{ - const scrypt_test_setting *t; - uint8_t test_digest[64]; - uint32_t i; - int res = 7, scrypt_valid; - - if (!scrypt_test_mix()) { -#if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: mix function power-on-self-test failed"); -#endif - res &= ~1; - } - - if (!scrypt_test_hash()) { -#if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: hash function power-on-self-test failed"); -#endif - res &= ~2; - } - - for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) { - t = post_settings + i; - scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest)); - scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); - } - - if (!scrypt_valid) { -#if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); -#endif - res &= ~4; - } - - return res; -} - -typedef struct scrypt_aligned_alloc_t { - uint8_t *mem, *ptr; -} scrypt_aligned_alloc; - -#ifdef SCRYPT_TEST_SPEED - -static uint8_t *mem_base = (uint8_t *)0; -static size_t mem_bump = 0; - -/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ -static scrypt_aligned_alloc scrypt_alloc(uint64_t size) -{ - scrypt_aligned_alloc aa; - if (!mem_base) { - mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); - if (!mem_base) - scrypt_fatal_error("scrypt: out of memory"); - mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - } - aa.mem = mem_base + mem_bump; - aa.ptr = aa.mem; - mem_bump += (size_t)size; - return aa; -} - -static void scrypt_free(scrypt_aligned_alloc *aa) { - mem_bump = 0; -} - -#else - -static scrypt_aligned_alloc scrypt_alloc(uint64_t size) -{ - static const size_t max_alloc = (size_t)-1; - scrypt_aligned_alloc aa; - size += (SCRYPT_BLOCK_BYTES - 1); - if (size > max_alloc) - scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); - aa.mem = (uint8_t *)malloc((size_t)size); - aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - if (!aa.mem) - scrypt_fatal_error("scrypt: out of memory"); - return aa; -} - -static void scrypt_free(scrypt_aligned_alloc *aa) -{ - free(aa->mem); -} - -#endif /* SCRYPT_TEST_SPEED */ - - -void scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, - uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes) -{ - scrypt_aligned_alloc YX, V; - uint8_t *X, *Y; - uint32_t N, r, p, chunk_bytes, i; - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) - scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); -#endif - -#if !defined(SCRYPT_TEST) - static int power_on_self_test = 0; - if (!power_on_self_test) { - power_on_self_test = 1; - if (!scrypt_power_on_self_test()) - scrypt_fatal_error("scrypt: power on self test failed"); - } -#endif - - if (Nfactor > scrypt_maxNfactor) - scrypt_fatal_error("scrypt: N out of range"); - if (rfactor > scrypt_maxrfactor) - scrypt_fatal_error("scrypt: r out of range"); - if (pfactor > scrypt_maxpfactor) - scrypt_fatal_error("scrypt: p out of range"); - - N = (1 << (Nfactor + 1)); - r = (1 << rfactor); - p = (1 << pfactor); - - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - V = scrypt_alloc((uint64_t)N * chunk_bytes); - YX = scrypt_alloc((p + 1) * chunk_bytes); - - /* 1: X = PBKDF2(password, salt) */ - Y = YX.ptr; - X = Y + chunk_bytes; - scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p); - - /* 2: X = ROMix(X) */ - for (i = 0; i < p; i++) - scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r); - - /* 3: Out = PBKDF2(password, X) */ - scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes); - - scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes); - - scrypt_free(&V); - scrypt_free(&YX); -} - -#define Nfactor 8 -#define rfactor 0 -#define pfactor 0 -#if (SCRYPT_BLOCK_BYTES == 64) -#define chunk_bytes 128 -#elif (SCRYPT_BLOCK_BYTES == 128) -#define chunk_bytes 256 -#elif (SCRYPT_BLOCK_BYTES == 256) -#define chunk_bytes 512 -#elif (SCRYPT_BLOCK_BYTES == 512) -#define chunk_bytes 1024 -#endif - -void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out) -{ - scrypt_aligned_alloc YX, V; - uint8_t *X, *Y; - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) - scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); -#endif - -/* -#if !defined(SCRYPT_TEST) - static int power_on_self_test = 0; - if (!power_on_self_test) { - power_on_self_test = 1; - if (!scrypt_power_on_self_test()) - scrypt_fatal_error("scrypt: power on self test failed"); - } -#endif -*/ - V = scrypt_alloc((uint64_t)512 * chunk_bytes); - YX = scrypt_alloc(2 * chunk_bytes); - - /* 1: X = PBKDF2(password, salt) */ - Y = YX.ptr; - X = Y + chunk_bytes; - scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes); - - /* 2: X = ROMix(X) */ - scrypt_ROMix((scrypt_mix_word_t *)X, (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, 512, 1); - - /* 3: Out = PBKDF2(password, X) */ - scrypt_pbkdf2(password, password_len, X, chunk_bytes, 1, out, 32); - - scrypt_ensure_zero(YX.ptr, 2 * chunk_bytes); - - scrypt_free(&V); - scrypt_free(&YX); -} - -#if defined( _WINDOWS ) -#if !defined( QT_GUI ) -} /* extern "C" */ -#endif -#endif diff --git a/algo/argon2/argon2a/ar2/ar2-scrypt-jane.h b/algo/argon2/argon2a/ar2/ar2-scrypt-jane.h deleted file mode 100644 index 78006e5..0000000 --- a/algo/argon2/argon2a/ar2/ar2-scrypt-jane.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef AR2_SCRYPT_JANE_H -#define AR2_SCRYPT_JANE_H - -#ifdef _MSC_VER -#undef SCRYPT_CHOOSE_COMPILETIME -#endif -//#define SCRYPT_TEST -#define SCRYPT_SKEIN512 -#define SCRYPT_SALSA64 - -/* - Nfactor: Increases CPU & Memory Hardness - N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used - - rfactor: Increases Memory Hardness - r = (1 << rfactor): How large a chunk is - - pfactor: Increases CPU Hardness - p = (1 << pfactor): Number of times to mix the main chunk - - A block is the basic mixing unit (salsa/chacha block = 64 bytes) - A chunk is (2 * r) blocks - - ~Memory used = (N + 2) * ((2 * r) * block size) -*/ - -#include -#include - -typedef void (*scrypt_fatal_errorfn)(const char *msg); -void scrypt_set_fatal_error(scrypt_fatal_errorfn fn); - -void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes); -void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out); -#endif /* AR2_SCRYPT_JANE_H */ diff --git a/algo/argon2/argon2a/ar2/argon2.c b/algo/argon2/argon2a/ar2/argon2.c deleted file mode 100644 index c238598..0000000 --- a/algo/argon2/argon2a/ar2/argon2.c +++ /dev/null @@ -1,284 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ - -#include -#include -#include -#include - -#include "argon2.h" -#include "cores.h" - -/* Error messages */ -static const char *Argon2_ErrorMessage[] = { - /*{ARGON2_OK, */ "OK", - /*}, - - {ARGON2_OUTPUT_PTR_NULL, */ "Output pointer is NULL", - /*}, - -{ARGON2_OUTPUT_TOO_SHORT, */ "Output is too short", - /*}, -{ARGON2_OUTPUT_TOO_LONG, */ "Output is too long", - /*}, - -{ARGON2_PWD_TOO_SHORT, */ "Password is too short", - /*}, -{ARGON2_PWD_TOO_LONG, */ "Password is too long", - /*}, - -{ARGON2_SALT_TOO_SHORT, */ "Salt is too short", - /*}, -{ARGON2_SALT_TOO_LONG, */ "Salt is too long", - /*}, - -{ARGON2_AD_TOO_SHORT, */ "Associated data is too short", - /*}, -{ARGON2_AD_TOO_LONG, */ "Associated date is too long", - /*}, - -{ARGON2_SECRET_TOO_SHORT, */ "Secret is too short", - /*}, -{ARGON2_SECRET_TOO_LONG, */ "Secret is too long", - /*}, - -{ARGON2_TIME_TOO_SMALL, */ "Time cost is too small", - /*}, -{ARGON2_TIME_TOO_LARGE, */ "Time cost is too large", - /*}, - -{ARGON2_MEMORY_TOO_LITTLE, */ "Memory cost is too small", - /*}, -{ARGON2_MEMORY_TOO_MUCH, */ "Memory cost is too large", - /*}, - -{ARGON2_LANES_TOO_FEW, */ "Too few lanes", - /*}, -{ARGON2_LANES_TOO_MANY, */ "Too many lanes", - /*}, - -{ARGON2_PWD_PTR_MISMATCH, */ "Password pointer is NULL, but password length is not 0", - /*}, -{ARGON2_SALT_PTR_MISMATCH, */ "Salt pointer is NULL, but salt length is not 0", - /*}, -{ARGON2_SECRET_PTR_MISMATCH, */ "Secret pointer is NULL, but secret length is not 0", - /*}, -{ARGON2_AD_PTR_MISMATCH, */ "Associated data pointer is NULL, but ad length is not 0", - /*}, - -{ARGON2_MEMORY_ALLOCATION_ERROR, */ "Memory allocation error", - /*}, - -{ARGON2_FREE_MEMORY_CBK_NULL, */ "The free memory callback is NULL", - /*}, -{ARGON2_ALLOCATE_MEMORY_CBK_NULL, */ "The allocate memory callback is NULL", - /*}, - -{ARGON2_INCORRECT_PARAMETER, */ "Argon2_Context context is NULL", - /*}, -{ARGON2_INCORRECT_TYPE, */ "There is no such version of Argon2", - /*}, - -{ARGON2_OUT_PTR_MISMATCH, */ "Output pointer mismatch", - /*}, - -{ARGON2_THREADS_TOO_FEW, */ "Not enough threads", - /*}, -{ARGON2_THREADS_TOO_MANY, */ "Too many threads", - /*}, -{ARGON2_MISSING_ARGS, */ "Missing arguments", /*},*/ -}; - -int argon2d(argon2_context *context) { return ar2_argon2_core(context, Argon2_d); } - -int argon2i(argon2_context *context) { return ar2_argon2_core(context, Argon2_i); } - -int ar2_verify_d(argon2_context *context, const char *hash) -{ - int result; - /*if (0 == context->outlen || NULL == hash) { - return ARGON2_OUT_PTR_MISMATCH; - }*/ - - result = ar2_argon2_core(context, Argon2_d); - - if (ARGON2_OK != result) { - return result; - } - - return 0 == memcmp(hash, context->out, 32); -} - -const char *error_message(int error_code) -{ - enum { - /* Make sure---at compile time---that the enum size matches the array - size */ - ERROR_STRING_CHECK = - 1 / - !!((sizeof(Argon2_ErrorMessage) / sizeof(Argon2_ErrorMessage[0])) == - ARGON2_ERROR_CODES_LENGTH) - }; - if (error_code < ARGON2_ERROR_CODES_LENGTH) { - return Argon2_ErrorMessage[(argon2_error_codes)error_code]; - } - return "Unknown error code."; -} - -/* encoding/decoding helpers */ - -/* - * Some macros for constant-time comparisons. These work over values in - * the 0..255 range. Returned value is 0x00 on "false", 0xFF on "true". - */ -#define EQ(x, y) ((((0U - ((unsigned)(x) ^ (unsigned)(y))) >> 8) & 0xFF) ^ 0xFF) -#define GT(x, y) ((((unsigned)(y) - (unsigned)(x)) >> 8) & 0xFF) -#define GE(x, y) (GT(y, x) ^ 0xFF) -#define LT(x, y) GT(y, x) -#define LE(x, y) GE(y, x) - -/* - * Convert value x (0..63) to corresponding Base64 character. - */ -static int b64_byte_to_char(unsigned x) { -//static inline int b64_byte_to_char(unsigned x) { - return (LT(x, 26) & (x + 'A')) | - (GE(x, 26) & LT(x, 52) & (x + ('a' - 26))) | - (GE(x, 52) & LT(x, 62) & (x + ('0' - 52))) | (EQ(x, 62) & '+') | - (EQ(x, 63) & '/'); -} - -/* - * Convert some bytes to Base64. 'dst_len' is the length (in characters) - * of the output buffer 'dst'; if that buffer is not large enough to - * receive the result (including the terminating 0), then (size_t)-1 - * is returned. Otherwise, the zero-terminated Base64 string is written - * in the buffer, and the output length (counted WITHOUT the terminating - * zero) is returned. - */ -static size_t to_base64(char *dst, size_t dst_len, const void *src) -{ - size_t olen; - const unsigned char *buf; - unsigned acc, acc_len; - - olen = 43; - /*switch (32 % 3) { - case 2: - olen++;*/ - /* fall through */ - /*case 1: - olen += 2; - break; - }*/ - if (dst_len <= olen) { - return (size_t)-1; - } - acc = 0; - acc_len = 0; - buf = (const unsigned char *)src; - size_t src_len = 32; - while (src_len-- > 0) { - acc = (acc << 8) + (*buf++); - acc_len += 8; - while (acc_len >= 6) { - acc_len -= 6; - *dst++ = b64_byte_to_char((acc >> acc_len) & 0x3F); - } - } - if (acc_len > 0) { - *dst++ = b64_byte_to_char((acc << (6 - acc_len)) & 0x3F); - } - *dst++ = 0; - return olen; -} - -/* ==================================================================== */ -/* - * Code specific to Argon2i. - * - * The code below applies the following format: - * - * $argon2i$m=,t=,p=[,keyid=][,data=][$[$]] - * - * where is a decimal integer (positive, fits in an 'unsigned long') - * and is Base64-encoded data (no '=' padding characters, no newline - * or whitespace). The "keyid" is a binary identifier for a key (up to 8 - * bytes); "data" is associated data (up to 32 bytes). When the 'keyid' - * (resp. the 'data') is empty, then it is ommitted from the output. - * - * The last two binary chunks (encoded in Base64) are, in that order, - * the salt and the output. Both are optional, but you cannot have an - * output without a salt. The binary salt length is between 8 and 48 bytes. - * The output length is always exactly 32 bytes. - */ - -int ar2_encode_string(char *dst, size_t dst_len, argon2_context *ctx) -{ -#define SS(str) \ - do { \ - size_t pp_len = strlen(str); \ - if (pp_len >= dst_len) { \ - return 0; \ - } \ - memcpy(dst, str, pp_len + 1); \ - dst += pp_len; \ - dst_len -= pp_len; \ - } while (0) - -#define SX(x) \ - do { \ - char tmp[30]; \ - sprintf(tmp, "%lu", (unsigned long)(x)); \ - SS(tmp); \ - } while (0); - -#define SB(buf) \ - do { \ - size_t sb_len = to_base64(dst, dst_len, buf); \ - if (sb_len == (size_t)-1) { \ - return 0; \ - } \ - dst += sb_len; \ - dst_len -= sb_len; \ - } while (0); - - SS("$argon2i$m="); - SX(16); - SS(",t="); - SX(2); - SS(",p="); - SX(1); - - /*if (ctx->adlen > 0) { - SS(",data="); - SB(ctx->ad, ctx->adlen); - }*/ - - /*if (ctx->saltlen == 0) - return 1;*/ - - SS("$"); - SB(ctx->salt); - - /*if (ctx->outlen32 == 0) - return 1;*/ - - SS("$"); - SB(ctx->out); - return 1; - -#undef SS -#undef SX -#undef SB -} diff --git a/algo/argon2/argon2a/ar2/argon2.h b/algo/argon2/argon2a/ar2/argon2.h deleted file mode 100644 index 09fa983..0000000 --- a/algo/argon2/argon2a/ar2/argon2.h +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ -#ifndef ARGON2_H -#define ARGON2_H - -#include -#include -#include - -#if defined(__cplusplus) -extern "C" { -#endif - -/*************************Argon2 input parameter - * restrictions**************************************************/ - -/* Minimum and maximum number of lanes (degree of parallelism) */ -#define ARGON2_MIN_LANES UINT32_C(1) -#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF) - -/* Minimum and maximum number of threads */ -#define ARGON2_MIN_THREADS UINT32_C(1) -#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF) - -/* Number of synchronization points between lanes per pass */ -#define ARGON2_SYNC_POINTS UINT32_C(4) - -/* Minimum and maximum digest size in bytes */ -#define ARGON2_MIN_OUTLEN UINT32_C(4) -#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */ -#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */ - -#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b)) -/* Max memory size is half the addressing space, topping at 2^32 blocks (4 TB) - */ -#define ARGON2_MAX_MEMORY_BITS \ - ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1)) -#define ARGON2_MAX_MEMORY \ - ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS) - -/* Minimum and maximum number of passes */ -#define ARGON2_MIN_TIME UINT32_C(1) -#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum password length in bytes */ -#define ARGON2_MIN_PWD_LENGTH UINT32_C(0) -#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum associated data length in bytes */ -#define ARGON2_MIN_AD_LENGTH UINT32_C(0) -#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum salt length in bytes */ -#define ARGON2_MIN_SALT_LENGTH UINT32_C(8) -#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum key length in bytes */ -#define ARGON2_MIN_SECRET UINT32_C(0) -#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF) - -#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0) -#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1) -#define ARGON2_FLAG_CLEAR_MEMORY (UINT32_C(1) << 2) -#define ARGON2_DEFAULT_FLAGS \ - (ARGON2_FLAG_CLEAR_PASSWORD | ARGON2_FLAG_CLEAR_MEMORY) - -/* Error codes */ -typedef enum Argon2_ErrorCodes { - ARGON2_OK = 0, - - ARGON2_OUTPUT_PTR_NULL = 1, - - ARGON2_OUTPUT_TOO_SHORT = 2, - ARGON2_OUTPUT_TOO_LONG = 3, - - ARGON2_PWD_TOO_SHORT = 4, - ARGON2_PWD_TOO_LONG = 5, - - ARGON2_SALT_TOO_SHORT = 6, - ARGON2_SALT_TOO_LONG = 7, - - ARGON2_AD_TOO_SHORT = 8, - ARGON2_AD_TOO_LONG = 9, - - ARGON2_SECRET_TOO_SHORT = 10, - ARGON2_SECRET_TOO_LONG = 11, - - ARGON2_TIME_TOO_SMALL = 12, - ARGON2_TIME_TOO_LARGE = 13, - - ARGON2_MEMORY_TOO_LITTLE = 14, - ARGON2_MEMORY_TOO_MUCH = 15, - - ARGON2_LANES_TOO_FEW = 16, - ARGON2_LANES_TOO_MANY = 17, - - ARGON2_PWD_PTR_MISMATCH = 18, /* NULL ptr with non-zero length */ - ARGON2_SALT_PTR_MISMATCH = 19, /* NULL ptr with non-zero length */ - ARGON2_SECRET_PTR_MISMATCH = 20, /* NULL ptr with non-zero length */ - ARGON2_AD_PTR_MISMATCH = 21, /* NULL ptr with non-zero length */ - - ARGON2_MEMORY_ALLOCATION_ERROR = 22, - - ARGON2_FREE_MEMORY_CBK_NULL = 23, - ARGON2_ALLOCATE_MEMORY_CBK_NULL = 24, - - ARGON2_INCORRECT_PARAMETER = 25, - ARGON2_INCORRECT_TYPE = 26, - - ARGON2_OUT_PTR_MISMATCH = 27, - - ARGON2_THREADS_TOO_FEW = 28, - ARGON2_THREADS_TOO_MANY = 29, - - ARGON2_MISSING_ARGS = 30, - - ARGON2_ERROR_CODES_LENGTH /* Do NOT remove; Do NOT add error codes after - this - error code */ -} argon2_error_codes; - -/* Memory allocator types --- for external allocation */ -typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate); -typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate); - -/* Argon2 external data structures */ - -/* - *****Context: structure to hold Argon2 inputs: - * output array and its length, - * password and its length, - * salt and its length, - * secret and its length, - * associated data and its length, - * number of passes, amount of used memory (in KBytes, can be rounded up a bit) - * number of parallel threads that will be run. - * All the parameters above affect the output hash value. - * Additionally, two function pointers can be provided to allocate and - deallocate the memory (if NULL, memory will be allocated internally). - * Also, three flags indicate whether to erase password, secret as soon as they - are pre-hashed (and thus not needed anymore), and the entire memory - **************************** - Simplest situation: you have output array out[8], password is stored in - pwd[32], salt is stored in salt[16], you do not have keys nor associated data. - You need to spend 1 GB of RAM and you run 5 passes of Argon2d with 4 parallel - lanes. - You want to erase the password, but you're OK with last pass not being erased. - You want to use the default memory allocator. - */ -typedef struct Argon2_Context { - uint8_t *out; /* output array */ - uint8_t *pwd; /* password array */ - uint8_t *salt; /* salt array */ - /*uint8_t *secret;*/ /* key array */ - /*uint8_t *ad;*/ /* associated data array */ - - allocate_fptr allocate_cbk; /* pointer to memory allocator */ - deallocate_fptr free_cbk; /* pointer to memory deallocator */ - - /*uint32_t outlen;*/ /* digest length */ - uint32_t pwdlen; /* password length */ - /*uint32_t saltlen;*/ /* salt length */ - /*uint32_t secretlen;*/ /* key length */ - /*uint32_t adlen;*/ /* associated data length */ - /*uint32_t t_cost;*/ /* number of passes */ - /*uint32_t m_cost;*/ /* amount of memory requested (KB) */ - /*uint32_t lanes;*/ /* number of lanes */ - /*uint32_t threads;*/ /* maximum number of threads */ - /*uint32_t flags;*/ /* array of bool options */ - -} argon2_context; - -/** - * Function to hash the inputs in the memory-hard fashion (uses Argon2i) - * @param out Pointer to the memory where the hash digest will be written - * @param outlen Digest length in bytes - * @param in Pointer to the input (password) - * @param inlen Input length in bytes - * @param salt Pointer to the salt - * @param saltlen Salt length in bytes - * @pre @a out must have at least @a outlen bytes allocated - * @pre @a in must be at least @inlen bytes long - * @pre @a saltlen must be at least @saltlen bytes long - * @return Zero if successful, 1 otherwise. - */ -/*int hash_argon2i(void *out, size_t outlen, const void *in, size_t inlen, - const void *salt, size_t saltlen, unsigned int t_cost, - unsigned int m_cost);*/ - -/* same for argon2d */ -/*int hash_argon2d(void *out, size_t outlen, const void *in, size_t inlen, - const void *salt, size_t saltlen, unsigned int t_cost, - unsigned int m_cost);*/ - -/* - * **************Argon2d: Version of Argon2 that picks memory blocks depending - * on the password and salt. Only for side-channel-free - * environment!!*************** - * @param context Pointer to current Argon2 context - * @return Zero if successful, a non zero error code otherwise - */ -int argon2d(argon2_context *context); - -/* - * * **************Argon2i: Version of Argon2 that picks memory blocks - *independent on the password and salt. Good for side-channels, - ******************* but worse w.r.t. tradeoff attacks if - *******************only one pass is used*************** - * @param context Pointer to current Argon2 context - * @return Zero if successful, a non zero error code otherwise - */ -int argon2i(argon2_context *context); - -/* - * * **************Argon2di: Reserved name*************** - * @param context Pointer to current Argon2 context - * @return Zero if successful, a non zero error code otherwise - */ -int argon2di(argon2_context *context); - -/* - * * **************Argon2ds: Argon2d hardened against GPU attacks, 20% - * slower*************** - * @param context Pointer to current Argon2 context - * @return Zero if successful, a non zero error code otherwise - */ -int argon2ds(argon2_context *context); - -/* - * * **************Argon2id: First half-pass over memory is - *password-independent, the rest are password-dependent - ********************OK against side channels: they reduce to 1/2-pass - *Argon2i*************** - * @param context Pointer to current Argon2 context - * @return Zero if successful, a non zero error code otherwise - */ -int argon2id(argon2_context *context); - -/* - * Verify if a given password is correct for Argon2d hashing - * @param context Pointer to current Argon2 context - * @param hash The password hash to verify. The length of the hash is - * specified by the context outlen member - * @return Zero if successful, a non zero error code otherwise - */ -int ar2_verify_d(argon2_context *context, const char *hash); - -/* - * Get the associated error message for given error code - * @return The error message associated with the given error code - */ -const char *error_message(int error_code); - -/* ==================================================================== */ -/* - * Code specific to Argon2i. - * - * The code below applies the following format: - * - * $argon2i$m=,t=,p=[,keyid=][,data=][$[$]] - * - * where is a decimal integer (positive, fits in an 'unsigned long') - * and is Base64-encoded data (no '=' padding characters, no newline - * or whitespace). The "keyid" is a binary identifier for a key (up to 8 - * bytes); "data" is associated data (up to 32 bytes). When the 'keyid' - * (resp. the 'data') is empty, then it is ommitted from the output. - * - * The last two binary chunks (encoded in Base64) are, in that order, - * the salt and the output. Both are optional, but you cannot have an - * output without a salt. The binary salt length is between 8 and 48 bytes. - * The output length is always exactly 32 bytes. - */ - -int ar2_encode_string(char *dst, size_t dst_len, argon2_context *ctx); - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/algo/argon2/argon2a/ar2/bench.c b/algo/argon2/argon2a/ar2/bench.c deleted file mode 100644 index be61c44..0000000 --- a/algo/argon2/argon2a/ar2/bench.c +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include -#include -#include -#include -#ifdef _MSC_VER -#include -#endif - -#include "argon2.h" - -static uint64_t rdtsc(void) -{ -#ifdef _MSC_VER - return __rdtsc(); -#else -#if defined(__amd64__) || defined(__x86_64__) - uint64_t rax, rdx; - __asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :); - return (rdx << 32) | rax; -#elif defined(__i386__) || defined(__i386) || defined(__X86__) - uint64_t rax; - __asm__ __volatile__("rdtsc" : "=A"(rax) : :); - return rax; -#else -#error "Not implemented!" -#endif -#endif -} - -/* - * Benchmarks Argon2 with salt length 16, password length 16, t_cost 1, - and different m_cost and threads - */ -static void benchmark() -{ -#define BENCH_OUTLEN 16 -#define BENCH_INLEN 16 - const uint32_t inlen = BENCH_INLEN; - const unsigned outlen = BENCH_OUTLEN; - unsigned char out[BENCH_OUTLEN]; - unsigned char pwd_array[BENCH_INLEN]; - unsigned char salt_array[BENCH_INLEN]; -#undef BENCH_INLEN -#undef BENCH_OUTLEN - - uint32_t t_cost = 1; - uint32_t m_cost; - uint32_t thread_test[6] = {1, 2, 4, 6, 8, 16}; - - memset(pwd_array, 0, inlen); - memset(salt_array, 1, inlen); - - for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) { - unsigned i; - for (i = 0; i < 6; ++i) { - argon2_context context; - uint32_t thread_n = thread_test[i]; - uint64_t stop_cycles, stop_cycles_i; - clock_t stop_time; - uint64_t delta_d, delta_i; - double mcycles_d, mcycles_i, run_time; - - clock_t start_time = clock(); - uint64_t start_cycles = rdtsc(); - - context.out = out; - context.outlen = outlen; - context.pwd = pwd_array; - context.pwdlen = inlen; - context.salt = salt_array; - context.saltlen = inlen; - context.secret = NULL; - context.secretlen = 0; - context.ad = NULL; - context.adlen = 0; - context.t_cost = t_cost; - context.m_cost = m_cost; - context.lanes = thread_n; - context.threads = thread_n; - context.allocate_cbk = NULL; - context.free_cbk = NULL; - context.flags = 0; - - argon2d(&context); - stop_cycles = rdtsc(); - argon2i(&context); - stop_cycles_i = rdtsc(); - stop_time = clock(); - - delta_d = (stop_cycles - start_cycles) / (m_cost); - delta_i = (stop_cycles_i - stop_cycles) / (m_cost); - mcycles_d = (double)(stop_cycles - start_cycles) / (1UL << 20); - mcycles_i = (double)(stop_cycles_i - stop_cycles) / (1UL << 20); - printf("Argon2d %d iterations %d MiB %d threads: %2.2f cpb %2.2f " - "Mcycles \n", - t_cost, m_cost >> 10, thread_n, (float)delta_d / 1024, - mcycles_d); - printf("Argon2i %d iterations %d MiB %d threads: %2.2f cpb %2.2f " - "Mcycles \n", - t_cost, m_cost >> 10, thread_n, (float)delta_i / 1024, - mcycles_i); - - run_time = ((double)stop_time - start_time) / (CLOCKS_PER_SEC); - printf("%2.4f seconds\n\n", run_time); - } - } -} - -int main() -{ - benchmark(); - return ARGON2_OK; -} diff --git a/algo/argon2/argon2a/ar2/blake2/blake2-impl.h b/algo/argon2/argon2a/ar2/blake2/blake2-impl.h deleted file mode 100644 index 9bab8e2..0000000 --- a/algo/argon2/argon2a/ar2/blake2/blake2-impl.h +++ /dev/null @@ -1,143 +0,0 @@ -#ifndef PORTABLE_BLAKE2_IMPL_H -#define PORTABLE_BLAKE2_IMPL_H - -#include -#include - -#if defined(_MSC_VER) -#define BLAKE2_INLINE __inline -#elif defined(__GNUC__) || defined(__clang__) -#define BLAKE2_INLINE __inline__ -#else -#define BLAKE2_INLINE -#endif - -/* Argon2 Team - Begin Code */ -/* - Not an exhaustive list, but should cover the majority of modern platforms - Additionally, the code will always be correct---this is only a performance - tweak. -*/ -#if (defined(__BYTE_ORDER__) && \ - (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ - defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ - defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ - defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ - defined(_M_ARM) -#define NATIVE_LITTLE_ENDIAN -#endif -/* Argon2 Team - End Code */ - -static BLAKE2_INLINE uint32_t load32(const void *src) { -#if defined(NATIVE_LITTLE_ENDIAN) - uint32_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = (const uint8_t *)src; - uint32_t w = *p++; - w |= (uint32_t)(*p++) << 8; - w |= (uint32_t)(*p++) << 16; - w |= (uint32_t)(*p++) << 24; - return w; -#endif -} - -static BLAKE2_INLINE uint64_t load64(const void *src) { -#if defined(NATIVE_LITTLE_ENDIAN) - uint64_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = (const uint8_t *)src; - uint64_t w = *p++; - w |= (uint64_t)(*p++) << 8; - w |= (uint64_t)(*p++) << 16; - w |= (uint64_t)(*p++) << 24; - w |= (uint64_t)(*p++) << 32; - w |= (uint64_t)(*p++) << 40; - w |= (uint64_t)(*p++) << 48; - w |= (uint64_t)(*p++) << 56; - return w; -#endif -} - -static BLAKE2_INLINE void store32(void *dst, uint32_t w) { -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -#endif -} - -static BLAKE2_INLINE void store64(void *dst, uint64_t w) { -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -#endif -} - -static BLAKE2_INLINE uint64_t load48(const void *src) { - const uint8_t *p = (const uint8_t *)src; - uint64_t w = *p++; - w |= (uint64_t)(*p++) << 8; - w |= (uint64_t)(*p++) << 16; - w |= (uint64_t)(*p++) << 24; - w |= (uint64_t)(*p++) << 32; - w |= (uint64_t)(*p++) << 40; - return w; -} - -static BLAKE2_INLINE void store48(void *dst, uint64_t w) { - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -} - -static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) { - return (w >> c) | (w << (32 - c)); -} - -static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { - return (w >> c) | (w << (64 - c)); -} - -/* prevents compiler optimizing out memset() */ -static BLAKE2_INLINE void burn(void *v, size_t n) { - static void *(*const volatile memset_v)(void *, int, size_t) = &memset; - memset_v(v, 0, n); -} - -#endif diff --git a/algo/argon2/argon2a/ar2/blake2/blake2.h b/algo/argon2/argon2a/ar2/blake2/blake2.h deleted file mode 100644 index 90f2a50..0000000 --- a/algo/argon2/argon2a/ar2/blake2/blake2.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef PORTABLE_BLAKE2_H -#define PORTABLE_BLAKE2_H - -#include -#include -#include - -#if defined(__cplusplus) -extern "C" { -#endif - -enum blake2b_constant { - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_KEYBYTES = 64, - BLAKE2B_SALTBYTES = 16, - BLAKE2B_PERSONALBYTES = 16 -}; - -#pragma pack(push, 1) -typedef struct __blake2b_param { - uint8_t digest_length; /* 1 */ - uint8_t key_length; /* 2 */ - uint8_t fanout; /* 3 */ - uint8_t depth; /* 4 */ - uint32_t leaf_length; /* 8 */ - uint64_t node_offset; /* 16 */ - uint8_t node_depth; /* 17 */ - uint8_t inner_length; /* 18 */ - uint8_t reserved[14]; /* 32 */ - uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */ - uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */ -} blake2b_param; -#pragma pack(pop) - -typedef struct __blake2b_state { - uint64_t h[8]; - uint64_t t[2]; - uint64_t f[2]; - unsigned buflen; - unsigned outlen; - uint8_t last_node; - uint8_t buf[BLAKE2B_BLOCKBYTES]; -} blake2b_state; - -/* Ensure param structs have not been wrongly padded */ -/* Poor man's static_assert */ -enum { - blake2_size_check_0 = 1 / !!(CHAR_BIT == 8), - blake2_size_check_2 = - 1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT) -}; - -/* Streaming API */ -int ar2_blake2b_init(blake2b_state *S, size_t outlen); -int ar2_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, - size_t keylen); -int ar2_blake2b_init_param(blake2b_state *S, const blake2b_param *P); -int ar2_blake2b_update(blake2b_state *S, const void *in, size_t inlen); -void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen); -int ar2_blake2b_final(blake2b_state *S, void *out, size_t outlen); - -/* Simple API */ -int ar2_blake2b(void *out, const void *in, const void *key, size_t keylen); - -/* Argon2 Team - Begin Code */ -int ar2_blake2b_long(void *out, const void *in); -/* Argon2 Team - End Code */ -/* Miouyouyou */ -void ar2_blake2b_too(void *out, const void *in); - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/algo/argon2/argon2a/ar2/blake2/blamka-round-opt.h b/algo/argon2/argon2a/ar2/blake2/blamka-round-opt.h deleted file mode 100644 index 690686d..0000000 --- a/algo/argon2/argon2a/ar2/blake2/blamka-round-opt.h +++ /dev/null @@ -1,162 +0,0 @@ -#ifndef BLAKE_ROUND_MKA_OPT_H -#define BLAKE_ROUND_MKA_OPT_H - -#include "blake2-impl.h" - -#if defined(_MSC_VER) -#include -#endif - -#include -#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__)) -#include -#endif - -#if !defined(__XOP__) -#if defined(__SSSE3__) -#define r16 \ - (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) -#define r24 \ - (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) -#define _mm_roti_epi64(x, c) \ - (-(c) == 32) \ - ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ - : (-(c) == 24) \ - ? _mm_shuffle_epi8((x), r24) \ - : (-(c) == 16) \ - ? _mm_shuffle_epi8((x), r16) \ - : (-(c) == 63) \ - ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ - _mm_add_epi64((x), (x))) \ - : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ - _mm_slli_epi64((x), 64 - (-(c)))) -#else /* defined(__SSE2__) */ -#define _mm_roti_epi64(r, c) \ - _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c)))) -#endif -#else -#endif - -static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { - const __m128i z = _mm_mul_epu32(x, y); - return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); -} - -#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = fBlaMka(A0, B0); \ - A1 = fBlaMka(A1, B1); \ - \ - D0 = _mm_xor_si128(D0, A0); \ - D1 = _mm_xor_si128(D1, A1); \ - \ - D0 = _mm_roti_epi64(D0, -32); \ - D1 = _mm_roti_epi64(D1, -32); \ - \ - C0 = fBlaMka(C0, D0); \ - C1 = fBlaMka(C1, D1); \ - \ - B0 = _mm_xor_si128(B0, C0); \ - B1 = _mm_xor_si128(B1, C1); \ - \ - B0 = _mm_roti_epi64(B0, -24); \ - B1 = _mm_roti_epi64(B1, -24); \ - } while ((void)0, 0) - -#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = fBlaMka(A0, B0); \ - A1 = fBlaMka(A1, B1); \ - \ - D0 = _mm_xor_si128(D0, A0); \ - D1 = _mm_xor_si128(D1, A1); \ - \ - D0 = _mm_roti_epi64(D0, -16); \ - D1 = _mm_roti_epi64(D1, -16); \ - \ - C0 = fBlaMka(C0, D0); \ - C1 = fBlaMka(C1, D1); \ - \ - B0 = _mm_xor_si128(B0, C0); \ - B1 = _mm_xor_si128(B1, C1); \ - \ - B0 = _mm_roti_epi64(B0, -63); \ - B1 = _mm_roti_epi64(B1, -63); \ - } while ((void)0, 0) - -#if defined(__SSSE3__) -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ - __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ - B0 = t0; \ - B1 = t1; \ - \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - \ - t0 = _mm_alignr_epi8(D1, D0, 8); \ - t1 = _mm_alignr_epi8(D0, D1, 8); \ - D0 = t1; \ - D1 = t0; \ - } while ((void)0, 0) - -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ - __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ - B0 = t0; \ - B1 = t1; \ - \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - \ - t0 = _mm_alignr_epi8(D0, D1, 8); \ - t1 = _mm_alignr_epi8(D1, D0, 8); \ - D0 = t1; \ - D1 = t0; \ - } while ((void)0, 0) -#else /* SSE2 */ -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = D0; \ - __m128i t1 = B0; \ - D0 = C0; \ - C0 = C1; \ - C1 = D0; \ - D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \ - D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \ - B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \ - B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \ - } while ((void)0, 0) - -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - t0 = B0; \ - __m128i t1 = D0; \ - B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \ - B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \ - D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \ - D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \ - } while ((void)0, 0) -#endif - -#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - } while ((void)0, 0) - -#endif diff --git a/algo/argon2/argon2a/ar2/blake2/blamka-round-ref.h b/algo/argon2/argon2a/ar2/blake2/blamka-round-ref.h deleted file mode 100644 index f497e10..0000000 --- a/algo/argon2/argon2a/ar2/blake2/blamka-round-ref.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef BLAKE_ROUND_MKA_H -#define BLAKE_ROUND_MKA_H - -#include "blake2.h" -#include "blake2-impl.h" - -/*designed by the Lyra PHC team */ -static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { - const uint64_t m = UINT64_C(0xFFFFFFFF); - const uint64_t xy = (x & m) * (y & m); - return x + y + 2 * xy; -} - -#define G(a, b, c, d) \ - do { \ - a = fBlaMka(a, b); \ - d = rotr64(d ^ a, 32); \ - c = fBlaMka(c, d); \ - b = rotr64(b ^ c, 24); \ - a = fBlaMka(a, b); \ - d = rotr64(d ^ a, 16); \ - c = fBlaMka(c, d); \ - b = rotr64(b ^ c, 63); \ - } while ((void)0, 0) - -#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \ - v12, v13, v14, v15) \ - do { \ - G(v0, v4, v8, v12); \ - G(v1, v5, v9, v13); \ - G(v2, v6, v10, v14); \ - G(v3, v7, v11, v15); \ - G(v0, v5, v10, v15); \ - G(v1, v6, v11, v12); \ - G(v2, v7, v8, v13); \ - G(v3, v4, v9, v14); \ - } while ((void)0, 0) - -#endif diff --git a/algo/argon2/argon2a/ar2/blake2b.c b/algo/argon2/argon2a/ar2/blake2b.c deleted file mode 100644 index 90f2e0e..0000000 --- a/algo/argon2/argon2a/ar2/blake2b.c +++ /dev/null @@ -1,316 +0,0 @@ -#include -#include -#include -#include - -#include "blake2/blake2.h" -#include "blake2/blake2-impl.h" - -#if defined(_MSC_VER) -// i know there is a trick but nvm :p -#define PRIu64 "%llu" -#define PRIx64 "%llx" -#endif - -static const uint64_t blake2b_IV[8] = { - UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), - UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), - UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), - UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) -}; - -static const unsigned int blake2b_sigma[12][16] = { - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, - {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, - {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, - {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, - {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, - {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, - {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, -}; - -static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) { - S->f[1] = (uint64_t)-1; -} - -static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) { - if (S->last_node) { - blake2b_set_lastnode(S); - } - S->f[0] = (uint64_t)-1; -} - -static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S, uint64_t inc) { - S->t[0] += inc; - S->t[1] += (S->t[0] < inc); -} - -static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) { - burn(S, sizeof(*S)); /* wipe */ - blake2b_set_lastblock(S); /* invalidate for further use */ -} - -static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) { - memset(S, 0, sizeof(*S)); - memcpy(S->h, blake2b_IV, sizeof(S->h)); -} - -/* -void print_state(blake2b_state BlakeHash) -{ - printf(".h = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n" - "UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n" - "UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n" - "UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n" - ".t = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n" - ".f = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")}\n", - BlakeHash.h[0], BlakeHash.h[1], BlakeHash.h[2], BlakeHash.h[3], - BlakeHash.h[4], BlakeHash.h[5], BlakeHash.h[6], BlakeHash.h[7], - BlakeHash.t[0], BlakeHash.t[1], - BlakeHash.f[0], BlakeHash.f[1]); - printf(".buf = {"); - for (register uint8_t i = 0; i < BLAKE2B_BLOCKBYTES; i++) - printf("%" PRIu8 ", ", BlakeHash.buf[i]); - puts("\n"); - printf("}\n.buflen = %d\n.outlen = %d\n", - BlakeHash.buflen, BlakeHash.outlen); - printf(".last_node = %" PRIu8 "\n", BlakeHash.last_node); - fflush(stdout); -} -*/ - -static const blake2b_state miou = { - .h = { - UINT64_C(7640891576939301128), UINT64_C(13503953896175478587), - UINT64_C(4354685564936845355), UINT64_C(11912009170470909681), - UINT64_C(5840696475078001361), UINT64_C(11170449401992604703), - UINT64_C(2270897969802886507), UINT64_C(6620516959819538809) - }, - .t = {UINT64_C(0), UINT64_C(0)}, - .f = {UINT64_C(0), UINT64_C(0)}, - .buf = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - }, - .buflen = 0, - .outlen = 64, - .last_node = 0 -}; - - -int ar2_blake2b_init_param(blake2b_state *S, const blake2b_param *P) -{ - const unsigned char *p = (const unsigned char *)P; - unsigned int i; - - if (NULL == P || NULL == S) { - return -1; - } - - blake2b_init0(S); - /* IV XOR Parameter Block */ - for (i = 0; i < 8; ++i) { - S->h[i] ^= load64(&p[i * sizeof(S->h[i])]); - } - S->outlen = P->digest_length; - return 0; -} - -void compare_buffs(uint64_t *h, size_t outlen) -{ - // printf("CMP : %d", memcmp(h, miou.h, 8*(sizeof(uint64_t)))); - printf("miou : %" PRIu64 " - h : %" PRIu64 " - outlen : %ld\n", miou.h[0], h[0], outlen); - fflush(stdout); -} - -/* Sequential blake2b initialization */ -int ar2_blake2b_init(blake2b_state *S, size_t outlen) -{ - memcpy(S, &miou, sizeof(*S)); - S->h[0] += outlen; - return 0; -} - -void print64(const char *name, const uint64_t *array, uint16_t size) -{ - printf("%s = {", name); - for (uint8_t i = 0; i < size; i++) printf("UINT64_C(%" PRIu64 "), ", array[i]); - printf("};\n"); -} - -int ar2_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t keylen) -{ - return 0; -} - -static void blake2b_compress(blake2b_state *S, const uint8_t *block) -{ - uint64_t m[16]; - uint64_t v[16]; - unsigned int i, r; - - for (i = 0; i < 16; ++i) { - m[i] = load64(block + i * 8); - } - - for (i = 0; i < 8; ++i) { - v[i] = S->h[i]; - } - - v[8] = blake2b_IV[0]; - v[9] = blake2b_IV[1]; - v[10] = blake2b_IV[2]; - v[11] = blake2b_IV[3]; - v[12] = blake2b_IV[4] ^ S->t[0]; - v[13] = blake2b_IV[5]/* ^ S->t[1]*/; - v[14] = blake2b_IV[6] ^ S->f[0]; - v[15] = blake2b_IV[7]/* ^ S->f[1]*/; - -#define G(r, i, a, b, c, d) \ - do { \ - a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ - } while ((void)0, 0) - -#define ROUND(r) \ - do { \ - G(r, 0, v[0], v[4], v[8], v[12]); \ - G(r, 1, v[1], v[5], v[9], v[13]); \ - G(r, 2, v[2], v[6], v[10], v[14]); \ - G(r, 3, v[3], v[7], v[11], v[15]); \ - G(r, 4, v[0], v[5], v[10], v[15]); \ - G(r, 5, v[1], v[6], v[11], v[12]); \ - G(r, 6, v[2], v[7], v[8], v[13]); \ - G(r, 7, v[3], v[4], v[9], v[14]); \ - } while ((void)0, 0) - - for (r = 0; r < 12; ++r) ROUND(r); - - for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; - -#undef G -#undef ROUND -} - -int ar2_blake2b_update(blake2b_state *S, const void *in, size_t inlen) -{ - const uint8_t *pin = (const uint8_t *)in; - /* Complete current block */ - memcpy(&S->buf[4], pin, 124); - blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); - blake2b_compress(S, S->buf); - S->buflen = 0; - pin += 124; - - register int8_t i = 7; - /* Avoid buffer copies when possible */ - while (i--) { - blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); - blake2b_compress(S, pin); - pin += BLAKE2B_BLOCKBYTES; - } - memcpy(&S->buf[S->buflen], pin, 4); - S->buflen += 4; - return 0; -} - -void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen) -{ - memcpy(&S->buf[S->buflen], in, inlen); - S->buflen += (unsigned int)inlen; -} - -int ar2_blake2b_final(blake2b_state *S, void *out, size_t outlen) -{ - uint8_t buffer[BLAKE2B_OUTBYTES] = {0}; - unsigned int i; - - blake2b_increment_counter(S, S->buflen); - blake2b_set_lastblock(S); - memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ - blake2b_compress(S, S->buf); - - for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */ - store64(buffer + sizeof(S->h[i]) * i, S->h[i]); - } - - memcpy(out, buffer, S->outlen); - - burn(buffer, sizeof(buffer)); - burn(S->buf, sizeof(S->buf)); - burn(S->h, sizeof(S->h)); - return 0; -} - -int ar2_blake2b(void *out, const void *in, const void *key, size_t keylen) -{ - blake2b_state S; - - ar2_blake2b_init(&S, 64); - my_blake2b_update(&S, in, 64); - ar2_blake2b_final(&S, out, 64); - burn(&S, sizeof(S)); - return 0; -} - -void ar2_blake2b_too(void *pout, const void *in) -{ - uint8_t *out = (uint8_t *)pout; - uint8_t out_buffer[64]; - uint8_t in_buffer[64]; - - blake2b_state blake_state; - ar2_blake2b_init(&blake_state, 64); - blake_state.buflen = blake_state.buf[1] = 4; - my_blake2b_update(&blake_state, in, 72); - ar2_blake2b_final(&blake_state, out_buffer, 64); - memcpy(out, out_buffer, 32); - out += 32; - - register uint8_t i = 29; - while (i--) { - memcpy(in_buffer, out_buffer, 64); - ar2_blake2b(out_buffer, in_buffer, NULL, 0); - memcpy(out, out_buffer, 32); - out += 32; - } - - memcpy(in_buffer, out_buffer, 64); - ar2_blake2b(out_buffer, in_buffer, NULL, 0); - memcpy(out, out_buffer, 64); - - burn(&blake_state, sizeof(blake_state)); -} - -/* Argon2 Team - Begin Code */ -int ar2_blake2b_long(void *pout, const void *in) -{ - uint8_t *out = (uint8_t *)pout; - blake2b_state blake_state; - uint8_t outlen_bytes[sizeof(uint32_t)] = {0}; - - store32(outlen_bytes, 32); - - ar2_blake2b_init(&blake_state, 32); - my_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)); - ar2_blake2b_update(&blake_state, in, 1024); - ar2_blake2b_final(&blake_state, out, 32); - burn(&blake_state, sizeof(blake_state)); - return 0; -} -/* Argon2 Team - End Code */ diff --git a/algo/argon2/argon2a/ar2/cores.c b/algo/argon2/argon2a/ar2/cores.c deleted file mode 100644 index 152a0ba..0000000 --- a/algo/argon2/argon2a/ar2/cores.c +++ /dev/null @@ -1,349 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ - -/*For memory wiping*/ -#ifdef _MSC_VER -#include -#include /* For SecureZeroMemory */ -#endif -#if defined __STDC_LIB_EXT1__ -#define __STDC_WANT_LIB_EXT1__ 1 -#endif -#define VC_GE_2005(version) (version >= 1400) - -#include -#include -#include -#include - -#include "argon2.h" -#include "cores.h" -#include "blake2/blake2.h" -#include "blake2/blake2-impl.h" - -#ifdef GENKAT -#include "genkat.h" -#endif - -#if defined(__clang__) -#if __has_attribute(optnone) -#define NOT_OPTIMIZED __attribute__((optnone)) -#endif -#elif defined(__GNUC__) -#define GCC_VERSION \ - (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -#if GCC_VERSION >= 40400 -#define NOT_OPTIMIZED __attribute__((optimize("O0"))) -#endif -#endif -#ifndef NOT_OPTIMIZED -#define NOT_OPTIMIZED -#endif - -/***************Instance and Position constructors**********/ -void ar2_init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); } -//inline void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); } - -void ar2_copy_block(block *dst, const block *src) { -//inline void copy_block(block *dst, const block *src) { - memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_WORDS_IN_BLOCK); -} - -void ar2_xor_block(block *dst, const block *src) { -//inline void xor_block(block *dst, const block *src) { - int i; - for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) { - dst->v[i] ^= src->v[i]; - } -} - -static void ar2_load_block(block *dst, const void *input) { -//static inline void load_block(block *dst, const void *input) { - unsigned i; - for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) { - dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i])); - } -} - -static void ar2_store_block(void *output, const block *src) { -//static inline void store_block(void *output, const block *src) { - unsigned i; - for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) { - store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]); - } -} - -/***************Memory allocators*****************/ -int ar2_allocate_memory(block **memory, uint32_t m_cost) { - if (memory != NULL) { - size_t memory_size = sizeof(block) * m_cost; - if (m_cost != 0 && - memory_size / m_cost != - sizeof(block)) { /*1. Check for multiplication overflow*/ - return ARGON2_MEMORY_ALLOCATION_ERROR; - } - - *memory = (block *)malloc(memory_size); /*2. Try to allocate*/ - - if (!*memory) { - return ARGON2_MEMORY_ALLOCATION_ERROR; - } - - return ARGON2_OK; - } else { - return ARGON2_MEMORY_ALLOCATION_ERROR; - } -} - -void ar2_secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); } -//inline void secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); } - -/*********Memory functions*/ - -void ar2_clear_memory(argon2_instance_t *instance, int clear) { -//inline void clear_memory(argon2_instance_t *instance, int clear) { - if (instance->memory != NULL && clear) { - ar2_secure_wipe_memory(instance->memory, - sizeof(block) * /*instance->memory_blocks*/16); - } -} - -void ar2_free_memory(block *memory) { free(memory); } -//inline void free_memory(block *memory) { free(memory); } - -void ar2_finalize(const argon2_context *context, argon2_instance_t *instance) { - if (context != NULL && instance != NULL) { - block blockhash; - ar2_copy_block(&blockhash, instance->memory + 15); - - /* Hash the result */ - { - uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; - ar2_store_block(blockhash_bytes, &blockhash); - ar2_blake2b_long(context->out, blockhash_bytes); - ar2_secure_wipe_memory(blockhash.v, ARGON2_BLOCK_SIZE); - ar2_secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); /* clear blockhash_bytes */ - } - -#ifdef GENKAT - print_tag(context->out, context->outlen); -#endif - - /* Clear memory */ - // clear_memory(instance, 1); - - ar2_free_memory(instance->memory); - } -} - -uint32_t ar2_index_alpha(const argon2_instance_t *instance, - const argon2_position_t *position, uint32_t pseudo_rand, - int same_lane) { - /* - * Pass 0: - * This lane : all already finished segments plus already constructed - * blocks in this segment - * Other lanes : all already finished segments - * Pass 1+: - * This lane : (SYNC_POINTS - 1) last segments plus already constructed - * blocks in this segment - * Other lanes : (SYNC_POINTS - 1) last segments - */ - uint32_t reference_area_size; - uint64_t relative_position; - uint32_t start_position, absolute_position; - - if (0 == position->pass) { - /* First pass */ - if (0 == position->slice) { - /* First slice */ - reference_area_size = - position->index - 1; /* all but the previous */ - } else { - if (same_lane) { - /* The same lane => add current segment */ - reference_area_size = - position->slice * 4 + - position->index - 1; - } else { - reference_area_size = - position->slice * 4 + - ((position->index == 0) ? (-1) : 0); - } - } - } else { - /* Second pass */ - if (same_lane) {reference_area_size = 11 + position->index;} - else {reference_area_size = 12 - (position->index == 0);} - } - - /* 1.2.4. Mapping pseudo_rand to 0.. and produce - * relative position */ - relative_position = pseudo_rand; - relative_position = relative_position * relative_position >> 32; - relative_position = reference_area_size - 1 - - (reference_area_size * relative_position >> 32); - - /* 1.2.5 Computing starting position */ - start_position = 0; - - if (0 != position->pass) { - start_position = (position->slice == ARGON2_SYNC_POINTS - 1) - ? 0 : (position->slice + 1) * 4; - } - - /* 1.2.6. Computing absolute position */ - absolute_position = (start_position + relative_position) % 16; - return absolute_position; -} - -void ar2_fill_memory_blocks(argon2_instance_t *instance) { - uint32_t r, s; - - for (r = 0; r < 2; ++r) { - for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { - - argon2_position_t position; - position.pass = r; - position.lane = 0; - position.slice = (uint8_t)s; - position.index = 0; - ar2_fill_segment(instance, position); - } - -#ifdef GENKAT - internal_kat(instance, r); /* Print all memory blocks */ -#endif - } -} - -void ar2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) { - /* Make the first and second block in each lane as G(H0||i||0) or - G(H0||i||1) */ - uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; - store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0); - store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, 0); - ar2_blake2b_too(blockhash_bytes, blockhash); - ar2_load_block(&instance->memory[0], blockhash_bytes); - - store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1); - ar2_blake2b_too(blockhash_bytes, blockhash); - ar2_load_block(&instance->memory[1], blockhash_bytes); - ar2_secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); -} - - -static const blake2b_state base_hash = { - .h = { - UINT64_C(7640891576939301192), UINT64_C(13503953896175478587), - UINT64_C(4354685564936845355), UINT64_C(11912009170470909681), - UINT64_C(5840696475078001361), UINT64_C(11170449401992604703), - UINT64_C(2270897969802886507), UINT64_C(6620516959819538809) - }, - .t = {UINT64_C(0),UINT64_C(0)}, - .f = {UINT64_C(0),UINT64_C(0)}, - .buf = { - 1, 0, 0, 0, 32, 0, 0, 0, 16, 0, 0, 0, 2, 0, 0, 0, 16, 0, 0, 0, 1, 0, - 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - .buflen = 28, - .outlen = 64, - .last_node = 0 -}; - -#define PWDLEN 32 -#define SALTLEN 32 -#define SECRETLEN 0 -#define ADLEN 0 -void ar2_initial_hash(uint8_t *blockhash, argon2_context *context, - argon2_type type) { - - uint8_t value[sizeof(uint32_t)]; - - /* Is it generating cache invalidation between cores ? */ - blake2b_state BlakeHash = base_hash; - BlakeHash.buf[20] = (uint8_t) type; - my_blake2b_update(&BlakeHash, (const uint8_t *)context->pwd, - PWDLEN); - - - ar2_secure_wipe_memory(context->pwd, PWDLEN); - context->pwdlen = 0; - - store32(&value, SALTLEN); - my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - my_blake2b_update(&BlakeHash, (const uint8_t *)context->salt, - SALTLEN); - - store32(&value, SECRETLEN); - my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - store32(&value, ADLEN); - my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - ar2_blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); -} - -int ar2_initialize(argon2_instance_t *instance, argon2_context *context) { - /* 1. Memory allocation */ - - - ar2_allocate_memory(&(instance->memory), 16); - - /* 2. Initial hashing */ - /* H_0 + 8 extra bytes to produce the first blocks */ - /* Hashing all inputs */ - uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; - ar2_initial_hash(blockhash, context, instance->type); - /* Zeroing 8 extra bytes */ - ar2_secure_wipe_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, - ARGON2_PREHASH_SEED_LENGTH - - ARGON2_PREHASH_DIGEST_LENGTH); - -#ifdef GENKAT - initial_kat(blockhash, context, instance->type); -#endif - - /* 3. Creating first blocks, we always have at least two blocks in a slice - */ - ar2_fill_first_blocks(blockhash, instance); - /* Clearing the hash */ - ar2_secure_wipe_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH); - - return ARGON2_OK; -} - -int ar2_argon2_core(argon2_context *context, argon2_type type) { - argon2_instance_t instance; - instance.memory = NULL; - instance.type = type; - - /* 3. Initialization: Hashing inputs, allocating memory, filling first - * blocks - */ - - int result = ar2_initialize(&instance, context); - if (ARGON2_OK != result) return result; - - /* 4. Filling memory */ - ar2_fill_memory_blocks(&instance); - - /* 5. Finalization */ - ar2_finalize(context, &instance); - - return ARGON2_OK; -} diff --git a/algo/argon2/argon2a/ar2/cores.h b/algo/argon2/argon2a/ar2/cores.h deleted file mode 100644 index e3f183f..0000000 --- a/algo/argon2/argon2a/ar2/cores.h +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ - -#ifndef ARGON2_CORES_H -#define ARGON2_CORES_H - -#if defined(_MSC_VER) -#include -#include -#define ALIGN(n) __declspec(align(n)) -#elif defined(__GNUC__) || defined(__clang) -#define ALIGN(x) __attribute__((__aligned__(x))) -#else -#define ALIGN(x) -#endif - -/*************************Argon2 internal - * constants**************************************************/ - -enum argon2_core_constants { - /* Version of the algorithm */ - ARGON2_VERSION_NUMBER = 0x10, - - /* Memory block size in bytes */ - ARGON2_BLOCK_SIZE = 1024, - ARGON2_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8, - ARGON2_QWORDS_IN_BLOCK = 64, - - /* Number of pseudo-random values generated by one call to Blake in Argon2i - to - generate reference block positions */ - ARGON2_ADDRESSES_IN_BLOCK = 128, - - /* Pre-hashing digest length and its extension*/ - ARGON2_PREHASH_DIGEST_LENGTH = 64, - ARGON2_PREHASH_SEED_LENGTH = 72 -}; - -/* Argon2 primitive type */ -typedef enum Argon2_type { Argon2_d = 0, Argon2_i = 1 } argon2_type; - -/*************************Argon2 internal data - * types**************************************************/ - -/* - * Structure for the (1KB) memory block implemented as 128 64-bit words. - * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no - * bounds checking). - */ -typedef struct _block { uint64_t v[ARGON2_WORDS_IN_BLOCK]; } ALIGN(16) block; - -/*****************Functions that work with the block******************/ - -/* Initialize each byte of the block with @in */ -void ar2_init_block_value(block *b, uint8_t in); - -/* Copy block @src to block @dst */ -void ar2_copy_block(block *dst, const block *src); - -/* XOR @src onto @dst bytewise */ -void ar2_xor_block(block *dst, const block *src); - -/* - * Argon2 instance: memory pointer, number of passes, amount of memory, type, - * and derived values. - * Used to evaluate the number and location of blocks to construct in each - * thread - */ -typedef struct Argon2_instance_t { - block *memory; /* Memory pointer */ - argon2_type type; - int print_internals; /* whether to print the memory blocks */ -} argon2_instance_t; - -/* - * Argon2 position: where we construct the block right now. Used to distribute - * work between threads. - */ -typedef struct Argon2_position_t { - uint32_t pass; - uint32_t lane; - uint8_t slice; - uint32_t index; -} argon2_position_t; - -/*************************Argon2 core - * functions**************************************************/ - -/* Allocates memory to the given pointer - * @param memory pointer to the pointer to the memory - * @param m_cost number of blocks to allocate in the memory - * @return ARGON2_OK if @memory is a valid pointer and memory is allocated - */ -int ar2_allocate_memory(block **memory, uint32_t m_cost); - -/* Function that securely cleans the memory - * @param mem Pointer to the memory - * @param s Memory size in bytes - */ -void ar2_secure_wipe_memory(void *v, size_t n); - -/* Clears memory - * @param instance pointer to the current instance - * @param clear_memory indicates if we clear the memory with zeros. - */ -void ar2_clear_memory(argon2_instance_t *instance, int clear); - -/* Deallocates memory - * @param memory pointer to the blocks - */ -void ar2_free_memory(block *memory); - -/* - * Computes absolute position of reference block in the lane following a skewed - * distribution and using a pseudo-random value as input - * @param instance Pointer to the current instance - * @param position Pointer to the current position - * @param pseudo_rand 32-bit pseudo-random value used to determine the position - * @param same_lane Indicates if the block will be taken from the current lane. - * If so we can reference the current segment - * @pre All pointers must be valid - */ -uint32_t ar2_index_alpha(const argon2_instance_t *instance, - const argon2_position_t *position, uint32_t pseudo_rand, - int same_lane); - -/* - * Function that validates all inputs against predefined restrictions and return - * an error code - * @param context Pointer to current Argon2 context - * @return ARGON2_OK if everything is all right, otherwise one of error codes - * (all defined in - */ -int ar2_validate_inputs(const argon2_context *context); - -/* - * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears - * password and secret if needed - * @param context Pointer to the Argon2 internal structure containing memory - * pointer, and parameters for time and space requirements. - * @param blockhash Buffer for pre-hashing digest - * @param type Argon2 type - * @pre @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes - * allocated - */ -void ar2_initial_hash(uint8_t *blockhash, argon2_context *context, - argon2_type type); - -/* - * Function creates first 2 blocks per lane - * @param instance Pointer to the current instance - * @param blockhash Pointer to the pre-hashing digest - * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values - */ -void ar2_fill_firsts_blocks(uint8_t *blockhash, const argon2_instance_t *instance); - -/* - * Function allocates memory, hashes the inputs with Blake, and creates first - * two blocks. Returns the pointer to the main memory with 2 blocks per lane - * initialized - * @param context Pointer to the Argon2 internal structure containing memory - * pointer, and parameters for time and space requirements. - * @param instance Current Argon2 instance - * @return Zero if successful, -1 if memory failed to allocate. @context->state - * will be modified if successful. - */ -int ar2_initialize(argon2_instance_t *instance, argon2_context *context); - -/* - * XORing the last block of each lane, hashing it, making the tag. Deallocates - * the memory. - * @param context Pointer to current Argon2 context (use only the out parameters - * from it) - * @param instance Pointer to current instance of Argon2 - * @pre instance->state must point to necessary amount of memory - * @pre context->out must point to outlen bytes of memory - * @pre if context->free_cbk is not NULL, it should point to a function that - * deallocates memory - */ -void ar2_finalize(const argon2_context *context, argon2_instance_t *instance); - -/* - * Function that fills the segment using previous segments also from other - * threads - * @param instance Pointer to the current instance - * @param position Current position - * @pre all block pointers must be valid - */ -void ar2_fill_segment(const argon2_instance_t *instance, - argon2_position_t position); - -/* - * Function that fills the entire memory t_cost times based on the first two - * blocks in each lane - * @param instance Pointer to the current instance - */ -void ar2_fill_memory_blocks(argon2_instance_t *instance); - -/* - * Function that performs memory-hard hashing with certain degree of parallelism - * @param context Pointer to the Argon2 internal structure - * @return Error code if smth is wrong, ARGON2_OK otherwise - */ -int ar2_argon2_core(argon2_context *context, argon2_type type); - -#endif diff --git a/algo/argon2/argon2a/ar2/genkat.c.hide b/algo/argon2/argon2a/ar2/genkat.c.hide deleted file mode 100644 index 07042b2..0000000 --- a/algo/argon2/argon2a/ar2/genkat.c.hide +++ /dev/null @@ -1,186 +0,0 @@ -#include -#include -#include -#include - -#include "argon2.h" -#include "cores.h" - -void initial_kat(const uint8_t *blockhash, const argon2_context *context, - argon2_type type) -{ - unsigned i; - - if (blockhash != NULL && context != NULL) { - printf("======================================="); - - switch (type) { - case Argon2_d: - printf("Argon2d\n"); - break; - - case Argon2_i: - printf("Argon2i\n"); - break; - - default: - break; - } - - printf("Memory: %u KiB, Iterations: %u, Parallelism: %u lanes, Tag " - "length: %u bytes\n", - context->m_cost, context->t_cost, context->lanes, - context->outlen); - - printf("Password[%u]: ", context->pwdlen); - - if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) { - printf("CLEARED\n"); - } else { - for (i = 0; i < context->pwdlen; ++i) { - printf("%2.2x ", ((unsigned char *)context->pwd)[i]); - } - - printf("\n"); - } - - printf("Salt[%u]: ", context->saltlen); - - for (i = 0; i < context->saltlen; ++i) { - printf("%2.2x ", ((unsigned char *)context->salt)[i]); - } - - printf("\n"); - - printf("Secret[%u]: ", context->secretlen); - - if (context->flags & ARGON2_FLAG_CLEAR_SECRET) { - printf("CLEARED\n"); - } else { - for (i = 0; i < context->secretlen; ++i) { - printf("%2.2x ", ((unsigned char *)context->secret)[i]); - } - - printf("\n"); - } - - printf("Associated data[%u]: ", context->adlen); - - for (i = 0; i < context->adlen; ++i) { - printf("%2.2x ", ((unsigned char *)context->ad)[i]); - } - - printf("\n"); - - printf("Pre-hashing digest: "); - - for (i = 0; i < ARGON2_PREHASH_DIGEST_LENGTH; ++i) { - printf("%2.2x ", ((unsigned char *)blockhash)[i]); - } - - printf("\n"); - } -} - -void print_tag(const void *out, uint32_t outlen) -{ - unsigned i; - if (out != NULL) { - printf("Tag: "); - - for (i = 0; i < outlen; ++i) { - printf("%2.2x ", ((uint8_t *)out)[i]); - } - - printf("\n"); - } -} - -void internal_kat(const argon2_instance_t *instance, uint32_t pass) -{ - if (instance != NULL) { - uint32_t i, j; - printf("\n After pass %u:\n", pass); - - for (i = 0; i < instance->memory_blocks; ++i) { - uint32_t how_many_words = - (instance->memory_blocks > ARGON2_WORDS_IN_BLOCK) - ? 1 - : ARGON2_WORDS_IN_BLOCK; - - for (j = 0; j < how_many_words; ++j) - printf("Block %.4u [%3u]: %016" PRIx64 "\n", i, j, - instance->memory[i].v[j]); - } - } -} - -static void fatal(const char *error) { - fprintf(stderr, "Error: %s\n", error); - exit(1); -} - -static void generate_testvectors(const char *type) -{ -#define TEST_OUTLEN 32 -#define TEST_PWDLEN 32 -#define TEST_SALTLEN 16 -#define TEST_SECRETLEN 8 -#define TEST_ADLEN 12 - argon2_context context; - - unsigned char out[TEST_OUTLEN]; - unsigned char pwd[TEST_PWDLEN]; - unsigned char salt[TEST_SALTLEN]; - unsigned char secret[TEST_SECRETLEN]; - unsigned char ad[TEST_ADLEN]; - const allocate_fptr myown_allocator = NULL; - const deallocate_fptr myown_deallocator = NULL; - - unsigned t_cost = 3; - unsigned m_cost = 16; - unsigned lanes = 4; - - memset(pwd, 1, TEST_OUTLEN); - memset(salt, 2, TEST_SALTLEN); - memset(secret, 3, TEST_SECRETLEN); - memset(ad, 4, TEST_ADLEN); - - context.out = out; - context.outlen = TEST_OUTLEN; - context.pwd = pwd; - context.pwdlen = TEST_PWDLEN; - context.salt = salt; - context.saltlen = TEST_SALTLEN; - context.secret = secret; - context.secretlen = TEST_SECRETLEN; - context.ad = ad; - context.adlen = TEST_ADLEN; - context.t_cost = t_cost; - context.m_cost = m_cost; - context.lanes = lanes; - context.threads = lanes; - context.allocate_cbk = myown_allocator; - context.free_cbk = myown_deallocator; - context.flags = 0; - -#undef TEST_OUTLEN -#undef TEST_PWDLEN -#undef TEST_SALTLEN -#undef TEST_SECRETLEN -#undef TEST_ADLEN - - if (!strcmp(type, "d")) { - argon2d(&context); - } else if (!strcmp(type, "i")) { - argon2i(&context); - } else - fatal("wrong Argon2 type"); -} - -int main(int argc, char *argv[]) -{ - const char *type = (argc > 1) ? argv[1] : "i"; - generate_testvectors(type); - return ARGON2_OK; -} diff --git a/algo/argon2/argon2a/ar2/genkat.h.hide b/algo/argon2/argon2a/ar2/genkat.h.hide deleted file mode 100644 index 9c776bf..0000000 --- a/algo/argon2/argon2a/ar2/genkat.h.hide +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ - -#ifndef ARGON2_KAT_H -#define ARGON2_KAT_H - -/* - * Initial KAT function that prints the inputs to the file - * @param blockhash Array that contains pre-hashing digest - * @param context Holds inputs - * @param type Argon2 type - * @pre blockhash must point to INPUT_INITIAL_HASH_LENGTH bytes - * @pre context member pointers must point to allocated memory of size according - * to the length values - */ -void initial_kat(const uint8_t *blockhash, const argon2_context *context, - argon2_type type); - -/* - * Function that prints the output tag - * @param out output array pointer - * @param outlen digest length - * @pre out must point to @a outlen bytes - **/ -void print_tag(const void *out, uint32_t outlen); - -/* - * Function that prints the internal state at given moment - * @param instance pointer to the current instance - * @param pass current pass number - * @pre instance must have necessary memory allocated - **/ -void internal_kat(const argon2_instance_t *instance, uint32_t pass); - -#endif diff --git a/algo/argon2/argon2a/ar2/opt.c b/algo/argon2/argon2a/ar2/opt.c deleted file mode 100644 index feda867..0000000 --- a/algo/argon2/argon2a/ar2/opt.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ - -#include -#include -#include -#include -#include - -#include - -#include "argon2.h" -#include "cores.h" -#include "opt.h" - -#include "blake2/blake2.h" -#include "blake2/blamka-round-opt.h" - -void ar2_fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block) -{ - __m128i ALIGN(16) block_XY[ARGON2_QWORDS_IN_BLOCK]; - uint32_t i; - for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) { - block_XY[i] = state[i] = _mm_xor_si128( - state[i], _mm_load_si128(&ref_block[i])); - } - - BLAKE2_ROUND(state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]); - BLAKE2_ROUND(state[8], state[9], state[10], state[11], state[12], state[13], state[14], state[15]); - BLAKE2_ROUND(state[16], state[17], state[18], state[19], state[20], state[21], state[22], state[23]); - BLAKE2_ROUND(state[24], state[25], state[26], state[27], state[28], state[29], state[30], state[31]); - BLAKE2_ROUND(state[32], state[33], state[34], state[35], state[36], state[37], state[38], state[39]); - BLAKE2_ROUND(state[40], state[41], state[42], state[43], state[44], state[45], state[46], state[47]); - BLAKE2_ROUND(state[48], state[49], state[50], state[51], state[52], state[53], state[54], state[55]); - BLAKE2_ROUND(state[56], state[57], state[58], state[59], state[60], state[61], state[62], state[63]); - /*for (i = 0; i < 8; ++i) { - BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], - state[8 * i + 3], state[8 * i + 4], state[8 * i + 5], - state[8 * i + 6], state[8 * i + 7]); - }*/ - - BLAKE2_ROUND(state[0], state[8], state[16], state[24], state[32], state[40], state[48], state[56]); - BLAKE2_ROUND(state[1], state[9], state[17], state[25], state[33], state[41], state[49], state[57]); - BLAKE2_ROUND(state[2], state[10], state[18], state[26], state[34], state[42], state[50], state[58]); - BLAKE2_ROUND(state[3], state[11], state[19], state[27], state[35], state[43], state[51], state[59]); - BLAKE2_ROUND(state[4], state[12], state[20], state[28], state[36], state[44], state[52], state[60]); - BLAKE2_ROUND(state[5], state[13], state[21], state[29], state[37], state[45], state[53], state[61]); - BLAKE2_ROUND(state[6], state[14], state[22], state[30], state[38], state[46], state[54], state[62]); - BLAKE2_ROUND(state[7], state[15], state[23], state[31], state[39], state[47], state[55], state[63]); - /*for (i = 0; i < 8; ++i) { - BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], - state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i], - state[8 * 6 + i], state[8 * 7 + i]); - }*/ - - for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) { - state[i] = _mm_xor_si128(state[i], block_XY[i]); - _mm_storeu_si128(&next_block[i], state[i]); - } -} - -static const uint64_t bad_rands[32] = { - UINT64_C(17023632018251376180), UINT64_C(4911461131397773491), - UINT64_C(15927076453364631751), UINT64_C(7860239898779391109), - - UINT64_C(11820267568857244377), UINT64_C(12188179869468676617), - UINT64_C(3732913385414474778), UINT64_C(7651458777762572084), - - UINT64_C(3062274162574341415), UINT64_C(17922653540258786897), - UINT64_C(17393848266100524980), UINT64_C(8539695715554563839), - - UINT64_C(13824538050656654359), UINT64_C(12078939433126460936), - UINT64_C(15331979418564540430), UINT64_C(12058346794217174273), - - UINT64_C(13593922096015221049), UINT64_C(18356682276374416500), - UINT64_C(4968040514092703824), UINT64_C(11202790346130235567), - - UINT64_C(2276229735041314644), UINT64_C(220837743321691382), - UINT64_C(4861211596230784273), UINT64_C(6330592584132590331), - - UINT64_C(3515580430960296763), UINT64_C(9869356316971855173), - UINT64_C(485533243489193056), UINT64_C(14596447761048148032), - - UINT64_C(16531790085730132900), UINT64_C(17328824500878824371), - UINT64_C(8548260058287621283), UINT64_C(8641748798041936364) -}; - -void ar2_generate_addresses(const argon2_instance_t *instance, - const argon2_position_t *position, - uint64_t *pseudo_rands) -{ - uint8_t offset = position->pass * 16 + position->slice * 4; - pseudo_rands[0] = bad_rands[offset++]; - pseudo_rands[1] = bad_rands[offset++]; - pseudo_rands[2] = bad_rands[offset++]; - pseudo_rands[3] = bad_rands[offset++]; - - /*if ((position->pass == 1 && position->slice == 3)) - print64("pseudo_rands", pseudo_rands, 4);*/ -} - -#define SEGMENT_LENGTH 4 -#define LANE_LENGTH 16 -#define POS_LANE 0 - -void ar2_fill_segment(const argon2_instance_t *instance, - argon2_position_t position) -{ - block *ref_block = NULL, *curr_block = NULL; - uint64_t pseudo_rand, ref_index; - uint32_t prev_offset, curr_offset; - uint8_t i; - __m128i state[64]; - int data_independent_addressing = (instance->type == Argon2_i); - - /* Pseudo-random values that determine the reference block position */ - uint64_t *pseudo_rands = NULL; - - pseudo_rands = (uint64_t *)malloc(/*sizeof(uint64_t) * 4*/32); - - if (data_independent_addressing) { - ar2_generate_addresses(instance, &position, pseudo_rands); - } - - i = 0; - - if ((0 == position.pass) && (0 == position.slice)) { - i = 2; /* we have already generated the first two blocks */ - } - - /*printf("Position.lane = %d\nPosition.slice = %d\nStarting index : %d\n", position.lane, position.slice, starting_index);*/ - /* Offset of the current block */ - curr_offset = position.slice * 4 + i; - - if (0 == curr_offset % 16) { - /* Last block in this lane */ - prev_offset = curr_offset + /*instance->lane_length - 1*/15; - } else { - /* Previous block */ - prev_offset = curr_offset - 1; - } - - memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); - - for (; i < SEGMENT_LENGTH; - ++i, ++curr_offset, ++prev_offset) { - /*1.1 Rotating prev_offset if needed */ - if (curr_offset % LANE_LENGTH == 1) { - prev_offset = curr_offset - 1; - } - - /* 1.2 Computing the index of the reference block */ - /* 1.2.1 Taking pseudo-random value from the previous block */ - if (data_independent_addressing) { - pseudo_rand = pseudo_rands[i]; - } else { - pseudo_rand = instance->memory[prev_offset].v[0]; - } - - /* 1.2.2 Computing the lane of the reference block */ - - /* 1.2.3 Computing the number of possible reference block within the - * lane. - */ - position.index = i; - ref_index = ar2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,1); - - /* 2 Creating a new block */ - ref_block = instance->memory + ref_index; - curr_block = instance->memory + curr_offset; - ar2_fill_block(state, (__m128i const *)ref_block->v, (__m128i *)curr_block->v); - } - - free(pseudo_rands); -} diff --git a/algo/argon2/argon2a/ar2/opt.h b/algo/argon2/argon2a/ar2/opt.h deleted file mode 100644 index 8e3b5aa..0000000 --- a/algo/argon2/argon2a/ar2/opt.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ - -#ifndef ARGON2_OPT_H -#define ARGON2_OPT_H - -/* - * Function fills a new memory block. Differs from the - * @param state Pointer to the just produced block. Content will be updated(!) - * @param ref_block Pointer to the reference block - * @param next_block Pointer to the block to be constructed - * @pre all block pointers must be valid - */ -void ar2_fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block); - -/* - * Generate pseudo-random values to reference blocks in the segment and puts - * them into the array - * @param instance Pointer to the current instance - * @param position Pointer to the current position - * @param pseudo_rands Pointer to the array of 64-bit values - * @pre pseudo_rands must point to @a instance->segment_length allocated values - */ -void ar2_generate_addresses(const argon2_instance_t *instance, - const argon2_position_t *position, - uint64_t *pseudo_rands); - -/* - * Function that fills the segment using previous segments also from other - * threads. - * Identical to the reference code except that it calls optimized FillBlock() - * @param instance Pointer to the current instance - * @param position Current position - * @pre all block pointers must be valid - */ -void ar2_fill_segment(const argon2_instance_t *instance, - argon2_position_t position); - -#endif /* ARGON2_OPT_H */ diff --git a/algo/argon2/argon2a/ar2/ref.c.hide b/algo/argon2/argon2a/ar2/ref.c.hide deleted file mode 100644 index 98ae07c..0000000 --- a/algo/argon2/argon2a/ar2/ref.c.hide +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ - -#include -#include -#include - -#include "argon2.h" -#include "cores.h" -#include "ref.h" - -#include "blake2/blamka-round-ref.h" -#include "blake2/blake2-impl.h" -#include "blake2/blake2.h" - -void fill_block(const block *prev_block, const block *ref_block, - block *next_block) { - block blockR, block_tmp; - unsigned i; - - copy_block(&blockR, ref_block); - xor_block(&blockR, prev_block); - copy_block(&block_tmp, &blockR); - - /* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then - (16,17,..31)... finally (112,113,...127) */ - for (i = 0; i < 8; ++i) { - BLAKE2_ROUND_NOMSG( - blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2], - blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5], - blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8], - blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11], - blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14], - blockR.v[16 * i + 15]); - } - - /* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then - (2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */ - for (i = 0; i < 8; i++) { - BLAKE2_ROUND_NOMSG( - blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16], - blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33], - blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64], - blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81], - blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112], - blockR.v[2 * i + 113]); - } - - copy_block(next_block, &block_tmp); - xor_block(next_block, &blockR); -} - -void generate_addresses(const argon2_instance_t *instance, - const argon2_position_t *position, - uint64_t *pseudo_rands) { - block zero_block, input_block, address_block; - uint32_t i; - - init_block_value(&zero_block, 0); - init_block_value(&input_block, 0); - init_block_value(&address_block, 0); - - if (instance != NULL && position != NULL) { - input_block.v[0] = position->pass; - input_block.v[1] = position->lane; - input_block.v[2] = position->slice; - input_block.v[3] = 16; - input_block.v[4] = 2; - input_block.v[5] = instance->type; - - for (i = 0; i < 4; ++i) { - if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { - input_block.v[6]++; - fill_block(&zero_block, &input_block, &address_block); - fill_block(&zero_block, &address_block, &address_block); - } - - pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; - } - } -} - -void fill_segment(const argon2_instance_t *instance, - argon2_position_t position) { - block *ref_block = NULL, *curr_block = NULL; - uint64_t pseudo_rand, ref_index, ref_lane; - uint32_t prev_offset, curr_offset; - uint32_t starting_index; - uint32_t i; - int data_independent_addressing = (instance->type == Argon2_i); - /* Pseudo-random values that determine the reference block position */ - uint64_t *pseudo_rands = NULL; - - if (instance == NULL) { - return; - } - - pseudo_rands = - (uint64_t *)malloc(sizeof(uint64_t) * 4); - - if (pseudo_rands == NULL) { - return; - } - - if (data_independent_addressing) { - generate_addresses(instance, &position, pseudo_rands); - } - - starting_index = 0; - - if ((0 == position.pass) && (0 == position.slice)) { - starting_index = 2; /* we have already generated the first two blocks */ - } - - /* Offset of the current block */ - curr_offset = position.lane * 16 + - position.slice * 4 + starting_index; - - if (0 == curr_offset % 16) { - /* Last block in this lane */ - prev_offset = curr_offset + 16 - 1; - } else { - /* Previous block */ - prev_offset = curr_offset - 1; - } - - for (i = starting_index; i < 4; ++i, ++curr_offset, ++prev_offset) { - /*1.1 Rotating prev_offset if needed */ - if (curr_offset % 16 == 1) { - prev_offset = curr_offset - 1; - } - - /* 1.2 Computing the index of the reference block */ - /* 1.2.1 Taking pseudo-random value from the previous block */ - if (data_independent_addressing) { - pseudo_rand = pseudo_rands[i]; - } else { - pseudo_rand = instance->memory[prev_offset].v[0]; - } - - /* 1.2.2 Computing the lane of the reference block */ - ref_lane = ((pseudo_rand >> 32)) % 1; - - if ((position.pass == 0) && (position.slice == 0)) { - /* Can not reference other lanes yet */ - ref_lane = position.lane; - } - - /* 1.2.3 Computing the number of possible reference block within the - * lane. - */ - position.index = i; - ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, - ref_lane == position.lane); - - /* 2 Creating a new block */ - ref_block = - instance->memory + 16 * ref_lane + ref_index; - curr_block = instance->memory + curr_offset; - fill_block(instance->memory + prev_offset, ref_block, curr_block); - } - - free(pseudo_rands); -} diff --git a/algo/argon2/argon2a/ar2/ref.h.hide b/algo/argon2/argon2a/ar2/ref.h.hide deleted file mode 100644 index 7ee22ee..0000000 --- a/algo/argon2/argon2a/ar2/ref.h.hide +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ - -#ifndef ARGON2_REF_H -#define ARGON2_REF_H - -/* - * Function fills a new memory block - * @param prev_block Pointer to the previous block - * @param ref_block Pointer to the reference block - * @param next_block Pointer to the block to be constructed - * @pre all block pointers must be valid - */ -void fill_block(const block *prev_block, const block *ref_block, - block *next_block); - -/* - * Generate pseudo-random values to reference blocks in the segment and puts - * them into the array - * @param instance Pointer to the current instance - * @param position Pointer to the current position - * @param pseudo_rands Pointer to the array of 64-bit values - * @pre pseudo_rands must point to @a instance->segment_length allocated values - */ -void generate_addresses(const argon2_instance_t *instance, - const argon2_position_t *position, - uint64_t *pseudo_rands); - -/* - * Function that fills the segment using previous segments also from other - * threads - * @param instance Pointer to the current instance - * @param position Current position - * @pre all block pointers must be valid - */ -void fill_segment(const argon2_instance_t *instance, - argon2_position_t position); - -#endif /* ARGON2_REF_H */ diff --git a/algo/argon2/argon2a/ar2/run.c.hide b/algo/argon2/argon2a/ar2/run.c.hide deleted file mode 100644 index 2b1b30a..0000000 --- a/algo/argon2/argon2a/ar2/run.c.hide +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Argon2 source code package - * - * Written by Daniel Dinu and Dmitry Khovratovich, 2015 - * - * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. - * - * You should have received a copy of the CC0 Public Domain Dedication along - * with - * this software. If not, see - * . - */ - -#include -#include -#include -#include -#include -#include - -#include "argon2.h" -#include "cores.h" - -#define T_COST_DEF 3 -#define LOG_M_COST_DEF 12 /* 2^12 = 4 MiB */ -#define LANES_DEF 1 -#define THREADS_DEF 1 -#define OUT_LEN 32 -#define SALT_LEN 16 - -#define UNUSED_PARAMETER(x) (void)(x) - -static void usage(const char *cmd) { - printf("Usage: %s pwd salt [-y version] [-t iterations] [-m memory] [-p " - "parallelism]\n", - cmd); - - printf("Parameters:\n"); - printf("\tpwd\t\tThe password to hash\n"); - printf("\tsalt\t\tThe salt to use, at most 16 characters\n"); - printf("\t-d\t\tUse Argon2d instead of Argon2i (which is the default)\n"); - printf("\t-t N\t\tSets the number of iterations to N (default = %d)\n", - T_COST_DEF); - printf("\t-m N\t\tSets the memory usage of 2^N KiB (default %d)\n", - LOG_M_COST_DEF); - printf("\t-p N\t\tSets parallelism to N threads (default %d)\n", - THREADS_DEF); -} - -static void fatal(const char *error) { - fprintf(stderr, "Error: %s\n", error); - exit(1); -} - -/* -Runs Argon2 with certain inputs and parameters, inputs not cleared. Prints the -Base64-encoded hash string -@out output array with at least 32 bytes allocated -@pwd NULL-terminated string, presumably from argv[] -@salt salt array with at least SALTLEN_DEF bytes allocated -@t_cost number of iterations -@m_cost amount of requested memory in KB -@lanes amount of requested parallelism -@threads actual parallelism -@type String, only "d" and "i" are accepted -*/ -static void run(uint8_t *out, char *pwd, uint8_t *salt, uint32_t t_cost, - uint32_t m_cost, uint32_t lanes, uint32_t threads, - const char *type) { - clock_t start_time, stop_time; - unsigned pwd_length; - argon2_context context; - int i; - - start_time = clock(); - - if (!pwd) { - fatal("password missing"); - } - - if (!salt) { - secure_wipe_memory(pwd, strlen(pwd)); - fatal("salt missing"); - } - - pwd_length = strlen(pwd); - - UNUSED_PARAMETER(threads); - - context.out = out; - context.outlen = OUT_LEN; - context.pwd = (uint8_t *)pwd; - context.pwdlen = pwd_length; - context.salt = salt; - context.saltlen = SALT_LEN; - context.secret = NULL; - context.secretlen = 0; - context.ad = NULL; - context.adlen = 0; - context.t_cost = t_cost; - context.m_cost = m_cost; - context.lanes = lanes; - context.threads = lanes; - context.allocate_cbk = NULL; - context.free_cbk = NULL; - context.flags = ARGON2_FLAG_CLEAR_PASSWORD; - - if (!strcmp(type, "d")) { - int result = argon2d(&context); - if (result != ARGON2_OK) - fatal(error_message(result)); - } else if (!strcmp(type, "i")) { - int result = argon2i(&context); - if (result != ARGON2_OK) - fatal(error_message(result)); - } else { - secure_wipe_memory(pwd, strlen(pwd)); - fatal("wrong Argon2 type"); - } - - stop_time = clock(); - - /* add back when proper decoding */ - /* - char encoded[300]; - encode_string(encoded, sizeof encoded, &context); - printf("%s\n", encoded); - */ - printf("Hash:\t\t"); - for (i = 0; i < context.outlen; ++i) { - printf("%02x", context.out[i]); - } - printf("\n"); - - printf("%2.3f seconds\n", - ((double)stop_time - start_time) / (CLOCKS_PER_SEC)); -} - -int main(int argc, char *argv[]) { - unsigned char out[OUT_LEN]; - uint32_t m_cost = 1 << LOG_M_COST_DEF; - uint32_t t_cost = T_COST_DEF; - uint32_t lanes = LANES_DEF; - uint32_t threads = THREADS_DEF; - char *pwd = NULL; - uint8_t salt[SALT_LEN]; - const char *type = "i"; - int i; - - if (argc < 3) { - usage(argv[0]); - return ARGON2_MISSING_ARGS; - } - - /* get password and salt from command line */ - pwd = argv[1]; - if (strlen(argv[2]) > SALT_LEN) { - fatal("salt too long"); - } - memset(salt, 0x00, SALT_LEN); /* pad with null bytes */ - memcpy(salt, argv[2], strlen(argv[2])); - - /* parse options */ - for (i = 3; i < argc; i++) { - const char *a = argv[i]; - unsigned long input = 0; - if (!strcmp(a, "-m")) { - if (i < argc - 1) { - i++; - input = strtoul(argv[i], NULL, 10); - if (input == 0 || input == ULONG_MAX || - input > ARGON2_MAX_MEMORY_BITS) { - fatal("bad numeric input for -m"); - } - m_cost = ARGON2_MIN(UINT64_C(1) << input, UINT32_C(0xFFFFFFFF)); - if (m_cost > ARGON2_MAX_MEMORY) { - fatal("m_cost overflow"); - } - continue; - } else { - fatal("missing -m argument"); - } - } else if (!strcmp(a, "-t")) { - if (i < argc - 1) { - i++; - input = strtoul(argv[i], NULL, 10); - if (input == 0 || input == ULONG_MAX || - input > ARGON2_MAX_TIME) { - fatal("bad numeric input for -t"); - } - t_cost = input; - continue; - } else { - fatal("missing -t argument"); - } - } else if (!strcmp(a, "-p")) { - if (i < argc - 1) { - i++; - input = strtoul(argv[i], NULL, 10); - if (input == 0 || input == ULONG_MAX || - input > ARGON2_MAX_THREADS || input > ARGON2_MAX_LANES) { - fatal("bad numeric input for -p"); - } - threads = input; - lanes = threads; - continue; - } else { - fatal("missing -p argument"); - } - } else if (!strcmp(a, "-d")) { - type = "d"; - } else { - fatal("unknown argument"); - } - } - printf("Type:\t\tArgon2%c\n", type[0]); - printf("Iterations:\t%" PRIu32 " \n", t_cost); - printf("Memory:\t\t%" PRIu32 " KiB\n", m_cost); - printf("Parallelism:\t%" PRIu32 " \n", lanes); - run(out, pwd, salt, t_cost, m_cost, lanes, threads, type); - - return ARGON2_OK; -} diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash.h deleted file mode 100644 index 3a48bf5..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash.h +++ /dev/null @@ -1,38 +0,0 @@ -#if defined(SCRYPT_SKEIN512) -#include "scrypt-jane-hash_skein512.h" -#else - #define SCRYPT_HASH "ERROR" - #define SCRYPT_HASH_BLOCK_SIZE 64 - #define SCRYPT_HASH_DIGEST_SIZE 64 - typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state; - typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - static void scrypt_hash_init(scrypt_hash_state *S) {} - static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {} - static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {} - static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0}; - #error must define a hash function! -#endif - -#include "scrypt-jane-pbkdf2.h" - -#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ - -static int -scrypt_test_hash(void) { - scrypt_hash_state st; - scrypt_hash_digest hash, final; - uint8_t msg[SCRYPT_TEST_HASH_LEN]; - size_t i; - - for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++) - msg[i] = (uint8_t)i; - - scrypt_hash_init(&st); - for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) { - scrypt_hash(hash, msg, i); - scrypt_hash_update(&st, hash, sizeof(hash)); - } - scrypt_hash_finish(&st, final); - return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE); -} - diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash_skein512.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash_skein512.h deleted file mode 100644 index 838df2c..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash_skein512.h +++ /dev/null @@ -1,188 +0,0 @@ -#define SCRYPT_HASH "Skein-512" -#define SCRYPT_HASH_BLOCK_SIZE 64 -#define SCRYPT_HASH_DIGEST_SIZE 64 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -typedef struct scrypt_hash_state_t { - uint64_t X[8], T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -#include - -static void -skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) { - uint64_t X[8], key[8], Xt[9+18], T[3+1]; - size_t r; - - while (blocks--) { - T[0] = S->T[0] + add; - T[1] = S->T[1]; - T[2] = T[0] ^ T[1]; - key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0]; - key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1]; - key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2]; - key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3]; - key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4]; - key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0]; - key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1]; - key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7]; - Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7]; - in += SCRYPT_HASH_BLOCK_SIZE; - - for (r = 0; r < 18; r++) - Xt[r + 9] = Xt[r + 0]; - - for (r = 0; r < 18; r += 2) { - X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0]; - X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2]; - X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4]; - X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6]; - X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2]; - X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0]; - X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6]; - X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4]; - X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4]; - X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6]; - X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0]; - X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2]; - X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6]; - X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4]; - X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2]; - X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0]; - - X[0] += Xt[r + 1]; - X[1] += Xt[r + 2]; - X[2] += Xt[r + 3]; - X[3] += Xt[r + 4]; - X[4] += Xt[r + 5]; - X[5] += Xt[r + 6] + T[1]; - X[6] += Xt[r + 7] + T[2]; - X[7] += Xt[r + 8] + r + 1; - - T[3] = T[0]; - T[0] = T[1]; - T[1] = T[2]; - T[2] = T[3]; - - X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0]; - X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2]; - X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4]; - X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6]; - X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2]; - X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0]; - X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6]; - X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4]; - X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4]; - X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6]; - X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0]; - X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2]; - X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6]; - X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4]; - X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2]; - X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0]; - - X[0] += Xt[r + 2]; - X[1] += Xt[r + 3]; - X[2] += Xt[r + 4]; - X[3] += Xt[r + 5]; - X[4] += Xt[r + 6]; - X[5] += Xt[r + 7] + T[1]; - X[6] += Xt[r + 8] + T[2]; - X[7] += Xt[r + 9] + r + 2; - - T[3] = T[0]; - T[0] = T[1]; - T[1] = T[2]; - T[2] = T[3]; - } - - S->X[0] = key[0] ^ X[0]; - S->X[1] = key[1] ^ X[1]; - S->X[2] = key[2] ^ X[2]; - S->X[3] = key[3] ^ X[3]; - S->X[4] = key[4] ^ X[4]; - S->X[5] = key[5] ^ X[5]; - S->X[6] = key[6] ^ X[6]; - S->X[7] = key[7] ^ X[7]; - - S->T[0] = T[0]; - S->T[1] = T[1] & ~0x4000000000000000ull; - } -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->X[0] = 0x4903ADFF749C51CEull; - S->X[1] = 0x0D95DE399746DF03ull; - S->X[2] = 0x8FD1934127C79BCEull; - S->X[3] = 0x9A255629FF352CB1ull; - S->X[4] = 0x5DB62599DF6CA7B0ull; - S->X[5] = 0xEABE394CA9D5C3F4ull; - S->X[6] = 0x991112C71A75B523ull; - S->X[7] = 0xAE18A40B660FCC33ull; - S->T[0] = 0x0000000000000000ull; - S->T[1] = 0x7000000000000000ull; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */ - if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) { - /* handle the previous data, we know there is enough for at least one block */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - memcpy(S->buffer + S->leftover, in, want); - in += want; - inlen -= want; - S->leftover = 0; - skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE); - } - - /* handle the current data if there's more than one block */ - if (inlen > SCRYPT_HASH_BLOCK_SIZE) { - blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE); - inlen -= blocks; - in += blocks; - } - } - - /* handle leftover data */ - memcpy(S->buffer + S->leftover, in, inlen); - S->leftover += (int) inlen; -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover); - S->T[1] |= 0x8000000000000000ull; - skein512_blocks(S, S->buffer, 1, S->leftover); - - memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE); - S->T[0] = 0; - S->T[1] = 0xff00000000000000ull; - skein512_blocks(S, S->buffer, 1, 8); - - U64TO8_LE(&hash[ 0], S->X[0]); - U64TO8_LE(&hash[ 8], S->X[1]); - U64TO8_LE(&hash[16], S->X[2]); - U64TO8_LE(&hash[24], S->X[3]); - U64TO8_LE(&hash[32], S->X[4]); - U64TO8_LE(&hash[40], S->X[5]); - U64TO8_LE(&hash[48], S->X[6]); - U64TO8_LE(&hash[56], S->X[7]); -} - - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4, - 0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf, - 0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41, - 0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67, -}; diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx.h deleted file mode 100644 index 663d833..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx.h +++ /dev/null @@ -1,367 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA64_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - a2(vmovdqa xmm4,[rax+64]) - a2(vmovdqa xmm5,[rax+80]) - a2(vmovdqa xmm6,[rax+96]) - a2(vmovdqa xmm7,[rax+112]) - aj(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a3(vpxor xmm4,xmm4,[r9+64]) - a3(vpxor xmm5,xmm5,[r9+80]) - a3(vpxor xmm6,xmm6,[r9+96]) - a3(vpxor xmm7,xmm7,[r9+112]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_avx_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a3(vpxor xmm4,xmm4,[rsi+r9+64]) - a3(vpxor xmm5,xmm5,[rsi+r9+80]) - a3(vpxor xmm6,xmm6,[rsi+r9+96]) - a3(vpxor xmm7,xmm7,[rsi+r9+112]) - aj(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a3(vpxor xmm4,xmm4,[rdx+r9+64]) - a3(vpxor xmm5,xmm5,[rdx+r9+80]) - a3(vpxor xmm6,xmm6,[rdx+r9+96]) - a3(vpxor xmm7,xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa [rsp+0],xmm0) - a2(vmovdqa [rsp+16],xmm1) - a2(vmovdqa [rsp+32],xmm2) - a2(vmovdqa [rsp+48],xmm3) - a2(vmovdqa [rsp+64],xmm4) - a2(vmovdqa [rsp+80],xmm5) - a2(vmovdqa [rsp+96],xmm6) - a2(vmovdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_avx_loop: ) - a3(vpaddq xmm8, xmm0, xmm2) - a3(vpaddq xmm9, xmm1, xmm3) - a3(vpshufd xmm8, xmm8, 0xb1) - a3(vpshufd xmm9, xmm9, 0xb1) - a3(vpxor xmm6, xmm6, xmm8) - a3(vpxor xmm7, xmm7, xmm9) - a3(vpaddq xmm10, xmm0, xmm6) - a3(vpaddq xmm11, xmm1, xmm7) - a3(vpsrlq xmm8, xmm10, 51) - a3(vpsrlq xmm9, xmm11, 51) - a3(vpsllq xmm10, xmm10, 13) - a3(vpsllq xmm11, xmm11, 13) - a3(vpxor xmm4, xmm4, xmm8) - a3(vpxor xmm5, xmm5, xmm9) - a3(vpxor xmm4, xmm4, xmm10) - a3(vpxor xmm5, xmm5, xmm11) - a3(vpaddq xmm8, xmm6, xmm4) - a3(vpaddq xmm9, xmm7, xmm5) - a3(vpsrlq xmm10, xmm8, 25) - a3(vpsrlq xmm11, xmm9, 25) - a3(vpsllq xmm8, xmm8, 39) - a3(vpsllq xmm9, xmm9, 39) - a3(vpxor xmm2, xmm2, xmm10) - a3(vpxor xmm3, xmm3, xmm11) - a3(vpxor xmm2, xmm2, xmm8) - a3(vpxor xmm3, xmm3, xmm9) - a3(vpaddq xmm10, xmm4, xmm2) - a3(vpaddq xmm11, xmm5, xmm3) - a3(vpshufd xmm10, xmm10, 0xb1) - a3(vpshufd xmm11, xmm11, 0xb1) - a3(vpxor xmm0, xmm0, xmm10) - a3(vpxor xmm1, xmm1, xmm11) - a2(vmovdqa xmm8, xmm2) - a2(vmovdqa xmm9, xmm3) - a4(vpalignr xmm2, xmm6, xmm7, 8) - a4(vpalignr xmm3, xmm7, xmm6, 8) - a4(vpalignr xmm6, xmm9, xmm8, 8) - a4(vpalignr xmm7, xmm8, xmm9, 8) - a3(vpaddq xmm10, xmm0, xmm2) - a3(vpaddq xmm11, xmm1, xmm3) - a3(vpshufd xmm10, xmm10, 0xb1) - a3(vpshufd xmm11, xmm11, 0xb1) - a3(vpxor xmm6, xmm6, xmm10) - a3(vpxor xmm7, xmm7, xmm11) - a3(vpaddq xmm8, xmm0, xmm6) - a3(vpaddq xmm9, xmm1, xmm7) - a3(vpsrlq xmm10, xmm8, 51) - a3(vpsrlq xmm11, xmm9, 51) - a3(vpsllq xmm8, xmm8, 13) - a3(vpsllq xmm9, xmm9, 13) - a3(vpxor xmm5, xmm5, xmm10) - a3(vpxor xmm4, xmm4, xmm11) - a3(vpxor xmm5, xmm5, xmm8) - a3(vpxor xmm4, xmm4, xmm9) - a3(vpaddq xmm10, xmm6, xmm5) - a3(vpaddq xmm11, xmm7, xmm4) - a3(vpsrlq xmm8, xmm10, 25) - a3(vpsrlq xmm9, xmm11, 25) - a3(vpsllq xmm10, xmm10, 39) - a3(vpsllq xmm11, xmm11, 39) - a3(vpxor xmm2, xmm2, xmm8) - a3(vpxor xmm3, xmm3, xmm9) - a3(vpxor xmm2, xmm2, xmm10) - a3(vpxor xmm3, xmm3, xmm11) - a3(vpaddq xmm8, xmm5, xmm2) - a3(vpaddq xmm9, xmm4, xmm3) - a3(vpshufd xmm8, xmm8, 0xb1) - a3(vpshufd xmm9, xmm9, 0xb1) - a3(vpxor xmm0, xmm0, xmm8) - a3(vpxor xmm1, xmm1, xmm9) - a2(vmovdqa xmm10, xmm2) - a2(vmovdqa xmm11, xmm3) - a4(vpalignr xmm2, xmm6, xmm7, 8) - a4(vpalignr xmm3, xmm7, xmm6, 8) - a4(vpalignr xmm6, xmm11, xmm10, 8) - a4(vpalignr xmm7, xmm10, xmm11, 8) - a2(sub rax, 2) - aj(ja scrypt_salsa64_avx_loop) - a3(vpaddq xmm0,xmm0,[rsp+0]) - a3(vpaddq xmm1,xmm1,[rsp+16]) - a3(vpaddq xmm2,xmm2,[rsp+32]) - a3(vpaddq xmm3,xmm3,[rsp+48]) - a3(vpaddq xmm4,xmm4,[rsp+64]) - a3(vpaddq xmm5,xmm5,[rsp+80]) - a3(vpaddq xmm6,xmm6,[rsp+96]) - a3(vpaddq xmm7,xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - a2(vmovdqa [rax+64],xmm4) - a2(vmovdqa [rax+80],xmm5) - a2(vmovdqa [rax+96],xmm6) - a2(vmovdqa [rax+112],xmm7) - aj(jne scrypt_ChunkMix_avx_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_AVX - -static void asm_calling_convention -scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x5 = _mm_xor_si128(x5, z2); - x4 = _mm_xor_si128(x4, z3); - x5 = _mm_xor_si128(x5, z0); - x4 = _mm_xor_si128(x4, z1); - - z0 = _mm_add_epi64(x5, x6); - z1 = _mm_add_epi64(x4, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x5); - z1 = _mm_add_epi64(x3, x4); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_AVX) - /* uses salsa64_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-AVX" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx2.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx2.h deleted file mode 100644 index 8181302..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx2.h +++ /dev/null @@ -1,221 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA64_AVX2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx2) - a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa ymm0,[rax+0]) - a2(vmovdqa ymm1,[rax+32]) - a2(vmovdqa ymm2,[rax+64]) - a2(vmovdqa ymm3,[rax+96]) - aj(jz scrypt_ChunkMix_avx2_no_xor1) - a3(vpxor ymm0,ymm0,[r9+0]) - a3(vpxor ymm1,ymm1,[r9+32]) - a3(vpxor ymm2,ymm2,[r9+64]) - a3(vpxor ymm3,ymm3,[r9+96]) - a1(scrypt_ChunkMix_avx2_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_avx2_loop:) - a2(and rdx, rdx) - a3(vpxor ymm0,ymm0,[rsi+r9+0]) - a3(vpxor ymm1,ymm1,[rsi+r9+32]) - a3(vpxor ymm2,ymm2,[rsi+r9+64]) - a3(vpxor ymm3,ymm3,[rsi+r9+96]) - aj(jz scrypt_ChunkMix_avx2_no_xor2) - a3(vpxor ymm0,ymm0,[rdx+r9+0]) - a3(vpxor ymm1,ymm1,[rdx+r9+32]) - a3(vpxor ymm2,ymm2,[rdx+r9+64]) - a3(vpxor ymm3,ymm3,[rdx+r9+96]) - a1(scrypt_ChunkMix_avx2_no_xor2:) - a2(vmovdqa ymm6,ymm0) - a2(vmovdqa ymm7,ymm1) - a2(vmovdqa ymm8,ymm2) - a2(vmovdqa ymm9,ymm3) - a2(mov rax,4) - a1(scrypt_salsa64_avx2_loop: ) - a3(vpaddq ymm4, ymm1, ymm0) - a3(vpshufd ymm4, ymm4, 0xb1) - a3(vpxor ymm3, ymm3, ymm4) - a3(vpaddq ymm4, ymm0, ymm3) - a3(vpsrlq ymm5, ymm4, 51) - a3(vpxor ymm2, ymm2, ymm5) - a3(vpsllq ymm4, ymm4, 13) - a3(vpxor ymm2, ymm2, ymm4) - a3(vpaddq ymm4, ymm3, ymm2) - a3(vpsrlq ymm5, ymm4, 25) - a3(vpxor ymm1, ymm1, ymm5) - a3(vpsllq ymm4, ymm4, 39) - a3(vpxor ymm1, ymm1, ymm4) - a3(vpaddq ymm4, ymm2, ymm1) - a3(vpshufd ymm4, ymm4, 0xb1) - a3(vpermq ymm1, ymm1, 0x39) - a3(vpermq ymm10, ymm2, 0x4e) - a3(vpxor ymm0, ymm0, ymm4) - a3(vpermq ymm3, ymm3, 0x93) - a3(vpaddq ymm4, ymm3, ymm0) - a3(vpshufd ymm4, ymm4, 0xb1) - a3(vpxor ymm1, ymm1, ymm4) - a3(vpaddq ymm4, ymm0, ymm1) - a3(vpsrlq ymm5, ymm4, 51) - a3(vpxor ymm10, ymm10, ymm5) - a3(vpsllq ymm4, ymm4, 13) - a3(vpxor ymm10, ymm10, ymm4) - a3(vpaddq ymm4, ymm1, ymm10) - a3(vpsrlq ymm5, ymm4, 25) - a3(vpxor ymm3, ymm3, ymm5) - a3(vpsllq ymm4, ymm4, 39) - a3(vpermq ymm1, ymm1, 0x93) - a3(vpxor ymm3, ymm3, ymm4) - a3(vpermq ymm2, ymm10, 0x4e) - a3(vpaddq ymm4, ymm10, ymm3) - a3(vpshufd ymm4, ymm4, 0xb1) - a3(vpermq ymm3, ymm3, 0x39) - a3(vpxor ymm0, ymm0, ymm4) - a1(dec rax) - aj(jnz scrypt_salsa64_avx2_loop) - a3(vpaddq ymm0,ymm0,ymm6) - a3(vpaddq ymm1,ymm1,ymm7) - a3(vpaddq ymm2,ymm2,ymm8) - a3(vpaddq ymm3,ymm3,ymm9) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],ymm0) - a2(vmovdqa [rax+32],ymm1) - a2(vmovdqa [rax+64],ymm2) - a2(vmovdqa [rax+96],ymm3) - aj(jne scrypt_ChunkMix_avx2_loop) - a1(vzeroupper) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_avx2) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_AVX2 - -static void asm_calling_convention -scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1); - y0 = ymmp[0]; - y1 = ymmp[1]; - y2 = ymmp[2]; - y3 = ymmp[3]; - - if (Bxor) { - ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1); - y0 = _mm256_xor_si256(y0, ymmp[0]); - y1 = _mm256_xor_si256(y1, ymmp[1]); - y2 = _mm256_xor_si256(y2, ymmp[2]); - y3 = _mm256_xor_si256(y3, ymmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - ymmp = (ymmi *)scrypt_block(Bin, i); - y0 = _mm256_xor_si256(y0, ymmp[0]); - y1 = _mm256_xor_si256(y1, ymmp[1]); - y2 = _mm256_xor_si256(y2, ymmp[2]); - y3 = _mm256_xor_si256(y3, ymmp[3]); - - if (Bxor) { - ymmp = (ymmi *)scrypt_block(Bxor, i); - y0 = _mm256_xor_si256(y0, ymmp[0]); - y1 = _mm256_xor_si256(y1, ymmp[1]); - y2 = _mm256_xor_si256(y2, ymmp[2]); - y3 = _mm256_xor_si256(y3, ymmp[3]); - } - - t0 = y0; - t1 = y1; - t2 = y2; - t3 = y3; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm256_add_epi64(y0, y1); - z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - y3 = _mm256_xor_si256(y3, z0); - z0 = _mm256_add_epi64(y3, y0); - z1 = _mm256_srli_epi64(z0, 64-13); - y2 = _mm256_xor_si256(y2, z1); - z0 = _mm256_slli_epi64(z0, 13); - y2 = _mm256_xor_si256(y2, z0); - z0 = _mm256_add_epi64(y2, y3); - z1 = _mm256_srli_epi64(z0, 64-39); - y1 = _mm256_xor_si256(y1, z1); - z0 = _mm256_slli_epi64(z0, 39); - y1 = _mm256_xor_si256(y1, z0); - y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1)); - y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2)); - y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3)); - z0 = _mm256_add_epi64(y1, y2); - z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - y0 = _mm256_xor_si256(y0, z0); - z0 = _mm256_add_epi64(y0, y3); - z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - y1 = _mm256_xor_si256(y1, z0); - z0 = _mm256_add_epi64(y1, y0); - z1 = _mm256_srli_epi64(z0, 64-13); - y2 = _mm256_xor_si256(y2, z1); - z0 = _mm256_slli_epi64(z0, 13); - y2 = _mm256_xor_si256(y2, z0); - z0 = _mm256_add_epi64(y2, y1); - z1 = _mm256_srli_epi64(z0, 64-39); - y3 = _mm256_xor_si256(y3, z1); - z0 = _mm256_slli_epi64(z0, 39); - y3 = _mm256_xor_si256(y3, z0); - z0 = _mm256_add_epi64(y3, y2); - z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - y0 = _mm256_xor_si256(y0, z0); - y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3)); - y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2)); - y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1)); - } - - y0 = _mm256_add_epi64(y0, t0); - y1 = _mm256_add_epi64(y1, t1); - y2 = _mm256_add_epi64(y2, t2); - y3 = _mm256_add_epi64(y3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half); - ymmp[0] = y0; - ymmp[1] = y1; - ymmp[2] = y2; - ymmp[3] = y3; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_AVX2) - /* uses salsa64_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-AVX2" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-sse2.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-sse2.h deleted file mode 100644 index 971d98a..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-sse2.h +++ /dev/null @@ -1,449 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA64_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a2(movdqa xmm4,[rax+64]) - a2(movdqa xmm5,[rax+80]) - a2(movdqa xmm6,[rax+96]) - a2(movdqa xmm7,[rax+112]) - aj(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a2(pxor xmm4,[r9+64]) - a2(pxor xmm5,[r9+80]) - a2(pxor xmm6,[r9+96]) - a2(pxor xmm7,[r9+112]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a2(pxor xmm4,[rsi+r9+64]) - a2(pxor xmm5,[rsi+r9+80]) - a2(pxor xmm6,[rsi+r9+96]) - a2(pxor xmm7,[rsi+r9+112]) - aj(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a2(pxor xmm4,[rdx+r9+64]) - a2(pxor xmm5,[rdx+r9+80]) - a2(pxor xmm6,[rdx+r9+96]) - a2(pxor xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa [rsp+0],xmm0) - a2(movdqa [rsp+16],xmm1) - a2(movdqa [rsp+32],xmm2) - a2(movdqa [rsp+48],xmm3) - a2(movdqa [rsp+64],xmm4) - a2(movdqa [rsp+80],xmm5) - a2(movdqa [rsp+96],xmm6) - a2(movdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_sse2_loop: ) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm4, xmm10) - a2(pxor xmm5, xmm11) - a2(pxor xmm4, xmm8) - a2(pxor xmm5, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm4) - a2(paddq xmm11, xmm5) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm4) - a2(movdqa xmm9, xmm5) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm8, xmm2) - a2(movdqa xmm9, xmm3) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(movdqa xmm2, xmm7) - a2(movdqa xmm3, xmm6) - a2(punpcklqdq xmm10, xmm6) - a2(punpcklqdq xmm11, xmm7) - a2(movdqa xmm6, xmm8) - a2(movdqa xmm7, xmm9) - a2(punpcklqdq xmm9, xmm9) - a2(punpcklqdq xmm8, xmm8) - a2(punpckhqdq xmm2, xmm10) - a2(punpckhqdq xmm3, xmm11) - a2(punpckhqdq xmm6, xmm9) - a2(punpckhqdq xmm7, xmm8) - a2(sub rax, 2) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm5, xmm10) - a2(pxor xmm4, xmm11) - a2(pxor xmm5, xmm8) - a2(pxor xmm4, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm5) - a2(paddq xmm11, xmm4) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm5) - a2(movdqa xmm9, xmm4) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm8, xmm2) - a2(movdqa xmm9, xmm3) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(movdqa xmm2, xmm7) - a2(movdqa xmm3, xmm6) - a2(punpcklqdq xmm10, xmm6) - a2(punpcklqdq xmm11, xmm7) - a2(movdqa xmm6, xmm8) - a2(movdqa xmm7, xmm9) - a2(punpcklqdq xmm9, xmm9) - a2(punpcklqdq xmm8, xmm8) - a2(punpckhqdq xmm2, xmm10) - a2(punpckhqdq xmm3, xmm11) - a2(punpckhqdq xmm6, xmm9) - a2(punpckhqdq xmm7, xmm8) - aj(ja scrypt_salsa64_sse2_loop) - a2(paddq xmm0,[rsp+0]) - a2(paddq xmm1,[rsp+16]) - a2(paddq xmm2,[rsp+32]) - a2(paddq xmm3,[rsp+48]) - a2(paddq xmm4,[rsp+64]) - a2(paddq xmm5,[rsp+80]) - a2(paddq xmm6,[rsp+96]) - a2(paddq xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a2(movdqa [rax+64],xmm4) - a2(movdqa [rax+80],xmm5) - a2(movdqa [rax+96],xmm6) - a2(movdqa [rax+112],xmm7) - aj(jne scrypt_ChunkMix_sse2_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_SSE2 - -static void asm_calling_convention -scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x4; - z1 = x5; - z2 = x2; - z3 = x3; - x4 = z1; - x5 = z0; - x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); - x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); - x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); - x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x4; - z1 = x5; - z2 = x2; - z3 = x3; - x4 = z1; - x5 = z0; - x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); - x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); - x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); - x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-SSE2" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif - -/* sse3/avx use this as well */ -#if defined(SCRYPT_SALSA64_INCLUDED) - /* - Default layout: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 - 12 13 14 15 - - SSE2 layout: - 0 5 10 15 - 12 1 6 11 - 8 13 2 7 - 4 9 14 3 - */ - - - static void asm_calling_convention - salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) { - uint64_t t; - while (count--) { - t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; - t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; - t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; - t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; - t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; - t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; - blocks += 16; - } - } -#endif \ No newline at end of file diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h deleted file mode 100644 index 21e94c9..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h +++ /dev/null @@ -1,399 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA64_SSSE3 - -asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_ssse3) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a2(movdqa xmm4,[rax+64]) - a2(movdqa xmm5,[rax+80]) - a2(movdqa xmm6,[rax+96]) - a2(movdqa xmm7,[rax+112]) - aj(jz scrypt_ChunkMix_ssse3_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a2(pxor xmm4,[r9+64]) - a2(pxor xmm5,[r9+80]) - a2(pxor xmm6,[r9+96]) - a2(pxor xmm7,[r9+112]) - a1(scrypt_ChunkMix_ssse3_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_ssse3_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a2(pxor xmm4,[rsi+r9+64]) - a2(pxor xmm5,[rsi+r9+80]) - a2(pxor xmm6,[rsi+r9+96]) - a2(pxor xmm7,[rsi+r9+112]) - aj(jz scrypt_ChunkMix_ssse3_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a2(pxor xmm4,[rdx+r9+64]) - a2(pxor xmm5,[rdx+r9+80]) - a2(pxor xmm6,[rdx+r9+96]) - a2(pxor xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_ssse3_no_xor2:) - a2(movdqa [rsp+0],xmm0) - a2(movdqa [rsp+16],xmm1) - a2(movdqa [rsp+32],xmm2) - a2(movdqa [rsp+48],xmm3) - a2(movdqa [rsp+64],xmm4) - a2(movdqa [rsp+80],xmm5) - a2(movdqa [rsp+96],xmm6) - a2(movdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_ssse3_loop: ) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm4, xmm10) - a2(pxor xmm5, xmm11) - a2(pxor xmm4, xmm8) - a2(pxor xmm5, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm4) - a2(paddq xmm11, xmm5) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm4) - a2(movdqa xmm9, xmm5) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm10, xmm2) - a2(movdqa xmm11, xmm3) - a2(movdqa xmm2, xmm6) - a2(movdqa xmm3, xmm7) - a3(palignr xmm2, xmm7, 8) - a3(palignr xmm3, xmm6, 8) - a2(movdqa xmm6, xmm11) - a2(movdqa xmm7, xmm10) - a3(palignr xmm6, xmm10, 8) - a3(palignr xmm7, xmm11, 8) - a2(sub rax, 2) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm5, xmm10) - a2(pxor xmm4, xmm11) - a2(pxor xmm5, xmm8) - a2(pxor xmm4, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm5) - a2(paddq xmm11, xmm4) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm5) - a2(movdqa xmm9, xmm4) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm10, xmm2) - a2(movdqa xmm11, xmm3) - a2(movdqa xmm2, xmm6) - a2(movdqa xmm3, xmm7) - a3(palignr xmm2, xmm7, 8) - a3(palignr xmm3, xmm6, 8) - a2(movdqa xmm6, xmm11) - a2(movdqa xmm7, xmm10) - a3(palignr xmm6, xmm10, 8) - a3(palignr xmm7, xmm11, 8) - aj(ja scrypt_salsa64_ssse3_loop) - a2(paddq xmm0,[rsp+0]) - a2(paddq xmm1,[rsp+16]) - a2(paddq xmm2,[rsp+32]) - a2(paddq xmm3,[rsp+48]) - a2(paddq xmm4,[rsp+64]) - a2(paddq xmm5,[rsp+80]) - a2(paddq xmm6,[rsp+96]) - a2(paddq xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a2(movdqa [rax+64],xmm4) - a2(movdqa [rax+80],xmm5) - a2(movdqa [rax+96],xmm6) - a2(movdqa [rax+112],xmm7) - aj(jne scrypt_ChunkMix_ssse3_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_ssse3) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_SSSE3 - -static void asm_calling_convention -scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x5 = _mm_xor_si128(x5, z2); - x4 = _mm_xor_si128(x4, z3); - x5 = _mm_xor_si128(x5, z0); - x4 = _mm_xor_si128(x4, z1); - - z0 = _mm_add_epi64(x5, x6); - z1 = _mm_add_epi64(x4, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x5); - z1 = _mm_add_epi64(x3, x4); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - /* uses salsa64_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-SSSE3" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-xop.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-xop.h deleted file mode 100644 index 9485247..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-xop.h +++ /dev/null @@ -1,335 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) - -#define SCRYPT_SALSA64_XOP - -asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_xop) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - a2(vmovdqa xmm4,[rax+64]) - a2(vmovdqa xmm5,[rax+80]) - a2(vmovdqa xmm6,[rax+96]) - a2(vmovdqa xmm7,[rax+112]) - aj(jz scrypt_ChunkMix_xop_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a3(vpxor xmm4,xmm4,[r9+64]) - a3(vpxor xmm5,xmm5,[r9+80]) - a3(vpxor xmm6,xmm6,[r9+96]) - a3(vpxor xmm7,xmm7,[r9+112]) - a1(scrypt_ChunkMix_xop_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_xop_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a3(vpxor xmm4,xmm4,[rsi+r9+64]) - a3(vpxor xmm5,xmm5,[rsi+r9+80]) - a3(vpxor xmm6,xmm6,[rsi+r9+96]) - a3(vpxor xmm7,xmm7,[rsi+r9+112]) - aj(jz scrypt_ChunkMix_xop_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a3(vpxor xmm4,xmm4,[rdx+r9+64]) - a3(vpxor xmm5,xmm5,[rdx+r9+80]) - a3(vpxor xmm6,xmm6,[rdx+r9+96]) - a3(vpxor xmm7,xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_xop_no_xor2:) - a2(vmovdqa [rsp+0],xmm0) - a2(vmovdqa [rsp+16],xmm1) - a2(vmovdqa [rsp+32],xmm2) - a2(vmovdqa [rsp+48],xmm3) - a2(vmovdqa [rsp+64],xmm4) - a2(vmovdqa [rsp+80],xmm5) - a2(vmovdqa [rsp+96],xmm6) - a2(vmovdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_xop_loop: ) - a3(vpaddq xmm8, xmm0, xmm2) - a3(vpaddq xmm9, xmm1, xmm3) - a3(vpshufd xmm8, xmm8, 0xb1) - a3(vpshufd xmm9, xmm9, 0xb1) - a3(vpxor xmm6, xmm6, xmm8) - a3(vpxor xmm7, xmm7, xmm9) - a3(vpaddq xmm10, xmm0, xmm6) - a3(vpaddq xmm11, xmm1, xmm7) - a3(vprotq xmm10, xmm10, 13) - a3(vprotq xmm11, xmm11, 13) - a3(vpxor xmm4, xmm4, xmm10) - a3(vpxor xmm5, xmm5, xmm11) - a3(vpaddq xmm8, xmm6, xmm4) - a3(vpaddq xmm9, xmm7, xmm5) - a3(vprotq xmm8, xmm8, 39) - a3(vprotq xmm9, xmm9, 39) - a3(vpxor xmm2, xmm2, xmm8) - a3(vpxor xmm3, xmm3, xmm9) - a3(vpaddq xmm10, xmm4, xmm2) - a3(vpaddq xmm11, xmm5, xmm3) - a3(vpshufd xmm10, xmm10, 0xb1) - a3(vpshufd xmm11, xmm11, 0xb1) - a3(vpxor xmm0, xmm0, xmm10) - a3(vpxor xmm1, xmm1, xmm11) - a2(vmovdqa xmm8, xmm2) - a2(vmovdqa xmm9, xmm3) - a4(vpalignr xmm2, xmm6, xmm7, 8) - a4(vpalignr xmm3, xmm7, xmm6, 8) - a4(vpalignr xmm6, xmm9, xmm8, 8) - a4(vpalignr xmm7, xmm8, xmm9, 8) - a3(vpaddq xmm10, xmm0, xmm2) - a3(vpaddq xmm11, xmm1, xmm3) - a3(vpshufd xmm10, xmm10, 0xb1) - a3(vpshufd xmm11, xmm11, 0xb1) - a3(vpxor xmm6, xmm6, xmm10) - a3(vpxor xmm7, xmm7, xmm11) - a3(vpaddq xmm8, xmm0, xmm6) - a3(vpaddq xmm9, xmm1, xmm7) - a3(vprotq xmm8, xmm8, 13) - a3(vprotq xmm9, xmm9, 13) - a3(vpxor xmm5, xmm5, xmm8) - a3(vpxor xmm4, xmm4, xmm9) - a3(vpaddq xmm10, xmm6, xmm5) - a3(vpaddq xmm11, xmm7, xmm4) - a3(vprotq xmm10, xmm10, 39) - a3(vprotq xmm11, xmm11, 39) - a3(vpxor xmm2, xmm2, xmm10) - a3(vpxor xmm3, xmm3, xmm11) - a3(vpaddq xmm8, xmm5, xmm2) - a3(vpaddq xmm9, xmm4, xmm3) - a3(vpshufd xmm8, xmm8, 0xb1) - a3(vpshufd xmm9, xmm9, 0xb1) - a3(vpxor xmm0, xmm0, xmm8) - a3(vpxor xmm1, xmm1, xmm9) - a2(vmovdqa xmm10, xmm2) - a2(vmovdqa xmm11, xmm3) - a4(vpalignr xmm2, xmm6, xmm7, 8) - a4(vpalignr xmm3, xmm7, xmm6, 8) - a4(vpalignr xmm6, xmm11, xmm10, 8) - a4(vpalignr xmm7, xmm10, xmm11, 8) - a2(sub rax, 2) - aj(ja scrypt_salsa64_xop_loop) - a3(vpaddq xmm0,xmm0,[rsp+0]) - a3(vpaddq xmm1,xmm1,[rsp+16]) - a3(vpaddq xmm2,xmm2,[rsp+32]) - a3(vpaddq xmm3,xmm3,[rsp+48]) - a3(vpaddq xmm4,xmm4,[rsp+64]) - a3(vpaddq xmm5,xmm5,[rsp+80]) - a3(vpaddq xmm6,xmm6,[rsp+96]) - a3(vpaddq xmm7,xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - a2(vmovdqa [rax+64],xmm4) - a2(vmovdqa [rax+80],xmm5) - a2(vmovdqa [rax+96],xmm6) - a2(vmovdqa [rax+112],xmm7) - aj(jne scrypt_ChunkMix_xop_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_xop) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_XOP - -static void asm_calling_convention -scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z0 = _mm_roti_epi64(z0, 13); - z1 = _mm_roti_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z0 = _mm_roti_epi64(z0, 39); - z1 = _mm_roti_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z0 = _mm_roti_epi64(z0, 13); - z1 = _mm_roti_epi64(z1, 13); - x5 = _mm_xor_si128(x5, z0); - x4 = _mm_xor_si128(x4, z1); - - z0 = _mm_add_epi64(x5, x6); - z1 = _mm_add_epi64(x4, x7); - z0 = _mm_roti_epi64(z0, 39); - z1 = _mm_roti_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x5); - z1 = _mm_add_epi64(x3, x4); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_XOP) - /* uses salsa64_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-XOP" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64.h deleted file mode 100644 index 2aec04f..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64.h +++ /dev/null @@ -1,41 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED) - -#undef SCRYPT_MIX -#define SCRYPT_MIX "Salsa64/8 Ref" - -#undef SCRYPT_SALSA64_INCLUDED -#define SCRYPT_SALSA64_INCLUDED -#define SCRYPT_SALSA64_BASIC - -static void -salsa64_core_basic(uint64_t state[16]) { - const size_t rounds = 8; - uint64_t v[16], t; - size_t i; - - for (i = 0; i < 16; i++) v[i] = state[i]; - - #define G(a,b,c,d) \ - t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \ - t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \ - t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \ - t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \ - - for (i = 0; i < rounds; i += 2) { - G( 0, 4, 8,12); - G( 5, 9,13, 1); - G(10,14, 2, 6); - G(15, 3, 7,11); - G( 0, 1, 2, 3); - G( 5, 6, 7, 4); - G(10,11, 8, 9); - G(15,12,13,14); - } - - for (i = 0; i < 16; i++) state[i] += v[i]; - - #undef G -} - -#endif - diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-pbkdf2.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-pbkdf2.h deleted file mode 100644 index ddd8742..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-pbkdf2.h +++ /dev/null @@ -1,112 +0,0 @@ -typedef struct scrypt_hmac_state_t { - scrypt_hash_state inner, outer; -} scrypt_hmac_state; - - -static void -scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { - scrypt_hash_state st; - scrypt_hash_init(&st); - scrypt_hash_update(&st, m, mlen); - scrypt_hash_finish(&st, hash); -} - -/* hmac */ -static void -scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { - uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; - size_t i; - - scrypt_hash_init(&st->inner); - scrypt_hash_init(&st->outer); - - if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { - /* use the key directly if it's <= blocksize bytes */ - memcpy(pad, key, keylen); - } else { - /* if it's > blocksize bytes, hash it */ - scrypt_hash(pad, key, keylen); - } - - /* inner = (key ^ 0x36) */ - /* h(inner || ...) */ - for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= 0x36; - scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); - - /* outer = (key ^ 0x5c) */ - /* h(outer || ...) */ - for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= (0x5c ^ 0x36); - scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); - - scrypt_ensure_zero(pad, sizeof(pad)); -} - -static void -scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { - /* h(inner || m...) */ - scrypt_hash_update(&st->inner, m, mlen); -} - -static void -scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { - /* h(inner || m) */ - scrypt_hash_digest innerhash; - scrypt_hash_finish(&st->inner, innerhash); - - /* h(outer || h(inner || m)) */ - scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); - scrypt_hash_finish(&st->outer, mac); - - scrypt_ensure_zero(st, sizeof(*st)); -} - -static void -scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) { - scrypt_hmac_state hmac_pw, hmac_pw_salt, work; - scrypt_hash_digest ti, u; - uint8_t be[4]; - uint32_t i, j, blocks; - uint64_t c; - - /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ - - /* hmac(password, ...) */ - scrypt_hmac_init(&hmac_pw, password, password_len); - - /* hmac(password, salt...) */ - hmac_pw_salt = hmac_pw; - scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); - - blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; - for (i = 1; i <= blocks; i++) { - /* U1 = hmac(password, salt || be(i)) */ - U32TO8_BE(be, i); - work = hmac_pw_salt; - scrypt_hmac_update(&work, be, 4); - scrypt_hmac_finish(&work, ti); - memcpy(u, ti, sizeof(u)); - - /* T[i] = U1 ^ U2 ^ U3... */ - for (c = 0; c < N - 1; c++) { - /* UX = hmac(password, U{X-1}) */ - work = hmac_pw; - scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE); - scrypt_hmac_finish(&work, u); - - /* T[i] ^= UX */ - for (j = 0; j < sizeof(u); j++) - ti[j] ^= u[j]; - } - - memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); - out += SCRYPT_HASH_DIGEST_SIZE; - bytes -= SCRYPT_HASH_DIGEST_SIZE; - } - - scrypt_ensure_zero(ti, sizeof(ti)); - scrypt_ensure_zero(u, sizeof(u)); - scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); - scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); -} diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h deleted file mode 100644 index fb45794..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h +++ /dev/null @@ -1,463 +0,0 @@ -#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC)) - #define X86ASM - - /* gcc 2.95 royally screws up stack alignments on variables */ - #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) - #define X86ASM_SSE - #define X86ASM_SSE2 - #endif - #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) - #define X86ASM_SSSE3 - #endif - #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) - #define X86ASM_AVX - #define X86ASM_XOP - #endif - #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700))) - #define X86ASM_AVX2 - #endif -#endif - -#if defined(CPU_X86_64) && defined(COMPILER_GCC) - #define X86_64ASM - #define X86_64ASM_SSE2 - #if (COMPILER_GCC >= 40102) - #define X86_64ASM_SSSE3 - #endif - #if (COMPILER_GCC >= 40400) - #define X86_64ASM_AVX - #define X86_64ASM_XOP - #endif - #if (COMPILER_GCC >= 40700) - #define X86_64ASM_AVX2 - #endif -#endif - -#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64)) - #define X86_INTRINSIC - #if defined(CPU_X86_64) || defined(X86ASM_SSE) - #define X86_INTRINSIC_SSE - #endif - #if defined(CPU_X86_64) || defined(X86ASM_SSE2) - #define X86_INTRINSIC_SSE2 - #endif - #if (COMPILER_MSVC >= COMPILER_MSVC_VS2005) - #define X86_INTRINSIC_SSSE3 - #endif - #if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1) - #define X86_INTRINSIC_AVX - #define X86_INTRINSIC_XOP - #endif - #if (COMPILER_MSVC >= COMPILER_MSVC_VS2012) - #define X86_INTRINSIC_AVX2 - #endif -#endif - -#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) - #define X86_INTRINSIC - #if defined(__SSE__) - #define X86_INTRINSIC_SSE - #endif - #if defined(__SSE2__) - #define X86_INTRINSIC_SSE2 - #endif - #if defined(__SSSE3__) - #define X86_INTRINSIC_SSSE3 - #endif - #if defined(__AVX__) - #define X86_INTRINSIC_AVX - #endif - #if defined(__XOP__) - #define X86_INTRINSIC_XOP - #endif - #if defined(__AVX2__) - #define X86_INTRINSIC_AVX2 - #endif -#endif - -/* only use simd on windows (or SSE2 on gcc)! */ -#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC) - #if defined(X86_INTRINSIC_SSE) - #include - #include - typedef __m64 qmm; - typedef __m128 xmm; - typedef __m128d xmmd; - #endif - #if defined(X86_INTRINSIC_SSE2) - #include - typedef __m128i xmmi; - #endif - #if defined(X86_INTRINSIC_SSSE3) - #include - #endif - #if defined(X86_INTRINSIC_AVX) - #include - #endif - #if defined(X86_INTRINSIC_XOP) - #if defined(COMPILER_MSVC) - #include - #else - #include - #endif - #endif - #if defined(X86_INTRINSIC_AVX2) - typedef __m256i ymmi; - #endif -#endif - -#if defined(X86_INTRINSIC_SSE2) - typedef union packedelem8_t { - uint8_t u[16]; - xmmi v; - } packedelem8; - - typedef union packedelem32_t { - uint32_t u[4]; - xmmi v; - } packedelem32; - - typedef union packedelem64_t { - uint64_t u[2]; - xmmi v; - } packedelem64; -#else - typedef union packedelem8_t { - uint8_t u[16]; - uint32_t dw[4]; - } packedelem8; - - typedef union packedelem32_t { - uint32_t u[4]; - uint8_t b[16]; - } packedelem32; - - typedef union packedelem64_t { - uint64_t u[2]; - uint8_t b[16]; - } packedelem64; -#endif - -#if defined(X86_INTRINSIC_SSSE3) - static const packedelem8 ALIGN(16) ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; - static const packedelem8 ALIGN(16) ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; -#endif - -/* - x86 inline asm for gcc/msvc. usage: - - asm_naked_fn_proto(return_type, name) (type parm1, type parm2..) - asm_naked_fn(name) - a1(..) - a2(.., ..) - a3(.., .., ..) - 64bit OR 0 paramters: a1(ret) - 32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters - asm_naked_fn_end(name) -*/ - -#if defined(X86ASM) || defined(X86_64ASM) - -#if defined(COMPILER_MSVC) - #pragma warning(disable : 4731) /* frame pointer modified by inline assembly */ - #define a1(x) __asm {x} - #define a2(x, y) __asm {x, y} - #define a3(x, y, z) __asm {x, y, z} - #define a4(x, y, z, w) __asm {x, y, z, w} - #define aj(x) __asm {x} - #define asm_align8 a1(ALIGN 8) - #define asm_align16 a1(ALIGN 16) - - #define asm_calling_convention STDCALL - #define aret(n) a1(ret n) - #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn - #define asm_naked_fn(fn) { - #define asm_naked_fn_end(fn) } -#elif defined(COMPILER_GCC) - #define GNU_AS1(x) #x ";\n" - #define GNU_AS2(x, y) #x ", " #y ";\n" - #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" - #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" - #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n" - #define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n" - - #define a1(x) GNU_AS1(x) - #define a2(x, y) GNU_AS2(x, y) - #define a3(x, y, z) GNU_AS3(x, y, z) - #define a4(x, y, z, w) GNU_AS4(x, y, z, w) - #define aj(x) GNU_ASJ(x) - #define asm_align8 ".p2align 3,,7" - #define asm_align16 ".p2align 4,,15" - - #if defined(OS_WINDOWS) - #define asm_calling_convention CDECL - #define aret(n) a1(ret) - - #if defined(X86_64ASM) - #define asm_naked_fn(fn) ; __asm__ ( \ - ".text\n" \ - asm_align16 GNU_ASFN(fn) \ - "subq $136, %rsp;" \ - "movdqa %xmm6, 0(%rsp);" \ - "movdqa %xmm7, 16(%rsp);" \ - "movdqa %xmm8, 32(%rsp);" \ - "movdqa %xmm9, 48(%rsp);" \ - "movdqa %xmm10, 64(%rsp);" \ - "movdqa %xmm11, 80(%rsp);" \ - "movdqa %xmm12, 96(%rsp);" \ - "movq %rdi, 112(%rsp);" \ - "movq %rsi, 120(%rsp);" \ - "movq %rcx, %rdi;" \ - "movq %rdx, %rsi;" \ - "movq %r8, %rdx;" \ - "movq %r9, %rcx;" \ - "call 1f;" \ - "movdqa 0(%rsp), %xmm6;" \ - "movdqa 16(%rsp), %xmm7;" \ - "movdqa 32(%rsp), %xmm8;" \ - "movdqa 48(%rsp), %xmm9;" \ - "movdqa 64(%rsp), %xmm10;" \ - "movdqa 80(%rsp), %xmm11;" \ - "movdqa 96(%rsp), %xmm12;" \ - "movq 112(%rsp), %rdi;" \ - "movq 120(%rsp), %rsi;" \ - "addq $136, %rsp;" \ - "ret;" \ - ".intel_syntax noprefix;" \ - ".p2align 4,,15;" \ - "1:;" - #else - #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) - #endif - #else - #define asm_calling_convention STDCALL - #define aret(n) a1(ret n) - #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) - #endif - - #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn - #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" ); - - #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" - #define asm_gcc_parms() ".att_syntax prefix;" - #define asm_gcc_trashed() __asm__ __volatile__("" ::: - #define asm_gcc_end() ); -#else - need x86 asm -#endif - -#endif /* X86ASM || X86_64ASM */ - - -#if defined(CPU_X86) || defined(CPU_X86_64) - -typedef enum cpu_flags_x86_t { - cpu_mmx = 1 << 0, - cpu_sse = 1 << 1, - cpu_sse2 = 1 << 2, - cpu_sse3 = 1 << 3, - cpu_ssse3 = 1 << 4, - cpu_sse4_1 = 1 << 5, - cpu_sse4_2 = 1 << 6, - cpu_avx = 1 << 7, - cpu_xop = 1 << 8, - cpu_avx2 = 1 << 9 -} cpu_flags_x86; - -typedef enum cpu_vendors_x86_t { - cpu_nobody, - cpu_intel, - cpu_amd -} cpu_vendors_x86; - -typedef struct x86_regs_t { - uint32_t eax, ebx, ecx, edx; -} x86_regs; - -#if defined(X86ASM) -asm_naked_fn_proto(int, has_cpuid)(void) -asm_naked_fn(has_cpuid) - a1(pushfd) - a1(pop eax) - a2(mov ecx, eax) - a2(xor eax, 0x200000) - a1(push eax) - a1(popfd) - a1(pushfd) - a1(pop eax) - a2(xor eax, ecx) - a2(shr eax, 21) - a2(and eax, 1) - a1(push ecx) - a1(popfd) - a1(ret) -asm_naked_fn_end(has_cpuid) -#endif /* X86ASM */ - - -static void NOINLINE -get_cpuid(x86_regs *regs, uint32_t flags) { -#if defined(COMPILER_MSVC) - __cpuid((int *)regs, (int)flags); -#else - #if defined(CPU_X86_64) - #define cpuid_bx rbx - #else - #define cpuid_bx ebx - #endif - - asm_gcc() - a1(push cpuid_bx) - a2(xor ecx, ecx) - a1(cpuid) - a2(mov [%1 + 0], eax) - a2(mov [%1 + 4], ebx) - a2(mov [%1 + 8], ecx) - a2(mov [%1 + 12], edx) - a1(pop cpuid_bx) - asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc" - asm_gcc_end() -#endif -} - -#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) -static uint64_t NOINLINE -get_xgetbv(uint32_t flags) { -#if defined(COMPILER_MSVC) - return _xgetbv(flags); -#else - uint32_t lo, hi; - asm_gcc() - a1(xgetbv) - asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi) - asm_gcc_end() - return ((uint64_t)lo | ((uint64_t)hi << 32)); -#endif -} -#endif // AVX support - -#if defined(SCRYPT_TEST_SPEED) -size_t cpu_detect_mask = (size_t)-1; -#endif - -static size_t -detect_cpu(void) { - //union { uint8_t s[12]; uint32_t i[3]; } vendor_string; - //cpu_vendors_x86 vendor = cpu_nobody; - x86_regs regs; - uint32_t max_level, max_ext_level; - size_t cpu_flags = 0; -#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) - uint64_t xgetbv_flags; -#endif - -#if defined(CPU_X86) - if (!has_cpuid()) - return cpu_flags; -#endif - - get_cpuid(®s, 0); - max_level = regs.eax; -#if 0 - vendor_string.i[0] = regs.ebx; - vendor_string.i[1] = regs.edx; - vendor_string.i[2] = regs.ecx; - - if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12)) - vendor = cpu_intel; - else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12)) - vendor = cpu_amd; -#endif - if (max_level & 0x00000500) { - /* "Intel P5 pre-B0" */ - cpu_flags |= cpu_mmx; - return cpu_flags; - } - - if (max_level < 1) - return cpu_flags; - - get_cpuid(®s, 1); -#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) - /* xsave/xrestore */ - if (regs.ecx & (1 << 27)) { - xgetbv_flags = get_xgetbv(0); - if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx; - } -#endif - if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2; - if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2; - if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3; - if (regs.ecx & (1 )) cpu_flags |= cpu_sse3; - if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2; - if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse; - if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx; - - if (cpu_flags & cpu_avx) { - if (max_level >= 7) { - get_cpuid(®s, 7); - if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2; - } - - get_cpuid(®s, 0x80000000); - max_ext_level = regs.eax; - if (max_ext_level >= 0x80000001) { - get_cpuid(®s, 0x80000001); - if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop; - } - } - - -#if defined(SCRYPT_TEST_SPEED) - cpu_flags &= cpu_detect_mask; -#endif - - return cpu_flags; -} - -#if defined(SCRYPT_TEST_SPEED) -static const char * -get_top_cpuflag_desc(size_t flag) { - if (flag & cpu_avx2) return "AVX2"; - else if (flag & cpu_xop) return "XOP"; - else if (flag & cpu_avx) return "AVX"; - else if (flag & cpu_sse4_2) return "SSE4.2"; - else if (flag & cpu_sse4_1) return "SSE4.1"; - else if (flag & cpu_ssse3) return "SSSE3"; - else if (flag & cpu_sse2) return "SSE2"; - else if (flag & cpu_sse) return "SSE"; - else if (flag & cpu_mmx) return "MMX"; - else return "Basic"; -} -#endif - -/* enable the highest system-wide option */ -#if defined(SCRYPT_CHOOSE_COMPILETIME) - #if !defined(__AVX2__) - #undef X86_64ASM_AVX2 - #undef X86ASM_AVX2 - #undef X86_INTRINSIC_AVX2 - #endif - #if !defined(__XOP__) - #undef X86_64ASM_XOP - #undef X86ASM_XOP - #undef X86_INTRINSIC_XOP - #endif - #if !defined(__AVX__) - #undef X86_64ASM_AVX - #undef X86ASM_AVX - #undef X86_INTRINSIC_AVX - #endif - #if !defined(__SSSE3__) - #undef X86_64ASM_SSSE3 - #undef X86ASM_SSSE3 - #undef X86_INTRINSIC_SSSE3 - #endif - #if !defined(__SSE2__) - #undef X86_64ASM_SSE2 - #undef X86ASM_SSE2 - #undef X86_INTRINSIC_SSE2 - #endif -#endif - -#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ \ No newline at end of file diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable.h deleted file mode 100644 index 9baa55e..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable.h +++ /dev/null @@ -1,310 +0,0 @@ -/* determine os */ -#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) - #include - #include - #define OS_WINDOWS -#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) - #include - #include - #include - - #define OS_SOLARIS -#else - #include - #include - #include /* need this to define BSD */ - #include - #include - - #define OS_NIX - #if defined(__linux__) - #include - #define OS_LINUX - #elif defined(BSD) - #define OS_BSD - - #if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) - #define OS_OSX - #elif defined(macintosh) || defined(Macintosh) - #define OS_MAC - #elif defined(__OpenBSD__) - #define OS_OPENBSD - #endif - #endif -#endif - - -/* determine compiler */ -#if defined(_MSC_VER) - #define COMPILER_MSVC_VS6 120000000 - #define COMPILER_MSVC_VS6PP 121000000 - #define COMPILER_MSVC_VS2002 130000000 - #define COMPILER_MSVC_VS2003 131000000 - #define COMPILER_MSVC_VS2005 140050727 - #define COMPILER_MSVC_VS2008 150000000 - #define COMPILER_MSVC_VS2008SP1 150030729 - #define COMPILER_MSVC_VS2010 160000000 - #define COMPILER_MSVC_VS2010SP1 160040219 - #define COMPILER_MSVC_VS2012RC 170000000 - #define COMPILER_MSVC_VS2012 170050727 - - #if _MSC_FULL_VER > 100000000 - #define COMPILER_MSVC (_MSC_FULL_VER) - #else - #define COMPILER_MSVC (_MSC_FULL_VER * 10) - #endif - - #if ((_MSC_VER == 1200) && defined(_mm_free)) - #undef COMPILER_MSVC - #define COMPILER_MSVC COMPILER_MSVC_VS6PP - #endif - - #pragma warning(disable : 4127) /* conditional expression is constant */ - #pragma warning(disable : 4100) /* unreferenced formal parameter */ - - #ifndef _CRT_SECURE_NO_WARNINGS - #define _CRT_SECURE_NO_WARNINGS - #endif - - #include - #include /* _rotl */ - #include - - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; - typedef signed int int32_t; - typedef unsigned __int64 uint64_t; - typedef signed __int64 int64_t; - - #define ROTL32(a,b) _rotl(a,b) - #define ROTR32(a,b) _rotr(a,b) - #define ROTL64(a,b) _rotl64(a,b) - #define ROTR64(a,b) _rotr64(a,b) - #undef NOINLINE - #define NOINLINE __declspec(noinline) - #undef NORETURN - #define NORETURN - #undef INLINE - #define INLINE __forceinline - #undef FASTCALL - #define FASTCALL __fastcall - #undef CDECL - #define CDECL __cdecl - #undef STDCALL - #define STDCALL __stdcall - #undef NAKED - #define NAKED __declspec(naked) - #define ALIGN(n) __declspec(align(n)) -#endif -#if defined(__ICC) - #define COMPILER_INTEL -#endif -#if defined(__GNUC__) - #if (__GNUC__ >= 3) - #define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__ - #else - #define COMPILER_GCC_PATCHLEVEL 0 - #endif - #define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL) - #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) - #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) - #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) - #define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b))) - #undef NOINLINE - #if (COMPILER_GCC >= 30000) - #define NOINLINE __attribute__((noinline)) - #else - #define NOINLINE - #endif - #undef NORETURN - #if (COMPILER_GCC >= 30000) - #define NORETURN __attribute__((noreturn)) - #else - #define NORETURN - #endif - #undef INLINE - #if (COMPILER_GCC >= 30000) - #define INLINE __attribute__((always_inline)) - #else - #define INLINE inline - #endif - #undef FASTCALL - #if (COMPILER_GCC >= 30400) - #define FASTCALL __attribute__((fastcall)) - #else - #define FASTCALL - #endif - #undef CDECL - #define CDECL __attribute__((cdecl)) - #undef STDCALL - #define STDCALL __attribute__((stdcall)) - #define ALIGN(n) __attribute__((aligned(n))) - #include -#endif -#if defined(__MINGW32__) || defined(__MINGW64__) - #define COMPILER_MINGW -#endif -#if defined(__PATHCC__) - #define COMPILER_PATHCC -#endif - -#define OPTIONAL_INLINE -#if defined(OPTIONAL_INLINE) - #undef OPTIONAL_INLINE - #define OPTIONAL_INLINE INLINE -#else - #define OPTIONAL_INLINE -#endif - -#define CRYPTO_FN NOINLINE STDCALL - -/* determine cpu */ -#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) - #define CPU_X86_64 -#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) - #define CPU_X86 500 -#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) - #define CPU_X86 400 -#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) - #define CPU_X86 300 -#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) - #define CPU_IA64 -#endif - -#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) - #define CPU_SPARC - #if defined(__sparcv9) - #define CPU_SPARC64 - #endif -#endif - -#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) - #define CPU_64BITS - #undef FASTCALL - #define FASTCALL - #undef CDECL - #define CDECL - #undef STDCALL - #define STDCALL -#endif - -#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) - #define CPU_PPC - #if defined(_ARCH_PWR7) - #define CPU_POWER7 - #elif defined(__64BIT__) - #define CPU_PPC64 - #else - #define CPU_PPC32 - #endif -#endif - -#if defined(__hppa__) || defined(__hppa) - #define CPU_HPPA -#endif - -#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) - #define CPU_ALPHA -#endif - -/* endian */ - -#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ - (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \ - (defined(CPU_X86) || defined(CPU_X86_64)) || \ - (defined(vax) || defined(MIPSEL) || defined(_MIPSEL))) -#define CPU_LE -#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \ - (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \ - (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB)) -#define CPU_BE -#else - /* unknown endian! */ -#endif - - -#define U8TO32_BE(p) \ - (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ - ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) - -#define U8TO32_LE(p) \ - (((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ - ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) - -#define U32TO8_BE(p, v) \ - (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ - (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); - -#define U32TO8_LE(p, v) \ - (p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \ - (p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24); - -#define U8TO64_BE(p) \ - (((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4)) - -#define U8TO64_LE(p) \ - (((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32)) - -#define U64TO8_BE(p, v) \ - U32TO8_BE((p), (uint32_t)((v) >> 32)); \ - U32TO8_BE((p) + 4, (uint32_t)((v) )); - -#define U64TO8_LE(p, v) \ - U32TO8_LE((p), (uint32_t)((v) )); \ - U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); - -#define U32_SWAP(v) { \ - (v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \ - (v) = ((v) << 16) | ((v) >> 16); \ -} - -#define U64_SWAP(v) { \ - (v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \ - (v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \ - (v) = ((v) << 32) | ((v) >> 32); \ -} - -static int -scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { - uint32_t differentbits = 0; - while (len--) - differentbits |= (*x++ ^ *y++); - return (1 & ((differentbits - 1) >> 8)); -} - -static void -scrypt_ensure_zero(void *p, size_t len) { -#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) - __stosb((unsigned char *)p, 0, len); -#elif (defined(CPU_X86) && defined(COMPILER_GCC)) - __asm__ __volatile__( - "pushl %%edi;\n" - "pushl %%ecx;\n" - "rep stosb;\n" - "popl %%ecx;\n" - "popl %%edi;\n" - :: "a"(0), "D"(p), "c"(len) : "cc", "memory" - ); -#elif (defined(CPU_X86_64) && defined(COMPILER_GCC)) - __asm__ __volatile__( - "pushq %%rdi;\n" - "pushq %%rcx;\n" - "rep stosb;\n" - "popq %%rcx;\n" - "popq %%rdi;\n" - :: "a"(0), "D"(p), "c"(len) : "cc", "memory" - ); -#else - volatile uint8_t *b = (volatile uint8_t *)p; - size_t i; - for (i = 0; i < len; i++) - b[i] = 0; -#endif -} - -#include "scrypt-jane-portable-x86.h" - -#if !defined(asm_calling_convention) -#define asm_calling_convention -#endif diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h deleted file mode 100644 index 57ba649..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h +++ /dev/null @@ -1,74 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -/* function type returned by scrypt_getROMix, used with cpu detection */ -typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r); -#endif - -/* romix pre/post nop function */ -static void asm_calling_convention -scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { - (void)blocks; (void)nblocks; -} - -/* romix pre/post endian conversion function */ -static void asm_calling_convention -scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { -#if !defined(CPU_LE) - static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; - size_t i; - if (endian_test.w == 0x100) { - nblocks *= SCRYPT_BLOCK_WORDS; - for (i = 0; i < nblocks; i++) { - SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); - } - } -#else - (void)blocks; (void)nblocks; -#endif -} - -/* chunkmix test function */ -typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); -typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); - -static int -scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { - /* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ - const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; -#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2)) - scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v; -#else - scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v; -#endif - uint8_t final[16]; - size_t i; - - for (i = 0; i < words; i++) { - v = (scrypt_mix_word_t)i; - v = (v << 8) | v; - v = (v << 16) | v; - chunk[0][i] = v; - } - - prefn(chunk[0], blocks); - mixfn(chunk[1], chunk[0], NULL, r); - postfn(chunk[1], blocks); - - /* grab the last 16 bytes of the final block */ - for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) { - SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]); - } - - return scrypt_verify(expected, final, 16); -} - -/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */ -static scrypt_mix_word_t * -scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) { - return base + (i * len); -} - -/* returns a pointer to block i */ -static scrypt_mix_word_t * -scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) { - return base + (i * SCRYPT_BLOCK_WORDS); -} diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-template.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-template.h deleted file mode 100644 index 373ae60..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-template.h +++ /dev/null @@ -1,122 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) - -#if defined(SCRYPT_CHOOSE_COMPILETIME) -#undef SCRYPT_ROMIX_FN -#define SCRYPT_ROMIX_FN scrypt_ROMix -#endif - -#undef SCRYPT_HAVE_ROMIX -#define SCRYPT_HAVE_ROMIX - -#if !defined(SCRYPT_CHUNKMIX_FN) - -#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic - -/* - Bout = ChunkMix(Bin) - - 2*r: number of blocks in the chunk -*/ -static void asm_calling_convention -SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { -#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2)) - scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block; -#else - scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block; -#endif - uint32_t i, j, blocksPerChunk = /*r * 2*/2, half = 0; - - /* 1: X = B_{2r - 1} */ - block = scrypt_block(Bin, blocksPerChunk - 1); - for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) - X[i] = block[i]; - - if (Bxor) { - block = scrypt_block(Bxor, blocksPerChunk - 1); - for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) - X[i] ^= block[i]; - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= /*r*/1) { - /* 3: X = H(X ^ B_i) */ - block = scrypt_block(Bin, i); - for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) - X[j] ^= block[j]; - - if (Bxor) { - block = scrypt_block(Bxor, i); - for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) - X[j] ^= block[j]; - } - SCRYPT_MIX_FN(X); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - block = scrypt_block(Bout, (i / 2) + half); - for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) - block[j] = X[j]; - } -} -#endif - -/* - X = ROMix(X) - - X: chunk to mix - Y: scratch chunk - N: number of rounds - V[N]: array of chunks to randomly index in to - 2*r: number of blocks in a chunk -*/ - -static void NOINLINE FASTCALL -SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { - uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * 2); - scrypt_mix_word_t *block = V; - - SCRYPT_ROMIX_TANGLE_FN(X, 2); - - /* 1: X = B */ - /* implicit */ - - /* 2: for i = 0 to N - 1 do */ - memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); - for (i = 0; i < /*N - 1*/511; i++, block += chunkWords) { - /* 3: V_i = X */ - /* 4: X = H(X) */ - SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, /*r*/1); - } - SCRYPT_CHUNKMIX_FN(X, block, NULL, 1); - - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < /*N*/512; i += 2) { - /* 7: j = Integerify(X) % N */ - j = X[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511; - - /* 8: X = H(Y ^ V_j) */ - SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), 1); - - /* 7: j = Integerify(Y) % N */ - j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511; - - /* 8: X = H(Y ^ V_j) */ - SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), 1); - } - - /* 10: B' = X */ - /* implicit */ - - SCRYPT_ROMIX_UNTANGLE_FN(X, 2); -} - -#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ - - -#undef SCRYPT_CHUNKMIX_FN -#undef SCRYPT_ROMIX_FN -#undef SCRYPT_MIX_FN -#undef SCRYPT_ROMIX_TANGLE_FN -#undef SCRYPT_ROMIX_UNTANGLE_FN - diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix.h deleted file mode 100644 index 02de357..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix.h +++ /dev/null @@ -1,23 +0,0 @@ -#if defined(SCRYPT_SALSA64) -#include "scrypt-jane-salsa64.h" -#else - #define SCRYPT_MIX_BASE "ERROR" - typedef uint32_t scrypt_mix_word_t; - #define SCRYPT_WORDTO8_LE U32TO8_LE - #define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP - #define SCRYPT_BLOCK_BYTES 64 - #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - #if !defined(SCRYPT_CHOOSE_COMPILETIME) - static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {} - static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; } - #else - static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {} - #endif - static int scrypt_test_mix(void) { return 0; } - #error must define a mix function! -#endif - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -#undef SCRYPT_MIX -#define SCRYPT_MIX SCRYPT_MIX_BASE -#endif diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-salsa64.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-salsa64.h deleted file mode 100644 index 96b7813..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-salsa64.h +++ /dev/null @@ -1,183 +0,0 @@ -#define SCRYPT_MIX_BASE "Salsa64/8" - -typedef uint64_t scrypt_mix_word_t; - -#define SCRYPT_WORDTO8_LE U64TO8_LE -#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP - -#define SCRYPT_BLOCK_BYTES 128 -#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - -/* must have these here in case block bytes is ever != 64 */ -#include "scrypt-jane-romix-basic.h" - -#include "scrypt-jane-mix_salsa64-avx2.h" -#include "scrypt-jane-mix_salsa64-xop.h" -#include "scrypt-jane-mix_salsa64-avx.h" -#include "scrypt-jane-mix_salsa64-ssse3.h" -#include "scrypt-jane-mix_salsa64-sse2.h" -#include "scrypt-jane-mix_salsa64.h" - -#if defined(SCRYPT_SALSA64_AVX2) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2 - #define SCRYPT_ROMIX_FN scrypt_ROMix_avx2 - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA64_XOP) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop - #define SCRYPT_ROMIX_FN scrypt_ROMix_xop - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA64_AVX) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx - #define SCRYPT_ROMIX_FN scrypt_ROMix_avx - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 - #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 - #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -/* cpu agnostic */ -#define SCRYPT_ROMIX_FN scrypt_ROMix_basic -#define SCRYPT_MIX_FN salsa64_core_basic -#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian -#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian -#include "scrypt-jane-romix-template.h" - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -static scrypt_ROMixfn -scrypt_getROMix(void) { - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA64_AVX2) - if (cpuflags & cpu_avx2) - return scrypt_ROMix_avx2; - else -#endif - -#if defined(SCRYPT_SALSA64_XOP) - if (cpuflags & cpu_xop) - return scrypt_ROMix_xop; - else -#endif - -#if defined(SCRYPT_SALSA64_AVX) - if (cpuflags & cpu_avx) - return scrypt_ROMix_avx; - else -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - if (cpuflags & cpu_ssse3) - return scrypt_ROMix_ssse3; - else -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - if (cpuflags & cpu_sse2) - return scrypt_ROMix_sse2; - else -#endif - - return scrypt_ROMix_basic; -} -#endif - - -#if defined(SCRYPT_TEST_SPEED) -static size_t -available_implementations(void) { - size_t cpuflags = detect_cpu(); - size_t flags = 0; - -#if defined(SCRYPT_SALSA64_AVX2) - if (cpuflags & cpu_avx2) - flags |= cpu_avx2; -#endif - -#if defined(SCRYPT_SALSA64_XOP) - if (cpuflags & cpu_xop) - flags |= cpu_xop; -#endif - -#if defined(SCRYPT_SALSA64_AVX) - if (cpuflags & cpu_avx) - flags |= cpu_avx; -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - if (cpuflags & cpu_ssse3) - flags |= cpu_ssse3; -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - if (cpuflags & cpu_sse2) - flags |= cpu_sse2; -#endif - - return flags; -} -#endif - -static int -scrypt_test_mix(void) { - static const uint8_t expected[16] = { - 0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c, - }; - - int ret = 1; - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA64_AVX2) - if (cpuflags & cpu_avx2) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_XOP) - if (cpuflags & cpu_xop) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_AVX) - if (cpuflags & cpu_avx) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - if (cpuflags & cpu_ssse3) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - if (cpuflags & cpu_sse2) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_BASIC) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); -#endif - - return ret; -} - diff --git a/algo/argon2/argon2a/ar2/sj/scrypt-jane-test-vectors.h b/algo/argon2/argon2a/ar2/sj/scrypt-jane-test-vectors.h deleted file mode 100644 index 2d0b596..0000000 --- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-test-vectors.h +++ /dev/null @@ -1,28 +0,0 @@ -typedef struct scrypt_test_setting_t { - const char *pw, *salt; - uint8_t Nfactor, rfactor, pfactor; -} scrypt_test_setting; - -static const scrypt_test_setting post_settings[] = { - {"", "", 3, 0, 0}, - {"password", "NaCl", 9, 3, 4}, - {0, 0, 0, 0, 0} -}; - -#if defined(SCRYPT_SKEIN512) - #if defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60, - 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59, - 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9, - 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89}, - {0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5, - 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99, - 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23, - 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b} - }; - #endif -#else - static const uint8_t post_vectors[][64] = {{0}}; -#endif - diff --git a/algo/argon2/argon2a/argon2a.c b/algo/argon2/argon2a/argon2a.c deleted file mode 100644 index 21bb2a0..0000000 --- a/algo/argon2/argon2a/argon2a.c +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include -#include -#include -#include -#include "ar2/argon2.h" -#include "ar2/cores.h" -#include "ar2/ar2-scrypt-jane.h" -#include "algo-gate-api.h" - -#define T_COSTS 2 -#define M_COSTS 16 -#define MASK 8 -#define ZERO 0 - -inline void argon_call(void *out, void *in, void *salt, int type) -{ - argon2_context context; - - context.out = (uint8_t *)out; - context.pwd = (uint8_t *)in; - context.salt = (uint8_t*)salt; - context.pwdlen = 0; - context.allocate_cbk = NULL; - context.free_cbk = NULL; - - ar2_argon2_core(&context, type); -} - -void argon2hash(void *output, const void *input) -{ - uint32_t _ALIGN(64) hashA[8], hashB[8]; - - my_scrypt((const unsigned char *)input, 80, - (const unsigned char *)input, 80, - (unsigned char *)hashA); - - argon_call(hashB, hashA, hashA, (hashA[0] & MASK) == ZERO); - - my_scrypt((const unsigned char *)hashB, 32, - (const unsigned char *)hashB, 32, - (unsigned char *)output); -} - -int scanhash_argon2( struct work* work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) endiandata[20]; - uint32_t _ALIGN(64) hash[8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t nonce = first_nonce; - - swab32_array( endiandata, pdata, 20 ); - - do { - be32enc(&endiandata[19], nonce); - argon2hash(hash, endiandata); - if (hash[7] <= Htarg && fulltest(hash, ptarget)) { - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - work_set_target_ratio(work, hash); - return 1; - } - nonce++; - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -int64_t argon2_get_max64 () -{ - return 0x1ffLL; -} - -bool register_argon2_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT; - gate->scanhash = (void*)&scanhash_argon2; - gate->hash = (void*)&argon2hash; - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&argon2_get_max64; - return true; -}; - diff --git a/algo/argon2/argon2d/argon2d-gate.c b/algo/argon2/argon2d/argon2d-gate.c deleted file mode 100644 index 733c70c..0000000 --- a/algo/argon2/argon2d/argon2d-gate.c +++ /dev/null @@ -1,192 +0,0 @@ -#include "argon2d-gate.h" -#include "argon2d/argon2.h" - -static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Input Length = Salt Length (salt = input) -static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash -static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS - -// Credits - -void argon2d_crds_hash( void *output, const void *input ) -{ - argon2_context context; - context.out = (uint8_t *)output; - context.outlen = (uint32_t)OUTPUT_BYTES; - context.pwd = (uint8_t *)input; - context.pwdlen = (uint32_t)INPUT_BYTES; - context.salt = (uint8_t *)input; //salt = input - context.saltlen = (uint32_t)INPUT_BYTES; - context.secret = NULL; - context.secretlen = 0; - context.ad = NULL; - context.adlen = 0; - context.allocate_cbk = NULL; - context.free_cbk = NULL; - context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS - // main configurable Argon2 hash parameters - context.m_cost = 250; // Memory in KiB (~256KB) - context.lanes = 4; // Degree of Parallelism - context.threads = 1; // Threads - context.t_cost = 1; // Iterations - context.version = ARGON2_VERSION_10; - - argon2_ctx( &context, Argon2_d ); -} - -int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) endiandata[20]; - uint32_t _ALIGN(64) hash[8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t nonce = first_nonce; - - swab32_array( endiandata, pdata, 20 ); - - do { - be32enc(&endiandata[19], nonce); - argon2d_crds_hash( hash, endiandata ); - if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -bool register_argon2d_crds_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_argon2d_crds; - gate->hash = (void*)&argon2d_crds_hash; - gate->set_target = (void*)&scrypt_set_target; - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; - return true; -} - -// Dynamic - -void argon2d_dyn_hash( void *output, const void *input ) -{ - argon2_context context; - context.out = (uint8_t *)output; - context.outlen = (uint32_t)OUTPUT_BYTES; - context.pwd = (uint8_t *)input; - context.pwdlen = (uint32_t)INPUT_BYTES; - context.salt = (uint8_t *)input; //salt = input - context.saltlen = (uint32_t)INPUT_BYTES; - context.secret = NULL; - context.secretlen = 0; - context.ad = NULL; - context.adlen = 0; - context.allocate_cbk = NULL; - context.free_cbk = NULL; - context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS - // main configurable Argon2 hash parameters - context.m_cost = 500; // Memory in KiB (512KB) - context.lanes = 8; // Degree of Parallelism - context.threads = 1; // Threads - context.t_cost = 2; // Iterations - context.version = ARGON2_VERSION_10; - - argon2_ctx( &context, Argon2_d ); -} - -int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) endiandata[20]; - uint32_t _ALIGN(64) hash[8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t nonce = first_nonce; - - swab32_array( endiandata, pdata, 20 ); - - do - { - be32enc(&endiandata[19], nonce); - argon2d_dyn_hash( hash, endiandata ); - if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -bool register_argon2d_dyn_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_argon2d_dyn; - gate->hash = (void*)&argon2d_dyn_hash; - gate->set_target = (void*)&scrypt_set_target; - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; - return true; -} - -// Unitus - -int scanhash_argon2d4096( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) vhash[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - uint32_t t_cost = 1; // 1 iteration - uint32_t m_cost = 4096; // use 4MB - uint32_t parallelism = 1; // 1 thread, 2 lanes - - for ( int i = 0; i < 19; i++ ) - be32enc( &endiandata[i], pdata[i] ); - - do { - be32enc( &endiandata[19], n ); - argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80, - (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 ); - if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n; - submit_solution( work, vhash, mythr ); - } - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} - -int64_t get_max64_0x1ff() { return 0x1ff; } - -bool register_argon2d4096_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_argon2d4096; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&get_max64_0x1ff; - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; - return true; -} - diff --git a/algo/argon2/argon2d/argon2d-gate.h b/algo/argon2/argon2d/argon2d-gate.h deleted file mode 100644 index dbb2b4d..0000000 --- a/algo/argon2/argon2d/argon2d-gate.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef ARGON2D_GATE_H__ -#define ARGON2D_GATE_H__ - -#include "algo-gate-api.h" -#include - -// Credits: version = 0x10, m_cost = 250. -bool register_argon2d_crds_algo( algo_gate_t* gate ); - -void argon2d_crds_hash( void *state, const void *input ); - -int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -// Dynamic: version = 0x10, m_cost = 500. -bool register_argon2d_dyn_algo( algo_gate_t* gate ); - -void argon2d_dyn_hash( void *state, const void *input ); - -int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - - -// Unitus: version = 0x13, m_cost = 4096. -bool register_argon2d4096_algo( algo_gate_t* gate ); - -int scanhash_argon2d4096( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - diff --git a/algo/argon2/argon2d/argon2d/argon2.c b/algo/argon2/argon2d/argon2d/argon2.c deleted file mode 100644 index 5eabe35..0000000 --- a/algo/argon2/argon2d/argon2d/argon2.c +++ /dev/null @@ -1,458 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#include -#include -#include - -#include "argon2.h" -#include "encoding.h" -#include "core.h" - -const char *argon2_type2string(argon2_type type, int uppercase) { - switch (type) { - case Argon2_d: - return uppercase ? "Argon2d" : "argon2d"; - case Argon2_i: - return uppercase ? "Argon2i" : "argon2i"; - case Argon2_id: - return uppercase ? "Argon2id" : "argon2id"; - } - - return NULL; -} - -int argon2_ctx(argon2_context *context, argon2_type type) { - /* 1. Validate all inputs */ - int result = validate_inputs(context); - uint32_t memory_blocks, segment_length; - argon2_instance_t instance; - - if (ARGON2_OK != result) { - return result; - } - - if (Argon2_d != type && Argon2_i != type && Argon2_id != type) { - return ARGON2_INCORRECT_TYPE; - } - - /* 2. Align memory size */ - /* Minimum memory_blocks = 8L blocks, where L is the number of lanes */ - memory_blocks = context->m_cost; - - if (memory_blocks < 2 * ARGON2_SYNC_POINTS * context->lanes) { - memory_blocks = 2 * ARGON2_SYNC_POINTS * context->lanes; - } - - segment_length = memory_blocks / (context->lanes * ARGON2_SYNC_POINTS); - /* Ensure that all segments have equal length */ - memory_blocks = segment_length * (context->lanes * ARGON2_SYNC_POINTS); - - instance.version = context->version; - instance.memory = NULL; - instance.passes = context->t_cost; - instance.memory_blocks = memory_blocks; - instance.segment_length = segment_length; - instance.lane_length = segment_length * ARGON2_SYNC_POINTS; - instance.lanes = context->lanes; - instance.threads = context->threads; - instance.type = type; - - if (instance.threads > instance.lanes) { - instance.threads = instance.lanes; - } - - /* 3. Initialization: Hashing inputs, allocating memory, filling first - * blocks - */ - result = initialize(&instance, context); - - if (ARGON2_OK != result) { - return result; - } - - /* 4. Filling memory */ - result = fill_memory_blocks(&instance); - - if (ARGON2_OK != result) { - return result; - } - /* 5. Finalization */ - finalize(context, &instance); - - return ARGON2_OK; -} - -int argon2_hash(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, const size_t saltlen, - void *hash, const size_t hashlen, char *encoded, - const size_t encodedlen, argon2_type type, - const uint32_t version){ - - argon2_context context; - int result; - uint8_t *out; - - if (pwdlen > ARGON2_MAX_PWD_LENGTH) { - return ARGON2_PWD_TOO_LONG; - } - - if (saltlen > ARGON2_MAX_SALT_LENGTH) { - return ARGON2_SALT_TOO_LONG; - } - - if (hashlen > ARGON2_MAX_OUTLEN) { - return ARGON2_OUTPUT_TOO_LONG; - } - - if (hashlen < ARGON2_MIN_OUTLEN) { - return ARGON2_OUTPUT_TOO_SHORT; - } - - out = malloc(hashlen); - if (!out) { - return ARGON2_MEMORY_ALLOCATION_ERROR; - } - - context.out = (uint8_t *)out; - context.outlen = (uint32_t)hashlen; - context.pwd = CONST_CAST(uint8_t *)pwd; - context.pwdlen = (uint32_t)pwdlen; - context.salt = CONST_CAST(uint8_t *)salt; - context.saltlen = (uint32_t)saltlen; - context.secret = NULL; - context.secretlen = 0; - context.ad = NULL; - context.adlen = 0; - context.t_cost = t_cost; - context.m_cost = m_cost; - context.lanes = parallelism; - context.threads = parallelism; - context.allocate_cbk = NULL; - context.free_cbk = NULL; - context.flags = ARGON2_DEFAULT_FLAGS; - context.version = version; - - result = argon2_ctx(&context, type); - - if (result != ARGON2_OK) { - clear_internal_memory(out, hashlen); - free(out); - return result; - } - - /* if raw hash requested, write it */ - if (hash) { - memcpy(hash, out, hashlen); - } - - /* if encoding requested, write it */ - if (encoded && encodedlen) { - if (encode_string(encoded, encodedlen, &context, type) != ARGON2_OK) { - clear_internal_memory(out, hashlen); /* wipe buffers if error */ - clear_internal_memory(encoded, encodedlen); - free(out); - return ARGON2_ENCODING_FAIL; - } - } - clear_internal_memory(out, hashlen); - free(out); - - return ARGON2_OK; -} - -int argon2i_hash_encoded(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, const size_t hashlen, - char *encoded, const size_t encodedlen, - const uint32_t version) { - - return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen, - NULL, hashlen, encoded, encodedlen, Argon2_i, - version ); -} - -int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, void *hash, const size_t hashlen, - const uint32_t version ) { - - return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen, - hash, hashlen, NULL, 0, Argon2_i, version ); -} - -int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, const size_t hashlen, - char *encoded, const size_t encodedlen, - const uint32_t version ) { - - return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen, - NULL, hashlen, encoded, encodedlen, Argon2_d, - version ); -} - -int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, void *hash, const size_t hashlen, - const uint32_t version ) { - - return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen, - hash, hashlen, NULL, 0, Argon2_d, version ); -} - -int argon2id_hash_encoded(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, const size_t hashlen, - char *encoded, const size_t encodedlen, - const uint32_t version ) { - - return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen, - NULL, hashlen, encoded, encodedlen, Argon2_id, - version); -} - -int argon2id_hash_raw(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, void *hash, const size_t hashlen, - const uint32_t version ) { - return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen, - hash, hashlen, NULL, 0, Argon2_id, version ); -} - -static int argon2_compare(const uint8_t *b1, const uint8_t *b2, size_t len) { - size_t i; - uint8_t d = 0U; - - for (i = 0U; i < len; i++) { - d |= b1[i] ^ b2[i]; - } - return (int)((1 & ((d - 1) >> 8)) - 1); -} - -int argon2_verify(const char *encoded, const void *pwd, const size_t pwdlen, - argon2_type type) { - - argon2_context ctx; - uint8_t *desired_result = NULL; - - int ret = ARGON2_OK; - - size_t encoded_len; - uint32_t max_field_len; - - if (pwdlen > ARGON2_MAX_PWD_LENGTH) { - return ARGON2_PWD_TOO_LONG; - } - - if (encoded == NULL) { - return ARGON2_DECODING_FAIL; - } - - encoded_len = strlen(encoded); - if (encoded_len > UINT32_MAX) { - return ARGON2_DECODING_FAIL; - } - - /* No field can be longer than the encoded length */ - max_field_len = (uint32_t)encoded_len; - - ctx.saltlen = max_field_len; - ctx.outlen = max_field_len; - - ctx.salt = malloc(ctx.saltlen); - ctx.out = malloc(ctx.outlen); - if (!ctx.salt || !ctx.out) { - ret = ARGON2_MEMORY_ALLOCATION_ERROR; - goto fail; - } - - ctx.pwd = (uint8_t *)pwd; - ctx.pwdlen = (uint32_t)pwdlen; - - ret = decode_string(&ctx, encoded, type); - if (ret != ARGON2_OK) { - goto fail; - } - - /* Set aside the desired result, and get a new buffer. */ - desired_result = ctx.out; - ctx.out = malloc(ctx.outlen); - if (!ctx.out) { - ret = ARGON2_MEMORY_ALLOCATION_ERROR; - goto fail; - } - - ret = argon2_verify_ctx(&ctx, (char *)desired_result, type); - if (ret != ARGON2_OK) { - goto fail; - } - -fail: - free(ctx.salt); - free(ctx.out); - free(desired_result); - - return ret; -} - -int argon2i_verify(const char *encoded, const void *pwd, const size_t pwdlen) { - - return argon2_verify(encoded, pwd, pwdlen, Argon2_i); -} - -int argon2d_verify(const char *encoded, const void *pwd, const size_t pwdlen) { - - return argon2_verify(encoded, pwd, pwdlen, Argon2_d); -} - -int argon2id_verify(const char *encoded, const void *pwd, const size_t pwdlen) { - - return argon2_verify(encoded, pwd, pwdlen, Argon2_id); -} - -int argon2d_ctx(argon2_context *context) { - return argon2_ctx(context, Argon2_d); -} - -int argon2i_ctx(argon2_context *context) { - return argon2_ctx(context, Argon2_i); -} - -int argon2id_ctx(argon2_context *context) { - return argon2_ctx(context, Argon2_id); -} - -int argon2_verify_ctx(argon2_context *context, const char *hash, - argon2_type type) { - int ret = argon2_ctx(context, type); - if (ret != ARGON2_OK) { - return ret; - } - - if (argon2_compare((uint8_t *)hash, context->out, context->outlen)) { - return ARGON2_VERIFY_MISMATCH; - } - - return ARGON2_OK; -} - -int argon2d_verify_ctx(argon2_context *context, const char *hash) { - return argon2_verify_ctx(context, hash, Argon2_d); -} - -int argon2i_verify_ctx(argon2_context *context, const char *hash) { - return argon2_verify_ctx(context, hash, Argon2_i); -} - -int argon2id_verify_ctx(argon2_context *context, const char *hash) { - return argon2_verify_ctx(context, hash, Argon2_id); -} - -const char *argon2_error_message(int error_code) { - switch (error_code) { - case ARGON2_OK: - return "OK"; - case ARGON2_OUTPUT_PTR_NULL: - return "Output pointer is NULL"; - case ARGON2_OUTPUT_TOO_SHORT: - return "Output is too short"; - case ARGON2_OUTPUT_TOO_LONG: - return "Output is too long"; - case ARGON2_PWD_TOO_SHORT: - return "Password is too short"; - case ARGON2_PWD_TOO_LONG: - return "Password is too long"; - case ARGON2_SALT_TOO_SHORT: - return "Salt is too short"; - case ARGON2_SALT_TOO_LONG: - return "Salt is too long"; - case ARGON2_AD_TOO_SHORT: - return "Associated data is too short"; - case ARGON2_AD_TOO_LONG: - return "Associated data is too long"; - case ARGON2_SECRET_TOO_SHORT: - return "Secret is too short"; - case ARGON2_SECRET_TOO_LONG: - return "Secret is too long"; - case ARGON2_TIME_TOO_SMALL: - return "Time cost is too small"; - case ARGON2_TIME_TOO_LARGE: - return "Time cost is too large"; - case ARGON2_MEMORY_TOO_LITTLE: - return "Memory cost is too small"; - case ARGON2_MEMORY_TOO_MUCH: - return "Memory cost is too large"; - case ARGON2_LANES_TOO_FEW: - return "Too few lanes"; - case ARGON2_LANES_TOO_MANY: - return "Too many lanes"; - case ARGON2_PWD_PTR_MISMATCH: - return "Password pointer is NULL, but password length is not 0"; - case ARGON2_SALT_PTR_MISMATCH: - return "Salt pointer is NULL, but salt length is not 0"; - case ARGON2_SECRET_PTR_MISMATCH: - return "Secret pointer is NULL, but secret length is not 0"; - case ARGON2_AD_PTR_MISMATCH: - return "Associated data pointer is NULL, but ad length is not 0"; - case ARGON2_MEMORY_ALLOCATION_ERROR: - return "Memory allocation error"; - case ARGON2_FREE_MEMORY_CBK_NULL: - return "The free memory callback is NULL"; - case ARGON2_ALLOCATE_MEMORY_CBK_NULL: - return "The allocate memory callback is NULL"; - case ARGON2_INCORRECT_PARAMETER: - return "Argon2_Context context is NULL"; - case ARGON2_INCORRECT_TYPE: - return "There is no such version of Argon2"; - case ARGON2_OUT_PTR_MISMATCH: - return "Output pointer mismatch"; - case ARGON2_THREADS_TOO_FEW: - return "Not enough threads"; - case ARGON2_THREADS_TOO_MANY: - return "Too many threads"; - case ARGON2_MISSING_ARGS: - return "Missing arguments"; - case ARGON2_ENCODING_FAIL: - return "Encoding failed"; - case ARGON2_DECODING_FAIL: - return "Decoding failed"; - case ARGON2_THREAD_FAIL: - return "Threading failure"; - case ARGON2_DECODING_LENGTH_FAIL: - return "Some of encoded parameters are too long or too short"; - case ARGON2_VERIFY_MISMATCH: - return "The password does not match the supplied hash"; - default: - return "Unknown error code"; - } -} -/* -size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost, uint32_t parallelism, - uint32_t saltlen, uint32_t hashlen, argon2_type type) { - return strlen("$$v=$m=,t=,p=$$") + strlen(argon2_type2string(type, 0)) + - numlen(t_cost) + numlen(m_cost) + numlen(parallelism) + - b64len(saltlen) + b64len(hashlen) + numlen(ARGON2_VERSION_NUMBER) + 1; -} -*/ diff --git a/algo/argon2/argon2d/argon2d/argon2.h b/algo/argon2/argon2d/argon2d/argon2.h deleted file mode 100644 index d546d37..0000000 --- a/algo/argon2/argon2d/argon2d/argon2.h +++ /dev/null @@ -1,440 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#ifndef ARGON2_H -#define ARGON2_H - -#include -#include -#include - -#if defined(__cplusplus) -extern "C" { -#endif - -/* Symbols visibility control */ -#ifdef A2_VISCTL -#define ARGON2_PUBLIC __attribute__((visibility("default"))) -#define ARGON2_LOCAL __attribute__ ((visibility ("hidden"))) -#elif _MSC_VER -#define ARGON2_PUBLIC __declspec(dllexport) -#define ARGON2_LOCAL -#else -#define ARGON2_PUBLIC -#define ARGON2_LOCAL -#endif - -/* - * Argon2 input parameter restrictions - */ - -/* Minimum and maximum number of lanes (degree of parallelism) */ -#define ARGON2_MIN_LANES UINT32_C(1) -#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF) - -/* Minimum and maximum number of threads */ -#define ARGON2_MIN_THREADS UINT32_C(1) -#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF) - -/* Number of synchronization points between lanes per pass */ -#define ARGON2_SYNC_POINTS UINT32_C(4) - -/* Minimum and maximum digest size in bytes */ -#define ARGON2_MIN_OUTLEN UINT32_C(4) -#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */ -#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */ - -#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b)) -/* Max memory size is addressing-space/2, topping at 2^32 blocks (4 TB) */ -#define ARGON2_MAX_MEMORY_BITS \ - ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1)) -#define ARGON2_MAX_MEMORY \ - ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS) - -/* Minimum and maximum number of passes */ -#define ARGON2_MIN_TIME UINT32_C(1) -#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum password length in bytes */ -#define ARGON2_MIN_PWD_LENGTH UINT32_C(0) -#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum associated data length in bytes */ -#define ARGON2_MIN_AD_LENGTH UINT32_C(0) -#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum salt length in bytes */ -#define ARGON2_MIN_SALT_LENGTH UINT32_C(8) -#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF) - -/* Minimum and maximum key length in bytes */ -#define ARGON2_MIN_SECRET UINT32_C(0) -#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF) - -/* Flags to determine which fields are securely wiped (default = no wipe). */ -#define ARGON2_DEFAULT_FLAGS UINT32_C(0) -#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0) -#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1) - -/* Global flag to determine if we are wiping internal memory buffers. This flag - * is defined in core.c and deafults to 1 (wipe internal memory). */ -extern int FLAG_clear_internal_memory; - -/* Error codes */ -typedef enum Argon2_ErrorCodes { - ARGON2_OK = 0, - - ARGON2_OUTPUT_PTR_NULL = -1, - - ARGON2_OUTPUT_TOO_SHORT = -2, - ARGON2_OUTPUT_TOO_LONG = -3, - - ARGON2_PWD_TOO_SHORT = -4, - ARGON2_PWD_TOO_LONG = -5, - - ARGON2_SALT_TOO_SHORT = -6, - ARGON2_SALT_TOO_LONG = -7, - - ARGON2_AD_TOO_SHORT = -8, - ARGON2_AD_TOO_LONG = -9, - - ARGON2_SECRET_TOO_SHORT = -10, - ARGON2_SECRET_TOO_LONG = -11, - - ARGON2_TIME_TOO_SMALL = -12, - ARGON2_TIME_TOO_LARGE = -13, - - ARGON2_MEMORY_TOO_LITTLE = -14, - ARGON2_MEMORY_TOO_MUCH = -15, - - ARGON2_LANES_TOO_FEW = -16, - ARGON2_LANES_TOO_MANY = -17, - - ARGON2_PWD_PTR_MISMATCH = -18, /* NULL ptr with non-zero length */ - ARGON2_SALT_PTR_MISMATCH = -19, /* NULL ptr with non-zero length */ - ARGON2_SECRET_PTR_MISMATCH = -20, /* NULL ptr with non-zero length */ - ARGON2_AD_PTR_MISMATCH = -21, /* NULL ptr with non-zero length */ - - ARGON2_MEMORY_ALLOCATION_ERROR = -22, - - ARGON2_FREE_MEMORY_CBK_NULL = -23, - ARGON2_ALLOCATE_MEMORY_CBK_NULL = -24, - - ARGON2_INCORRECT_PARAMETER = -25, - ARGON2_INCORRECT_TYPE = -26, - - ARGON2_OUT_PTR_MISMATCH = -27, - - ARGON2_THREADS_TOO_FEW = -28, - ARGON2_THREADS_TOO_MANY = -29, - - ARGON2_MISSING_ARGS = -30, - - ARGON2_ENCODING_FAIL = -31, - - ARGON2_DECODING_FAIL = -32, - - ARGON2_THREAD_FAIL = -33, - - ARGON2_DECODING_LENGTH_FAIL = -34, - - ARGON2_VERIFY_MISMATCH = -35 -} argon2_error_codes; - -/* Memory allocator types --- for external allocation */ -typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate); -typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate); - -/* Argon2 external data structures */ - -/* - ***** - * Context: structure to hold Argon2 inputs: - * output array and its length, - * password and its length, - * salt and its length, - * secret and its length, - * associated data and its length, - * number of passes, amount of used memory (in KBytes, can be rounded up a bit) - * number of parallel threads that will be run. - * All the parameters above affect the output hash value. - * Additionally, two function pointers can be provided to allocate and - * deallocate the memory (if NULL, memory will be allocated internally). - * Also, three flags indicate whether to erase password, secret as soon as they - * are pre-hashed (and thus not needed anymore), and the entire memory - ***** - * Simplest situation: you have output array out[8], password is stored in - * pwd[32], salt is stored in salt[16], you do not have keys nor associated - * data. You need to spend 1 GB of RAM and you run 5 passes of Argon2d with - * 4 parallel lanes. - * You want to erase the password, but you're OK with last pass not being - * erased. You want to use the default memory allocator. - * Then you initialize: - Argon2_Context(out,8,pwd,32,salt,16,NULL,0,NULL,0,5,1<<20,4,4,NULL,NULL,true,false,false,false) - */ -typedef struct Argon2_Context { - uint8_t *out; /* output array */ - uint32_t outlen; /* digest length */ - - uint8_t *pwd; /* password array */ - uint32_t pwdlen; /* password length */ - - uint8_t *salt; /* salt array */ - uint32_t saltlen; /* salt length */ - - uint8_t *secret; /* key array */ - uint32_t secretlen; /* key length */ - - uint8_t *ad; /* associated data array */ - uint32_t adlen; /* associated data length */ - - uint32_t t_cost; /* number of passes */ - uint32_t m_cost; /* amount of memory requested (KB) */ - uint32_t lanes; /* number of lanes */ - uint32_t threads; /* maximum number of threads */ - - uint32_t version; /* version number */ - - allocate_fptr allocate_cbk; /* pointer to memory allocator */ - deallocate_fptr free_cbk; /* pointer to memory deallocator */ - - uint32_t flags; /* array of bool options */ -} argon2_context; - -/* Argon2 primitive type */ -typedef enum Argon2_type { - Argon2_d = 0, - Argon2_i = 1, - Argon2_id = 2 -} argon2_type; - -/* Version of the algorithm */ -#define ARGON2_VERSION_10 0x10 -#define ARGON2_VERSION_13 0x13 - -/* - * Function that gives the string representation of an argon2_type. - * @param type The argon2_type that we want the string for - * @param uppercase Whether the string should have the first letter uppercase - * @return NULL if invalid type, otherwise the string representation. - */ -ARGON2_PUBLIC const char *argon2_type2string(argon2_type type, int uppercase); - -/* - * Function that performs memory-hard hashing with certain degree of parallelism - * @param context Pointer to the Argon2 internal structure - * @return Error code if smth is wrong, ARGON2_OK otherwise - */ -ARGON2_PUBLIC int argon2_ctx(argon2_context *context, argon2_type type); - -/** - * Hashes a password with Argon2i, producing an encoded hash - * @param t_cost Number of iterations - * @param m_cost Sets memory usage to m_cost kibibytes - * @param parallelism Number of threads and compute lanes - * @param pwd Pointer to password - * @param pwdlen Password size in bytes - * @param salt Pointer to salt - * @param saltlen Salt size in bytes - * @param hashlen Desired length of the hash in bytes - * @param encoded Buffer where to write the encoded hash - * @param encodedlen Size of the buffer (thus max size of the encoded hash) - * @pre Different parallelism levels will give different results - * @pre Returns ARGON2_OK if successful - */ -ARGON2_PUBLIC int argon2i_hash_encoded(const uint32_t t_cost, - const uint32_t m_cost, - const uint32_t parallelism, - const void *pwd, const size_t pwdlen, - const void *salt, const size_t saltlen, - const size_t hashlen, char *encoded, - const size_t encodedlen, - const uint32_t version ); - -/** - * Hashes a password with Argon2i, producing a raw hash at @hash - * @param t_cost Number of iterations - * @param m_cost Sets memory usage to m_cost kibibytes - * @param parallelism Number of threads and compute lanes - * @param pwd Pointer to password - * @param pwdlen Password size in bytes - * @param salt Pointer to salt - * @param saltlen Salt size in bytes - * @param hash Buffer where to write the raw hash - updated by the function - * @param hashlen Desired length of the hash in bytes - * @pre Different parallelism levels will give different results - * @pre Returns ARGON2_OK if successful - */ -ARGON2_PUBLIC int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, void *hash, - const size_t hashlen, - const uint32_t version ); - -ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost, - const uint32_t m_cost, - const uint32_t parallelism, - const void *pwd, const size_t pwdlen, - const void *salt, const size_t saltlen, - const size_t hashlen, char *encoded, - const size_t encodedlen, - const uint32_t version ); - -ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, void *hash, - const size_t hashlen, - const uint32_t version ); - -ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost, - const uint32_t m_cost, - const uint32_t parallelism, - const void *pwd, const size_t pwdlen, - const void *salt, const size_t saltlen, - const size_t hashlen, char *encoded, - const size_t encodedlen, - const uint32_t version ); - -ARGON2_PUBLIC int argon2id_hash_raw(const uint32_t t_cost, - const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, void *hash, - const size_t hashlen, - const uint32_t version ); - -/* generic function underlying the above ones */ -ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost, - const uint32_t parallelism, const void *pwd, - const size_t pwdlen, const void *salt, - const size_t saltlen, void *hash, - const size_t hashlen, char *encoded, - const size_t encodedlen, argon2_type type, - const uint32_t version ); - -/** - * Verifies a password against an encoded string - * Encoded string is restricted as in validate_inputs() - * @param encoded String encoding parameters, salt, hash - * @param pwd Pointer to password - * @pre Returns ARGON2_OK if successful - */ -ARGON2_PUBLIC int argon2i_verify(const char *encoded, const void *pwd, - const size_t pwdlen); - -ARGON2_PUBLIC int argon2d_verify(const char *encoded, const void *pwd, - const size_t pwdlen); - -ARGON2_PUBLIC int argon2id_verify(const char *encoded, const void *pwd, - const size_t pwdlen); - -/* generic function underlying the above ones */ -ARGON2_PUBLIC int argon2_verify(const char *encoded, const void *pwd, - const size_t pwdlen, argon2_type type); - -/** - * Argon2d: Version of Argon2 that picks memory blocks depending - * on the password and salt. Only for side-channel-free - * environment!! - ***** - * @param context Pointer to current Argon2 context - * @return Zero if successful, a non zero error code otherwise - */ -ARGON2_PUBLIC int argon2d_ctx(argon2_context *context); - -/** - * Argon2i: Version of Argon2 that picks memory blocks - * independent on the password and salt. Good for side-channels, - * but worse w.r.t. tradeoff attacks if only one pass is used. - ***** - * @param context Pointer to current Argon2 context - * @return Zero if successful, a non zero error code otherwise - */ -ARGON2_PUBLIC int argon2i_ctx(argon2_context *context); - -/** - * Argon2id: Version of Argon2 where the first half-pass over memory is - * password-independent, the rest are password-dependent (on the password and - * salt). OK against side channels (they reduce to 1/2-pass Argon2i), and - * better with w.r.t. tradeoff attacks (similar to Argon2d). - ***** - * @param context Pointer to current Argon2 context - * @return Zero if successful, a non zero error code otherwise - */ -ARGON2_PUBLIC int argon2id_ctx(argon2_context *context); - -/** - * Verify if a given password is correct for Argon2d hashing - * @param context Pointer to current Argon2 context - * @param hash The password hash to verify. The length of the hash is - * specified by the context outlen member - * @return Zero if successful, a non zero error code otherwise - */ -ARGON2_PUBLIC int argon2d_verify_ctx(argon2_context *context, const char *hash); - -/** - * Verify if a given password is correct for Argon2i hashing - * @param context Pointer to current Argon2 context - * @param hash The password hash to verify. The length of the hash is - * specified by the context outlen member - * @return Zero if successful, a non zero error code otherwise - */ -ARGON2_PUBLIC int argon2i_verify_ctx(argon2_context *context, const char *hash); - -/** - * Verify if a given password is correct for Argon2id hashing - * @param context Pointer to current Argon2 context - * @param hash The password hash to verify. The length of the hash is - * specified by the context outlen member - * @return Zero if successful, a non zero error code otherwise - */ -ARGON2_PUBLIC int argon2id_verify_ctx(argon2_context *context, - const char *hash); - -/* generic function underlying the above ones */ -ARGON2_PUBLIC int argon2_verify_ctx(argon2_context *context, const char *hash, - argon2_type type); - -/** - * Get the associated error message for given error code - * @return The error message associated with the given error code - */ -ARGON2_PUBLIC const char *argon2_error_message(int error_code); - -/** - * Returns the encoded hash length for the given input parameters - * @param t_cost Number of iterations - * @param m_cost Memory usage in kibibytes - * @param parallelism Number of threads; used to compute lanes - * @param saltlen Salt size in bytes - * @param hashlen Hash size in bytes - * @param type The argon2_type that we want the encoded length for - * @return The encoded hash length in bytes - */ -ARGON2_PUBLIC size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost, - uint32_t parallelism, uint32_t saltlen, - uint32_t hashlen, argon2_type type); - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/algo/argon2/argon2d/argon2d/argon2d_thread.c b/algo/argon2/argon2d/argon2d/argon2d_thread.c deleted file mode 100644 index 41eca42..0000000 --- a/algo/argon2/argon2d/argon2d/argon2d_thread.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#if !defined(ARGON2_NO_THREADS) - -#include "argon2d_thread.h" -#if defined(_WIN32) -#include -#endif - -int argon2_thread_create(argon2_thread_handle_t *handle, - argon2_thread_func_t func, void *args) { - if (NULL == handle || func == NULL) { - return -1; - } -#if defined(_WIN32) - *handle = _beginthreadex(NULL, 0, func, args, 0, NULL); - return *handle != 0 ? 0 : -1; -#else - return pthread_create(handle, NULL, func, args); -#endif -} - -int argon2_thread_join(argon2_thread_handle_t handle) { -#if defined(_WIN32) - if (WaitForSingleObject((HANDLE)handle, INFINITE) == WAIT_OBJECT_0) { - return CloseHandle((HANDLE)handle) != 0 ? 0 : -1; - } - return -1; -#else - return pthread_join(handle, NULL); -#endif -} - -void argon2_thread_exit(void) { -#if defined(_WIN32) - _endthreadex(0); -#else - pthread_exit(NULL); -#endif -} - -#endif /* ARGON2_NO_THREADS */ diff --git a/algo/argon2/argon2d/argon2d/argon2d_thread.h b/algo/argon2/argon2d/argon2d/argon2d_thread.h deleted file mode 100644 index 49d8836..0000000 --- a/algo/argon2/argon2d/argon2d/argon2d_thread.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#ifndef ARGON2_THREAD_H -#define ARGON2_THREAD_H - -#if !defined(ARGON2_NO_THREADS) - -/* - Here we implement an abstraction layer for the simpĺe requirements - of the Argon2 code. We only require 3 primitives---thread creation, - joining, and termination---so full emulation of the pthreads API - is unwarranted. Currently we wrap pthreads and Win32 threads. - - The API defines 2 types: the function pointer type, - argon2_thread_func_t, - and the type of the thread handle---argon2_thread_handle_t. -*/ -#if defined(_WIN32) -#include -typedef unsigned(__stdcall *argon2_thread_func_t)(void *); -typedef uintptr_t argon2_thread_handle_t; -#else -#include -typedef void *(*argon2_thread_func_t)(void *); -typedef pthread_t argon2_thread_handle_t; -#endif - -/* Creates a thread - * @param handle pointer to a thread handle, which is the output of this - * function. Must not be NULL. - * @param func A function pointer for the thread's entry point. Must not be - * NULL. - * @param args Pointer that is passed as an argument to @func. May be NULL. - * @return 0 if @handle and @func are valid pointers and a thread is successfully - * created. - */ -int argon2_thread_create(argon2_thread_handle_t *handle, - argon2_thread_func_t func, void *args); - -/* Waits for a thread to terminate - * @param handle Handle to a thread created with argon2_thread_create. - * @return 0 if @handle is a valid handle, and joining completed successfully. -*/ -int argon2_thread_join(argon2_thread_handle_t handle); - -/* Terminate the current thread. Must be run inside a thread created by - * argon2_thread_create. -*/ -void argon2_thread_exit(void); - -#endif /* ARGON2_NO_THREADS */ -#endif diff --git a/algo/argon2/argon2d/argon2d/core.c b/algo/argon2/argon2d/argon2d/core.c deleted file mode 100644 index e222648..0000000 --- a/algo/argon2/argon2d/argon2d/core.c +++ /dev/null @@ -1,635 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -/*For memory wiping*/ -#ifdef _MSC_VER -#include -#include /* For SecureZeroMemory */ -#endif -#if defined __STDC_LIB_EXT1__ -#define __STDC_WANT_LIB_EXT1__ 1 -#endif -#define VC_GE_2005(version) (version >= 1400) - -#include -#include -#include - -#include "core.h" -#include "argon2d_thread.h" -#include "../blake2/blake2.h" -#include "../blake2/blake2-impl.h" - -#ifdef GENKAT -#include "genkat.h" -#endif - -#if defined(__clang__) -#if __has_attribute(optnone) -#define NOT_OPTIMIZED __attribute__((optnone)) -#endif -#elif defined(__GNUC__) -#define GCC_VERSION \ - (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -#if GCC_VERSION >= 40400 -#define NOT_OPTIMIZED __attribute__((optimize("O0"))) -#endif -#endif -#ifndef NOT_OPTIMIZED -#define NOT_OPTIMIZED -#endif - -/***************Instance and Position constructors**********/ -void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); } - -void copy_block(block *dst, const block *src) { - memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK); -} - -void xor_block(block *dst, const block *src) { - int i; - for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { - dst->v[i] ^= src->v[i]; - } -} - -static void load_block(block *dst, const void *input) { - unsigned i; - for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { - dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i])); - } -} - -static void store_block(void *output, const block *src) { - unsigned i; - for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { - store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]); - } -} - -/***************Memory functions*****************/ - -int allocate_memory(const argon2_context *context, uint8_t **memory, - size_t num, size_t size) { - size_t memory_size = num*size; - if (memory == NULL) { - return ARGON2_MEMORY_ALLOCATION_ERROR; - } - - /* 1. Check for multiplication overflow */ - if (size != 0 && memory_size / size != num) { - return ARGON2_MEMORY_ALLOCATION_ERROR; - } - - /* 2. Try to allocate with appropriate allocator */ - if (context->allocate_cbk) { - (context->allocate_cbk)(memory, memory_size); - } else { - *memory = malloc(memory_size); - } - - if (*memory == NULL) { - return ARGON2_MEMORY_ALLOCATION_ERROR; - } - - return ARGON2_OK; -} - -void free_memory(const argon2_context *context, uint8_t *memory, - size_t num, size_t size) { - size_t memory_size = num*size; -// clear_internal_memory(memory, memory_size); - if (context->free_cbk) { - (context->free_cbk)(memory, memory_size); - } else { - free(memory); - } -} - -void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) { -#if defined(_MSC_VER) && VC_GE_2005(_MSC_VER) - SecureZeroMemory(v, n); -#elif defined memset_s - memset_s(v, n, 0, n); -#elif defined(__OpenBSD__) - explicit_bzero(v, n); -#else - static void *(*const volatile memset_sec)(void *, int, size_t) = &memset; - memset_sec(v, 0, n); -#endif -} - -/* Memory clear flag defaults to true. */ -int FLAG_clear_internal_memory = 0; -void clear_internal_memory(void *v, size_t n) { - if (FLAG_clear_internal_memory && v) { -// secure_wipe_memory(v, n); - } -} - -void finalize(const argon2_context *context, argon2_instance_t *instance) { - if (context != NULL && instance != NULL) { - block blockhash; - uint32_t l; - - copy_block(&blockhash, instance->memory + instance->lane_length - 1); - - /* XOR the last blocks */ - for (l = 1; l < instance->lanes; ++l) { - uint32_t last_block_in_lane = - l * instance->lane_length + (instance->lane_length - 1); - xor_block(&blockhash, instance->memory + last_block_in_lane); - } - - /* Hash the result */ - { - uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; - store_block(blockhash_bytes, &blockhash); - blake2b_long(context->out, context->outlen, blockhash_bytes, - ARGON2_BLOCK_SIZE); - /* clear blockhash and blockhash_bytes */ - clear_internal_memory(blockhash.v, ARGON2_BLOCK_SIZE); - clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); - } - -#ifdef GENKAT - print_tag(context->out, context->outlen); -#endif - - free_memory(context, (uint8_t *)instance->memory, - instance->memory_blocks, sizeof(block)); - } -} - -uint32_t index_alpha(const argon2_instance_t *instance, - const argon2_position_t *position, uint32_t pseudo_rand, - int same_lane) { - /* - * Pass 0: - * This lane : all already finished segments plus already constructed - * blocks in this segment - * Other lanes : all already finished segments - * Pass 1+: - * This lane : (SYNC_POINTS - 1) last segments plus already constructed - * blocks in this segment - * Other lanes : (SYNC_POINTS - 1) last segments - */ - uint32_t reference_area_size; - uint64_t relative_position; - uint32_t start_position, absolute_position; - - if (0 == position->pass) { - /* First pass */ - if (0 == position->slice) { - /* First slice */ - reference_area_size = - position->index - 1; /* all but the previous */ - } else { - if (same_lane) { - /* The same lane => add current segment */ - reference_area_size = - position->slice * instance->segment_length + - position->index - 1; - } else { - reference_area_size = - position->slice * instance->segment_length + - ((position->index == 0) ? (-1) : 0); - } - } - } else { - /* Second pass */ - if (same_lane) { - reference_area_size = instance->lane_length - - instance->segment_length + position->index - - 1; - } else { - reference_area_size = instance->lane_length - - instance->segment_length + - ((position->index == 0) ? (-1) : 0); - } - } - - /* 1.2.4. Mapping pseudo_rand to 0.. and produce - * relative position */ - relative_position = pseudo_rand; - relative_position = relative_position * relative_position >> 32; - relative_position = reference_area_size - 1 - - (reference_area_size * relative_position >> 32); - - /* 1.2.5 Computing starting position */ - start_position = 0; - - if (0 != position->pass) { - start_position = (position->slice == ARGON2_SYNC_POINTS - 1) - ? 0 - : (position->slice + 1) * instance->segment_length; - } - - /* 1.2.6. Computing absolute position */ - absolute_position = (start_position + relative_position) % - instance->lane_length; /* absolute position */ - return absolute_position; -} - -/* Single-threaded version for p=1 case */ -static int fill_memory_blocks_st(argon2_instance_t *instance) { - uint32_t r, s, l; - - for (r = 0; r < instance->passes; ++r) { - for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { - for (l = 0; l < instance->lanes; ++l) { - argon2_position_t position = {r, l, (uint8_t)s, 0}; - fill_segment(instance, position); - } - } -#ifdef GENKAT - internal_kat(instance, r); /* Print all memory blocks */ -#endif - } - return ARGON2_OK; -} - -#if !defined(ARGON2_NO_THREADS) - -#ifdef _WIN32 -static unsigned __stdcall fill_segment_thr(void *thread_data) -#else -static void *fill_segment_thr(void *thread_data) -#endif -{ - argon2_thread_data *my_data = thread_data; - fill_segment(my_data->instance_ptr, my_data->pos); - argon2_thread_exit(); - return 0; -} - -/* Multi-threaded version for p > 1 case */ -static int fill_memory_blocks_mt(argon2_instance_t *instance) { - uint32_t r, s; - argon2_thread_handle_t *thread = NULL; - argon2_thread_data *thr_data = NULL; - int rc = ARGON2_OK; - - /* 1. Allocating space for threads */ - thread = calloc(instance->lanes, sizeof(argon2_thread_handle_t)); - if (thread == NULL) { - rc = ARGON2_MEMORY_ALLOCATION_ERROR; - goto fail; - } - - thr_data = calloc(instance->lanes, sizeof(argon2_thread_data)); - if (thr_data == NULL) { - rc = ARGON2_MEMORY_ALLOCATION_ERROR; - goto fail; - } - - for (r = 0; r < instance->passes; ++r) { - for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { - uint32_t l; - - /* 2. Calling threads */ - for (l = 0; l < instance->lanes; ++l) { - argon2_position_t position; - - /* 2.1 Join a thread if limit is exceeded */ - if (l >= instance->threads) { - if (argon2_thread_join(thread[l - instance->threads])) { - rc = ARGON2_THREAD_FAIL; - goto fail; - } - } - - /* 2.2 Create thread */ - position.pass = r; - position.lane = l; - position.slice = (uint8_t)s; - position.index = 0; - thr_data[l].instance_ptr = - instance; /* preparing the thread input */ - memcpy(&(thr_data[l].pos), &position, - sizeof(argon2_position_t)); - if (argon2_thread_create(&thread[l], &fill_segment_thr, - (void *)&thr_data[l])) { - rc = ARGON2_THREAD_FAIL; - goto fail; - } - - /* fill_segment(instance, position); */ - /*Non-thread equivalent of the lines above */ - } - - /* 3. Joining remaining threads */ - for (l = instance->lanes - instance->threads; l < instance->lanes; - ++l) { - if (argon2_thread_join(thread[l])) { - rc = ARGON2_THREAD_FAIL; - goto fail; - } - } - } - -#ifdef GENKAT - internal_kat(instance, r); /* Print all memory blocks */ -#endif - } - -fail: - if (thread != NULL) { - free(thread); - } - if (thr_data != NULL) { - free(thr_data); - } - return rc; -} - -#endif /* ARGON2_NO_THREADS */ - -int fill_memory_blocks(argon2_instance_t *instance) { - if (instance == NULL || instance->lanes == 0) { - return ARGON2_INCORRECT_PARAMETER; - } -#if defined(ARGON2_NO_THREADS) - return fill_memory_blocks_st(instance); -#else - return instance->threads == 1 ? - fill_memory_blocks_st(instance) : fill_memory_blocks_mt(instance); -#endif -} - -int validate_inputs(const argon2_context *context) { - if (NULL == context) { - return ARGON2_INCORRECT_PARAMETER; - } - - if (NULL == context->out) { - return ARGON2_OUTPUT_PTR_NULL; - } - - /* Validate output length */ - if (ARGON2_MIN_OUTLEN > context->outlen) { - return ARGON2_OUTPUT_TOO_SHORT; - } - - if (ARGON2_MAX_OUTLEN < context->outlen) { - return ARGON2_OUTPUT_TOO_LONG; - } - - /* Validate password (required param) */ - if (NULL == context->pwd) { - if (0 != context->pwdlen) { - return ARGON2_PWD_PTR_MISMATCH; - } - } - - if (ARGON2_MIN_PWD_LENGTH > context->pwdlen) { - return ARGON2_PWD_TOO_SHORT; - } - - if (ARGON2_MAX_PWD_LENGTH < context->pwdlen) { - return ARGON2_PWD_TOO_LONG; - } - - /* Validate salt (required param) */ - if (NULL == context->salt) { - if (0 != context->saltlen) { - return ARGON2_SALT_PTR_MISMATCH; - } - } - - if (ARGON2_MIN_SALT_LENGTH > context->saltlen) { - return ARGON2_SALT_TOO_SHORT; - } - - if (ARGON2_MAX_SALT_LENGTH < context->saltlen) { - return ARGON2_SALT_TOO_LONG; - } - - /* Validate secret (optional param) */ - if (NULL == context->secret) { - if (0 != context->secretlen) { - return ARGON2_SECRET_PTR_MISMATCH; - } - } else { - if (ARGON2_MIN_SECRET > context->secretlen) { - return ARGON2_SECRET_TOO_SHORT; - } - if (ARGON2_MAX_SECRET < context->secretlen) { - return ARGON2_SECRET_TOO_LONG; - } - } - - /* Validate associated data (optional param) */ - if (NULL == context->ad) { - if (0 != context->adlen) { - return ARGON2_AD_PTR_MISMATCH; - } - } else { - if (ARGON2_MIN_AD_LENGTH > context->adlen) { - return ARGON2_AD_TOO_SHORT; - } - if (ARGON2_MAX_AD_LENGTH < context->adlen) { - return ARGON2_AD_TOO_LONG; - } - } - - /* Validate memory cost */ - if (ARGON2_MIN_MEMORY > context->m_cost) { - return ARGON2_MEMORY_TOO_LITTLE; - } - - if (ARGON2_MAX_MEMORY < context->m_cost) { - return ARGON2_MEMORY_TOO_MUCH; - } - - if (context->m_cost < 8 * context->lanes) { - return ARGON2_MEMORY_TOO_LITTLE; - } - - /* Validate time cost */ - if (ARGON2_MIN_TIME > context->t_cost) { - return ARGON2_TIME_TOO_SMALL; - } - - if (ARGON2_MAX_TIME < context->t_cost) { - return ARGON2_TIME_TOO_LARGE; - } - - /* Validate lanes */ - if (ARGON2_MIN_LANES > context->lanes) { - return ARGON2_LANES_TOO_FEW; - } - - if (ARGON2_MAX_LANES < context->lanes) { - return ARGON2_LANES_TOO_MANY; - } - - /* Validate threads */ - if (ARGON2_MIN_THREADS > context->threads) { - return ARGON2_THREADS_TOO_FEW; - } - - if (ARGON2_MAX_THREADS < context->threads) { - return ARGON2_THREADS_TOO_MANY; - } - - if (NULL != context->allocate_cbk && NULL == context->free_cbk) { - return ARGON2_FREE_MEMORY_CBK_NULL; - } - - if (NULL == context->allocate_cbk && NULL != context->free_cbk) { - return ARGON2_ALLOCATE_MEMORY_CBK_NULL; - } - - return ARGON2_OK; -} - -void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) { - uint32_t l; - /* Make the first and second block in each lane as G(H0||0||i) or - G(H0||1||i) */ - uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; - for (l = 0; l < instance->lanes; ++l) { - - store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0); - store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l); - blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, - ARGON2_PREHASH_SEED_LENGTH); - load_block(&instance->memory[l * instance->lane_length + 0], - blockhash_bytes); - - store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1); - blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, - ARGON2_PREHASH_SEED_LENGTH); - load_block(&instance->memory[l * instance->lane_length + 1], - blockhash_bytes); - } - clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); -} - -void initial_hash(uint8_t *blockhash, argon2_context *context, - argon2_type type) { - blake2b_state BlakeHash; - uint8_t value[sizeof(uint32_t)]; - - if (NULL == context || NULL == blockhash) { - return; - } - - blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH); - - store32(&value, context->lanes); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - store32(&value, context->outlen); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - store32(&value, context->m_cost); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - store32(&value, context->t_cost); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - -// store32(&value, ARGON2_VERSION_NUMBER); - store32(&value, context->version); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - store32(&value, (uint32_t)type); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - store32(&value, context->pwdlen); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - if (context->pwd != NULL) { - blake2b_update(&BlakeHash, (const uint8_t *)context->pwd, - context->pwdlen); - - if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) { -// secure_wipe_memory(context->pwd, context->pwdlen); - context->pwdlen = 0; - } - } - - store32(&value, context->saltlen); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - if (context->salt != NULL) { - blake2b_update(&BlakeHash, (const uint8_t *)context->salt, - context->saltlen); - } - - store32(&value, context->secretlen); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - if (context->secret != NULL) { - blake2b_update(&BlakeHash, (const uint8_t *)context->secret, - context->secretlen); - - if (context->flags & ARGON2_FLAG_CLEAR_SECRET) { -// secure_wipe_memory(context->secret, context->secretlen); - context->secretlen = 0; - } - } - - store32(&value, context->adlen); - blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); - - if (context->ad != NULL) { - blake2b_update(&BlakeHash, (const uint8_t *)context->ad, - context->adlen); - } - - blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); -} - -int initialize(argon2_instance_t *instance, argon2_context *context) { - uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; - int result = ARGON2_OK; - - if (instance == NULL || context == NULL) - return ARGON2_INCORRECT_PARAMETER; - instance->context_ptr = context; - - /* 1. Memory allocation */ - result = allocate_memory(context, (uint8_t **)&(instance->memory), - instance->memory_blocks, sizeof(block)); - if (result != ARGON2_OK) { - return result; - } - - /* 2. Initial hashing */ - /* H_0 + 8 extra bytes to produce the first blocks */ - /* uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; */ - /* Hashing all inputs */ - initial_hash(blockhash, context, instance->type); - /* Zeroing 8 extra bytes */ - clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, - ARGON2_PREHASH_SEED_LENGTH - - ARGON2_PREHASH_DIGEST_LENGTH); - -#ifdef GENKAT - initial_kat(blockhash, context, instance->type); -#endif - - /* 3. Creating first blocks, we always have at least two blocks in a slice - */ - fill_first_blocks(blockhash, instance); - /* Clearing the hash */ - clear_internal_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH); - - return ARGON2_OK; -} diff --git a/algo/argon2/argon2d/argon2d/core.h b/algo/argon2/argon2d/argon2d/core.h deleted file mode 100644 index 78000ba..0000000 --- a/algo/argon2/argon2d/argon2d/core.h +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#ifndef ARGON2_CORE_H -#define ARGON2_CORE_H - -#include "argon2.h" - -#define CONST_CAST(x) (x)(uintptr_t) - -/**********************Argon2 internal constants*******************************/ - -enum argon2_core_constants { - /* Memory block size in bytes */ - ARGON2_BLOCK_SIZE = 1024, - ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8, - ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16, - ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32, - ARGON2_512BIT_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 64, - - /* Number of pseudo-random values generated by one call to Blake in Argon2i - to - generate reference block positions */ - ARGON2_ADDRESSES_IN_BLOCK = 128, - - /* Pre-hashing digest length and its extension*/ - ARGON2_PREHASH_DIGEST_LENGTH = 64, - ARGON2_PREHASH_SEED_LENGTH = 72 -}; - -/*************************Argon2 internal data types***********************/ - -/* - * Structure for the (1KB) memory block implemented as 128 64-bit words. - * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no - * bounds checking). - */ -typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block; - -/*****************Functions that work with the block******************/ - -/* Initialize each byte of the block with @in */ -void init_block_value(block *b, uint8_t in); - -/* Copy block @src to block @dst */ -void copy_block(block *dst, const block *src); - -/* XOR @src onto @dst bytewise */ -void xor_block(block *dst, const block *src); - -/* - * Argon2 instance: memory pointer, number of passes, amount of memory, type, - * and derived values. - * Used to evaluate the number and location of blocks to construct in each - * thread - */ -typedef struct Argon2_instance_t { - block *memory; /* Memory pointer */ - uint32_t version; - uint32_t passes; /* Number of passes */ - uint32_t memory_blocks; /* Number of blocks in memory */ - uint32_t segment_length; - uint32_t lane_length; - uint32_t lanes; - uint32_t threads; - argon2_type type; - int print_internals; /* whether to print the memory blocks */ - argon2_context *context_ptr; /* points back to original context */ -} argon2_instance_t; - -/* - * Argon2 position: where we construct the block right now. Used to distribute - * work between threads. - */ -typedef struct Argon2_position_t { - uint32_t pass; - uint32_t lane; - uint8_t slice; - uint32_t index; -} argon2_position_t; - -/*Struct that holds the inputs for thread handling FillSegment*/ -typedef struct Argon2_thread_data { - argon2_instance_t *instance_ptr; - argon2_position_t pos; -} argon2_thread_data; - -/*************************Argon2 core functions********************************/ - -/* Allocates memory to the given pointer, uses the appropriate allocator as - * specified in the context. Total allocated memory is num*size. - * @param context argon2_context which specifies the allocator - * @param memory pointer to the pointer to the memory - * @param size the size in bytes for each element to be allocated - * @param num the number of elements to be allocated - * @return ARGON2_OK if @memory is a valid pointer and memory is allocated - */ -int allocate_memory(const argon2_context *context, uint8_t **memory, - size_t num, size_t size); - -/* - * Frees memory at the given pointer, uses the appropriate deallocator as - * specified in the context. Also cleans the memory using clear_internal_memory. - * @param context argon2_context which specifies the deallocator - * @param memory pointer to buffer to be freed - * @param size the size in bytes for each element to be deallocated - * @param num the number of elements to be deallocated - */ -void free_memory(const argon2_context *context, uint8_t *memory, - size_t num, size_t size); - -/* Function that securely cleans the memory. This ignores any flags set - * regarding clearing memory. Usually one just calls clear_internal_memory. - * @param mem Pointer to the memory - * @param s Memory size in bytes - */ -void secure_wipe_memory(void *v, size_t n); - -/* Function that securely clears the memory if FLAG_clear_internal_memory is - * set. If the flag isn't set, this function does nothing. - * @param mem Pointer to the memory - * @param s Memory size in bytes - */ -void clear_internal_memory(void *v, size_t n); - -/* - * Computes absolute position of reference block in the lane following a skewed - * distribution and using a pseudo-random value as input - * @param instance Pointer to the current instance - * @param position Pointer to the current position - * @param pseudo_rand 32-bit pseudo-random value used to determine the position - * @param same_lane Indicates if the block will be taken from the current lane. - * If so we can reference the current segment - * @pre All pointers must be valid - */ -uint32_t index_alpha(const argon2_instance_t *instance, - const argon2_position_t *position, uint32_t pseudo_rand, - int same_lane); - -/* - * Function that validates all inputs against predefined restrictions and return - * an error code - * @param context Pointer to current Argon2 context - * @return ARGON2_OK if everything is all right, otherwise one of error codes - * (all defined in - */ -int validate_inputs(const argon2_context *context); - -/* - * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears - * password and secret if needed - * @param context Pointer to the Argon2 internal structure containing memory - * pointer, and parameters for time and space requirements. - * @param blockhash Buffer for pre-hashing digest - * @param type Argon2 type - * @pre @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes - * allocated - */ -void initial_hash(uint8_t *blockhash, argon2_context *context, - argon2_type type); - -/* - * Function creates first 2 blocks per lane - * @param instance Pointer to the current instance - * @param blockhash Pointer to the pre-hashing digest - * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values - */ -void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance); - -/* - * Function allocates memory, hashes the inputs with Blake, and creates first - * two blocks. Returns the pointer to the main memory with 2 blocks per lane - * initialized - * @param context Pointer to the Argon2 internal structure containing memory - * pointer, and parameters for time and space requirements. - * @param instance Current Argon2 instance - * @return Zero if successful, -1 if memory failed to allocate. @context->state - * will be modified if successful. - */ -int initialize(argon2_instance_t *instance, argon2_context *context); - -/* - * XORing the last block of each lane, hashing it, making the tag. Deallocates - * the memory. - * @param context Pointer to current Argon2 context (use only the out parameters - * from it) - * @param instance Pointer to current instance of Argon2 - * @pre instance->state must point to necessary amount of memory - * @pre context->out must point to outlen bytes of memory - * @pre if context->free_cbk is not NULL, it should point to a function that - * deallocates memory - */ -void finalize(const argon2_context *context, argon2_instance_t *instance); - -/* - * Function that fills the segment using previous segments also from other - * threads - * @param context current context - * @param instance Pointer to the current instance - * @param position Current position - * @pre all block pointers must be valid - */ -void fill_segment(const argon2_instance_t *instance, - argon2_position_t position); - -/* - * Function that fills the entire memory t_cost times based on the first two - * blocks in each lane - * @param instance Pointer to the current instance - * @return ARGON2_OK if successful, @context->state - */ -int fill_memory_blocks(argon2_instance_t *instance); - -#endif diff --git a/algo/argon2/argon2d/argon2d/encoding.c b/algo/argon2/argon2d/argon2d/encoding.c deleted file mode 100644 index 12cfda4..0000000 --- a/algo/argon2/argon2d/argon2d/encoding.c +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#include -#include -#include -#include -#include "encoding.h" -#include "core.h" - -/* - * Example code for a decoder and encoder of "hash strings", with Argon2 - * parameters. - * - * This code comprises three sections: - * - * -- The first section contains generic Base64 encoding and decoding - * functions. It is conceptually applicable to any hash function - * implementation that uses Base64 to encode and decode parameters, - * salts and outputs. It could be made into a library, provided that - * the relevant functions are made public (non-static) and be given - * reasonable names to avoid collisions with other functions. - * - * -- The second section is specific to Argon2. It encodes and decodes - * the parameters, salts and outputs. It does not compute the hash - * itself. - * - * The code was originally written by Thomas Pornin , - * to whom comments and remarks may be sent. It is released under what - * should amount to Public Domain or its closest equivalent; the - * following mantra is supposed to incarnate that fact with all the - * proper legal rituals: - * - * --------------------------------------------------------------------- - * This file is provided under the terms of Creative Commons CC0 1.0 - * Public Domain Dedication. To the extent possible under law, the - * author (Thomas Pornin) has waived all copyright and related or - * neighboring rights to this file. This work is published from: Canada. - * --------------------------------------------------------------------- - * - * Copyright (c) 2015 Thomas Pornin - */ - -/* ==================================================================== */ -/* - * Common code; could be shared between different hash functions. - * - * Note: the Base64 functions below assume that uppercase letters (resp. - * lowercase letters) have consecutive numerical codes, that fit on 8 - * bits. All modern systems use ASCII-compatible charsets, where these - * properties are true. If you are stuck with a dinosaur of a system - * that still defaults to EBCDIC then you already have much bigger - * interoperability issues to deal with. - */ - -/* - * Some macros for constant-time comparisons. These work over values in - * the 0..255 range. Returned value is 0x00 on "false", 0xFF on "true". - */ -#define EQ(x, y) ((((0U - ((unsigned)(x) ^ (unsigned)(y))) >> 8) & 0xFF) ^ 0xFF) -#define GT(x, y) ((((unsigned)(y) - (unsigned)(x)) >> 8) & 0xFF) -#define GE(x, y) (GT(y, x) ^ 0xFF) -#define LT(x, y) GT(y, x) -#define LE(x, y) GE(y, x) - -/* - * Convert value x (0..63) to corresponding Base64 character. - */ -static int b64_byte_to_char(unsigned x) { - return (LT(x, 26) & (x + 'A')) | - (GE(x, 26) & LT(x, 52) & (x + ('a' - 26))) | - (GE(x, 52) & LT(x, 62) & (x + ('0' - 52))) | (EQ(x, 62) & '+') | - (EQ(x, 63) & '/'); -} - -/* - * Convert character c to the corresponding 6-bit value. If character c - * is not a Base64 character, then 0xFF (255) is returned. - */ -static unsigned b64_char_to_byte(int c) { - unsigned x; - - x = (GE(c, 'A') & LE(c, 'Z') & (c - 'A')) | - (GE(c, 'a') & LE(c, 'z') & (c - ('a' - 26))) | - (GE(c, '0') & LE(c, '9') & (c - ('0' - 52))) | (EQ(c, '+') & 62) | - (EQ(c, '/') & 63); - return x | (EQ(x, 0) & (EQ(c, 'A') ^ 0xFF)); -} - -/* - * Convert some bytes to Base64. 'dst_len' is the length (in characters) - * of the output buffer 'dst'; if that buffer is not large enough to - * receive the result (including the terminating 0), then (size_t)-1 - * is returned. Otherwise, the zero-terminated Base64 string is written - * in the buffer, and the output length (counted WITHOUT the terminating - * zero) is returned. - */ -static size_t to_base64(char *dst, size_t dst_len, const void *src, - size_t src_len) { - size_t olen; - const unsigned char *buf; - unsigned acc, acc_len; - - olen = (src_len / 3) << 2; - switch (src_len % 3) { - case 2: - olen++; - /* fall through */ - case 1: - olen += 2; - break; - } - if (dst_len <= olen) { - return (size_t)-1; - } - acc = 0; - acc_len = 0; - buf = (const unsigned char *)src; - while (src_len-- > 0) { - acc = (acc << 8) + (*buf++); - acc_len += 8; - while (acc_len >= 6) { - acc_len -= 6; - *dst++ = (char)b64_byte_to_char((acc >> acc_len) & 0x3F); - } - } - if (acc_len > 0) { - *dst++ = (char)b64_byte_to_char((acc << (6 - acc_len)) & 0x3F); - } - *dst++ = 0; - return olen; -} - -/* - * Decode Base64 chars into bytes. The '*dst_len' value must initially - * contain the length of the output buffer '*dst'; when the decoding - * ends, the actual number of decoded bytes is written back in - * '*dst_len'. - * - * Decoding stops when a non-Base64 character is encountered, or when - * the output buffer capacity is exceeded. If an error occurred (output - * buffer is too small, invalid last characters leading to unprocessed - * buffered bits), then NULL is returned; otherwise, the returned value - * points to the first non-Base64 character in the source stream, which - * may be the terminating zero. - */ -static const char *from_base64(void *dst, size_t *dst_len, const char *src) { - size_t len; - unsigned char *buf; - unsigned acc, acc_len; - - buf = (unsigned char *)dst; - len = 0; - acc = 0; - acc_len = 0; - for (;;) { - unsigned d; - - d = b64_char_to_byte(*src); - if (d == 0xFF) { - break; - } - src++; - acc = (acc << 6) + d; - acc_len += 6; - if (acc_len >= 8) { - acc_len -= 8; - if ((len++) >= *dst_len) { - return NULL; - } - *buf++ = (acc >> acc_len) & 0xFF; - } - } - - /* - * If the input length is equal to 1 modulo 4 (which is - * invalid), then there will remain 6 unprocessed bits; - * otherwise, only 0, 2 or 4 bits are buffered. The buffered - * bits must also all be zero. - */ - if (acc_len > 4 || (acc & (((unsigned)1 << acc_len) - 1)) != 0) { - return NULL; - } - *dst_len = len; - return src; -} - -/* - * Decode decimal integer from 'str'; the value is written in '*v'. - * Returned value is a pointer to the next non-decimal character in the - * string. If there is no digit at all, or the value encoding is not - * minimal (extra leading zeros), or the value does not fit in an - * 'unsigned long', then NULL is returned. - */ -static const char *decode_decimal(const char *str, unsigned long *v) { - const char *orig; - unsigned long acc; - - acc = 0; - for (orig = str;; str++) { - int c; - - c = *str; - if (c < '0' || c > '9') { - break; - } - c -= '0'; - if (acc > (ULONG_MAX / 10)) { - return NULL; - } - acc *= 10; - if ((unsigned long)c > (ULONG_MAX - acc)) { - return NULL; - } - acc += (unsigned long)c; - } - if (str == orig || (*orig == '0' && str != (orig + 1))) { - return NULL; - } - *v = acc; - return str; -} - -/* ==================================================================== */ -/* - * Code specific to Argon2. - * - * The code below applies the following format: - * - * $argon2[$v=]$m=,t=,p=$$ - * - * where is either 'd', 'id', or 'i', is a decimal integer (positive, - * fits in an 'unsigned long'), and is Base64-encoded data (no '=' padding - * characters, no newline or whitespace). - * - * The last two binary chunks (encoded in Base64) are, in that order, - * the salt and the output. Both are required. The binary salt length and the - * output length must be in the allowed ranges defined in argon2.h. - * - * The ctx struct must contain buffers large enough to hold the salt and pwd - * when it is fed into decode_string. - */ - -int decode_string(argon2_context *ctx, const char *str, argon2_type type) { - -/* check for prefix */ -#define CC(prefix) \ - do { \ - size_t cc_len = strlen(prefix); \ - if (strncmp(str, prefix, cc_len) != 0) { \ - return ARGON2_DECODING_FAIL; \ - } \ - str += cc_len; \ - } while ((void)0, 0) - -/* optional prefix checking with supplied code */ -#define CC_opt(prefix, code) \ - do { \ - size_t cc_len = strlen(prefix); \ - if (strncmp(str, prefix, cc_len) == 0) { \ - str += cc_len; \ - { code; } \ - } \ - } while ((void)0, 0) - -/* Decoding prefix into decimal */ -#define DECIMAL(x) \ - do { \ - unsigned long dec_x; \ - str = decode_decimal(str, &dec_x); \ - if (str == NULL) { \ - return ARGON2_DECODING_FAIL; \ - } \ - (x) = dec_x; \ - } while ((void)0, 0) - - -/* Decoding prefix into uint32_t decimal */ -#define DECIMAL_U32(x) \ - do { \ - unsigned long dec_x; \ - str = decode_decimal(str, &dec_x); \ - if (str == NULL || dec_x > UINT32_MAX) { \ - return ARGON2_DECODING_FAIL; \ - } \ - (x) = (uint32_t)dec_x; \ - } while ((void)0, 0) - - -/* Decoding base64 into a binary buffer */ -#define BIN(buf, max_len, len) \ - do { \ - size_t bin_len = (max_len); \ - str = from_base64(buf, &bin_len, str); \ - if (str == NULL || bin_len > UINT32_MAX) { \ - return ARGON2_DECODING_FAIL; \ - } \ - (len) = (uint32_t)bin_len; \ - } while ((void)0, 0) - - size_t maxsaltlen = ctx->saltlen; - size_t maxoutlen = ctx->outlen; - int validation_result; - const char* type_string; - - /* We should start with the argon2_type we are using */ - type_string = argon2_type2string(type, 0); - if (!type_string) { - return ARGON2_INCORRECT_TYPE; - } - - CC("$"); - CC(type_string); - - /* Reading the version number if the default is suppressed */ - ctx->version = ARGON2_VERSION_10; - CC_opt("$v=", DECIMAL_U32(ctx->version)); - - CC("$m="); - DECIMAL_U32(ctx->m_cost); - CC(",t="); - DECIMAL_U32(ctx->t_cost); - CC(",p="); - DECIMAL_U32(ctx->lanes); - ctx->threads = ctx->lanes; - - CC("$"); - BIN(ctx->salt, maxsaltlen, ctx->saltlen); - CC("$"); - BIN(ctx->out, maxoutlen, ctx->outlen); - - /* The rest of the fields get the default values */ - ctx->secret = NULL; - ctx->secretlen = 0; - ctx->ad = NULL; - ctx->adlen = 0; - ctx->allocate_cbk = NULL; - ctx->free_cbk = NULL; - ctx->flags = ARGON2_DEFAULT_FLAGS; - - /* On return, must have valid context */ - validation_result = validate_inputs(ctx); - if (validation_result != ARGON2_OK) { - return validation_result; - } - - /* Can't have any additional characters */ - if (*str == 0) { - return ARGON2_OK; - } else { - return ARGON2_DECODING_FAIL; - } -#undef CC -#undef CC_opt -#undef DECIMAL -#undef BIN -} - -int encode_string(char *dst, size_t dst_len, argon2_context *ctx, - argon2_type type) { -#define SS(str) \ - do { \ - size_t pp_len = strlen(str); \ - if (pp_len >= dst_len) { \ - return ARGON2_ENCODING_FAIL; \ - } \ - memcpy(dst, str, pp_len + 1); \ - dst += pp_len; \ - dst_len -= pp_len; \ - } while ((void)0, 0) - -#define SX(x) \ - do { \ - char tmp[30]; \ - sprintf(tmp, "%lu", (unsigned long)(x)); \ - SS(tmp); \ - } while ((void)0, 0) - -#define SB(buf, len) \ - do { \ - size_t sb_len = to_base64(dst, dst_len, buf, len); \ - if (sb_len == (size_t)-1) { \ - return ARGON2_ENCODING_FAIL; \ - } \ - dst += sb_len; \ - dst_len -= sb_len; \ - } while ((void)0, 0) - - const char* type_string = argon2_type2string(type, 0); - int validation_result = validate_inputs(ctx); - - if (!type_string) { - return ARGON2_ENCODING_FAIL; - } - - if (validation_result != ARGON2_OK) { - return validation_result; - } - - - SS("$"); - SS(type_string); - - SS("$v="); - SX(ctx->version); - - SS("$m="); - SX(ctx->m_cost); - SS(",t="); - SX(ctx->t_cost); - SS(",p="); - SX(ctx->lanes); - - SS("$"); - SB(ctx->salt, ctx->saltlen); - - SS("$"); - SB(ctx->out, ctx->outlen); - return ARGON2_OK; - -#undef SS -#undef SX -#undef SB -} - -size_t b64len(uint32_t len) { - size_t olen = ((size_t)len / 3) << 2; - - switch (len % 3) { - case 2: - olen++; - /* fall through */ - case 1: - olen += 2; - break; - } - - return olen; -} - -size_t numlen(uint32_t num) { - size_t len = 1; - while (num >= 10) { - ++len; - num = num / 10; - } - return len; -} - diff --git a/algo/argon2/argon2d/argon2d/encoding.h b/algo/argon2/argon2d/argon2d/encoding.h deleted file mode 100644 index 7e83ec9..0000000 --- a/algo/argon2/argon2d/argon2d/encoding.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#ifndef ENCODING_H -#define ENCODING_H -#include "argon2.h" - -#define ARGON2_MAX_DECODED_LANES UINT32_C(255) -#define ARGON2_MIN_DECODED_SALT_LEN UINT32_C(8) -#define ARGON2_MIN_DECODED_OUT_LEN UINT32_C(12) - -/* -* encode an Argon2 hash string into the provided buffer. 'dst_len' -* contains the size, in characters, of the 'dst' buffer; if 'dst_len' -* is less than the number of required characters (including the -* terminating 0), then this function returns ARGON2_ENCODING_ERROR. -* -* on success, ARGON2_OK is returned. -*/ -int encode_string(char *dst, size_t dst_len, argon2_context *ctx, - argon2_type type); - -/* -* Decodes an Argon2 hash string into the provided structure 'ctx'. -* The only fields that must be set prior to this call are ctx.saltlen and -* ctx.outlen (which must be the maximal salt and out length values that are -* allowed), ctx.salt and ctx.out (which must be buffers of the specified -* length), and ctx.pwd and ctx.pwdlen which must hold a valid password. -* -* Invalid input string causes an error. On success, the ctx is valid and all -* fields have been initialized. -* -* Returned value is ARGON2_OK on success, other ARGON2_ codes on error. -*/ -int decode_string(argon2_context *ctx, const char *str, argon2_type type); - -/* Returns the length of the encoded byte stream with length len */ -size_t b64len(uint32_t len); - -/* Returns the length of the encoded number num */ -size_t numlen(uint32_t num); - -#endif diff --git a/algo/argon2/argon2d/argon2d/opt.c b/algo/argon2/argon2d/argon2d/opt.c deleted file mode 100644 index 87ff4cc..0000000 --- a/algo/argon2/argon2d/argon2d/opt.c +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#include -#include -#include - -#include "argon2.h" -#include "core.h" - -#include "../blake2/blake2.h" -#include "../blake2/blamka-round-opt.h" - -/* - * Function fills a new memory block and optionally XORs the old block over the new one. - * Memory must be initialized. - * @param state Pointer to the just produced block. Content will be updated(!) - * @param ref_block Pointer to the reference block - * @param next_block Pointer to the block to be XORed over. May coincide with @ref_block - * @param with_xor Whether to XOR into the new block (1) or just overwrite (0) - * @pre all block pointers must be valid - */ - -#if defined(__AVX512F__) - -static void fill_block(__m512i *state, const block *ref_block, - block *next_block, int with_xor) { - __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK]; - unsigned int i; - - if (with_xor) { - for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { - state[i] = _mm512_xor_si512( - state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i)); - block_XY[i] = _mm512_xor_si512( - state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i)); - } - } else { - for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { - block_XY[i] = state[i] = _mm512_xor_si512( - state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i)); - } - } - - BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3], - state[ 4], state[ 5], state[ 6], state[ 7] ); - BLAKE2_ROUND_1( state[ 8], state[ 9], state[10], state[11], - state[12], state[13], state[14], state[15] ); - - BLAKE2_ROUND_2( state[ 0], state[ 2], state[ 4], state[ 6], - state[ 8], state[10], state[12], state[14] ); - BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7], - state[ 9], state[11], state[13], state[15] ); - -/* - for (i = 0; i < 2; ++i) { - BLAKE2_ROUND_1( - state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3], - state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]); - } - - for (i = 0; i < 2; ++i) { - BLAKE2_ROUND_2( - state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i], - state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]); - } -*/ - - for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { - state[i] = _mm512_xor_si512(state[i], block_XY[i]); - _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]); - } -} - -#elif defined(__AVX2__) - -static void fill_block(__m256i *state, const block *ref_block, - block *next_block, int with_xor) { - __m256i block_XY[ARGON2_HWORDS_IN_BLOCK]; - unsigned int i; - - if (with_xor) { - for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { - state[i] = _mm256_xor_si256( - state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i)); - block_XY[i] = _mm256_xor_si256( - state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i)); - } - } else { - for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { - block_XY[i] = state[i] = _mm256_xor_si256( - state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i)); - } - } - - BLAKE2_ROUND_1( state[ 0], state[ 4], state[ 1], state[ 5], - state[ 2], state[ 6], state[ 3], state[ 7] ); - BLAKE2_ROUND_1( state[ 8], state[12], state[ 9], state[13], - state[10], state[14], state[11], state[15] ); - BLAKE2_ROUND_1( state[16], state[20], state[17], state[21], - state[18], state[22], state[19], state[23] ); - BLAKE2_ROUND_1( state[24], state[28], state[25], state[29], - state[26], state[30], state[27], state[31] ); - - BLAKE2_ROUND_2( state[ 0], state[ 4], state[ 8], state[12], - state[16], state[20], state[24], state[28] ); - BLAKE2_ROUND_2( state[ 1], state[ 5], state[ 9], state[13], - state[17], state[21], state[25], state[29] ); - BLAKE2_ROUND_2( state[ 2], state[ 6], state[10], state[14], - state[18], state[22], state[26], state[30] ); - BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15], - state[19], state[23], state[27], state[31] ); - -/* - for (i = 0; i < 4; ++i) { - BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5], - state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]); - } - - for (i = 0; i < 4; ++i) { - BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i], - state[16 + i], state[20 + i], state[24 + i], state[28 + i]); - } -*/ - - for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { - state[i] = _mm256_xor_si256(state[i], block_XY[i]); - _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]); - } -} - -#else // SSE2 - -static void fill_block(__m128i *state, const block *ref_block, - block *next_block, int with_xor) { - __m128i block_XY[ARGON2_OWORDS_IN_BLOCK]; - unsigned int i; - - if (with_xor) { - for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { - state[i] = _mm_xor_si128( - state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i)); - block_XY[i] = _mm_xor_si128( - state[i], _mm_loadu_si128((const __m128i *)next_block->v + i)); - } - } else { - for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { - block_XY[i] = state[i] = _mm_xor_si128( - state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i)); - } - } - - BLAKE2_ROUND( state[ 0], state[ 1], state[ 2], state[ 3], - state[ 4], state[ 5], state[ 6], state[ 7] ); - BLAKE2_ROUND( state[ 8], state[ 9], state[10], state[11], - state[12], state[13], state[14], state[15] ); - BLAKE2_ROUND( state[16], state[17], state[18], state[19], - state[20], state[21], state[22], state[23] ); - BLAKE2_ROUND( state[24], state[25], state[26], state[27], - state[28], state[29], state[30], state[31] ); - BLAKE2_ROUND( state[32], state[33], state[34], state[35], - state[36], state[37], state[38], state[39] ); - BLAKE2_ROUND( state[40], state[41], state[42], state[43], - state[44], state[45], state[46], state[47] ); - BLAKE2_ROUND( state[48], state[49], state[50], state[51], - state[52], state[53], state[54], state[55] ); - BLAKE2_ROUND( state[56], state[57], state[58], state[59], - state[60], state[61], state[62], state[63] ); - - BLAKE2_ROUND( state[ 0], state[ 8], state[16], state[24], - state[32], state[40], state[48], state[56] ); - BLAKE2_ROUND( state[ 1], state[ 9], state[17], state[25], - state[33], state[41], state[49], state[57] ); - BLAKE2_ROUND( state[ 2], state[10], state[18], state[26], - state[34], state[42], state[50], state[58] ); - BLAKE2_ROUND( state[ 3], state[11], state[19], state[27], - state[35], state[43], state[51], state[59] ); - BLAKE2_ROUND( state[ 4], state[12], state[20], state[28], - state[36], state[44], state[52], state[60] ); - BLAKE2_ROUND( state[ 5], state[13], state[21], state[29], - state[37], state[45], state[53], state[61] ); - BLAKE2_ROUND( state[ 6], state[14], state[22], state[30], - state[38], state[46], state[54], state[62] ); - BLAKE2_ROUND( state[ 7], state[15], state[23], state[31], - state[39], state[47], state[55], state[63] ); - -/* - for (i = 0; i < 8; ++i) { - BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], - state[8 * i + 3], state[8 * i + 4], state[8 * i + 5], - state[8 * i + 6], state[8 * i + 7]); - } - - for (i = 0; i < 8; ++i) { - BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], - state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i], - state[8 * 6 + i], state[8 * 7 + i]); - } -*/ - for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { - state[i] = _mm_xor_si128(state[i], block_XY[i]); - _mm_storeu_si128((__m128i *)next_block->v + i, state[i]); - } -} - -#endif - -#if 0 -static void next_addresses(block *address_block, block *input_block) { - /*Temporary zero-initialized blocks*/ -#if defined(__AVX512F__) - __m512i zero_block[ARGON2_512BIT_WORDS_IN_BLOCK]; - __m512i zero2_block[ARGON2_512BIT_WORDS_IN_BLOCK]; -#elif defined(__AVX2__) - __m256i zero_block[ARGON2_HWORDS_IN_BLOCK]; - __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK]; -#else - __m128i zero_block[ARGON2_OWORDS_IN_BLOCK]; - __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK]; -#endif - - memset(zero_block, 0, sizeof(zero_block)); - memset(zero2_block, 0, sizeof(zero2_block)); - - /*Increasing index counter*/ - input_block->v[6]++; - - /*First iteration of G*/ - fill_block(zero_block, input_block, address_block, 0); - - /*Second iteration of G*/ - fill_block(zero2_block, address_block, address_block, 0); -} -#endif - -void fill_segment(const argon2_instance_t *instance, - argon2_position_t position) { - block *ref_block = NULL, *curr_block = NULL; -// block address_block, input_block; - uint64_t pseudo_rand, ref_index, ref_lane; - uint32_t prev_offset, curr_offset; - uint32_t starting_index, i; -#if defined(__AVX512F__) - __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK]; -#elif defined(__AVX2__) - __m256i state[ARGON2_HWORDS_IN_BLOCK]; -#else - __m128i state[ARGON2_OWORDS_IN_BLOCK]; -#endif -// int data_independent_addressing; - - if (instance == NULL) { - return; - } - - // data_independent_addressing = - // (instance->type == Argon2_i) || - // (instance->type == Argon2_id && (position.pass == 0) && - // (position.slice < ARGON2_SYNC_POINTS / 2)); - - // if (data_independent_addressing) { - // init_block_value(&input_block, 0); - - // input_block.v[0] = position.pass; - // input_block.v[1] = position.lane; - // input_block.v[2] = position.slice; - // input_block.v[3] = instance->memory_blocks; - // input_block.v[4] = instance->passes; - // input_block.v[5] = instance->type; - // } - - starting_index = 0; - - if ((0 == position.pass) && (0 == position.slice)) { - starting_index = 2; /* we have already generated the first two blocks */ - - /* Don't forget to generate the first block of addresses: */ -// if (data_independent_addressing) { -// next_addresses(&address_block, &input_block); -// } - } - - /* Offset of the current block */ - curr_offset = position.lane * instance->lane_length + - position.slice * instance->segment_length + starting_index; - - if (0 == curr_offset % instance->lane_length) { - /* Last block in this lane */ - prev_offset = curr_offset + instance->lane_length - 1; - } else { - /* Previous block */ - prev_offset = curr_offset - 1; - } - - memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); - - for (i = starting_index; i < instance->segment_length; - ++i, ++curr_offset, ++prev_offset) { - /*1.1 Rotating prev_offset if needed */ - if (curr_offset % instance->lane_length == 1) { - prev_offset = curr_offset - 1; - } - - /* 1.2 Computing the index of the reference block */ - /* 1.2.1 Taking pseudo-random value from the previous block */ -// if (data_independent_addressing) { -// if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { -// next_addresses(&address_block, &input_block); -// } -// pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; -// } else { - pseudo_rand = instance->memory[prev_offset].v[0]; -// } - - /* 1.2.2 Computing the lane of the reference block */ - ref_lane = ((pseudo_rand >> 32)) % instance->lanes; - - if ((position.pass == 0) && (position.slice == 0)) { - /* Can not reference other lanes yet */ - ref_lane = position.lane; - } - - /* 1.2.3 Computing the number of possible reference block within the - * lane. - */ - position.index = i; - ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, - ref_lane == position.lane); - - /* 2 Creating a new block */ - ref_block = - instance->memory + instance->lane_length * ref_lane + ref_index; - curr_block = instance->memory + curr_offset; - if (ARGON2_VERSION_10 == instance->version) { - /* version 1.2.1 and earlier: overwrite, not XOR */ - fill_block(state, ref_block, curr_block, 0); - } else { - if(0 == position.pass) { - fill_block(state, ref_block, curr_block, 0); - } else { - fill_block(state, ref_block, curr_block, 1); - } - } - } -} diff --git a/algo/argon2/argon2d/blake2/blake2-impl.h b/algo/argon2/argon2d/blake2/blake2-impl.h deleted file mode 100644 index 241f0be..0000000 --- a/algo/argon2/argon2d/blake2/blake2-impl.h +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#ifndef PORTABLE_BLAKE2_IMPL_H -#define PORTABLE_BLAKE2_IMPL_H - -#include -#include - -#if defined(_MSC_VER) -#define BLAKE2_INLINE __inline -#elif defined(__GNUC__) || defined(__clang__) -#define BLAKE2_INLINE __inline__ -#else -#define BLAKE2_INLINE -#endif - -/* Argon2 Team - Begin Code */ -/* - Not an exhaustive list, but should cover the majority of modern platforms - Additionally, the code will always be correct---this is only a performance - tweak. -*/ -#if (defined(__BYTE_ORDER__) && \ - (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ - defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ - defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ - defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ - defined(_M_ARM) -#define NATIVE_LITTLE_ENDIAN -#endif -/* Argon2 Team - End Code */ - -static BLAKE2_INLINE uint32_t load32(const void *src) { -#if defined(NATIVE_LITTLE_ENDIAN) - uint32_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = (const uint8_t *)src; - uint32_t w = *p++; - w |= (uint32_t)(*p++) << 8; - w |= (uint32_t)(*p++) << 16; - w |= (uint32_t)(*p++) << 24; - return w; -#endif -} - -static BLAKE2_INLINE uint64_t load64(const void *src) { -#if defined(NATIVE_LITTLE_ENDIAN) - uint64_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = (const uint8_t *)src; - uint64_t w = *p++; - w |= (uint64_t)(*p++) << 8; - w |= (uint64_t)(*p++) << 16; - w |= (uint64_t)(*p++) << 24; - w |= (uint64_t)(*p++) << 32; - w |= (uint64_t)(*p++) << 40; - w |= (uint64_t)(*p++) << 48; - w |= (uint64_t)(*p++) << 56; - return w; -#endif -} - -static BLAKE2_INLINE void store32(void *dst, uint32_t w) { -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -#endif -} - -static BLAKE2_INLINE void store64(void *dst, uint64_t w) { -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -#endif -} - -static BLAKE2_INLINE uint64_t load48(const void *src) { - const uint8_t *p = (const uint8_t *)src; - uint64_t w = *p++; - w |= (uint64_t)(*p++) << 8; - w |= (uint64_t)(*p++) << 16; - w |= (uint64_t)(*p++) << 24; - w |= (uint64_t)(*p++) << 32; - w |= (uint64_t)(*p++) << 40; - return w; -} - -static BLAKE2_INLINE void store48(void *dst, uint64_t w) { - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -} - -static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) { - return (w >> c) | (w << (32 - c)); -} - -static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { - return (w >> c) | (w << (64 - c)); -} - -void clear_internal_memory(void *v, size_t n); - -#endif diff --git a/algo/argon2/argon2d/blake2/blake2.h b/algo/argon2/argon2d/blake2/blake2.h deleted file mode 100644 index a452f33..0000000 --- a/algo/argon2/argon2d/blake2/blake2.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#ifndef PORTABLE_BLAKE2_H -#define PORTABLE_BLAKE2_H - -#include -#include -#include - -#if defined(__cplusplus) -extern "C" { -#endif - -enum blake2b_constant { - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_KEYBYTES = 64, - BLAKE2B_SALTBYTES = 16, - BLAKE2B_PERSONALBYTES = 16 -}; - -#pragma pack(push, 1) -typedef struct __blake2b_param { - uint8_t digest_length; /* 1 */ - uint8_t key_length; /* 2 */ - uint8_t fanout; /* 3 */ - uint8_t depth; /* 4 */ - uint32_t leaf_length; /* 8 */ - uint64_t node_offset; /* 16 */ - uint8_t node_depth; /* 17 */ - uint8_t inner_length; /* 18 */ - uint8_t reserved[14]; /* 32 */ - uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */ - uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */ -} blake2b_param; -#pragma pack(pop) - -typedef struct __blake2b_state { - uint64_t h[8]; - uint64_t t[2]; - uint64_t f[2]; - uint8_t buf[BLAKE2B_BLOCKBYTES]; - unsigned buflen; - unsigned outlen; - uint8_t last_node; -} blake2b_state; - -/* Ensure param structs have not been wrongly padded */ -/* Poor man's static_assert */ -enum { - blake2_size_check_0 = 1 / !!(CHAR_BIT == 8), - blake2_size_check_2 = - 1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT) -}; - -/* Streaming API */ -int blake2b_init(blake2b_state *S, size_t outlen); -int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, - size_t keylen); -int blake2b_init_param(blake2b_state *S, const blake2b_param *P); -int blake2b_update(blake2b_state *S, const void *in, size_t inlen); -int blake2b_final(blake2b_state *S, void *out, size_t outlen); - -/* Simple API */ -int blake2b(void *out, size_t outlen, const void *in, size_t inlen, - const void *key, size_t keylen); - -/* Argon2 Team - Begin Code */ -int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen); -/* Argon2 Team - End Code */ - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/algo/argon2/argon2d/blake2/blake2b.c b/algo/argon2/argon2d/blake2/blake2b.c deleted file mode 100644 index ca05df5..0000000 --- a/algo/argon2/argon2d/blake2/blake2b.c +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#include -#include -#include - -#include "blake2.h" -#include "blake2-impl.h" - -static const uint64_t blake2b_IV[8] = { - UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), - UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), - UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), - UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179)}; - -static const unsigned int blake2b_sigma[12][16] = { - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, - {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, - {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, - {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, - {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, - {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, - {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, -}; - -static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) { - S->f[1] = (uint64_t)-1; -} - -static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) { - if (S->last_node) { - blake2b_set_lastnode(S); - } - S->f[0] = (uint64_t)-1; -} - -static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S, - uint64_t inc) { - S->t[0] += inc; - S->t[1] += (S->t[0] < inc); -} - -static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) { - clear_internal_memory(S, sizeof(*S)); /* wipe */ - blake2b_set_lastblock(S); /* invalidate for further use */ -} - -static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) { - memset(S, 0, sizeof(*S)); - memcpy(S->h, blake2b_IV, sizeof(S->h)); -} - -int blake2b_init_param(blake2b_state *S, const blake2b_param *P) { - const unsigned char *p = (const unsigned char *)P; - unsigned int i; - - if (NULL == P || NULL == S) { - return -1; - } - - blake2b_init0(S); - /* IV XOR Parameter Block */ - for (i = 0; i < 8; ++i) { - S->h[i] ^= load64(&p[i * sizeof(S->h[i])]); - } - S->outlen = P->digest_length; - return 0; -} - -/* Sequential blake2b initialization */ -int blake2b_init(blake2b_state *S, size_t outlen) { - blake2b_param P; - - if (S == NULL) { - return -1; - } - - if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) { - blake2b_invalidate_state(S); - return -1; - } - - /* Setup Parameter Block for unkeyed BLAKE2 */ - P.digest_length = (uint8_t)outlen; - P.key_length = 0; - P.fanout = 1; - P.depth = 1; - P.leaf_length = 0; - P.node_offset = 0; - P.node_depth = 0; - P.inner_length = 0; - memset(P.reserved, 0, sizeof(P.reserved)); - memset(P.salt, 0, sizeof(P.salt)); - memset(P.personal, 0, sizeof(P.personal)); - - return blake2b_init_param(S, &P); -} - -int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, - size_t keylen) { - blake2b_param P; - - if (S == NULL) { - return -1; - } - - if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) { - blake2b_invalidate_state(S); - return -1; - } - - if ((key == 0) || (keylen == 0) || (keylen > BLAKE2B_KEYBYTES)) { - blake2b_invalidate_state(S); - return -1; - } - - /* Setup Parameter Block for keyed BLAKE2 */ - P.digest_length = (uint8_t)outlen; - P.key_length = (uint8_t)keylen; - P.fanout = 1; - P.depth = 1; - P.leaf_length = 0; - P.node_offset = 0; - P.node_depth = 0; - P.inner_length = 0; - memset(P.reserved, 0, sizeof(P.reserved)); - memset(P.salt, 0, sizeof(P.salt)); - memset(P.personal, 0, sizeof(P.personal)); - - if (blake2b_init_param(S, &P) < 0) { - blake2b_invalidate_state(S); - return -1; - } - - { - uint8_t block[BLAKE2B_BLOCKBYTES]; - memset(block, 0, BLAKE2B_BLOCKBYTES); - memcpy(block, key, keylen); - blake2b_update(S, block, BLAKE2B_BLOCKBYTES); - /* Burn the key from stack */ - clear_internal_memory(block, BLAKE2B_BLOCKBYTES); - } - return 0; -} - -static void blake2b_compress(blake2b_state *S, const uint8_t *block) { - uint64_t m[16]; - uint64_t v[16]; - unsigned int i, r; - - for (i = 0; i < 16; ++i) { - m[i] = load64(block + i * sizeof(m[i])); - } - - for (i = 0; i < 8; ++i) { - v[i] = S->h[i]; - } - - v[8] = blake2b_IV[0]; - v[9] = blake2b_IV[1]; - v[10] = blake2b_IV[2]; - v[11] = blake2b_IV[3]; - v[12] = blake2b_IV[4] ^ S->t[0]; - v[13] = blake2b_IV[5] ^ S->t[1]; - v[14] = blake2b_IV[6] ^ S->f[0]; - v[15] = blake2b_IV[7] ^ S->f[1]; - -#define G(r, i, a, b, c, d) \ - do { \ - a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ - } while ((void)0, 0) - -#define ROUND(r) \ - do { \ - G(r, 0, v[0], v[4], v[8], v[12]); \ - G(r, 1, v[1], v[5], v[9], v[13]); \ - G(r, 2, v[2], v[6], v[10], v[14]); \ - G(r, 3, v[3], v[7], v[11], v[15]); \ - G(r, 4, v[0], v[5], v[10], v[15]); \ - G(r, 5, v[1], v[6], v[11], v[12]); \ - G(r, 6, v[2], v[7], v[8], v[13]); \ - G(r, 7, v[3], v[4], v[9], v[14]); \ - } while ((void)0, 0) - - for (r = 0; r < 12; ++r) { - ROUND(r); - } - - for (i = 0; i < 8; ++i) { - S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; - } - -#undef G -#undef ROUND -} - -int blake2b_update(blake2b_state *S, const void *in, size_t inlen) { - const uint8_t *pin = (const uint8_t *)in; - - if (inlen == 0) { - return 0; - } - - /* Sanity check */ - if (S == NULL || in == NULL) { - return -1; - } - - /* Is this a reused state? */ - if (S->f[0] != 0) { - return -1; - } - - if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) { - /* Complete current block */ - size_t left = S->buflen; - size_t fill = BLAKE2B_BLOCKBYTES - left; - memcpy(&S->buf[left], pin, fill); - blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); - blake2b_compress(S, S->buf); - S->buflen = 0; - inlen -= fill; - pin += fill; - /* Avoid buffer copies when possible */ - while (inlen > BLAKE2B_BLOCKBYTES) { - blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); - blake2b_compress(S, pin); - inlen -= BLAKE2B_BLOCKBYTES; - pin += BLAKE2B_BLOCKBYTES; - } - } - memcpy(&S->buf[S->buflen], pin, inlen); - S->buflen += (unsigned int)inlen; - return 0; -} - -int blake2b_final(blake2b_state *S, void *out, size_t outlen) { - uint8_t buffer[BLAKE2B_OUTBYTES] = {0}; - unsigned int i; - - /* Sanity checks */ - if (S == NULL || out == NULL || outlen < S->outlen) { - return -1; - } - - /* Is this a reused state? */ - if (S->f[0] != 0) { - return -1; - } - - blake2b_increment_counter(S, S->buflen); - blake2b_set_lastblock(S); - memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ - blake2b_compress(S, S->buf); - - for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */ - store64(buffer + sizeof(S->h[i]) * i, S->h[i]); - } - - memcpy(out, buffer, S->outlen); - clear_internal_memory(buffer, sizeof(buffer)); - clear_internal_memory(S->buf, sizeof(S->buf)); - clear_internal_memory(S->h, sizeof(S->h)); - return 0; -} - -int blake2b(void *out, size_t outlen, const void *in, size_t inlen, - const void *key, size_t keylen) { - blake2b_state S; - int ret = -1; - - /* Verify parameters */ - if (NULL == in && inlen > 0) { - goto fail; - } - - if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) { - goto fail; - } - - if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) { - goto fail; - } - - if (keylen > 0) { - if (blake2b_init_key(&S, outlen, key, keylen) < 0) { - goto fail; - } - } else { - if (blake2b_init(&S, outlen) < 0) { - goto fail; - } - } - - if (blake2b_update(&S, in, inlen) < 0) { - goto fail; - } - ret = blake2b_final(&S, out, outlen); - -fail: - clear_internal_memory(&S, sizeof(S)); - return ret; -} - -/* Argon2 Team - Begin Code */ -int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) { - uint8_t *out = (uint8_t *)pout; - blake2b_state blake_state; - uint8_t outlen_bytes[sizeof(uint32_t)] = {0}; - int ret = -1; - - if (outlen > UINT32_MAX) { - goto fail; - } - - /* Ensure little-endian byte order! */ - store32(outlen_bytes, (uint32_t)outlen); - -#define TRY(statement) \ - do { \ - ret = statement; \ - if (ret < 0) { \ - goto fail; \ - } \ - } while ((void)0, 0) - - if (outlen <= BLAKE2B_OUTBYTES) { - TRY(blake2b_init(&blake_state, outlen)); - TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); - TRY(blake2b_update(&blake_state, in, inlen)); - TRY(blake2b_final(&blake_state, out, outlen)); - } else { - uint32_t toproduce; - uint8_t out_buffer[BLAKE2B_OUTBYTES]; - uint8_t in_buffer[BLAKE2B_OUTBYTES]; - TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES)); - TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); - TRY(blake2b_update(&blake_state, in, inlen)); - TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES)); - memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); - out += BLAKE2B_OUTBYTES / 2; - toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2; - - while (toproduce > BLAKE2B_OUTBYTES) { - memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); - TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, - BLAKE2B_OUTBYTES, NULL, 0)); - memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); - out += BLAKE2B_OUTBYTES / 2; - toproduce -= BLAKE2B_OUTBYTES / 2; - } - - memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); - TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL, - 0)); - memcpy(out, out_buffer, toproduce); - } -fail: - clear_internal_memory(&blake_state, sizeof(blake_state)); - return ret; -#undef TRY -} -/* Argon2 Team - End Code */ diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h deleted file mode 100644 index 2c8942e..0000000 --- a/algo/argon2/argon2d/blake2/blamka-round-opt.h +++ /dev/null @@ -1,471 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#ifndef BLAKE_ROUND_MKA_OPT_H -#define BLAKE_ROUND_MKA_OPT_H - -#include "blake2-impl.h" - -#include -#if defined(__SSSE3__) -#include /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */ -#endif - -#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__)) -#include -#endif - -#if !defined(__AVX512F__) -#if !defined(__AVX2__) -#if !defined(__XOP__) -#if defined(__SSSE3__) -#define r16 \ - (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) -#define r24 \ - (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) -#define _mm_roti_epi64(x, c) \ - (-(c) == 32) \ - ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ - : (-(c) == 24) \ - ? _mm_shuffle_epi8((x), r24) \ - : (-(c) == 16) \ - ? _mm_shuffle_epi8((x), r16) \ - : (-(c) == 63) \ - ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ - _mm_add_epi64((x), (x))) \ - : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ - _mm_slli_epi64((x), 64 - (-(c)))) -#else /* defined(__SSE2__) */ -#define _mm_roti_epi64(r, c) \ - _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c)))) -#endif -#else -#endif - -static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { - const __m128i z = _mm_mul_epu32(x, y); - return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); -} - -#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = fBlaMka(A0, B0); \ - A1 = fBlaMka(A1, B1); \ - \ - D0 = _mm_xor_si128(D0, A0); \ - D1 = _mm_xor_si128(D1, A1); \ - \ - D0 = _mm_roti_epi64(D0, -32); \ - D1 = _mm_roti_epi64(D1, -32); \ - \ - C0 = fBlaMka(C0, D0); \ - C1 = fBlaMka(C1, D1); \ - \ - B0 = _mm_xor_si128(B0, C0); \ - B1 = _mm_xor_si128(B1, C1); \ - \ - B0 = _mm_roti_epi64(B0, -24); \ - B1 = _mm_roti_epi64(B1, -24); \ - } while ((void)0, 0) - -#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = fBlaMka(A0, B0); \ - A1 = fBlaMka(A1, B1); \ - \ - D0 = _mm_xor_si128(D0, A0); \ - D1 = _mm_xor_si128(D1, A1); \ - \ - D0 = _mm_roti_epi64(D0, -16); \ - D1 = _mm_roti_epi64(D1, -16); \ - \ - C0 = fBlaMka(C0, D0); \ - C1 = fBlaMka(C1, D1); \ - \ - B0 = _mm_xor_si128(B0, C0); \ - B1 = _mm_xor_si128(B1, C1); \ - \ - B0 = _mm_roti_epi64(B0, -63); \ - B1 = _mm_roti_epi64(B1, -63); \ - } while ((void)0, 0) - -#if defined(__SSSE3__) -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ - __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ - B0 = t0; \ - B1 = t1; \ - \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - \ - t0 = _mm_alignr_epi8(D1, D0, 8); \ - t1 = _mm_alignr_epi8(D0, D1, 8); \ - D0 = t1; \ - D1 = t0; \ - } while ((void)0, 0) - -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ - __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ - B0 = t0; \ - B1 = t1; \ - \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - \ - t0 = _mm_alignr_epi8(D0, D1, 8); \ - t1 = _mm_alignr_epi8(D1, D0, 8); \ - D0 = t1; \ - D1 = t0; \ - } while ((void)0, 0) -#else /* SSE2 */ -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = D0; \ - __m128i t1 = B0; \ - D0 = C0; \ - C0 = C1; \ - C1 = D0; \ - D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \ - D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \ - B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \ - B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \ - } while ((void)0, 0) - -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0, t1; \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - t0 = B0; \ - t1 = D0; \ - B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \ - B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \ - D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \ - D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \ - } while ((void)0, 0) -#endif - -#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - } while ((void)0, 0) -#else /* __AVX2__ */ - -#include - -#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) -#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) -#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) -#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) - -#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - __m256i ml = _mm256_mul_epu32(A0, B0); \ - ml = _mm256_add_epi64(ml, ml); \ - A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ - D0 = _mm256_xor_si256(D0, A0); \ - D0 = rotr32(D0); \ - \ - ml = _mm256_mul_epu32(C0, D0); \ - ml = _mm256_add_epi64(ml, ml); \ - C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ - \ - B0 = _mm256_xor_si256(B0, C0); \ - B0 = rotr24(B0); \ - \ - ml = _mm256_mul_epu32(A1, B1); \ - ml = _mm256_add_epi64(ml, ml); \ - A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ - D1 = _mm256_xor_si256(D1, A1); \ - D1 = rotr32(D1); \ - \ - ml = _mm256_mul_epu32(C1, D1); \ - ml = _mm256_add_epi64(ml, ml); \ - C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ - \ - B1 = _mm256_xor_si256(B1, C1); \ - B1 = rotr24(B1); \ - } while((void)0, 0); - -#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - __m256i ml = _mm256_mul_epu32(A0, B0); \ - ml = _mm256_add_epi64(ml, ml); \ - A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ - D0 = _mm256_xor_si256(D0, A0); \ - D0 = rotr16(D0); \ - \ - ml = _mm256_mul_epu32(C0, D0); \ - ml = _mm256_add_epi64(ml, ml); \ - C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ - B0 = _mm256_xor_si256(B0, C0); \ - B0 = rotr63(B0); \ - \ - ml = _mm256_mul_epu32(A1, B1); \ - ml = _mm256_add_epi64(ml, ml); \ - A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ - D1 = _mm256_xor_si256(D1, A1); \ - D1 = rotr16(D1); \ - \ - ml = _mm256_mul_epu32(C1, D1); \ - ml = _mm256_add_epi64(ml, ml); \ - C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ - B1 = _mm256_xor_si256(B1, C1); \ - B1 = rotr63(B1); \ - } while((void)0, 0); - -#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ - C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ - D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ - \ - B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ - C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ - D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ - } while((void)0, 0); - -#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ - __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ - B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - \ - tmp1 = C0; \ - C0 = C1; \ - C1 = tmp1; \ - \ - tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ - tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ - D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - } while(0); - -#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ - C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ - D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ - \ - B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ - C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ - D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ - } while((void)0, 0); - -#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ - __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ - B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - \ - tmp1 = C0; \ - C0 = C1; \ - C1 = tmp1; \ - \ - tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ - tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ - D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - } while((void)0, 0); - -#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ - do{ \ - G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ - \ - G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ - } while((void)0, 0); - -#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do{ \ - G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - } while((void)0, 0); - -#endif /* __AVX2__ */ - -#else /* __AVX512F__ */ - -#include - -#define ror64(x, n) _mm512_ror_epi64((x), (n)) - -static __m512i muladd(__m512i x, __m512i y) -{ - __m512i z = _mm512_mul_epu32(x, y); - return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); -} - -#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = muladd(A0, B0); \ - A1 = muladd(A1, B1); \ -\ - D0 = _mm512_xor_si512(D0, A0); \ - D1 = _mm512_xor_si512(D1, A1); \ -\ - D0 = ror64(D0, 32); \ - D1 = ror64(D1, 32); \ -\ - C0 = muladd(C0, D0); \ - C1 = muladd(C1, D1); \ -\ - B0 = _mm512_xor_si512(B0, C0); \ - B1 = _mm512_xor_si512(B1, C1); \ -\ - B0 = ror64(B0, 24); \ - B1 = ror64(B1, 24); \ - } while ((void)0, 0) - -#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = muladd(A0, B0); \ - A1 = muladd(A1, B1); \ -\ - D0 = _mm512_xor_si512(D0, A0); \ - D1 = _mm512_xor_si512(D1, A1); \ -\ - D0 = ror64(D0, 16); \ - D1 = ror64(D1, 16); \ -\ - C0 = muladd(C0, D0); \ - C1 = muladd(C1, D1); \ -\ - B0 = _mm512_xor_si512(B0, C0); \ - B1 = _mm512_xor_si512(B1, C1); \ -\ - B0 = ror64(B0, 63); \ - B1 = ror64(B1, 63); \ - } while ((void)0, 0) - -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ - B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ -\ - C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ - C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ -\ - D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ - D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ - } while ((void)0, 0) - -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ - B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ -\ - C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ - C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ -\ - D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ - D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ - } while ((void)0, 0) - -#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ -\ - DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ -\ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ -\ - UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - } while ((void)0, 0) - -#define SWAP_HALVES(A0, A1) \ - do { \ - __m512i t0, t1; \ - t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ - t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ - A0 = t0; \ - A1 = t1; \ - } while((void)0, 0) - -#define SWAP_QUARTERS(A0, A1) \ - do { \ - SWAP_HALVES(A0, A1); \ - A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ - A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ - } while((void)0, 0) - -#define UNSWAP_QUARTERS(A0, A1) \ - do { \ - A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ - A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ - SWAP_HALVES(A0, A1); \ - } while((void)0, 0) - -#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \ - do { \ - SWAP_HALVES(A0, B0); \ - SWAP_HALVES(C0, D0); \ - SWAP_HALVES(A1, B1); \ - SWAP_HALVES(C1, D1); \ - BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ - SWAP_HALVES(A0, B0); \ - SWAP_HALVES(C0, D0); \ - SWAP_HALVES(A1, B1); \ - SWAP_HALVES(C1, D1); \ - } while ((void)0, 0) - -#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - SWAP_QUARTERS(A0, A1); \ - SWAP_QUARTERS(B0, B1); \ - SWAP_QUARTERS(C0, C1); \ - SWAP_QUARTERS(D0, D1); \ - BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ - UNSWAP_QUARTERS(A0, A1); \ - UNSWAP_QUARTERS(B0, B1); \ - UNSWAP_QUARTERS(C0, C1); \ - UNSWAP_QUARTERS(D0, D1); \ - } while ((void)0, 0) - -#endif /* __AVX512F__ */ -#endif /* BLAKE_ROUND_MKA_OPT_H */ diff --git a/algo/argon2/argon2d/blake2/blamka-round-ref.h b/algo/argon2/argon2d/blake2/blamka-round-ref.h deleted file mode 100644 index b8f2cf4..0000000 --- a/algo/argon2/argon2d/blake2/blamka-round-ref.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#ifndef BLAKE_ROUND_MKA_H -#define BLAKE_ROUND_MKA_H - -#include "blake2.h" -#include "blake2-impl.h" - -/* designed by the Lyra PHC team */ -static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { - const uint64_t m = UINT64_C(0xFFFFFFFF); - const uint64_t xy = (x & m) * (y & m); - return x + y + 2 * xy; -} - -#define G(a, b, c, d) \ - do { \ - a = fBlaMka(a, b); \ - d = rotr64(d ^ a, 32); \ - c = fBlaMka(c, d); \ - b = rotr64(b ^ c, 24); \ - a = fBlaMka(a, b); \ - d = rotr64(d ^ a, 16); \ - c = fBlaMka(c, d); \ - b = rotr64(b ^ c, 63); \ - } while ((void)0, 0) - -#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \ - v12, v13, v14, v15) \ - do { \ - G(v0, v4, v8, v12); \ - G(v1, v5, v9, v13); \ - G(v2, v6, v10, v14); \ - G(v3, v7, v11, v15); \ - G(v0, v5, v10, v15); \ - G(v1, v6, v11, v12); \ - G(v2, v7, v8, v13); \ - G(v3, v4, v9, v14); \ - } while ((void)0, 0) - -#endif diff --git a/algo/blake/blake-4way.c b/algo/blake/blake-4way.c deleted file mode 100644 index 8d1372f..0000000 --- a/algo/blake/blake-4way.c +++ /dev/null @@ -1,120 +0,0 @@ -#include "blake-gate.h" -#include "blake-hash-4way.h" -#include -#include -#include - -#if defined (BLAKE_4WAY) - -blake256r14_4way_context blake_4w_ctx; - -void blakehash_4way(void *state, const void *input) -{ - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - blake256r14_4way_context ctx; - memcpy( &ctx, &blake_4w_ctx, sizeof ctx ); - blake256r14_4way( &ctx, input + (64<<2), 16 ); - blake256r14_4way_close( &ctx, vhash ); - dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 ); -} - -int scanhash_blake_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[8*4] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t HTarget = ptarget[7]; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - HTarget = 0x7f; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - blake256r14_4way_init( &blake_4w_ctx ); - blake256r14_4way( &blake_4w_ctx, vdata, 64 ); - - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); - - blakehash_4way( hash, vdata ); - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= HTarget ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - -#if defined(BLAKE_8WAY) - -blake256r14_8way_context blake_8w_ctx; - -void blakehash_8way( void *state, const void *input ) -{ - uint32_t vhash[8*8] __attribute__ ((aligned (64))); - blake256r14_8way_context ctx; - memcpy( &ctx, &blake_8w_ctx, sizeof ctx ); - blake256r14_8way( &ctx, input + (64<<3), 16 ); - blake256r14_8way_close( &ctx, vhash ); - _dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, - state+128, state+160, state+192, state+224, - vhash, 256 ); -} - -int scanhash_blake_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t hash[8*8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t HTarget = ptarget[7]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - HTarget = 0x7f; - - mm256_bswap32_intrlv80_8x32( vdata, pdata ); - - blake256r14_8way_init( &blake_8w_ctx ); - blake256r14_8way( &blake_8w_ctx, vdata, 64 ); - - do { - *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, - n+3, n+2, n+1, n ) ); - pdata[19] = n; - - blakehash_8way( hash, vdata ); - - for ( int i = 0; i < 8; i++ ) - if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 8; - - } while ( (n < max_nonce) !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/blake/blake-gate.c b/algo/blake/blake-gate.c deleted file mode 100644 index 7fc6480..0000000 --- a/algo/blake/blake-gate.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "blake-gate.h" - -int64_t blake_get_max64 () -{ - return 0x7ffffLL; -} - -bool register_blake_algo( algo_gate_t* gate ) -{ - gate->optimizations = AVX2_OPT; - gate->get_max64 = (void*)&blake_get_max64; -//#if defined (__AVX2__) && defined (FOUR_WAY) -// gate->optimizations = SSE2_OPT | AVX2_OPT; -// gate->scanhash = (void*)&scanhash_blake_8way; -// gate->hash = (void*)&blakehash_8way; -#if defined(BLAKE_4WAY) - four_way_not_tested(); - gate->scanhash = (void*)&scanhash_blake_4way; - gate->hash = (void*)&blakehash_4way; -#else - gate->scanhash = (void*)&scanhash_blake; - gate->hash = (void*)&blakehash; -#endif - return true; -} - diff --git a/algo/blake/blake-gate.h b/algo/blake/blake-gate.h deleted file mode 100644 index b8a400b..0000000 --- a/algo/blake/blake-gate.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __BLAKE_GATE_H__ -#define __BLAKE_GATE_H__ - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) - #define BLAKE_4WAY -#endif - -#if defined (BLAKE_4WAY) -void blakehash_4way(void *state, const void *input); -int scanhash_blake_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - -void blakehash( void *state, const void *input ); -int scanhash_blake( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h deleted file mode 100644 index 03ebe9d..0000000 --- a/algo/blake/blake-hash-4way.h +++ /dev/null @@ -1,144 +0,0 @@ -/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */ -/** - * BLAKE interface. BLAKE is a family of functions which differ by their - * output size; this implementation defines BLAKE for output sizes 224, - * 256, 384 and 512 bits. This implementation conforms to the "third - * round" specification. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_blake.h - * @author Thomas Pornin - */ - -#ifndef __BLAKE_HASH_4WAY__ -#define __BLAKE_HASH_4WAY__ 1 - -//#ifdef __SSE4_2__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "algo/sha/sph_types.h" -#include "simd-utils.h" - -#define SPH_SIZE_blake256 256 - -#define SPH_SIZE_blake512 512 - -// With SSE4.2 only Blake-256 4 way is available. -// With AVX2 Blake-256 8way & Blake-512 4 way are also available. - -// Blake-256 4 way - -typedef struct { - unsigned char buf[64<<2]; - uint32_t H[8<<2]; - uint32_t S[4<<2]; -// __m128i buf[16] __attribute__ ((aligned (64))); -// __m128i H[8]; -// __m128i S[4]; - size_t ptr; - uint32_t T0, T1; - int rounds; // 14 for blake, 8 for blakecoin & vanilla -} blake_4way_small_context __attribute__ ((aligned (64))); - -// Default 14 rounds -typedef blake_4way_small_context blake256_4way_context; -void blake256_4way_init(void *ctx); -void blake256_4way(void *ctx, const void *data, size_t len); -void blake256_4way_close(void *ctx, void *dst); - -// 14 rounds, blake, decred -typedef blake_4way_small_context blake256r14_4way_context; -void blake256r14_4way_init(void *cc); -void blake256r14_4way(void *cc, const void *data, size_t len); -void blake256r14_4way_close(void *cc, void *dst); - -// 8 rounds, blakecoin, vanilla -typedef blake_4way_small_context blake256r8_4way_context; -void blake256r8_4way_init(void *cc); -void blake256r8_4way(void *cc, const void *data, size_t len); -void blake256r8_4way_close(void *cc, void *dst); - -#ifdef __AVX2__ - -// Blake-256 8 way - -typedef struct { - __m256i buf[16] __attribute__ ((aligned (64))); - __m256i H[8]; - __m256i S[4]; - size_t ptr; - sph_u32 T0, T1; - int rounds; // 14 for blake, 8 for blakecoin & vanilla -} blake_8way_small_context; - -// Default 14 rounds -typedef blake_8way_small_context blake256_8way_context; -void blake256_8way_init(void *cc); -void blake256_8way(void *cc, const void *data, size_t len); -void blake256_8way_close(void *cc, void *dst); - -// 14 rounds, blake, decred -typedef blake_8way_small_context blake256r14_8way_context; -void blake256r14_8way_init(void *cc); -void blake256r14_8way(void *cc, const void *data, size_t len); -void blake256r14_8way_close(void *cc, void *dst); - -// 8 rounds, blakecoin, vanilla -typedef blake_8way_small_context blake256r8_8way_context; -void blake256r8_8way_init(void *cc); -void blake256r8_8way(void *cc, const void *data, size_t len); -void blake256r8_8way_close(void *cc, void *dst); - -// Blake-512 4 way - -typedef struct { - __m256i buf[16] __attribute__ ((aligned (64))); - __m256i H[8]; - __m256i S[4]; - size_t ptr; - sph_u64 T0, T1; -} blake_4way_big_context; - -typedef blake_4way_big_context blake512_4way_context; - -void blake512_4way_init(void *cc); -void blake512_4way(void *cc, const void *data, size_t len); -void blake512_4way_close(void *cc, void *dst); -void blake512_4way_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#endif // AVX2 - -#ifdef __cplusplus -} -#endif - -#endif // BLAKE_HASH_4WAY_H__ diff --git a/algo/blake/blake.c b/algo/blake/blake.c deleted file mode 100644 index 385822e..0000000 --- a/algo/blake/blake.c +++ /dev/null @@ -1,92 +0,0 @@ -#include "algo-gate-api.h" -#include "sph_blake.h" - -#include -#include -#include - -/* Move init out of loop, so init once externally, - * and then use one single memcpy */ -static __thread sph_blake256_context blake_mid; -static __thread bool ctx_midstate_done = false; - -static void init_blake_hash(void) -{ - sph_blake256_init(&blake_mid); - ctx_midstate_done = true; -} - -void blakehash(void *state, const void *input) -{ - sph_blake256_context ctx; - - uint8_t hash[64] __attribute__ ((aligned (32))); - uint8_t *ending = (uint8_t*) input; - ending += 64; - - // do one memcopy to get a fresh context - if (!ctx_midstate_done) { - init_blake_hash(); - sph_blake256(&blake_mid, input, 64); - } - - memcpy(&ctx, &blake_mid, sizeof(blake_mid)); - - sph_blake256(&ctx, ending, 16); - sph_blake256_close(&ctx, hash); - - memcpy(state, hash, 32); - -} - -int scanhash_blake( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t HTarget = ptarget[7]; - uint32_t _ALIGN(32) hash64[8]; - uint32_t _ALIGN(32) endiandata[20]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - ctx_midstate_done = false; - - if (opt_benchmark) - HTarget = 0x7f; - - // we need big endian data... - swab32_array( endiandata, pdata, 20 ); - -#ifdef DEBUG_ALGO - applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]); -#endif - - do { - be32enc(&endiandata[19], n); - blakehash(hash64, endiandata); -#ifndef DEBUG_ALGO - if (hash64[7] <= HTarget && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (hash64[7] == 0) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - n++; pdata[19] = n; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c deleted file mode 100644 index b228e07..0000000 --- a/algo/blake/blake256-hash-4way.c +++ /dev/null @@ -1,1062 +0,0 @@ -/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */ -/* - * BLAKE implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -//#if defined (__SSE4_2__) - -#include -#include -#include -#include - -#include "blake-hash-4way.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE -#define SPH_SMALL_FOOTPRINT_BLAKE 1 -#endif - -#if SPH_SMALL_FOOTPRINT_BLAKE -#define SPH_COMPACT_BLAKE_32 1 -#endif - -#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE) -#define SPH_COMPACT_BLAKE_64 1 -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -// Blake-256 - -static const uint32_t IV256[8] = -{ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 -}; - -#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64 - -// Blake-256 4 & 8 way, Blake-512 4 way - -static const unsigned sigma[16][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } -}; - -#endif - -#define Z00 0 -#define Z01 1 -#define Z02 2 -#define Z03 3 -#define Z04 4 -#define Z05 5 -#define Z06 6 -#define Z07 7 -#define Z08 8 -#define Z09 9 -#define Z0A A -#define Z0B B -#define Z0C C -#define Z0D D -#define Z0E E -#define Z0F F - -#define Z10 E -#define Z11 A -#define Z12 4 -#define Z13 8 -#define Z14 9 -#define Z15 F -#define Z16 D -#define Z17 6 -#define Z18 1 -#define Z19 C -#define Z1A 0 -#define Z1B 2 -#define Z1C B -#define Z1D 7 -#define Z1E 5 -#define Z1F 3 - -#define Z20 B -#define Z21 8 -#define Z22 C -#define Z23 0 -#define Z24 5 -#define Z25 2 -#define Z26 F -#define Z27 D -#define Z28 A -#define Z29 E -#define Z2A 3 -#define Z2B 6 -#define Z2C 7 -#define Z2D 1 -#define Z2E 9 -#define Z2F 4 - -#define Z30 7 -#define Z31 9 -#define Z32 3 -#define Z33 1 -#define Z34 D -#define Z35 C -#define Z36 B -#define Z37 E -#define Z38 2 -#define Z39 6 -#define Z3A 5 -#define Z3B A -#define Z3C 4 -#define Z3D 0 -#define Z3E F -#define Z3F 8 - -#define Z40 9 -#define Z41 0 -#define Z42 5 -#define Z43 7 -#define Z44 2 -#define Z45 4 -#define Z46 A -#define Z47 F -#define Z48 E -#define Z49 1 -#define Z4A B -#define Z4B C -#define Z4C 6 -#define Z4D 8 -#define Z4E 3 -#define Z4F D - -#define Z50 2 -#define Z51 C -#define Z52 6 -#define Z53 A -#define Z54 0 -#define Z55 B -#define Z56 8 -#define Z57 3 -#define Z58 4 -#define Z59 D -#define Z5A 7 -#define Z5B 5 -#define Z5C F -#define Z5D E -#define Z5E 1 -#define Z5F 9 - -#define Z60 C -#define Z61 5 -#define Z62 1 -#define Z63 F -#define Z64 E -#define Z65 D -#define Z66 4 -#define Z67 A -#define Z68 0 -#define Z69 7 -#define Z6A 6 -#define Z6B 3 -#define Z6C 9 -#define Z6D 2 -#define Z6E 8 -#define Z6F B - -#define Z70 D -#define Z71 B -#define Z72 7 -#define Z73 E -#define Z74 C -#define Z75 1 -#define Z76 3 -#define Z77 9 -#define Z78 5 -#define Z79 0 -#define Z7A F -#define Z7B 4 -#define Z7C 8 -#define Z7D 6 -#define Z7E 2 -#define Z7F A - -#define Z80 6 -#define Z81 F -#define Z82 E -#define Z83 9 -#define Z84 B -#define Z85 3 -#define Z86 0 -#define Z87 8 -#define Z88 C -#define Z89 2 -#define Z8A D -#define Z8B 7 -#define Z8C 1 -#define Z8D 4 -#define Z8E A -#define Z8F 5 - -#define Z90 A -#define Z91 2 -#define Z92 8 -#define Z93 4 -#define Z94 7 -#define Z95 6 -#define Z96 1 -#define Z97 5 -#define Z98 F -#define Z99 B -#define Z9A 9 -#define Z9B E -#define Z9C 3 -#define Z9D C -#define Z9E D -#define Z9F 0 - -#define Mx(r, i) Mx_(Z ## r ## i) -#define Mx_(n) Mx__(n) -#define Mx__(n) M ## n - -// Blake-256 4 & 8 way - -#define CSx(r, i) CSx_(Z ## r ## i) -#define CSx_(n) CSx__(n) -#define CSx__(n) CS ## n - -#define CS0 SPH_C32(0x243F6A88) -#define CS1 SPH_C32(0x85A308D3) -#define CS2 SPH_C32(0x13198A2E) -#define CS3 SPH_C32(0x03707344) -#define CS4 SPH_C32(0xA4093822) -#define CS5 SPH_C32(0x299F31D0) -#define CS6 SPH_C32(0x082EFA98) -#define CS7 SPH_C32(0xEC4E6C89) -#define CS8 SPH_C32(0x452821E6) -#define CS9 SPH_C32(0x38D01377) -#define CSA SPH_C32(0xBE5466CF) -#define CSB SPH_C32(0x34E90C6C) -#define CSC SPH_C32(0xC0AC29B7) -#define CSD SPH_C32(0xC97C50DD) -#define CSE SPH_C32(0x3F84D5B5) -#define CSF SPH_C32(0xB5470917) - -#if SPH_COMPACT_BLAKE_32 - -static const sph_u32 CS[16] = { - SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), - SPH_C32(0x13198A2E), SPH_C32(0x03707344), - SPH_C32(0xA4093822), SPH_C32(0x299F31D0), - SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89), - SPH_C32(0x452821E6), SPH_C32(0x38D01377), - SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), - SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), - SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917) -}; - -#endif - - -#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \ -do { \ - a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \ - _mm_set_epi32( c1, c1, c1, c1 ), m0 ), b ), a ); \ - d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \ - c = _mm_add_epi32( c, d ); \ - b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \ - a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \ - _mm_set_epi32( c0, c0, c0, c0 ), m1 ), b ), a ); \ - d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \ - c = _mm_add_epi32( c, d ); \ - b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \ -} while (0) - -#if SPH_COMPACT_BLAKE_32 - -// Blake-256 4 way - -#define ROUND_S_4WAY(r) do { \ - GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \ - CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \ - GS_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \ - CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \ - GS_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \ - CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \ - GS_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \ - CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \ - GS_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \ - CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \ - GS_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \ - CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \ - GS_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \ - CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \ - GS_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \ - CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \ -} while (0) - -#else - -#define ROUND_S_4WAY(r) do { \ - GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ - GS_4WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ - GS_4WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ - GS_4WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ - GS_4WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ - GS_4WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ - GS_4WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ - GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ -} while (0) - -#endif - -#define DECL_STATE32_4WAY \ - __m128i H0, H1, H2, H3, H4, H5, H6, H7; \ - __m128i S0, S1, S2, S3; \ - uint32_t T0, T1; - -#define READ_STATE32_4WAY(state) do { \ - H0 = casti_m128i( state->H, 0 ); \ - H1 = casti_m128i( state->H, 1 ); \ - H2 = casti_m128i( state->H, 2 ); \ - H3 = casti_m128i( state->H, 3 ); \ - H4 = casti_m128i( state->H, 4 ); \ - H5 = casti_m128i( state->H, 5 ); \ - H6 = casti_m128i( state->H, 6 ); \ - H7 = casti_m128i( state->H, 7 ); \ - S0 = casti_m128i( state->S, 0 ); \ - S1 = casti_m128i( state->S, 1 ); \ - S2 = casti_m128i( state->S, 2 ); \ - S3 = casti_m128i( state->S, 3 ); \ - T0 = (state)->T0; \ - T1 = (state)->T1; \ - } while (0) - -#define WRITE_STATE32_4WAY(state) do { \ - casti_m128i( state->H, 0 ) = H0; \ - casti_m128i( state->H, 1 ) = H1; \ - casti_m128i( state->H, 2 ) = H2; \ - casti_m128i( state->H, 3 ) = H3; \ - casti_m128i( state->H, 4 ) = H4; \ - casti_m128i( state->H, 5 ) = H5; \ - casti_m128i( state->H, 6 ) = H6; \ - casti_m128i( state->H, 7 ) = H7; \ - casti_m128i( state->S, 0 ) = S0; \ - casti_m128i( state->S, 1 ) = S1; \ - casti_m128i( state->S, 2 ) = S2; \ - casti_m128i( state->S, 3 ) = S3; \ - (state)->T0 = T0; \ - (state)->T1 = T1; \ - } while (0) - -#if SPH_COMPACT_BLAKE_32 -// not used - -#define COMPRESS32_4WAY( rounds ) do { \ - __m128i M[16]; \ - __m128i V0, V1, V2, V3, V4, V5, V6, V7; \ - __m128i V8, V9, VA, VB, VC, VD, VE, VF; \ - unsigned r; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \ - V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \ - VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \ - VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \ - VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \ - VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \ - VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \ - VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \ - mm128_block_bswap_32( M, buf ); \ - mm128_block_bswap_32( M+8, buf+8 ); \ - for (r = 0; r < rounds; r ++) \ - ROUND_S_4WAY(r); \ - H0 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( S0, V0 ), V8 ), H0 ); \ - H1 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( S1, V1 ), V9 ), H1 ); \ - H2 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( S2, V2 ), VA ), H2 ); \ - H3 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( S3, V3 ), VB ), H3 ); \ - H4 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( S0, V4 ), VC ), H4 ); \ - H5 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( S1, V5 ), VD ), H5 ); \ - H6 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( S2, V6 ), VE ), H6 ); \ - H7 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( S3, V7 ), VF ), H7 ); \ - } while (0) - -#else - -// current impl - -#if defined(__SSSE3__) - -#define BLAKE256_4WAY_BLOCK_BSWAP32 do \ -{ \ - __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \ - 0x0405060700010203 ); \ - M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \ - M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \ - M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \ - M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \ - M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \ - M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \ - M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \ - M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \ - M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \ - M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \ - MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \ - MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \ - MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \ - MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \ - ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \ - MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \ -} while(0) - -#else // SSE2 - -#define BLAKE256_4WAY_BLOCK_BSWAP32 do \ -{ \ - M0 = mm128_bswap_32( buf[0] ); \ - M1 = mm128_bswap_32( buf[1] ); \ - M2 = mm128_bswap_32( buf[2] ); \ - M3 = mm128_bswap_32( buf[3] ); \ - M4 = mm128_bswap_32( buf[4] ); \ - M5 = mm128_bswap_32( buf[5] ); \ - M6 = mm128_bswap_32( buf[6] ); \ - M7 = mm128_bswap_32( buf[7] ); \ - M8 = mm128_bswap_32( buf[8] ); \ - M9 = mm128_bswap_32( buf[9] ); \ - MA = mm128_bswap_32( buf[10] ); \ - MB = mm128_bswap_32( buf[11] ); \ - MC = mm128_bswap_32( buf[12] ); \ - MD = mm128_bswap_32( buf[13] ); \ - ME = mm128_bswap_32( buf[14] ); \ - MF = mm128_bswap_32( buf[15] ); \ -} while(0) - -#endif // SSSE3 else SSE2 - -#define COMPRESS32_4WAY( rounds ) \ -do { \ - __m128i M0, M1, M2, M3, M4, M5, M6, M7; \ - __m128i M8, M9, MA, MB, MC, MD, ME, MF; \ - __m128i V0, V1, V2, V3, V4, V5, V6, V7; \ - __m128i V8, V9, VA, VB, VC, VD, VE, VF; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \ - V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \ - VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \ - VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \ - VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \ - VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \ - VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \ - VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \ - BLAKE256_4WAY_BLOCK_BSWAP32; \ - ROUND_S_4WAY(0); \ - ROUND_S_4WAY(1); \ - ROUND_S_4WAY(2); \ - ROUND_S_4WAY(3); \ - ROUND_S_4WAY(4); \ - ROUND_S_4WAY(5); \ - ROUND_S_4WAY(6); \ - ROUND_S_4WAY(7); \ - if (rounds == 14) \ - { \ - ROUND_S_4WAY(8); \ - ROUND_S_4WAY(9); \ - ROUND_S_4WAY(0); \ - ROUND_S_4WAY(1); \ - ROUND_S_4WAY(2); \ - ROUND_S_4WAY(3); \ - } \ - H0 = mm128_xor4( V8, V0, S0, H0 ); \ - H1 = mm128_xor4( V9, V1, S1, H1 ); \ - H2 = mm128_xor4( VA, V2, S2, H2 ); \ - H3 = mm128_xor4( VB, V3, S3, H3 ); \ - H4 = mm128_xor4( VC, V4, S0, H4 ); \ - H5 = mm128_xor4( VD, V5, S1, H5 ); \ - H6 = mm128_xor4( VE, V6, S2, H6 ); \ - H7 = mm128_xor4( VF, V7, S3, H7 ); \ -} while (0) - -#endif - -#if defined (__AVX2__) - -// Blake-256 8 way - -#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \ -do { \ - a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ - _mm256_set1_epi32( c1 ), m0 ), b ), a ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \ - c = _mm256_add_epi32( c, d ); \ - b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \ - a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ - _mm256_set1_epi32( c0 ), m1 ), b ), a ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \ - c = _mm256_add_epi32( c, d ); \ - b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \ -} while (0) - -#define ROUND_S_8WAY(r) do { \ - GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ - GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ - GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ - GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ - GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ - GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ - GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ - GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ -} while (0) - -#define DECL_STATE32_8WAY \ - __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ - __m256i S0, S1, S2, S3; \ - sph_u32 T0, T1; - -#define READ_STATE32_8WAY(state) \ -do { \ - H0 = (state)->H[0]; \ - H1 = (state)->H[1]; \ - H2 = (state)->H[2]; \ - H3 = (state)->H[3]; \ - H4 = (state)->H[4]; \ - H5 = (state)->H[5]; \ - H6 = (state)->H[6]; \ - H7 = (state)->H[7]; \ - S0 = (state)->S[0]; \ - S1 = (state)->S[1]; \ - S2 = (state)->S[2]; \ - S3 = (state)->S[3]; \ - T0 = (state)->T0; \ - T1 = (state)->T1; \ -} while (0) - -#define WRITE_STATE32_8WAY(state) \ -do { \ - (state)->H[0] = H0; \ - (state)->H[1] = H1; \ - (state)->H[2] = H2; \ - (state)->H[3] = H3; \ - (state)->H[4] = H4; \ - (state)->H[5] = H5; \ - (state)->H[6] = H6; \ - (state)->H[7] = H7; \ - (state)->S[0] = S0; \ - (state)->S[1] = S1; \ - (state)->S[2] = S2; \ - (state)->S[3] = S3; \ - (state)->T0 = T0; \ - (state)->T1 = T1; \ -} while (0) - -#define COMPRESS32_8WAY( rounds ) \ -do { \ - __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ - __m256i M8, M9, MA, MB, MC, MD, ME, MF; \ - __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ - __m256i V8, V9, VA, VB, VC, VD, VE, VF; \ - __m256i shuf_bswap32; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \ - V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \ - VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \ - VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \ - VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \ - VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \ - VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \ - VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \ - shuf_bswap32 = _mm256_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203, \ - 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ - M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \ - M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \ - M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \ - M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \ - M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \ - M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \ - M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \ - M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \ - M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \ - M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \ - MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \ - MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \ - MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \ - MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \ - ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \ - MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \ - ROUND_S_8WAY(0); \ - ROUND_S_8WAY(1); \ - ROUND_S_8WAY(2); \ - ROUND_S_8WAY(3); \ - ROUND_S_8WAY(4); \ - ROUND_S_8WAY(5); \ - ROUND_S_8WAY(6); \ - ROUND_S_8WAY(7); \ - if (rounds == 14) \ - { \ - ROUND_S_8WAY(8); \ - ROUND_S_8WAY(9); \ - ROUND_S_8WAY(0); \ - ROUND_S_8WAY(1); \ - ROUND_S_8WAY(2); \ - ROUND_S_8WAY(3); \ - } \ - H0 = mm256_xor4( V8, V0, S0, H0 ); \ - H1 = mm256_xor4( V9, V1, S1, H1 ); \ - H2 = mm256_xor4( VA, V2, S2, H2 ); \ - H3 = mm256_xor4( VB, V3, S3, H3 ); \ - H4 = mm256_xor4( VC, V4, S0, H4 ); \ - H5 = mm256_xor4( VD, V5, S1, H5 ); \ - H6 = mm256_xor4( VE, V6, S2, H6 ); \ - H7 = mm256_xor4( VF, V7, S3, H7 ); \ -} while (0) - - -#endif - -// Blake-256 4 way - -static const uint32_t salt_zero_4way_small[4] = { 0, 0, 0, 0 }; - -static void -blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv, - const uint32_t *salt, int rounds ) -{ - __m128i zero = m128_zero; - casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] ); - casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] ); - casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] ); - casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] ); - casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] ); - casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] ); - casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] ); - casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] ); - - casti_m128i( ctx->S, 0 ) = zero; - casti_m128i( ctx->S, 1 ) = zero; - casti_m128i( ctx->S, 2 ) = zero; - casti_m128i( ctx->S, 3 ) = zero; - ctx->T0 = ctx->T1 = 0; - ctx->ptr = 0; - ctx->rounds = rounds; -} - -static void -blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len ) -{ - __m128i *buf = (__m128i*)ctx->buf; - size_t bptr = ctx->ptr<<2; - size_t vptr = ctx->ptr >> 2; - size_t blen = len << 2; - DECL_STATE32_4WAY - - if ( blen < (sizeof ctx->buf) - bptr ) - { - memcpy( buf + vptr, data, (sizeof ctx->buf) - bptr ); - bptr += blen; - ctx->ptr = bptr>>2; - return; - } - - READ_STATE32_4WAY( ctx ); - while ( blen > 0 ) - { - size_t clen = ( sizeof ctx->buf ) - bptr; - - if ( clen > blen ) - clen = blen; - memcpy( buf + vptr, data, clen ); - bptr += clen; - data = (const unsigned char *)data + clen; - blen -= clen; - if ( bptr == ( sizeof ctx->buf ) ) - { - if ( ( T0 = T0 + 512 ) < 512 ) - T1 = T1 + 1; - COMPRESS32_4WAY( ctx->rounds ); - bptr = 0; - } - } - WRITE_STATE32_4WAY( ctx ); - ctx->ptr = bptr>>2; -} - -static void -blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n, - void *dst, size_t out_size_w32 ) -{ - __m128i buf[16] __attribute__ ((aligned (64))); - size_t ptr = ctx->ptr; - size_t vptr = ctx->ptr>>2; - unsigned bit_len = ( (unsigned)ptr << 3 ); - uint32_t tl = ctx->T0 + bit_len; - uint32_t th = ctx->T1; - - if ( ptr == 0 ) - { - ctx->T0 = 0xFFFFFE00UL; - ctx->T1 = 0xFFFFFFFFUL; - } - else if ( ctx->T0 == 0 ) - { - ctx->T0 = 0xFFFFFE00UL + bit_len; - ctx->T1 = ctx->T1 - 1; - } - else - ctx->T0 -= 512 - bit_len; - - buf[vptr] = _mm_set1_epi32( 0x80 ); - - if ( vptr < 12 ) - { - memset_zero_128( buf + vptr + 1, 13 - vptr ); - buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) ); - buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) ); - buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) ); - blake32_4way( ctx, buf + vptr, 64 - ptr ); - } - else - { - memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 ); - blake32_4way( ctx, buf + vptr, 64 - ptr ); - ctx->T0 = 0xFFFFFE00UL; - ctx->T1 = 0xFFFFFFFFUL; - memset_zero_128( buf, 56>>2 ); - buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) ); - buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) ); - buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) ); - blake32_4way( ctx, buf, 64 ); - } - - mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H ); -} - -#if defined (__AVX2__) - -// Blake-256 8 way - -static const sph_u32 salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; - -static void -blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv, - const sph_u32 *salt, int rounds ) -{ - __m256i zero = m256_zero; - casti_m256i( sc->H, 0 ) = _mm256_set1_epi32( iv[0] ); - casti_m256i( sc->H, 1 ) = _mm256_set1_epi32( iv[1] ); - casti_m256i( sc->H, 2 ) = _mm256_set1_epi32( iv[2] ); - casti_m256i( sc->H, 3 ) = _mm256_set1_epi32( iv[3] ); - casti_m256i( sc->H, 4 ) = _mm256_set1_epi32( iv[4] ); - casti_m256i( sc->H, 5 ) = _mm256_set1_epi32( iv[5] ); - casti_m256i( sc->H, 6 ) = _mm256_set1_epi32( iv[6] ); - casti_m256i( sc->H, 7 ) = _mm256_set1_epi32( iv[7] ); - - casti_m256i( sc->S, 0 ) = zero; - casti_m256i( sc->S, 1 ) = zero; - casti_m256i( sc->S, 2 ) = zero; - casti_m256i( sc->S, 3 ) = zero; - - sc->T0 = sc->T1 = 0; - sc->ptr = 0; - sc->rounds = rounds; -} - -static void -blake32_8way( blake_8way_small_context *sc, const void *data, size_t len ) -{ - __m256i *vdata = (__m256i*)data; - __m256i *buf; - size_t ptr; - const int buf_size = 64; // number of elements, sizeof/4 - DECL_STATE32_8WAY - buf = sc->buf; - ptr = sc->ptr; - if ( len < buf_size - ptr ) - { - memcpy_256( buf + (ptr>>2), vdata, len>>2 ); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE32_8WAY(sc); - while ( len > 0 ) - { - size_t clen; - - clen = buf_size - ptr; - if (clen > len) - clen = len; - memcpy_256( buf + (ptr>>2), vdata, clen>>2 ); - ptr += clen; - vdata += (clen>>2); - len -= clen; - if ( ptr == buf_size ) - { - if ( ( T0 = SPH_T32(T0 + 512) ) < 512 ) - T1 = SPH_T32(T1 + 1); - COMPRESS32_8WAY( sc->rounds ); - ptr = 0; - } - } - WRITE_STATE32_8WAY(sc); - sc->ptr = ptr; -} - -static void -blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, - void *dst, size_t out_size_w32 ) -{ - __m256i buf[16]; - size_t ptr; - unsigned bit_len; - sph_u32 th, tl; - - ptr = sc->ptr; - bit_len = ((unsigned)ptr << 3); - buf[ptr>>2] = _mm256_set1_epi32( 0x80 ); - tl = sc->T0 + bit_len; - th = sc->T1; - - if ( ptr == 0 ) - { - sc->T0 = SPH_C32(0xFFFFFE00UL); - sc->T1 = SPH_C32(0xFFFFFFFFUL); - } - else if ( sc->T0 == 0 ) - { - sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len; - sc->T1 = SPH_T32(sc->T1 - 1); - } - else - sc->T0 -= 512 - bit_len; - - if ( ptr <= 52 ) - { - memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 ); - if ( out_size_w32 == 8 ) - buf[52>>2] = _mm256_or_si256( buf[52>>2], - _mm256_set1_epi32( 0x01000000UL ) ); - *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) ); - *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) ); - blake32_8way( sc, buf + (ptr>>2), 64 - ptr ); - } - else - { - memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 ); - blake32_8way( sc, buf + (ptr>>2), 64 - ptr ); - sc->T0 = SPH_C32(0xFFFFFE00UL); - sc->T1 = SPH_C32(0xFFFFFFFFUL); - memset_zero_256( buf, 56>>2 ); - if ( out_size_w32 == 8 ) - buf[52>>2] = _mm256_set1_epi32( 0x01000000UL ); - *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) ); - *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) ); - blake32_8way( sc, buf, 64 ); - } - mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H ); -} - -#endif - -// Blake-256 4 way - -// default 14 rounds, backward copatibility -void -blake256_4way_init(void *ctx) -{ - blake32_4way_init( ctx, IV256, salt_zero_4way_small, 14 ); -} - -void -blake256_4way(void *ctx, const void *data, size_t len) -{ - blake32_4way(ctx, data, len); -} - -void -blake256_4way_close(void *ctx, void *dst) -{ - blake32_4way_close(ctx, 0, 0, dst, 8); -} - -#if defined(__AVX2__) - -// Blake-256 8 way - -void -blake256_8way_init(void *cc) -{ - blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 ); -} - -void -blake256_8way(void *cc, const void *data, size_t len) -{ - blake32_8way(cc, data, len); -} - -void -blake256_8way_close(void *cc, void *dst) -{ - blake32_8way_close(cc, 0, 0, dst, 8); -} - -#endif - -// 14 rounds Blake, Decred -void blake256r14_4way_init(void *cc) -{ - blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 ); -} - -void -blake256r14_4way(void *cc, const void *data, size_t len) -{ - blake32_4way(cc, data, len); -} - -void -blake256r14_4way_close(void *cc, void *dst) -{ - blake32_4way_close(cc, 0, 0, dst, 8); -} - -#if defined(__AVX2__) - -void blake256r14_8way_init(void *cc) -{ - blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 ); -} - -void -blake256r14_8way(void *cc, const void *data, size_t len) -{ - blake32_8way(cc, data, len); -} - -void -blake256r14_8way_close(void *cc, void *dst) -{ - blake32_8way_close(cc, 0, 0, dst, 8); -} - -#endif - -// 8 rounds Blakecoin, Vanilla -void blake256r8_4way_init(void *cc) -{ - blake32_4way_init( cc, IV256, salt_zero_4way_small, 8 ); -} - -void -blake256r8_4way(void *cc, const void *data, size_t len) -{ - blake32_4way(cc, data, len); -} - -void -blake256r8_4way_close(void *cc, void *dst) -{ - blake32_4way_close(cc, 0, 0, dst, 8); -} - -#if defined (__AVX2__) - -void blake256r8_8way_init(void *cc) -{ - blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 ); -} - -void -blake256r8_8way(void *cc, const void *data, size_t len) -{ - blake32_8way(cc, data, len); -} - -void -blake256r8_8way_close(void *cc, void *dst) -{ - blake32_8way_close(cc, 0, 0, dst, 8); -} - -#endif - -#ifdef __cplusplus -} -#endif - -//#endif diff --git a/algo/blake/blake256-hash-4way.c.new b/algo/blake/blake256-hash-4way.c.new deleted file mode 100644 index 683c84c..0000000 --- a/algo/blake/blake256-hash-4way.c.new +++ /dev/null @@ -1,322 +0,0 @@ -// convert blake256 32 bit to use 64 bit with serial vectoring -// -// cut calls to GS in half -// -// combine V -// v0 = {V0,V1} -// v1 = {V2,V3} -// v2 = {V4,V5} -// v3 = {V6,V7} -// v4 = {V8,V9} -// v5 = {VA,VB} -// v6 = {VC,VD} -// v7 = {CE,VF} -// -// v6x = {VD,VC} swap(VC,VD) swap(v6) -// v7x = {VF,VE} swap(VE,VF) swap(v7) -// -// V0 = v1v0 -// V1 = v3v2 -// V2 = v5v4 -// V3 = v7v6 -// V4 = v9v8 -// V5 = vbva -// V6 = vdvc -// V7 = vfve -// -// The rotate in ROUND is to effect straddle and unstraddle for the third -// and 4th iteration of GS. -// It concatenates 2 contiguous 256 bit vectors and extracts the middle -// 256 bits. After the transform they must be restored with only the -// chosen bits modified in the original 2 vectors. -// ror1x128 achieves this by putting the chosen bits in arg1, the "low" -// 256 bit vector and saves the untouched bits temporailly in arg0, the -// "high" 256 bit vector. Simply reverse the process to restore data back -// to original positions. - -// Use standard 4way when AVX2 is not available use x2 mode with AVX2. -// -// Data is organised the same as 32 bit 4 way, in effect serial vectoring -// on top of parallel vectoring. Same data in the same place just taking -// two chunks at a time. -// -// Transparent to user, x2 mode used when AVX2 detected. -// Use existing 4way context but revert to scalar types. -// Same interleave function (128 bit) or x2 with 256 bit? -// User trsnaparency would have to apply to interleave as well. -// -// Use common 4way update and close - -/* -typedef struct { - unsigned char buf[64<<2]; - uint32_t H[8<<2]; - uint32_t S[4<<2]; - size_t ptr; - uint32_t T0, T1; - int rounds; // 14 for blake, 8 for blakecoin & vanilla -} blakex2_4way_small_context __attribute__ ((aligned (64))); -*/ - -static void -blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv, - const uint32_t *salt, int rounds ) -{ - casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] ); - casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] ); - casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] ); - casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] ); - casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] ); - casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] ); - casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] ); - casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] ); - - casti_m128i( ctx->S, 0 ) = m128_zero; - casti_m128i( ctx->S, 1 ) = m128_zero; - casti_m128i( ctx->S, 2 ) = m128_zero; - casti_m128i( ctx->S, 3 ) = m128_zero; -/* - sc->S[0] = _mm_set1_epi32( salt[0] ); - sc->S[1] = _mm_set1_epi32( salt[1] ); - sc->S[2] = _mm_set1_epi32( salt[2] ); - sc->S[3] = _mm_set1_epi32( salt[3] ); -*/ - ctx->T0 = ctx->T1 = 0; - ctx->ptr = 0; - ctx->rounds = rounds; -} - -static void -blake32x2( blake_4way_small_context *ctx, const void *data, size_t len ) -{ - __m128i *buf = (__m256i*)ctx->buf; - size_t bptr = ctx->ptr << 2; - size_t vptr = ctx->ptr >> 3; - size_t blen = len << 2; -// unsigned char *buf = ctx->buf; -// size_t ptr = ctx->ptr<<4; // repurposed - DECL_STATE32x2 - -// buf = sc->buf; -// ptr = sc->ptr; - -// adjust len for use with ptr, clen, all absolute bytes. -// int blen = len<<2; - - if ( blen < (sizeof ctx->buf) - bptr ) - { - memcpy( buf + vptr, data, blen ); - ptr += blen; - ctx->ptr = bptr >> 2;; - return; - } - - READ_STATE32( ctx ); - while ( blen > 0 ) - { - size_t clen; - - clen = ( sizeof sc->buf ) - ptr; - if ( clen > blen ) - clen = blen; - memcpy( buf + vptr, data, clen ); - bptr += clen; - vptr = bptr >> 5; - data = (const unsigned char *)data + clen; - blen -= clen; - if ( bptr == sizeof ctx->buf ) - { - if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover - T1 += 1; - COMPRESS32x2_4WAY( ctx->rounds ); - ptr = 0; - } - } - WRITE_STATE32x2( ctx ); - ctx->ptr = bptr >> 2; -} - -static void -blake32x2_4way_close( blake_4way_small_context *ctx, void *dst ) -{ - __m256i buf[8] __attribute__ ((aligned (64))); - size_t ptr = ctx->ptr; - size_t vptr = ctx->ptr>>2; - unsigned bit_len = ( (unsigned)ptr << 3 ); // one lane - uint32_t th = ctx->T1; - uint32_t tl = ctx->T0 + bit_len; - - if ( ptr == 0 ) - { - ctx->T0 = 0xFFFFFE00UL; - ctx->T1 = 0xFFFFFFFFUL; - } - else if ( ctx->T0 == 0 ) - { - ctx->T0 = 0xFFFFFE00UL + bit_len; - ctx->T1 -= 1; - } - else - ctx->T0 -= 512 - bit_len; - - // memset doesn't do ints - buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 ); - - if ( vptr < 5 ) - { - memset_zero_256( buf + vptr + 1, 6 - vptr ); - buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32( - 0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) ); - buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl, - th,th,th,th ) ); - blake32x2_4way( ctx, buf + vptr, 64 - ptr ); - } - else - { - memset_zero_256( vbuf + vptr + 1, 7 - vptr ); - blake32x2_4way( ctx, vbuf + ptr, 64 - ptr ); - ctx->T0 = 0xFFFFFE00UL; - ctx->T1 = 0xFFFFFFFFUL; - buf[ 6 ] = mm256_zero; - buf[ 6 ] = _mm256_set_epi32( 0,0,0,0, - 0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL ); - buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl, - th, th, th, th ); - blake32x2_4way( ctx, buf, 64 ); - } - - casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) ); - casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) ); - casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) ); - casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) ); -} - - - - -#define DECL_STATE32x2_4WAY \ - __m256i H0, H1, H2, H3; \ - __m256i S0, S1; \ - uint32_t T0, T1; - -#define READ_STATE32x2_4WAY(state) do \ -{ \ - H0 = casti_m256i( state->H, 0 ); \ - H1 = casti_m256i( state->H, 1 ); \ - H2 = casti_m256i( state->H, 2 ); \ - H3 = casti_m256i( state->H, 3 ); \ - S0 = casti_m256i( state->S, 0 ); \ - S1 = casti_m256i( state->S, 1 ); \ - T0 = state->T0; \ - T1 = state->T1; \ - -#define WRITE_STATE32x2_4WAY(state) do { \ - casti_m256i( state->H, 0 ) = H0; \ - casti_m256i( state->H, 1 ) = H1; \ - casti_m256i( state->H, 2 ) = H2; \ - casti_m256i( state->H, 3 ) = H3; \ - casti_m256i( state->S, 0 ) = S0; \ - casti_m256i( state->S, 1 ) = S1; \ - state->T0 = T0; \ - state->T1 = T1; \ -} while (0) - - -#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \ -{ \ - a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ - _mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \ - _mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \ - d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \ - c = _mm256_add_epi32( c, d ); \ - b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \ - a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ - _mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \ - _mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \ - c = _mm256_add_epi32( c, d ); \ - b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \ -} while (0) - -#define ROUND_Sx2_4WAY(r) do \ -{ \ - GS2_4WAY( Mx(r, 0), Mx(r, 1), Mx(r, 2), Mx(r, 3), \ - CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \ - GS2_4WAY( Mx(r, 4), Mx(r, 5), Mx(r, 6), Mx(r, 7), \ - CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \ - mm256_ror1x128_512( V3, V2 ); \ - mm256_ror1x128_512( V6, V7 ); \ - GS2_4WAY( Mx(r, 8), Mx(r, 9), Mx(r, A), Mx(r, B), \ - CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \ - GS2_4WAY( Mx(r, C), Mx(r, D), Mx(r, C), Mx(r, D), \ - CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \ - mm256_rol1x128_512( V2, V3 ); \ - mm256_rol1x128_512( V7, V6 ); - -#define COMPRESS32x2_4WAY( rounds ) do \ -{ \ - __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ - __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ - unsigned r; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \ - CS0, CS0, CS0, CS0 ) ); \ - V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \ - CS2, CS2, CS2, CS2 ) ); \ - V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \ - _mm256_set_epi32( CS5, CS5, CS5, CS5, \ - CS4, CS4, CS4, CS4 ) ); \ - V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \ - _mm256_set_epi32( CS7, CS7, CS7, CS7, \ - CS6, CS6, CS6, CS6 ) ); \ - M0 = mm256_bswap_32( buf[ 0] ); \ - M1 = mm256_bswap_32( buf[ 1] ); \ - M2 = mm256_bswap_32( buf[ 2] ); \ - M3 = mm256_bswap_32( buf[ 3] ); \ - M4 = mm256_bswap_32( buf[ 4] ); \ - M5 = mm256_bswap_32( buf[ 5] ); \ - M6 = mm256_bswap_32( buf[ 6] ); \ - M7 = mm256_bswap_32( buf[ 7] ); \ - ROUND_Sx2_4WAY(0); \ - ROUND_Sx2_4WAY(1); \ - ROUND_Sx2_4WAY(2); \ - ROUND_Sx2_4WAY(3); \ - ROUND_Sx2_4WAY(4); \ - ROUND_Sx2_4WAY(5); \ - ROUND_Sx2_4WAY(6); \ - ROUND_Sx2_4WAY(7); \ - if (rounds == 14) \ - { \ - ROUND_Sx2_4WAY(8); \ - ROUND_Sx2_4WAY(9); \ - ROUND_Sx2_4WAY(0); \ - ROUND_Sx2_4WAY(1); \ - ROUND_Sx2_4WAY(2); \ - ROUND_Sx2_4WAY(3); \ - } \ - H0 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( V8, V0 ), S0 ), H0 ); \ - H1 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( V9, V1 ), S1 ), H1 ); \ - H2 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( VA, V2 ), S2 ), H2 ); \ - H3 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( VB, V3 ), S3 ), H3 ); \ -} while (0) - - - - - - - - - - - - - - diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c deleted file mode 100644 index 6799116..0000000 --- a/algo/blake/blake2b.c +++ /dev/null @@ -1,231 +0,0 @@ -/** - * Blake2-B Implementation - * tpruvot@github 2015-2016 - */ - -#include "algo-gate-api.h" -#include -#include -#include "algo/blake/sph_blake2b.h" - -//static __thread sph_blake2b_ctx s_midstate; -//static __thread sph_blake2b_ctx s_ctx; -#define MIDLEN 76 -#define A 64 - -void blake2b_hash(void *output, const void *input) -{ - uint8_t _ALIGN(A) hash[32]; - sph_blake2b_ctx ctx __attribute__ ((aligned (64))); - - sph_blake2b_init(&ctx, 32, NULL, 0); - sph_blake2b_update(&ctx, input, 80); - sph_blake2b_final(&ctx, hash); - - memcpy(output, hash, 32); -} - -/* -static void blake2b_hash_end(uint32_t *output, const uint32_t *input) -{ - s_ctx.outlen = MIDLEN; - memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN); - sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN); - sph_blake2b_final(&s_ctx, (uint8_t*) output); -} -*/ - -int scanhash_blake2b( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(A) vhashcpu[8]; - uint32_t _ALIGN(A) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[8]; - - uint32_t n = first_nonce; - - for (int i=0; i < 19; i++) { - be32enc(&endiandata[i], pdata[i]); - } - - // midstate (untested yet) - //blake2b_init(&s_midstate, 32, NULL, 0); - //blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN); - //memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx)); - - do { - be32enc(&endiandata[8], n); - //blake2b_hash_end(vhashcpu, endiandata); - blake2b_hash(vhashcpu, endiandata); - - if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) { - work_set_target_ratio(work, vhashcpu); - *hashes_done = n - first_nonce + 1; - pdata[8] = n; - return 1; - } - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - *hashes_done = n - first_nonce + 1; - pdata[8] = n; - - return 0; -} - -static inline void swab256(void *dest_p, const void *src_p) -{ - uint32_t *dest = (uint32_t *)dest_p; - const uint32_t *src = (uint32_t *)src_p; - - dest[0] = swab32(src[7]); - dest[1] = swab32(src[6]); - dest[2] = swab32(src[5]); - dest[3] = swab32(src[4]); - dest[4] = swab32(src[3]); - dest[5] = swab32(src[2]); - dest[6] = swab32(src[1]); - dest[7] = swab32(src[0]); -} - -/* compute nbits to get the network diff */ -void blake2b_calc_network_diff(struct work *work) -{ - // sample for diff 43.281 : 1c05ea29 - uint32_t nbits = work->data[11]; // unsure if correct - uint32_t bits = (nbits & 0xffffff); - int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 - - double d = (double)0x0000ffff / (double)bits; - for (int m=shift; m < 29; m++) d *= 256.0; - for (int m=29; m < shift; m++) d /= 256.0; - if (opt_debug_diff) - applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); - net_diff = d; -} - -void blake2b_be_build_stratum_request( char *req, struct work *work ) -{ - unsigned char *xnonce2str; - uint32_t ntime, nonce; - char ntimestr[9], noncestr[9]; - be32enc( &ntime, work->data[ algo_gate.ntime_index ] ); - be32enc( &nonce, work->data[ algo_gate.nonce_index ] ); - bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) ); - bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) ); - uint16_t high_nonce = swab32(work->data[9]) >> 16; - xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2); - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr ); - free( xnonce2str ); -} - -#define min(a,b) (a>b ? (b) :(a)) - -// merkle root handled here, no need for gen_merkle_root gate target -void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) -{ - uchar merkle_root[64] = { 0 }; - uint32_t extraheader[32] = { 0 }; - int headersize = 0; - size_t t; - int i; - - // merkle root - memcpy( merkle_root, sctx->job.coinbase, 32 ); - headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) ); - memcpy( extraheader, &sctx->job.coinbase[32], headersize ); - // Increment extranonce2 - for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - // Assemble block header - memset( g_work->data, 0, sizeof(g_work->data) ); -// g_work->data[0] = le32dec( sctx->job.version ); -// for ( i = 0; i < 8; i++ ) -// g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i ); - for ( i = 0; i < 8; i++ ) - g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i]; -// for ( i = 0; i < 8; i++ ) -// g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i ); - g_work->data[8] = 0; // nonce - g_work->data[9] = swab32( extraheader[0] ) | ( rand() & 0xf0 ); - g_work->data[10] = be32dec( sctx->job.ntime ); - g_work->data[11] = be32dec( sctx->job.nbits ); - for ( i = 0; i < 8; i++ ) - g_work->data[12+i] = ( (uint32_t*)merkle_root )[i]; -} - -#undef min - -void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id, - uint32_t* end_nonce_ptr, bool clean_job ) -{ - const int wkcmp_sz = 32; // bytes - const int wkcmp_off = 32 + 16; - uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); - - if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz ) - && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) - || strcmp( work->job_id, g_work->job_id ) ) ) - { - work_free( work ); - work_copy( work, g_work ); - *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id; - if ( opt_randomize ) - *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads; - *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; - } - else - ++(*nonceptr); - - // suprnova job_id check without data/target/height change... - // we just may have copied new g_wwork to work so why this test here? -// if ( have_stratum && strcmp( work->job_id, g_work->job_id ) ) - // exit thread loop -// continue; -// else -// { -// nonceptr[1] += 0x10; -// nonceptr[1] |= thr_id; -// } -} - -bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum, - int thr_id ) -{ - if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) ) - // need to regen g_work.. - return false; - // extradata: prevent duplicates - work->data[ 8 ] += 0x10; - work->data[ 8 + 1 ] |= thr_id; - return true; -} - -double blake2b_get_max64() { return 0x1fffffLL; } - -bool register_blake2b_algo( algo_gate_t* gate ) -{ - algo_not_tested(); - gate->ntime_index = 10; - gate->nbits_index = 11; - gate->nonce_index = 8; - gate->work_cmp_size = 32; - gate->scanhash = (void*)&scanhash_blake2b; - gate->hash = (void*)&blake2b_hash; - gate->calc_network_diff = (void*)&blake2b_calc_network_diff; - gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request; - gate->work_decode = (void*)&std_be_work_decode; - gate->submit_getwork_result = (void*)&std_be_submit_getwork_result; - gate->build_extraheader = (void*)&blake2b_build_extraheader; - gate->get_new_work = (void*)&blake2b_get_new_work; - gate->get_max64 = (void*)&blake2b_get_max64; - gate->ready_to_mine = (void*)&blake2b_ready_to_mine; - have_gbt = false; - return true; -} diff --git a/algo/blake/blake2s-4way.c b/algo/blake/blake2s-4way.c deleted file mode 100644 index 9048566..0000000 --- a/algo/blake/blake2s-4way.c +++ /dev/null @@ -1,119 +0,0 @@ -#include "blake2s-gate.h" -#include "blake2s-hash-4way.h" -#include -#include - -#if defined(BLAKE2S_8WAY) - -static __thread blake2s_8way_state blake2s_8w_ctx; - -void blake2s_8way_hash( void *output, const void *input ) -{ - uint32_t vhash[8*8] __attribute__ ((aligned (64))); - blake2s_8way_state ctx; - memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx ); - - blake2s_8way_update( &ctx, input + (64<<3), 16 ); - blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES ); - - dintrlv_8x32( output, output+ 32, output+ 64, output+ 96, - output+128, output+160, output+192, output+224, - vhash, 256 ); -} - -int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t hash[8*8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 19; // aligned - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - mm256_bswap32_intrlv80_8x32( vdata, pdata ); - blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES ); - blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 ); - - do { - *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, - n+3, n+2, n+1, n ) ); - pdata[19] = n; - - blake2s_8way_hash( hash, vdata ); - - - for ( int i = 0; i < 8; i++ ) - if ( (hash+(i<<3))[7] <= Htarg ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 8; - - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#elif defined(BLAKE2S_4WAY) - -static __thread blake2s_4way_state blake2s_4w_ctx; - -void blake2s_4way_hash( void *output, const void *input ) -{ - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - blake2s_4way_state ctx; - memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx ); - - blake2s_4way_update( &ctx, input + (64<<2), 16 ); - blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES ); - - dintrlv_4x32( output, output+32, output+64, output+96, - vhash, 256 ); -} - -int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[8*4] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES ); - blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 ); - - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); - pdata[19] = n; - - blake2s_4way_hash( hash, vdata ); - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/blake/blake2s-gate.c b/algo/blake/blake2s-gate.c deleted file mode 100644 index 2af35d5..0000000 --- a/algo/blake/blake2s-gate.c +++ /dev/null @@ -1,27 +0,0 @@ -#include "blake2s-gate.h" - - -// changed to get_max64_0x3fffffLL in cpuminer-multi-decred -int64_t blake2s_get_max64 () -{ - return 0x7ffffLL; -} - -bool register_blake2s_algo( algo_gate_t* gate ) -{ -#if defined(BLAKE2S_8WAY) - gate->scanhash = (void*)&scanhash_blake2s_8way; - gate->hash = (void*)&blake2s_8way_hash; -#elif defined(BLAKE2S_4WAY) - gate->scanhash = (void*)&scanhash_blake2s_4way; - gate->hash = (void*)&blake2s_4way_hash; -#else - gate->scanhash = (void*)&scanhash_blake2s; - gate->hash = (void*)&blake2s_hash; -#endif - gate->get_max64 = (void*)&blake2s_get_max64; - gate->optimizations = SSE42_OPT | AVX2_OPT; - return true; -}; - - diff --git a/algo/blake/blake2s-gate.h b/algo/blake/blake2s-gate.h deleted file mode 100644 index ee1a243..0000000 --- a/algo/blake/blake2s-gate.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __BLAKE2S_GATE_H__ -#define __BLAKE2S_GATE_H__ 1 - -#include -#include "algo-gate-api.h" - -#if defined(__SSE4_2__) - #define BLAKE2S_4WAY -#endif -#if defined(__AVX2__) - #define BLAKE2S_8WAY -#endif - -bool register_blake2s_algo( algo_gate_t* gate ); - -#if defined(BLAKE2S_8WAY) - -void blake2s_8way_hash( void *state, const void *input ); -int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#elif defined (BLAKE2S_4WAY) - -void blake2s_4way_hash( void *state, const void *input ); -int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#else - -void blake2s_hash( void *state, const void *input ); -int scanhash_blake2s( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - -#endif diff --git a/algo/blake/blake2s-hash-4way.c b/algo/blake/blake2s-hash-4way.c deleted file mode 100644 index f3bbe35..0000000 --- a/algo/blake/blake2s-hash-4way.c +++ /dev/null @@ -1,362 +0,0 @@ -/** - * BLAKE2 reference source code package - reference C implementations - * - * Written in 2012 by Samuel Neves - * - * To the extent possible under law, the author(s) have dedicated all copyright - * and related and neighboring rights to this software to the public domain - * worldwide. This software is distributed without any warranty. - * - * You should have received a copy of the CC0 Public Domain Dedication along with - * this software. If not, see . - */ - -#include "blake2s-hash-4way.h" - -#include -#include -#include - -#if defined(__SSE4_2__) - -static const uint32_t blake2s_IV[8] = -{ - 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, - 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL -}; - -static const uint8_t blake2s_sigma[10][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , -}; - -// define a constant for initial param. - -int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen ) -{ - blake2s_nway_param P[1]; - - P->digest_length = outlen; - P->key_length = 0; - P->fanout = 1; - P->depth = 1; - P->leaf_length = 0; - *((uint64_t*)(P->node_offset)) = 0; - P->node_depth = 0; - P->inner_length = 0; - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - - memset( S, 0, sizeof( blake2s_4way_state ) ); - for( int i = 0; i < 8; ++i ) - S->h[i] = _mm_set1_epi32( blake2s_IV[i] ); - - uint32_t *p = ( uint32_t * )( P ); - - /* IV XOR ParamBlock */ - for ( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) ); - return 0; -} - -int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block ) -{ - __m128i m[16]; - __m128i v[16]; - - memcpy_128( m, block, 16 ); - memcpy_128( v, S->h, 8 ); - - v[ 8] = _mm_set1_epi32( blake2s_IV[0] ); - v[ 9] = _mm_set1_epi32( blake2s_IV[1] ); - v[10] = _mm_set1_epi32( blake2s_IV[2] ); - v[11] = _mm_set1_epi32( blake2s_IV[3] ); - v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ), - _mm_set1_epi32( blake2s_IV[4] ) ); - v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ), - _mm_set1_epi32( blake2s_IV[5] ) ); - v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ), - _mm_set1_epi32( blake2s_IV[6] ) ); - v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ), - _mm_set1_epi32( blake2s_IV[7] ) ); - -#define G4W(r,i,a,b,c,d) \ -do { \ - a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \ - d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \ - c = _mm_add_epi32( c, d ); \ - b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \ - a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \ - d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \ - c = _mm_add_epi32( c, d ); \ - b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \ -} while(0) - -#define ROUND4W(r) \ -do { \ - G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \ - G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \ - G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \ - G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \ - G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \ - G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \ - G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \ - G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \ -} while(0) - - ROUND4W( 0 ); - ROUND4W( 1 ); - ROUND4W( 2 ); - ROUND4W( 3 ); - ROUND4W( 4 ); - ROUND4W( 5 ); - ROUND4W( 6 ); - ROUND4W( 7 ); - ROUND4W( 8 ); - ROUND4W( 9 ); - - for( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] ); - -#undef G4W -#undef ROUND4W - return 0; -} - -int blake2s_4way_update( blake2s_4way_state *S, const void *in, - uint64_t inlen ) -{ - __m128i *input = (__m128i*)in; - __m128i *buf = (__m128i*)S->buf; - const int bsize = BLAKE2S_BLOCKBYTES; - - while( inlen > 0 ) - { - size_t left = S->buflen; - if( inlen >= bsize - left ) - { - memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 ); - S->buflen += bsize - left; - S->t[0] += BLAKE2S_BLOCKBYTES; - S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES ); - blake2s_4way_compress( S, buf ); - S->buflen = 0; - input += ( bsize >> 2 ); - inlen -= bsize; - } - else - { - memcpy_128( buf + ( left>>2 ), input, inlen>>2 ); - S->buflen += (size_t) inlen; - input += ( inlen>>2 ); - inlen -= inlen; - } - } - return 0; -} - -int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen ) -{ - __m128i *buf = (__m128i*)S->buf; - - S->t[0] += S->buflen; - S->t[1] += ( S->t[0] < S->buflen ); - if ( S->last_node ) - S->f[1] = ~0U; - S->f[0] = ~0U; - - memset_zero_128( buf + ( S->buflen>>2 ), - ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 ); - blake2s_4way_compress( S, buf ); - - for ( int i = 0; i < 8; ++i ) - casti_m128i( out, i ) = S->h[ i ]; - return 0; -} - -#if defined(__AVX2__) - -int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block ) -{ - __m256i m[16]; - __m256i v[16]; - - memcpy_256( m, block, 16 ); - memcpy_256( v, S->h, 8 ); - - v[ 8] = _mm256_set1_epi32( blake2s_IV[0] ); - v[ 9] = _mm256_set1_epi32( blake2s_IV[1] ); - v[10] = _mm256_set1_epi32( blake2s_IV[2] ); - v[11] = _mm256_set1_epi32( blake2s_IV[3] ); - v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ), - _mm256_set1_epi32( blake2s_IV[4] ) ); - v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ), - _mm256_set1_epi32( blake2s_IV[5] ) ); - v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ), - _mm256_set1_epi32( blake2s_IV[6] ) ); - v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ), - _mm256_set1_epi32( blake2s_IV[7] ) ); - -#define G8W(r,i,a,b,c,d) \ -do { \ - a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \ - m[ blake2s_sigma[r][2*i+0] ] ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \ - c = _mm256_add_epi32( c, d ); \ - b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \ - a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \ - m[ blake2s_sigma[r][2*i+1] ] ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \ - c = _mm256_add_epi32( c, d ); \ - b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \ -} while(0) - -#define ROUND8W(r) \ -do { \ - G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \ - G8W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \ - G8W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \ - G8W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \ - G8W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \ - G8W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \ - G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \ - G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \ -} while(0) - - ROUND8W( 0 ); - ROUND8W( 1 ); - ROUND8W( 2 ); - ROUND8W( 3 ); - ROUND8W( 4 ); - ROUND8W( 5 ); - ROUND8W( 6 ); - ROUND8W( 7 ); - ROUND8W( 8 ); - ROUND8W( 9 ); - - for( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] ); - -#undef G8W -#undef ROUND8W - return 0; -} - -int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen ) -{ - blake2s_nway_param P[1]; - - P->digest_length = outlen; - P->key_length = 0; - P->fanout = 1; - P->depth = 1; - P->leaf_length = 0; - *((uint64_t*)(P->node_offset)) = 0; - P->node_depth = 0; - P->inner_length = 0; - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - - memset( S, 0, sizeof( blake2s_8way_state ) ); - for( int i = 0; i < 8; ++i ) - S->h[i] = _mm256_set1_epi32( blake2s_IV[i] ); - - uint32_t *p = ( uint32_t * )( P ); - - /* IV XOR ParamBlock */ - for ( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) ); - return 0; -} - -int blake2s_8way_update( blake2s_8way_state *S, const void *in, - uint64_t inlen ) -{ - __m256i *input = (__m256i*)in; - __m256i *buf = (__m256i*)S->buf; - const int bsize = BLAKE2S_BLOCKBYTES; - - while( inlen > 0 ) - { - size_t left = S->buflen; - if( inlen >= bsize - left ) - { - memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 ); - S->buflen += bsize - left; - S->t[0] += BLAKE2S_BLOCKBYTES; - S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES ); - blake2s_8way_compress( S, buf ); - S->buflen = 0; - input += ( bsize >> 2 ); - inlen -= bsize; - } - else - { - memcpy_256( buf + ( left>>2 ), input, inlen>>2 ); - S->buflen += (size_t) inlen; - input += ( inlen>>2 ); - inlen -= inlen; - } - } - return 0; -} - -int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen ) -{ - __m256i *buf = (__m256i*)S->buf; - - S->t[0] += S->buflen; - S->t[1] += ( S->t[0] < S->buflen ); - if ( S->last_node ) - S->f[1] = ~0U; - S->f[0] = ~0U; - - memset_zero_256( buf + ( S->buflen>>2 ), - ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 ); - blake2s_8way_compress( S, buf ); - - for ( int i = 0; i < 8; ++i ) - casti_m256i( out, i ) = S->h[ i ]; - return 0; -} - - -#endif // __AVX2__ - -#if 0 -int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) -{ - blake2s_state S[1]; - - /* Verify parameters */ - if ( NULL == in ) return -1; - - if ( NULL == out ) return -1; - - if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */ - - if( keylen > 0 ) - { - if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1; - } - else - { - if( blake2s_init( S, outlen ) < 0 ) return -1; - } - - blake2s_update( S, ( uint8_t * )in, inlen ); - blake2s_final( S, out, outlen ); - return 0; -} -#endif - -#endif // __SSE4_2__ diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h deleted file mode 100644 index 3457829..0000000 --- a/algo/blake/blake2s-hash-4way.h +++ /dev/null @@ -1,112 +0,0 @@ -/** - * BLAKE2 reference source code package - reference C implementations - * - * Written in 2012 by Samuel Neves - * - * To the extent possible under law, the author(s) have dedicated all copyright - * and related and neighboring rights to this software to the public domain - * worldwide. This software is distributed without any warranty. - * - * You should have received a copy of the CC0 Public Domain Dedication along with - * this software. If not, see . - */ -//#pragma once -#ifndef __BLAKE2S_HASH_4WAY_H__ -#define __BLAKE2S_HASH_4WAY_H__ 1 - -#if defined(__SSE4_2__) - -#include "simd-utils.h" - -#include -#include - -#if defined(_MSC_VER) -#include -#define inline __inline -#define ALIGN(x) __declspec(align(x)) -#else -#define ALIGN(x) __attribute__((aligned(x))) -#endif - - -#if defined(__cplusplus) -extern "C" { -#endif - -enum blake2s_constant -{ - BLAKE2S_BLOCKBYTES = 64, - BLAKE2S_OUTBYTES = 32, - BLAKE2S_KEYBYTES = 32, - BLAKE2S_SALTBYTES = 8, - BLAKE2S_PERSONALBYTES = 8 -}; - -#pragma pack(push, 1) -typedef struct __blake2s_nway_param -{ - uint8_t digest_length; // 1 - uint8_t key_length; // 2 - uint8_t fanout; // 3 - uint8_t depth; // 4 - uint32_t leaf_length; // 8 - uint8_t node_offset[6];// 14 - uint8_t node_depth; // 15 - uint8_t inner_length; // 16 - // uint8_t reserved[0]; - uint8_t salt[BLAKE2S_SALTBYTES]; // 24 - uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 -} blake2s_nway_param; -#pragma pack(pop) - -ALIGN( 64 ) typedef struct __blake2s_4way_state -{ - __m128i h[8]; - uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ]; - uint32_t t[2]; - uint32_t f[2]; - size_t buflen; - uint8_t last_node; -} blake2s_4way_state ; - -int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen ); -int blake2s_4way_update( blake2s_4way_state *S, const void *in, - uint64_t inlen ); -int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen ); - -#if defined(__AVX2__) - -ALIGN( 64 ) typedef struct __blake2s_8way_state -{ - __m256i h[8]; - uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ]; - uint32_t t[2]; - uint32_t f[2]; - size_t buflen; - uint8_t last_node; -} blake2s_8way_state ; - -int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen ); -int blake2s_8way_update( blake2s_8way_state *S, const void *in, - uint64_t inlen ); -int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen ); - -#endif - -#if 0 - // Simple API -// int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); - - // Direct Hash Mining Helpers - #define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */ - #define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0) -#endif - -#if defined(__cplusplus) -} -#endif - -#endif // __SSE4_2__ - -#endif diff --git a/algo/blake/blake2s.c b/algo/blake/blake2s.c deleted file mode 100644 index aee4ce5..0000000 --- a/algo/blake/blake2s.c +++ /dev/null @@ -1,87 +0,0 @@ -#include "blake2s-gate.h" - -#include -#include - -#include "sph-blake2s.h" - -static __thread blake2s_state blake2s_ctx; -//static __thread blake2s_state s_ctx; -#define MIDLEN 76 - -void blake2s_hash( void *output, const void *input ) -{ - unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES]; - blake2s_state ctx __attribute__ ((aligned (64))); - - memcpy( &ctx, &blake2s_ctx, sizeof ctx ); - blake2s_update( &ctx, input+64, 16 ); - -// blake2s_init(&ctx, BLAKE2S_OUTBYTES); -// blake2s_update(&ctx, input, 80); - blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES ); - - memcpy(output, hash, 32); -} -/* -static void blake2s_hash_end(uint32_t *output, const uint32_t *input) -{ - s_ctx.buflen = MIDLEN; - memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN); - blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN); - blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES); -} -*/ -int scanhash_blake2s( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - uint32_t _ALIGN(64) hash64[8]; - uint32_t _ALIGN(64) endiandata[20]; - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - - uint32_t n = first_nonce; - - swab32_array( endiandata, pdata, 20 ); - - // midstate - blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES ); - blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 ); - - do { - be32enc(&endiandata[19], n); - blake2s_hash( hash64, endiandata ); - if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return true; - } - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} -/* -// changed to get_max64_0x3fffffLL in cpuminer-multi-decred -int64_t blake2s_get_max64 () -{ - return 0x7ffffLL; -} - -bool register_blake2s_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_blake2s; - gate->hash = (void*)&blake2s_hash; - gate->get_max64 = (void*)&blake2s_get_max64; - return true; -}; -*/ diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c deleted file mode 100644 index b57f712..0000000 --- a/algo/blake/blake512-hash-4way.c +++ /dev/null @@ -1,701 +0,0 @@ -/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */ -/* - * BLAKE implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#if defined (__AVX2__) - -#include -#include -#include - -#include "blake-hash-4way.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE -#define SPH_SMALL_FOOTPRINT_BLAKE 1 -#endif - -#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE) -#define SPH_COMPACT_BLAKE_64 1 -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - - -// Blake-512 - -static const sph_u64 IV512[8] = { - SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), - SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), - SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), - SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) -}; - - -#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64 - -// Blake-256 4 & 8 way, Blake-512 4 way - -static const unsigned sigma[16][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } -}; - -#endif - -#define Z00 0 -#define Z01 1 -#define Z02 2 -#define Z03 3 -#define Z04 4 -#define Z05 5 -#define Z06 6 -#define Z07 7 -#define Z08 8 -#define Z09 9 -#define Z0A A -#define Z0B B -#define Z0C C -#define Z0D D -#define Z0E E -#define Z0F F - -#define Z10 E -#define Z11 A -#define Z12 4 -#define Z13 8 -#define Z14 9 -#define Z15 F -#define Z16 D -#define Z17 6 -#define Z18 1 -#define Z19 C -#define Z1A 0 -#define Z1B 2 -#define Z1C B -#define Z1D 7 -#define Z1E 5 -#define Z1F 3 - -#define Z20 B -#define Z21 8 -#define Z22 C -#define Z23 0 -#define Z24 5 -#define Z25 2 -#define Z26 F -#define Z27 D -#define Z28 A -#define Z29 E -#define Z2A 3 -#define Z2B 6 -#define Z2C 7 -#define Z2D 1 -#define Z2E 9 -#define Z2F 4 - -#define Z30 7 -#define Z31 9 -#define Z32 3 -#define Z33 1 -#define Z34 D -#define Z35 C -#define Z36 B -#define Z37 E -#define Z38 2 -#define Z39 6 -#define Z3A 5 -#define Z3B A -#define Z3C 4 -#define Z3D 0 -#define Z3E F -#define Z3F 8 - -#define Z40 9 -#define Z41 0 -#define Z42 5 -#define Z43 7 -#define Z44 2 -#define Z45 4 -#define Z46 A -#define Z47 F -#define Z48 E -#define Z49 1 -#define Z4A B -#define Z4B C -#define Z4C 6 -#define Z4D 8 -#define Z4E 3 -#define Z4F D - -#define Z50 2 -#define Z51 C -#define Z52 6 -#define Z53 A -#define Z54 0 -#define Z55 B -#define Z56 8 -#define Z57 3 -#define Z58 4 -#define Z59 D -#define Z5A 7 -#define Z5B 5 -#define Z5C F -#define Z5D E -#define Z5E 1 -#define Z5F 9 - -#define Z60 C -#define Z61 5 -#define Z62 1 -#define Z63 F -#define Z64 E -#define Z65 D -#define Z66 4 -#define Z67 A -#define Z68 0 -#define Z69 7 -#define Z6A 6 -#define Z6B 3 -#define Z6C 9 -#define Z6D 2 -#define Z6E 8 -#define Z6F B - -#define Z70 D -#define Z71 B -#define Z72 7 -#define Z73 E -#define Z74 C -#define Z75 1 -#define Z76 3 -#define Z77 9 -#define Z78 5 -#define Z79 0 -#define Z7A F -#define Z7B 4 -#define Z7C 8 -#define Z7D 6 -#define Z7E 2 -#define Z7F A - -#define Z80 6 -#define Z81 F -#define Z82 E -#define Z83 9 -#define Z84 B -#define Z85 3 -#define Z86 0 -#define Z87 8 -#define Z88 C -#define Z89 2 -#define Z8A D -#define Z8B 7 -#define Z8C 1 -#define Z8D 4 -#define Z8E A -#define Z8F 5 - -#define Z90 A -#define Z91 2 -#define Z92 8 -#define Z93 4 -#define Z94 7 -#define Z95 6 -#define Z96 1 -#define Z97 5 -#define Z98 F -#define Z99 B -#define Z9A 9 -#define Z9B E -#define Z9C 3 -#define Z9D C -#define Z9E D -#define Z9F 0 - -#define Mx(r, i) Mx_(Z ## r ## i) -#define Mx_(n) Mx__(n) -#define Mx__(n) M ## n - -// Blake-512 4 way - -#define CBx(r, i) CBx_(Z ## r ## i) -#define CBx_(n) CBx__(n) -#define CBx__(n) CB ## n - -#define CB0 SPH_C64(0x243F6A8885A308D3) -#define CB1 SPH_C64(0x13198A2E03707344) -#define CB2 SPH_C64(0xA4093822299F31D0) -#define CB3 SPH_C64(0x082EFA98EC4E6C89) -#define CB4 SPH_C64(0x452821E638D01377) -#define CB5 SPH_C64(0xBE5466CF34E90C6C) -#define CB6 SPH_C64(0xC0AC29B7C97C50DD) -#define CB7 SPH_C64(0x3F84D5B5B5470917) -#define CB8 SPH_C64(0x9216D5D98979FB1B) -#define CB9 SPH_C64(0xD1310BA698DFB5AC) -#define CBA SPH_C64(0x2FFD72DBD01ADFB7) -#define CBB SPH_C64(0xB8E1AFED6A267E96) -#define CBC SPH_C64(0xBA7C9045F12C7F99) -#define CBD SPH_C64(0x24A19947B3916CF7) -#define CBE SPH_C64(0x0801F2E2858EFC16) -#define CBF SPH_C64(0x636920D871574E69) - -#if SPH_COMPACT_BLAKE_64 -// not used -static const sph_u64 CB[16] = { - SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344), - SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89), - SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C), - SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917), - SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC), - SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96), - SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7), - SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69) -}; - -#endif - - -// Blake-512 4 way - -#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \ - a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \ - _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \ - d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \ - c = _mm256_add_epi64( c, d ); \ - b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \ - a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \ - _mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \ - d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \ - c = _mm256_add_epi64( c, d ); \ - b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \ -} while (0) - -#if SPH_COMPACT_BLAKE_64 -// not used -#define ROUND_B_4WAY(r) do { \ - GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \ - CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \ - GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \ - CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \ - GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \ - CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \ - GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \ - CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \ - GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \ - CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \ - GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \ - CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \ - GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \ - CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \ - GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \ - CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \ -} while (0) - -#else -//current_impl -#define ROUND_B_4WAY(r) do { \ - GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \ - GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \ - GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \ - GB_4WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \ - GB_4WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \ - GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \ - GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \ - GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \ - } while (0) - -#endif - - -// Blake-512 4 way - -#define DECL_STATE64_4WAY \ - __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ - __m256i S0, S1, S2, S3; \ - sph_u64 T0, T1; - -#define READ_STATE64_4WAY(state) do { \ - H0 = (state)->H[0]; \ - H1 = (state)->H[1]; \ - H2 = (state)->H[2]; \ - H3 = (state)->H[3]; \ - H4 = (state)->H[4]; \ - H5 = (state)->H[5]; \ - H6 = (state)->H[6]; \ - H7 = (state)->H[7]; \ - S0 = (state)->S[0]; \ - S1 = (state)->S[1]; \ - S2 = (state)->S[2]; \ - S3 = (state)->S[3]; \ - T0 = (state)->T0; \ - T1 = (state)->T1; \ - } while (0) - -#define WRITE_STATE64_4WAY(state) do { \ - (state)->H[0] = H0; \ - (state)->H[1] = H1; \ - (state)->H[2] = H2; \ - (state)->H[3] = H3; \ - (state)->H[4] = H4; \ - (state)->H[5] = H5; \ - (state)->H[6] = H6; \ - (state)->H[7] = H7; \ - (state)->S[0] = S0; \ - (state)->S[1] = S1; \ - (state)->S[2] = S2; \ - (state)->S[3] = S3; \ - (state)->T0 = T0; \ - (state)->T1 = T1; \ - } while (0) - -#if SPH_COMPACT_BLAKE_64 - -// not used -#define COMPRESS64_4WAY do { \ - __m256i M[16]; \ - __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ - __m256i V8, V9, VA, VB, VC, VD, VE, VF; \ - unsigned r; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \ - V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \ - VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \ - VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \ - VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \ - _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \ - VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \ - _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \ - VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ - _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \ - VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ - _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \ - M[0x0] = mm256_bswap_64( *(buf+0) ); \ - M[0x1] = mm256_bswap_64( *(buf+1) ); \ - M[0x2] = mm256_bswap_64( *(buf+2) ); \ - M[0x3] = mm256_bswap_64( *(buf+3) ); \ - M[0x4] = mm256_bswap_64( *(buf+4) ); \ - M[0x5] = mm256_bswap_64( *(buf+5) ); \ - M[0x6] = mm256_bswap_64( *(buf+6) ); \ - M[0x7] = mm256_bswap_64( *(buf+7) ); \ - M[0x8] = mm256_bswap_64( *(buf+8) ); \ - M[0x9] = mm256_bswap_64( *(buf+9) ); \ - M[0xA] = mm256_bswap_64( *(buf+10) ); \ - M[0xB] = mm256_bswap_64( *(buf+11) ); \ - M[0xC] = mm256_bswap_64( *(buf+12) ); \ - M[0xD] = mm256_bswap_64( *(buf+13) ); \ - M[0xE] = mm256_bswap_64( *(buf+14) ); \ - M[0xF] = mm256_bswap_64( *(buf+15) ); \ - for (r = 0; r < 16; r ++) \ - ROUND_B_4WAY(r); \ - H0 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \ - H1 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \ - H2 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( S2, V2 ), VA ), H2 ); \ - H3 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( S3, V3 ), VB ), H3 ); \ - H4 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( S0, V4 ), VC ), H4 ); \ - H5 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( S1, V5 ), VD ), H5 ); \ - H6 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( S2, V6 ), VE ), H6 ); \ - H7 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( S3, V7 ), VF ), H7 ); \ - } while (0) - -#else - -//current impl - -#define COMPRESS64_4WAY do \ -{ \ - __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ - __m256i M8, M9, MA, MB, MC, MD, ME, MF; \ - __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ - __m256i V8, V9, VA, VB, VC, VD, VE, VF; \ - __m256i shuf_bswap64; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \ - V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \ - VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \ - VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \ - VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ - _mm256_set1_epi64x( CB4 ) ); \ - VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ - _mm256_set1_epi64x( CB5 ) ); \ - VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \ - _mm256_set1_epi64x( CB6 ) ); \ - VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \ - _mm256_set1_epi64x( CB7 ) ); \ - shuf_bswap64 = _mm256_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607, \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ - M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \ - M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \ - M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \ - M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \ - M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \ - M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \ - M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \ - M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \ - M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \ - M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \ - MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \ - MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \ - MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \ - MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \ - ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \ - MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \ - ROUND_B_4WAY(0); \ - ROUND_B_4WAY(1); \ - ROUND_B_4WAY(2); \ - ROUND_B_4WAY(3); \ - ROUND_B_4WAY(4); \ - ROUND_B_4WAY(5); \ - ROUND_B_4WAY(6); \ - ROUND_B_4WAY(7); \ - ROUND_B_4WAY(8); \ - ROUND_B_4WAY(9); \ - ROUND_B_4WAY(0); \ - ROUND_B_4WAY(1); \ - ROUND_B_4WAY(2); \ - ROUND_B_4WAY(3); \ - ROUND_B_4WAY(4); \ - ROUND_B_4WAY(5); \ - H0 = mm256_xor4( V8, V0, S0, H0 ); \ - H1 = mm256_xor4( V9, V1, S1, H1 ); \ - H2 = mm256_xor4( VA, V2, S2, H2 ); \ - H3 = mm256_xor4( VB, V3, S3, H3 ); \ - H4 = mm256_xor4( VC, V4, S0, H4 ); \ - H5 = mm256_xor4( VD, V5, S1, H5 ); \ - H6 = mm256_xor4( VE, V6, S2, H6 ); \ - H7 = mm256_xor4( VF, V7, S3, H7 ); \ -} while (0) - -#endif - -static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 }; - -static void -blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv, - const sph_u64 *salt ) -{ - __m256i zero = m256_zero; - casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( iv[0] ); - casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( iv[1] ); - casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( iv[2] ); - casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( iv[3] ); - casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( iv[4] ); - casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( iv[5] ); - casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( iv[6] ); - casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( iv[7] ); - - casti_m256i( sc->S, 0 ) = zero; - casti_m256i( sc->S, 1 ) = zero; - casti_m256i( sc->S, 2 ) = zero; - casti_m256i( sc->S, 3 ) = zero; - - sc->T0 = sc->T1 = 0; - sc->ptr = 0; -} - -static void -blake64_4way( blake_4way_big_context *sc, const void *data, size_t len) -{ - __m256i *vdata = (__m256i*)data; - __m256i *buf; - size_t ptr; - DECL_STATE64_4WAY - - const int buf_size = 128; // sizeof/8 - - buf = sc->buf; - ptr = sc->ptr; - if ( len < (buf_size - ptr) ) - { - memcpy_256( buf + (ptr>>3), vdata, len>>3 ); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE64_4WAY(sc); - while ( len > 0 ) - { - size_t clen; - - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( buf + (ptr>>3), vdata, clen>>3 ); - ptr += clen; - vdata = vdata + (clen>>3); - len -= clen; - if (ptr == buf_size ) - { - if ((T0 = SPH_T64(T0 + 1024)) < 1024) - T1 = SPH_T64(T1 + 1); - COMPRESS64_4WAY; - ptr = 0; - } - } - WRITE_STATE64_4WAY(sc); - sc->ptr = ptr; -} - -static void -blake64_4way_close( blake_4way_big_context *sc, - unsigned ub, unsigned n, void *dst, size_t out_size_w64) -{ - __m256i buf[16]; - size_t ptr; - unsigned bit_len; - uint64_t z, zz; - sph_u64 th, tl; - - ptr = sc->ptr; - bit_len = ((unsigned)ptr << 3); - z = 0x80 >> n; - zz = ((ub & -z) | z) & 0xFF; - buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz ); - tl = sc->T0 + bit_len; - th = sc->T1; - if (ptr == 0 ) - { - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL); - sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL); - } - else if ( sc->T0 == 0 ) - { - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len; - sc->T1 = SPH_T64(sc->T1 - 1); - } - else - { - sc->T0 -= 1024 - bit_len; - } - if ( ptr <= 104 ) - { - memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 ); - if ( out_size_w64 == 8 ) - buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)], - _mm256_set1_epi64x( 0x0100000000000000ULL ) ); - *(buf+(112>>3)) = mm256_bswap_64( - _mm256_set_epi64x( th, th, th, th ) ); - *(buf+(120>>3)) = mm256_bswap_64( - _mm256_set_epi64x( tl, tl, tl, tl ) ); - - blake64_4way( sc, buf + (ptr>>3), 128 - ptr ); - } - else - { - memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 ); - - blake64_4way( sc, buf + (ptr>>3), 128 - ptr ); - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL); - sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL); - memset_zero_256( buf, 112>>3 ); - if ( out_size_w64 == 8 ) - buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL ); - *(buf+(112>>3)) = mm256_bswap_64( - _mm256_set_epi64x( th, th, th, th ) ); - *(buf+(120>>3)) = mm256_bswap_64( - _mm256_set_epi64x( tl, tl, tl, tl ) ); - - blake64_4way( sc, buf, 128 ); - } - mm256_block_bswap_64( (__m256i*)dst, sc->H ); -} - -void -blake512_4way_init(void *cc) -{ - blake64_4way_init(cc, IV512, salt_zero_big); -} - -void -blake512_4way(void *cc, const void *data, size_t len) -{ - blake64_4way(cc, data, len); -} - -void -blake512_4way_close(void *cc, void *dst) -{ - blake512_4way_addbits_and_close(cc, 0, 0, dst); -} - -void -blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - blake64_4way_close(cc, ub, n, dst, 8); -} - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/blake/blakecoin-4way.c b/algo/blake/blakecoin-4way.c deleted file mode 100644 index 898cbe3..0000000 --- a/algo/blake/blakecoin-4way.c +++ /dev/null @@ -1,121 +0,0 @@ -#include "blakecoin-gate.h" -#include "blake-hash-4way.h" -#include -#include -#include - -#if defined (BLAKECOIN_4WAY) - -blake256r8_4way_context blakecoin_4w_ctx; - -void blakecoin_4way_hash(void *state, const void *input) -{ - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - blake256r8_4way_context ctx; - - memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx ); - blake256r8_4way( &ctx, input + (64<<2), 16 ); - blake256r8_4way_close( &ctx, vhash ); - - dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 ); -} - -int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[8*4] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t HTarget = ptarget[7]; - uint32_t n = first_nonce; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - if ( opt_benchmark ) - HTarget = 0x7f; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - blake256r8_4way_init( &blakecoin_4w_ctx ); - blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 ); - - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); - pdata[19] = n; - blakecoin_4way_hash( hash, vdata ); - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) - && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - -#if defined(BLAKECOIN_8WAY) - -blake256r8_8way_context blakecoin_8w_ctx; - -void blakecoin_8way_hash( void *state, const void *input ) -{ - uint32_t vhash[8*8] __attribute__ ((aligned (64))); - blake256r8_8way_context ctx; - - memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx ); - blake256r8_8way( &ctx, input + (64<<3), 16 ); - blake256r8_8way_close( &ctx, vhash ); - - dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128, - state+160, state+192, state+224, vhash, 256 ); -} - -int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t hash[8*8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t HTarget = ptarget[7]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - if ( opt_benchmark ) - HTarget = 0x7f; - - mm256_bswap32_intrlv80_8x32( vdata, pdata ); - blake256r8_8way_init( &blakecoin_8w_ctx ); - blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 ); - - do { - *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, - n+3, n+2, n+1, n ) ); - pdata[19] = n; - blakecoin_8way_hash( hash, vdata ); - - for ( int i = 0; i < 8; i++ ) - if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) - && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 8; - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - diff --git a/algo/blake/blakecoin-gate.c b/algo/blake/blakecoin-gate.c deleted file mode 100644 index 0429063..0000000 --- a/algo/blake/blakecoin-gate.c +++ /dev/null @@ -1,36 +0,0 @@ -#include "blakecoin-gate.h" -#include - -// changed to get_max64_0x3fffffLL in cpuminer-multi-decred -int64_t blakecoin_get_max64 () -{ - return 0x7ffffLL; -// return 0x3fffffLL; -} - -// vanilla uses default gen merkle root, otherwise identical to blakecoin -bool register_vanilla_algo( algo_gate_t* gate ) -{ -#if defined(BLAKECOIN_8WAY) - gate->scanhash = (void*)&scanhash_blakecoin_8way; - gate->hash = (void*)&blakecoin_8way_hash; - -#elif defined(BLAKECOIN_4WAY) - gate->scanhash = (void*)&scanhash_blakecoin_4way; - gate->hash = (void*)&blakecoin_4way_hash; -#else - gate->scanhash = (void*)&scanhash_blakecoin; - gate->hash = (void*)&blakecoinhash; -#endif - gate->optimizations = SSE42_OPT | AVX2_OPT; - gate->get_max64 = (void*)&blakecoin_get_max64; - return true; -} - -bool register_blakecoin_algo( algo_gate_t* gate ) -{ - register_vanilla_algo( gate ); - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; - return true; -} - diff --git a/algo/blake/blakecoin-gate.h b/algo/blake/blakecoin-gate.h deleted file mode 100644 index 456aa90..0000000 --- a/algo/blake/blakecoin-gate.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef __BLAKECOIN_GATE_H__ -#define __BLAKECOIN_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__SSE4_2__) - #define BLAKECOIN_4WAY -#endif -#if defined(__AVX2__) - #define BLAKECOIN_8WAY -#endif - -#if defined (BLAKECOIN_8WAY) -void blakecoin_8way_hash(void *state, const void *input); -int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - -#if defined (BLAKECOIN_4WAY) -void blakecoin_4way_hash(void *state, const void *input); -int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - -void blakecoinhash( void *state, const void *input ); -int scanhash_blakecoin( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif diff --git a/algo/blake/blakecoin.c b/algo/blake/blakecoin.c deleted file mode 100644 index f733c2c..0000000 --- a/algo/blake/blakecoin.c +++ /dev/null @@ -1,125 +0,0 @@ -#include "blakecoin-gate.h" -#define BLAKE32_ROUNDS 8 -#include "sph_blake.h" - -void blakecoin_init(void *cc); -void blakecoin(void *cc, const void *data, size_t len); -void blakecoin_close(void *cc, void *dst); - -#include -#include -#include -#include - -// context management is staged for efficiency. -// 1. global initial ctx cached on startup -// 2. per-thread midstate ctx cache refreshed every scan -// 3. local ctx for final hash calculation - -static sph_blake256_context blake_init_ctx; -static __thread sph_blake256_context blake_mid_ctx; - -static void blake_midstate_init( const void* input ) -{ - // copy cached initial state - memcpy( &blake_mid_ctx, &blake_init_ctx, sizeof blake_mid_ctx ); - blakecoin( &blake_mid_ctx, input, 64 ); -} - -void blakecoinhash( void *state, const void *input ) -{ - sph_blake256_context ctx; - uint8_t hash[64] __attribute__ ((aligned (32))); - uint8_t *ending = (uint8_t*) input + 64; - - // copy cached midstate - memcpy( &ctx, &blake_mid_ctx, sizeof ctx ); - blakecoin( &ctx, ending, 16 ); - blakecoin_close( &ctx, hash ); - memcpy( state, hash, 32 ); -} - -int scanhash_blakecoin( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t HTarget = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint32_t _ALIGN(32) hash64[8]; - uint32_t _ALIGN(32) endiandata[20]; - - uint32_t n = first_nonce; - - if (opt_benchmark) - HTarget = 0x7f; - - // we need big endian data... - for (int kk=0; kk < 19; kk++) - be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]); - - blake_midstate_init( endiandata ); - -#ifdef DEBUG_ALGO - applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]); -#endif - - do { - be32enc(&endiandata[19], n); - blakecoinhash(hash64, endiandata); -#ifndef DEBUG_ALGO - if (hash64[7] <= HTarget && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (hash64[7] == 0) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - n++; pdata[19] = n; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -/* -void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx ) -{ - SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root ); -} -*/ -/* -// changed to get_max64_0x3fffffLL in cpuminer-multi-decred -int64_t blakecoin_get_max64 () -{ - return 0x7ffffLL; -} - -// vanilla uses default gen merkle root, otherwise identical to blakecoin -bool register_vanilla_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_blakecoin; - gate->hash = (void*)&blakecoinhash; - gate->get_max64 = (void*)&blakecoin_get_max64; - blakecoin_init( &blake_init_ctx ); - return true; -} - -bool register_blakecoin_algo( algo_gate_t* gate ) -{ - register_vanilla_algo( gate ); - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; - return true; -} -*/ diff --git a/algo/blake/decred-4way.c b/algo/blake/decred-4way.c deleted file mode 100644 index 1cbf157..0000000 --- a/algo/blake/decred-4way.c +++ /dev/null @@ -1,74 +0,0 @@ -#include "decred-gate.h" -#include "blake-hash-4way.h" -#include -#include -#include -#include - -#if defined (DECRED_4WAY) - -static __thread blake256_4way_context blake_mid; - -void decred_hash_4way( void *state, const void *input ) -{ - uint32_t vhash[8*4] __attribute__ ((aligned (64))); -// uint32_t hash0[8] __attribute__ ((aligned (32))); -// uint32_t hash1[8] __attribute__ ((aligned (32))); -// uint32_t hash2[8] __attribute__ ((aligned (32))); -// uint32_t hash3[8] __attribute__ ((aligned (32))); - const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 ); - int tail_len = 180 - DECRED_MIDSTATE_LEN; - blake256_4way_context ctx __attribute__ ((aligned (64))); - - memcpy( &ctx, &blake_mid, sizeof(blake_mid) ); - blake256_4way( &ctx, tail, tail_len ); - blake256_4way_close( &ctx, vhash ); - dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 ); -} - -int scanhash_decred_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[48*4] __attribute__ ((aligned (64))); - uint32_t hash[8*4] __attribute__ ((aligned (32))); - uint32_t _ALIGN(64) edata[48]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX]; - uint32_t n = first_nonce; - const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - // copy to buffer guaranteed to be aligned. - memcpy( edata, pdata, 180 ); - - // use the old way until new way updated for size. - mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 ); - - blake256_4way_init( &blake_mid ); - blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN ); - - uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4; - do { - * noncep = n; - *(noncep+1) = n+1; - *(noncep+2) = n+2; - *(noncep+3) = n+3; - - decred_hash_4way( hash, vdata ); - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= HTarget ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[DECRED_NONCE_INDEX] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/blake/decred-gate.c b/algo/blake/decred-gate.c deleted file mode 100644 index e9c2091..0000000 --- a/algo/blake/decred-gate.c +++ /dev/null @@ -1,173 +0,0 @@ -#include "decred-gate.h" -#include -#include -#include - -uint32_t *decred_get_nonceptr( uint32_t *work_data ) -{ - return &work_data[ DECRED_NONCE_INDEX ]; -} - -double decred_calc_network_diff( struct work* work ) -{ - // sample for diff 43.281 : 1c05ea29 - // todo: endian reversed on longpoll could be zr5 specific... - uint32_t nbits = work->data[ DECRED_NBITS_INDEX ]; - uint32_t bits = ( nbits & 0xffffff ); - int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28 - int m; - double d = (double)0x0000ffff / (double)bits; - - for ( m = shift; m < 29; m++ ) - d *= 256.0; - for ( m = 29; m < shift; m++ ) - d /= 256.0; - if ( shift == 28 ) - d *= 256.0; // testnet - if ( opt_debug_diff ) - applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, - shift, bits ); - return net_diff; -} - -void decred_decode_extradata( struct work* work, uint64_t* net_blocks ) -{ - // some random extradata to make the work unique - work->data[ DECRED_XNONCE_INDEX ] = (rand()*4); - work->height = work->data[32]; - if (!have_longpoll && work->height > *net_blocks + 1) - { - char netinfo[64] = { 0 }; - if (opt_showdiff && net_diff > 0.) - { - if (net_diff != work->targetdiff) - sprintf(netinfo, ", diff %.3f, target %.1f", net_diff, - work->targetdiff); - else - sprintf(netinfo, ", diff %.3f", net_diff); - } - applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height, - netinfo); - *net_blocks = work->height - 1; - } -} - -void decred_be_build_stratum_request( char *req, struct work *work, - struct stratum_ctx *sctx ) -{ - unsigned char *xnonce2str; - uint32_t ntime, nonce; - char ntimestr[9], noncestr[9]; - - be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] ); - be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] ); - bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) ); - bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) ); - xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ), - sctx->xnonce1_size ); - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr ); - free(xnonce2str); -} -#define min(a,b) (a>b ? (b) :(a)) - -void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) -{ - uchar merkle_root[64] = { 0 }; - uint32_t extraheader[32] = { 0 }; - int headersize = 0; - uint32_t* extradata = (uint32_t*) sctx->xnonce1; - size_t t; - int i; - - // getwork over stratum, getwork merkle + header passed in coinb1 - memcpy(merkle_root, sctx->job.coinbase, 32); - headersize = min((int)sctx->job.coinbase_size - 32, - sizeof(extraheader) ); - memcpy( extraheader, &sctx->job.coinbase[32], headersize ); - - // Increment extranonce2 - for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - - // Assemble block header - memset( g_work->data, 0, sizeof(g_work->data) ); - g_work->data[0] = le32dec( sctx->job.version ); - for ( i = 0; i < 8; i++ ) - g_work->data[1 + i] = swab32( - le32dec( (uint32_t *) sctx->job.prevhash + i ) ); - for ( i = 0; i < 8; i++ ) - g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) ); - -// for ( i = 0; i < 8; i++ ) // prevhash -// g_work->data[1 + i] = swab32( g_work->data[1 + i] ); -// for ( i = 0; i < 8; i++ ) // merkle -// g_work->data[9 + i] = swab32( g_work->data[9 + i] ); - - for ( i = 0; i < headersize/4; i++ ) // header - g_work->data[17 + i] = extraheader[i]; - // extradata - - for ( i = 0; i < sctx->xnonce1_size/4; i++ ) - g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i]; - for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ ) - g_work->data[i] = 0; - g_work->data[37] = (rand()*4) << 8; - // block header suffix from coinb2 (stake version) - memcpy( &g_work->data[44], - &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 ); - sctx->bloc_height = g_work->data[32]; - //applog_hex(work->data, 180); - //applog_hex(&work->data[36], 36); -} - -#undef min - -bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum, - int thr_id ) -{ - if ( have_stratum && strcmp(stratum->job.job_id, work->job_id) ) - // need to regen g_work.. - return false; - if ( have_stratum && !work->data[0] && !opt_benchmark ) - { - sleep(1); - return false; - } - // extradata: prevent duplicates - work->data[ DECRED_XNONCE_INDEX ] += 1; - work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id; - return true; -} - -int decred_get_work_data_size() { return DECRED_DATA_SIZE; } - -bool register_decred_algo( algo_gate_t* gate ) -{ -#if defined(DECRED_4WAY) - four_way_not_tested(); - gate->scanhash = (void*)&scanhash_decred_4way; - gate->hash = (void*)&decred_hash_4way; -#else - gate->scanhash = (void*)&scanhash_decred; - gate->hash = (void*)&decred_hash; -#endif - gate->optimizations = AVX2_OPT; - gate->get_nonceptr = (void*)&decred_get_nonceptr; - gate->get_max64 = (void*)&get_max64_0x3fffffLL; - gate->decode_extra_data = (void*)&decred_decode_extradata; - gate->build_stratum_request = (void*)&decred_be_build_stratum_request; - gate->work_decode = (void*)&std_be_work_decode; - gate->submit_getwork_result = (void*)&std_be_submit_getwork_result; - gate->build_extraheader = (void*)&decred_build_extraheader; - gate->ready_to_mine = (void*)&decred_ready_to_mine; - gate->nbits_index = DECRED_NBITS_INDEX; - gate->ntime_index = DECRED_NTIME_INDEX; - gate->nonce_index = DECRED_NONCE_INDEX; - gate->get_work_data_size = (void*)&decred_get_work_data_size; - gate->work_cmp_size = DECRED_WORK_COMPARE_SIZE; - allow_mininginfo = false; - have_gbt = false; - return true; -} - diff --git a/algo/blake/decred-gate.h b/algo/blake/decred-gate.h deleted file mode 100644 index 3910b50..0000000 --- a/algo/blake/decred-gate.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef __DECRED_GATE_H__ -#define __DECRED_GATE_H__ - -#include "algo-gate-api.h" -#include - -#define DECRED_NBITS_INDEX 29 -#define DECRED_NTIME_INDEX 34 -#define DECRED_NONCE_INDEX 35 -#define DECRED_XNONCE_INDEX 36 -#define DECRED_DATA_SIZE 192 -#define DECRED_WORK_COMPARE_SIZE 140 -#define DECRED_MIDSTATE_LEN 128 - -#if defined (__AVX2__) -//void blakehash_84way(void *state, const void *input); -//int scanhash_blake_8way( struct work *work, uint32_t max_nonce, -// uint64_t *hashes_done ); -#endif - -#if defined(__SSE4_2__) - #define DECRED_4WAY -#endif - -#if defined (DECRED_4WAY) -void decred_hash_4way(void *state, const void *input); -int scanhash_decred_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - -void decred_hash( void *state, const void *input ); -int scanhash_decred( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - diff --git a/algo/blake/decred.c b/algo/blake/decred.c deleted file mode 100644 index 8645d2a..0000000 --- a/algo/blake/decred.c +++ /dev/null @@ -1,288 +0,0 @@ -#include "decred-gate.h" -#include "sph_blake.h" - -#include -#include -#include -#include - -/* -#ifndef min -#define min(a,b) (a>b ? b : a) -#endif -#ifndef max -#define max(a,b) (adata; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - -// #define DCR_NONCE_OFT32 35 - - const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX]; - const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7]; - - uint32_t n = first_nonce; - - ctx_midstate_done = false; - -#if 1 - memcpy(endiandata, pdata, 180); -#else - for (int k=0; k < (180/4); k++) - be32enc(&endiandata[k], pdata[k]); -#endif - -#ifdef DEBUG_ALGO - if (!thr_id) applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]); -#endif - - do { - //be32enc(&endiandata[DCR_NONCE_OFT32], n); - endiandata[DECRED_NONCE_INDEX] = n; - decred_hash(hash32, endiandata); - - if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) { - work_set_target_ratio(work, hash32); - *hashes_done = n - first_nonce + 1; -#ifdef DEBUG_ALGO - applog(LOG_BLUE, "Nonce : %08x %08x", n, swab32(n)); - applog_hash(ptarget); - applog_compare_hash(hash32, ptarget); -#endif - pdata[DECRED_NONCE_INDEX] = n; - return 1; - } - - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[DECRED_NONCE_INDEX] = n; - return 0; -} - -/* -uint32_t *decred_get_nonceptr( uint32_t *work_data ) -{ - return &work_data[ DECRED_NONCE_INDEX ]; -} - -double decred_calc_network_diff( struct work* work ) -{ - // sample for diff 43.281 : 1c05ea29 - // todo: endian reversed on longpoll could be zr5 specific... - uint32_t nbits = work->data[ DECRED_NBITS_INDEX ]; - uint32_t bits = ( nbits & 0xffffff ); - int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28 - int m; - double d = (double)0x0000ffff / (double)bits; - - for ( m = shift; m < 29; m++ ) - d *= 256.0; - for ( m = 29; m < shift; m++ ) - d /= 256.0; - if ( shift == 28 ) - d *= 256.0; // testnet - if ( opt_debug_diff ) - applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, - shift, bits ); - return net_diff; -} - -void decred_decode_extradata( struct work* work, uint64_t* net_blocks ) -{ - // some random extradata to make the work unique - work->data[ DECRED_XNONCE_INDEX ] = (rand()*4); - work->height = work->data[32]; - if (!have_longpoll && work->height > *net_blocks + 1) - { - char netinfo[64] = { 0 }; - if (opt_showdiff && net_diff > 0.) - { - if (net_diff != work->targetdiff) - sprintf(netinfo, ", diff %.3f, target %.1f", net_diff, - work->targetdiff); - else - sprintf(netinfo, ", diff %.3f", net_diff); - } - applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height, - netinfo); - *net_blocks = work->height - 1; - } -} - -void decred_be_build_stratum_request( char *req, struct work *work, - struct stratum_ctx *sctx ) -{ - unsigned char *xnonce2str; - uint32_t ntime, nonce; - char ntimestr[9], noncestr[9]; - - be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] ); - be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] ); - bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) ); - bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) ); - xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ), - sctx->xnonce1_size ); - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr ); - free(xnonce2str); -} -*/ -/* -// data shared between gen_merkle_root and build_extraheader. -__thread uint32_t decred_extraheader[32] = { 0 }; -__thread int decred_headersize = 0; - -void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) -{ - // getwork over stratum, getwork merkle + header passed in coinb1 - memcpy(merkle_root, sctx->job.coinbase, 32); - decred_headersize = min((int)sctx->job.coinbase_size - 32, - sizeof(decred_extraheader) ); - memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize); -} -*/ - -/* -#define min(a,b) (a>b ? (b) :(a)) - -void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) -{ - uchar merkle_root[64] = { 0 }; - uint32_t extraheader[32] = { 0 }; - int headersize = 0; - uint32_t* extradata = (uint32_t*) sctx->xnonce1; - size_t t; - int i; - - // getwork over stratum, getwork merkle + header passed in coinb1 - memcpy(merkle_root, sctx->job.coinbase, 32); - headersize = min((int)sctx->job.coinbase_size - 32, - sizeof(extraheader) ); - memcpy( extraheader, &sctx->job.coinbase[32], headersize ); - - // Increment extranonce2 - for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - - // Assemble block header - memset( g_work->data, 0, sizeof(g_work->data) ); - g_work->data[0] = le32dec( sctx->job.version ); - for ( i = 0; i < 8; i++ ) - g_work->data[1 + i] = swab32( - le32dec( (uint32_t *) sctx->job.prevhash + i ) ); - for ( i = 0; i < 8; i++ ) - g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) ); - -// for ( i = 0; i < 8; i++ ) // prevhash -// g_work->data[1 + i] = swab32( g_work->data[1 + i] ); -// for ( i = 0; i < 8; i++ ) // merkle -// g_work->data[9 + i] = swab32( g_work->data[9 + i] ); - - for ( i = 0; i < headersize/4; i++ ) // header - g_work->data[17 + i] = extraheader[i]; - // extradata - - for ( i = 0; i < sctx->xnonce1_size/4; i++ ) - g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i]; - for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ ) - g_work->data[i] = 0; - g_work->data[37] = (rand()*4) << 8; - // block header suffix from coinb2 (stake version) - memcpy( &g_work->data[44], - &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 ); - sctx->bloc_height = g_work->data[32]; - //applog_hex(work->data, 180); - //applog_hex(&work->data[36], 36); -} - -#undef min - -bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum, - int thr_id ) -{ - if ( have_stratum && strcmp(stratum->job.job_id, work->job_id) ) - // need to regen g_work.. - return false; - if ( have_stratum && !work->data[0] && !opt_benchmark ) - { - sleep(1); - return false; - } - // extradata: prevent duplicates - work->data[ DECRED_XNONCE_INDEX ] += 1; - work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id; - return true; -} - - -bool register_decred_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT; - gate->scanhash = (void*)&scanhash_decred; - gate->hash = (void*)&decred_hash; - gate->get_nonceptr = (void*)&decred_get_nonceptr; - gate->get_max64 = (void*)&get_max64_0x3fffffLL; - gate->decode_extra_data = (void*)&decred_decode_extradata; - gate->build_stratum_request = (void*)&decred_be_build_stratum_request; - gate->work_decode = (void*)&std_be_work_decode; - gate->submit_getwork_result = (void*)&std_be_submit_getwork_result; - gate->build_extraheader = (void*)&decred_build_extraheader; - gate->ready_to_mine = (void*)&decred_ready_to_mine; - gate->nbits_index = DECRED_NBITS_INDEX; - gate->ntime_index = DECRED_NTIME_INDEX; - gate->nonce_index = DECRED_NONCE_INDEX; - gate->work_data_size = DECRED_DATA_SIZE; - gate->work_cmp_size = DECRED_WORK_COMPARE_SIZE; - allow_mininginfo = false; - have_gbt = false; - return true; -} -*/ diff --git a/algo/blake/mod_blakecoin.c b/algo/blake/mod_blakecoin.c deleted file mode 100644 index 7b597f9..0000000 --- a/algo/blake/mod_blakecoin.c +++ /dev/null @@ -1,531 +0,0 @@ -/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */ -/* - * BLAKECOIN implementation. (Stripped to 256 bits only) - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - * @author Tanguy Pruvot (cpuminer implementation) - */ - -#include -#include -#include - -#include "sph_blake.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -static const sph_u32 IV256[8] = { - SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), - SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), - SPH_C32(0x510E527F), SPH_C32(0x9B05688C), - SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) -}; - -#define Z00 0 -#define Z01 1 -#define Z02 2 -#define Z03 3 -#define Z04 4 -#define Z05 5 -#define Z06 6 -#define Z07 7 -#define Z08 8 -#define Z09 9 -#define Z0A A -#define Z0B B -#define Z0C C -#define Z0D D -#define Z0E E -#define Z0F F - -#define Z10 E -#define Z11 A -#define Z12 4 -#define Z13 8 -#define Z14 9 -#define Z15 F -#define Z16 D -#define Z17 6 -#define Z18 1 -#define Z19 C -#define Z1A 0 -#define Z1B 2 -#define Z1C B -#define Z1D 7 -#define Z1E 5 -#define Z1F 3 - -#define Z20 B -#define Z21 8 -#define Z22 C -#define Z23 0 -#define Z24 5 -#define Z25 2 -#define Z26 F -#define Z27 D -#define Z28 A -#define Z29 E -#define Z2A 3 -#define Z2B 6 -#define Z2C 7 -#define Z2D 1 -#define Z2E 9 -#define Z2F 4 - -#define Z30 7 -#define Z31 9 -#define Z32 3 -#define Z33 1 -#define Z34 D -#define Z35 C -#define Z36 B -#define Z37 E -#define Z38 2 -#define Z39 6 -#define Z3A 5 -#define Z3B A -#define Z3C 4 -#define Z3D 0 -#define Z3E F -#define Z3F 8 - -#define Z40 9 -#define Z41 0 -#define Z42 5 -#define Z43 7 -#define Z44 2 -#define Z45 4 -#define Z46 A -#define Z47 F -#define Z48 E -#define Z49 1 -#define Z4A B -#define Z4B C -#define Z4C 6 -#define Z4D 8 -#define Z4E 3 -#define Z4F D - -#define Z50 2 -#define Z51 C -#define Z52 6 -#define Z53 A -#define Z54 0 -#define Z55 B -#define Z56 8 -#define Z57 3 -#define Z58 4 -#define Z59 D -#define Z5A 7 -#define Z5B 5 -#define Z5C F -#define Z5D E -#define Z5E 1 -#define Z5F 9 - -#define Z60 C -#define Z61 5 -#define Z62 1 -#define Z63 F -#define Z64 E -#define Z65 D -#define Z66 4 -#define Z67 A -#define Z68 0 -#define Z69 7 -#define Z6A 6 -#define Z6B 3 -#define Z6C 9 -#define Z6D 2 -#define Z6E 8 -#define Z6F B - -#define Z70 D -#define Z71 B -#define Z72 7 -#define Z73 E -#define Z74 C -#define Z75 1 -#define Z76 3 -#define Z77 9 -#define Z78 5 -#define Z79 0 -#define Z7A F -#define Z7B 4 -#define Z7C 8 -#define Z7D 6 -#define Z7E 2 -#define Z7F A - -#define Z80 6 -#define Z81 F -#define Z82 E -#define Z83 9 -#define Z84 B -#define Z85 3 -#define Z86 0 -#define Z87 8 -#define Z88 C -#define Z89 2 -#define Z8A D -#define Z8B 7 -#define Z8C 1 -#define Z8D 4 -#define Z8E A -#define Z8F 5 - -#define Z90 A -#define Z91 2 -#define Z92 8 -#define Z93 4 -#define Z94 7 -#define Z95 6 -#define Z96 1 -#define Z97 5 -#define Z98 F -#define Z99 B -#define Z9A 9 -#define Z9B E -#define Z9C 3 -#define Z9D C -#define Z9E D -#define Z9F 0 - -#define Mx(r, i) Mx_(Z ## r ## i) -#define Mx_(n) Mx__(n) -#define Mx__(n) M ## n - -#define CSx(r, i) CSx_(Z ## r ## i) -#define CSx_(n) CSx__(n) -#define CSx__(n) CS ## n - -#define CS0 SPH_C32(0x243F6A88) -#define CS1 SPH_C32(0x85A308D3) -#define CS2 SPH_C32(0x13198A2E) -#define CS3 SPH_C32(0x03707344) -#define CS4 SPH_C32(0xA4093822) -#define CS5 SPH_C32(0x299F31D0) -#define CS6 SPH_C32(0x082EFA98) -#define CS7 SPH_C32(0xEC4E6C89) -#define CS8 SPH_C32(0x452821E6) -#define CS9 SPH_C32(0x38D01377) -#define CSA SPH_C32(0xBE5466CF) -#define CSB SPH_C32(0x34E90C6C) -#define CSC SPH_C32(0xC0AC29B7) -#define CSD SPH_C32(0xC97C50DD) -#define CSE SPH_C32(0x3F84D5B5) -#define CSF SPH_C32(0xB5470917) - -#if SPH_64 - -#define CBx(r, i) CBx_(Z ## r ## i) -#define CBx_(n) CBx__(n) -#define CBx__(n) CB ## n - -#define CB0 SPH_C64(0x243F6A8885A308D3) -#define CB1 SPH_C64(0x13198A2E03707344) -#define CB2 SPH_C64(0xA4093822299F31D0) -#define CB3 SPH_C64(0x082EFA98EC4E6C89) -#define CB4 SPH_C64(0x452821E638D01377) -#define CB5 SPH_C64(0xBE5466CF34E90C6C) -#define CB6 SPH_C64(0xC0AC29B7C97C50DD) -#define CB7 SPH_C64(0x3F84D5B5B5470917) -#define CB8 SPH_C64(0x9216D5D98979FB1B) -#define CB9 SPH_C64(0xD1310BA698DFB5AC) -#define CBA SPH_C64(0x2FFD72DBD01ADFB7) -#define CBB SPH_C64(0xB8E1AFED6A267E96) -#define CBC SPH_C64(0xBA7C9045F12C7F99) -#define CBD SPH_C64(0x24A19947B3916CF7) -#define CBE SPH_C64(0x0801F2E2858EFC16) -#define CBF SPH_C64(0x636920D871574E69) - -#endif - -#define GS(m0, m1, c0, c1, a, b, c, d) do { \ - a = SPH_T32(a + b + (m0 ^ c1)); \ - d = SPH_ROTR32(d ^ a, 16); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 12); \ - a = SPH_T32(a + b + (m1 ^ c0)); \ - d = SPH_ROTR32(d ^ a, 8); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 7); \ - } while (0) - -#define ROUND_S(r) do { \ - GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ - GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ - GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ - GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ - GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ - GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ - GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ - GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ - } while (0) - -#define DECL_STATE32 \ - sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \ - sph_u32 S0, S1, S2, S3, T0, T1; - -#define READ_STATE32(state) do { \ - H0 = (state)->H[0]; \ - H1 = (state)->H[1]; \ - H2 = (state)->H[2]; \ - H3 = (state)->H[3]; \ - H4 = (state)->H[4]; \ - H5 = (state)->H[5]; \ - H6 = (state)->H[6]; \ - H7 = (state)->H[7]; \ - S0 = (state)->S[0]; \ - S1 = (state)->S[1]; \ - S2 = (state)->S[2]; \ - S3 = (state)->S[3]; \ - T0 = (state)->T0; \ - T1 = (state)->T1; \ - } while (0) - -#define WRITE_STATE32(state) do { \ - (state)->H[0] = H0; \ - (state)->H[1] = H1; \ - (state)->H[2] = H2; \ - (state)->H[3] = H3; \ - (state)->H[4] = H4; \ - (state)->H[5] = H5; \ - (state)->H[6] = H6; \ - (state)->H[7] = H7; \ - (state)->S[0] = S0; \ - (state)->S[1] = S1; \ - (state)->S[2] = S2; \ - (state)->S[3] = S3; \ - (state)->T0 = T0; \ - (state)->T1 = T1; \ - } while (0) - -#define BLAKE32_ROUNDS 8 - -#define COMPRESS32 do { \ - sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ - sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ - sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = S0 ^ CS0; \ - V9 = S1 ^ CS1; \ - VA = S2 ^ CS2; \ - VB = S3 ^ CS3; \ - VC = T0 ^ CS4; \ - VD = T0 ^ CS5; \ - VE = T1 ^ CS6; \ - VF = T1 ^ CS7; \ - M0 = sph_dec32be_aligned(buf + 0); \ - M1 = sph_dec32be_aligned(buf + 4); \ - M2 = sph_dec32be_aligned(buf + 8); \ - M3 = sph_dec32be_aligned(buf + 12); \ - M4 = sph_dec32be_aligned(buf + 16); \ - M5 = sph_dec32be_aligned(buf + 20); \ - M6 = sph_dec32be_aligned(buf + 24); \ - M7 = sph_dec32be_aligned(buf + 28); \ - M8 = sph_dec32be_aligned(buf + 32); \ - M9 = sph_dec32be_aligned(buf + 36); \ - MA = sph_dec32be_aligned(buf + 40); \ - MB = sph_dec32be_aligned(buf + 44); \ - MC = sph_dec32be_aligned(buf + 48); \ - MD = sph_dec32be_aligned(buf + 52); \ - ME = sph_dec32be_aligned(buf + 56); \ - MF = sph_dec32be_aligned(buf + 60); \ - ROUND_S(0); \ - ROUND_S(1); \ - ROUND_S(2); \ - ROUND_S(3); \ - ROUND_S(4); \ - ROUND_S(5); \ - ROUND_S(6); \ - ROUND_S(7); \ - if (BLAKE32_ROUNDS == 14) { \ - ROUND_S(8); \ - ROUND_S(9); \ - ROUND_S(0); \ - ROUND_S(1); \ - ROUND_S(2); \ - ROUND_S(3); \ - } \ - H0 ^= S0 ^ V0 ^ V8; \ - H1 ^= S1 ^ V1 ^ V9; \ - H2 ^= S2 ^ V2 ^ VA; \ - H3 ^= S3 ^ V3 ^ VB; \ - H4 ^= S0 ^ V4 ^ VC; \ - H5 ^= S1 ^ V5 ^ VD; \ - H6 ^= S2 ^ V6 ^ VE; \ - H7 ^= S3 ^ V7 ^ VF; \ - } while (0) - - -static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 }; - -static void -blake32_init(sph_blake_small_context *sc, - const sph_u32 *iv, const sph_u32 *salt) -{ - memcpy(sc->H, iv, 8 * sizeof(sph_u32)); - memcpy(sc->S, salt, 4 * sizeof(sph_u32)); - sc->T0 = sc->T1 = 0; - sc->ptr = 0; -} - -static void -blake32(sph_blake_small_context *sc, const void *data, size_t len) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE32 - - buf = sc->buf; - ptr = sc->ptr; - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE32(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == sizeof sc->buf) { - if ((T0 = SPH_T32(T0 + 512)) < 512) - T1 = SPH_T32(T1 + 1); - COMPRESS32; - ptr = 0; - } - } - WRITE_STATE32(sc); - sc->ptr = ptr; -} - -static void -blake32_close(sph_blake_small_context *sc, - unsigned ub, unsigned n, void *dst, size_t out_size_w32) -{ - union { - unsigned char buf[64]; - sph_u32 dummy; - } u; - size_t ptr, k; - unsigned bit_len; - unsigned z; - sph_u32 th, tl; - unsigned char *out; - - ptr = sc->ptr; - bit_len = ((unsigned)ptr << 3) + n; - z = 0x80 >> n; - u.buf[ptr] = ((ub & -z) | z) & 0xFF; - tl = sc->T0 + bit_len; - th = sc->T1; - if (ptr == 0 && n == 0) { - sc->T0 = SPH_C32(0xFFFFFE00); - sc->T1 = SPH_C32(0xFFFFFFFF); - } else if (sc->T0 == 0) { - sc->T0 = SPH_C32(0xFFFFFE00) + bit_len; - sc->T1 = SPH_T32(sc->T1 - 1); - } else { - sc->T0 -= 512 - bit_len; - } - if (bit_len <= 446) { - memset(u.buf + ptr + 1, 0, 55 - ptr); - if (out_size_w32 == 8) - u.buf[55] |= 1; - sph_enc32be_aligned(u.buf + 56, th); - sph_enc32be_aligned(u.buf + 60, tl); - blake32(sc, u.buf + ptr, 64 - ptr); - } else { - memset(u.buf + ptr + 1, 0, 63 - ptr); - blake32(sc, u.buf + ptr, 64 - ptr); - sc->T0 = SPH_C32(0xFFFFFE00); - sc->T1 = SPH_C32(0xFFFFFFFF); - memset(u.buf, 0, 56); - if (out_size_w32 == 8) - u.buf[55] = 1; - sph_enc32be_aligned(u.buf + 56, th); - sph_enc32be_aligned(u.buf + 60, tl); - blake32(sc, u.buf, 64); - } - out = dst; - for (k = 0; k < out_size_w32; k ++) - sph_enc32be(out + (k << 2), sc->H[k]); -} - -void -blakecoin_init(void *cc) -{ - blake32_init(cc, IV256, salt_zero_small); -} - -void -blakecoin(void *cc, const void *data, size_t len) -{ - blake32(cc, data, len); -} - -static void -blakecoin_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - blake32_close(cc, ub, n, dst, 8); - blakecoin_init(cc); -} - -void -blakecoin_close(void *cc, void *dst) -{ - blakecoin_addbits_and_close(cc, 0, 0, dst); -} - -#ifdef __cplusplus -} -#endif diff --git a/algo/blake/pentablake-4way.c b/algo/blake/pentablake-4way.c deleted file mode 100644 index 1b45afa..0000000 --- a/algo/blake/pentablake-4way.c +++ /dev/null @@ -1,122 +0,0 @@ -#include "pentablake-gate.h" - -#if defined (__AVX2__) - -#include -#include -#include -#include - -#include "blake-hash-4way.h" -#include "sph_blake.h" - -extern void pentablakehash_4way( void *output, const void *input ) -{ - - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - blake512_4way_context ctx; - - - blake512_4way_init( &ctx ); - blake512_4way( &ctx, input, 80 ); - blake512_4way_close( &ctx, vhash ); - - blake512_4way_init( &ctx ); - blake512_4way( &ctx, vhash, 64 ); - blake512_4way_close( &ctx, vhash ); - - blake512_4way_init( &ctx ); - blake512_4way( &ctx, vhash, 64 ); - blake512_4way_close( &ctx, vhash ); - - blake512_4way_init( &ctx ); - blake512_4way( &ctx, vhash, 64 ); - blake512_4way_close( &ctx, vhash ); - - blake512_4way_init( &ctx ); - blake512_4way( &ctx, vhash, 64 ); - blake512_4way_close( &ctx, vhash ); - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); -} - -int scanhash_pentablake_4way( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t endiandata[32] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t *noncep = vdata + 73; // 9*8 + 1 - int thr_id = mythr->id; // thr_id arg is deprecated - -// uint32_t _ALIGN(32) hash64[8]; -// uint32_t _ALIGN(32) endiandata[32]; - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); - - for ( int m=0; m < 6; m++ ) - { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); - - pentablakehash_4way( hash, vdata ); - - for ( int i = 0; i < 4; i++ ) - if ( !( (hash+(i<<3))[7] & mask ) - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n + i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - - } while (n < max_nonce && !work_restart[thr_id].restart); - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -#endif diff --git a/algo/blake/pentablake-gate.c b/algo/blake/pentablake-gate.c deleted file mode 100644 index b194206..0000000 --- a/algo/blake/pentablake-gate.c +++ /dev/null @@ -1,16 +0,0 @@ -#include "pentablake-gate.h" - -bool register_pentablake_algo( algo_gate_t* gate ) -{ -#if defined (PENTABLAKE_4WAY) - gate->scanhash = (void*)&scanhash_pentablake_4way; - gate->hash = (void*)&pentablakehash_4way; -#else - gate->scanhash = (void*)&scanhash_pentablake; - gate->hash = (void*)&pentablakehash; -#endif - gate->optimizations = AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/blake/pentablake-gate.h b/algo/blake/pentablake-gate.h deleted file mode 100644 index 0d2d995..0000000 --- a/algo/blake/pentablake-gate.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __PENTABLAKE_GATE_H__ -#define __PENTABLAKE_GATE_H__ - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) - #define PENTABLAKE_4WAY -#endif - -#if defined(PENTABLAKE_4WAY) -void pentablakehash_4way( void *state, const void *input ); -int scanhash_pentablake_4way( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -#endif - -void pentablakehash( void *state, const void *input ); -int scanhash_pentablake( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - diff --git a/algo/blake/pentablake.c b/algo/blake/pentablake.c deleted file mode 100644 index 55c874c..0000000 --- a/algo/blake/pentablake.c +++ /dev/null @@ -1,113 +0,0 @@ -#include "pentablake-gate.h" -#include -#include -#include -#include - -#include "sph_blake.h" - -//#define DEBUG_ALGO - -extern void pentablakehash(void *output, const void *input) -{ - unsigned char _ALIGN(32) hash[128]; - // same as uint32_t hashA[16], hashB[16]; - #define hashB hash+64 - - sph_blake512_context ctx_blake; - - sph_blake512_init(&ctx_blake); - sph_blake512(&ctx_blake, input, 80); - sph_blake512_close(&ctx_blake, hash); - - sph_blake512_init(&ctx_blake); - sph_blake512(&ctx_blake, hash, 64); - sph_blake512_close(&ctx_blake, hashB); - - sph_blake512_init(&ctx_blake); - sph_blake512(&ctx_blake, hashB, 64); - sph_blake512_close(&ctx_blake, hash); - - sph_blake512_init(&ctx_blake); - sph_blake512(&ctx_blake, hash, 64); - sph_blake512_close(&ctx_blake, hashB); - - sph_blake512_init(&ctx_blake); - sph_blake512(&ctx_blake, hashB, 64); - sph_blake512_close(&ctx_blake, hash); - - memcpy(output, hash, 32); - -} - -int scanhash_pentablake( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint32_t _ALIGN(32) hash64[8]; - uint32_t _ALIGN(32) endiandata[32]; - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - -#ifdef DEBUG_ALGO - if (Htarg != 0) - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - pentablakehash(hash64, endiandata); -#ifndef DEBUG_ALGO - if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/blake/sph-blake2s.c b/algo/blake/sph-blake2s.c deleted file mode 100644 index a732910..0000000 --- a/algo/blake/sph-blake2s.c +++ /dev/null @@ -1,378 +0,0 @@ -/** - * BLAKE2 reference source code package - reference C implementations - * - * Written in 2012 by Samuel Neves - * - * To the extent possible under law, the author(s) have dedicated all copyright - * and related and neighboring rights to this software to the public domain - * worldwide. This software is distributed without any warranty. - * - * You should have received a copy of the CC0 Public Domain Dedication along with - * this software. If not, see . - */ - -#include -#include -#include - -#include "algo/sha/sph_types.h" -#include "sph-blake2s.h" - -static const uint32_t blake2s_IV[8] = -{ - 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, - 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL -}; - -static const uint8_t blake2s_sigma[10][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , -}; - -static inline int blake2s_set_lastnode( blake2s_state *S ) -{ - S->f[1] = ~0U; - return 0; -} - -static inline int blake2s_clear_lastnode( blake2s_state *S ) -{ - S->f[1] = 0U; - return 0; -} - -/* Some helper functions, not necessarily useful */ -static inline int blake2s_set_lastblock( blake2s_state *S ) -{ - if( S->last_node ) blake2s_set_lastnode( S ); - - S->f[0] = ~0U; - return 0; -} - -static inline int blake2s_clear_lastblock( blake2s_state *S ) -{ - if( S->last_node ) blake2s_clear_lastnode( S ); - - S->f[0] = 0U; - return 0; -} - -static inline int blake2s_increment_counter( blake2s_state *S, const uint32_t inc ) -{ - S->t[0] += inc; - S->t[1] += ( S->t[0] < inc ); - return 0; -} - -// Parameter-related functions -static inline int blake2s_param_set_digest_length( blake2s_param *P, const uint8_t digest_length ) -{ - P->digest_length = digest_length; - return 0; -} - -static inline int blake2s_param_set_fanout( blake2s_param *P, const uint8_t fanout ) -{ - P->fanout = fanout; - return 0; -} - -static inline int blake2s_param_set_max_depth( blake2s_param *P, const uint8_t depth ) -{ - P->depth = depth; - return 0; -} - -static inline int blake2s_param_set_leaf_length( blake2s_param *P, const uint32_t leaf_length ) -{ - store32( &P->leaf_length, leaf_length ); - return 0; -} - -static inline int blake2s_param_set_node_offset( blake2s_param *P, const uint64_t node_offset ) -{ - store48( P->node_offset, node_offset ); - return 0; -} - -static inline int blake2s_param_set_node_depth( blake2s_param *P, const uint8_t node_depth ) -{ - P->node_depth = node_depth; - return 0; -} - -static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_t inner_length ) -{ - P->inner_length = inner_length; - return 0; -} - -static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] ) -{ - memcpy( P->salt, salt, BLAKE2S_SALTBYTES ); - return 0; -} - -static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] ) -{ - memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES ); - return 0; -} - -static inline int blake2s_init0( blake2s_state *S ) -{ - memset( S, 0, sizeof( blake2s_state ) ); - - for( int i = 0; i < 8; ++i ) S->h[i] = blake2s_IV[i]; - - return 0; -} - -/* init2 xors IV with input parameter block */ -int blake2s_init_param( blake2s_state *S, const blake2s_param *P ) -{ - blake2s_init0( S ); - uint32_t *p = ( uint32_t * )( P ); - - /* IV XOR ParamBlock */ - for( size_t i = 0; i < 8; ++i ) - S->h[i] ^= load32( &p[i] ); - - return 0; -} - - -// Sequential blake2s initialization -int blake2s_init( blake2s_state *S, const uint8_t outlen ) -{ - blake2s_param P[1]; - - /* Move interval verification here? */ - if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1; - - P->digest_length = outlen; - P->key_length = 0; - P->fanout = 1; - P->depth = 1; - store32( &P->leaf_length, 0 ); - store48( &P->node_offset, 0 ); - P->node_depth = 0; - P->inner_length = 0; - // memset(P->reserved, 0, sizeof(P->reserved) ); - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - return blake2s_init_param( S, P ); -} - -int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ) -{ - blake2s_param P[1]; - - if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1; - - if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1; - - P->digest_length = outlen; - P->key_length = keylen; - P->fanout = 1; - P->depth = 1; - store32( &P->leaf_length, 0 ); - store48( &P->node_offset, 0 ); - P->node_depth = 0; - P->inner_length = 0; - // memset(P->reserved, 0, sizeof(P->reserved) ); - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - - if( blake2s_init_param( S, P ) < 0 ) return -1; - - { - uint8_t block[BLAKE2S_BLOCKBYTES]; - memset( block, 0, BLAKE2S_BLOCKBYTES ); - memcpy( block, key, keylen ); - blake2s_update( S, block, BLAKE2S_BLOCKBYTES ); - secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */ - } - return 0; -} - -int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) -{ - uint32_t m[16]; - uint32_t v[16]; - - for( size_t i = 0; i < 16; ++i ) - m[i] = load32( block + i * sizeof( m[i] ) ); - - for( size_t i = 0; i < 8; ++i ) - v[i] = S->h[i]; - - v[ 8] = blake2s_IV[0]; - v[ 9] = blake2s_IV[1]; - v[10] = blake2s_IV[2]; - v[11] = blake2s_IV[3]; - v[12] = S->t[0] ^ blake2s_IV[4]; - v[13] = S->t[1] ^ blake2s_IV[5]; - v[14] = S->f[0] ^ blake2s_IV[6]; - v[15] = S->f[1] ^ blake2s_IV[7]; -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2s_sigma[r][2*i+0]]; \ - d = SPH_ROTR32(d ^ a, 16); \ - c = c + d; \ - b = SPH_ROTR32(b ^ c, 12); \ - a = a + b + m[blake2s_sigma[r][2*i+1]]; \ - d = SPH_ROTR32(d ^ a, 8); \ - c = c + d; \ - b = SPH_ROTR32(b ^ c, 7); \ - } while(0) -#define ROUND(r) \ - do { \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ - } while(0) - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - - for( size_t i = 0; i < 8; ++i ) - S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; - -#undef G -#undef ROUND - return 0; -} - - -int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen ) -{ - while( inlen > 0 ) - { - size_t left = S->buflen; - size_t fill = 2 * BLAKE2S_BLOCKBYTES - left; - - if( inlen > fill ) - { - memcpy( S->buf + left, in, fill ); // Fill buffer - S->buflen += fill; - blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES ); - blake2s_compress( S, S->buf ); // Compress - memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left - S->buflen -= BLAKE2S_BLOCKBYTES; - in += fill; - inlen -= fill; - } - else // inlen <= fill - { - memcpy(S->buf + left, in, (size_t) inlen); - S->buflen += (size_t) inlen; // Be lazy, do not compress - in += inlen; - inlen -= inlen; - } - } - - return 0; -} - -int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen ) -{ - uint8_t buffer[BLAKE2S_OUTBYTES]; - - if( S->buflen > BLAKE2S_BLOCKBYTES ) - { - blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES ); - blake2s_compress( S, S->buf ); - S->buflen -= BLAKE2S_BLOCKBYTES; - memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen ); - } - - blake2s_increment_counter( S, ( uint32_t )S->buflen ); - blake2s_set_lastblock( S ); - memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */ - blake2s_compress( S, S->buf ); - - for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ - store32( buffer + sizeof( S->h[i] ) * i, S->h[i] ); - - memcpy( out, buffer, outlen ); - return 0; -} - -int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) -{ - blake2s_state S[1]; - - /* Verify parameters */ - if ( NULL == in ) return -1; - - if ( NULL == out ) return -1; - - if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */ - - if( keylen > 0 ) - { - if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1; - } - else - { - if( blake2s_init( S, outlen ) < 0 ) return -1; - } - - blake2s_update( S, ( uint8_t * )in, inlen ); - blake2s_final( S, out, outlen ); - return 0; -} - -#if defined(BLAKE2S_SELFTEST) -#include -#include "blake2-kat.h" /* test data not included */ -int main( int argc, char **argv ) -{ - uint8_t key[BLAKE2S_KEYBYTES]; - uint8_t buf[KAT_LENGTH]; - - for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i ) - key[i] = ( uint8_t )i; - - for( size_t i = 0; i < KAT_LENGTH; ++i ) - buf[i] = ( uint8_t )i; - - for( size_t i = 0; i < KAT_LENGTH; ++i ) - { - uint8_t hash[BLAKE2S_OUTBYTES]; - blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES ); - - if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) ) - { - puts( "error" ); - return -1; - } - } - - puts( "ok" ); - return 0; -} -#endif diff --git a/algo/blake/sph-blake2s.h b/algo/blake/sph-blake2s.h deleted file mode 100644 index 2949fa6..0000000 --- a/algo/blake/sph-blake2s.h +++ /dev/null @@ -1,150 +0,0 @@ -/** - * BLAKE2 reference source code package - reference C implementations - * - * Written in 2012 by Samuel Neves - * - * To the extent possible under law, the author(s) have dedicated all copyright - * and related and neighboring rights to this software to the public domain - * worldwide. This software is distributed without any warranty. - * - * You should have received a copy of the CC0 Public Domain Dedication along with - * this software. If not, see . - */ -#pragma once -#ifndef __BLAKE2_H__ -#define __BLAKE2_H__ - -#include -#include - -#if defined(_MSC_VER) -#include -#define inline __inline -#define ALIGN(x) __declspec(align(x)) -#else -#define ALIGN(x) __attribute__((aligned(x))) -#endif - -/* blake2-impl.h */ - -static inline uint32_t load32(const void *src) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - return *(uint32_t *)(src); -#else - const uint8_t *p = (uint8_t *)src; - uint32_t w = *p++; - w |= (uint32_t)(*p++) << 8; - w |= (uint32_t)(*p++) << 16; - w |= (uint32_t)(*p++) << 24; - return w; -#endif -} - -static inline void store32(void *dst, uint32_t w) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - *(uint32_t *)(dst) = w; -#else - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; w >>= 8; - *p++ = (uint8_t)w; w >>= 8; - *p++ = (uint8_t)w; w >>= 8; - *p++ = (uint8_t)w; -#endif -} - -static inline uint64_t load48(const void *src) -{ - const uint8_t *p = (const uint8_t *)src; - uint64_t w = *p++; - w |= (uint64_t)(*p++) << 8; - w |= (uint64_t)(*p++) << 16; - w |= (uint64_t)(*p++) << 24; - w |= (uint64_t)(*p++) << 32; - w |= (uint64_t)(*p++) << 40; - return w; -} - -static inline void store48(void *dst, uint64_t w) -{ - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; w >>= 8; - *p++ = (uint8_t)w; w >>= 8; - *p++ = (uint8_t)w; w >>= 8; - *p++ = (uint8_t)w; w >>= 8; - *p++ = (uint8_t)w; w >>= 8; - *p++ = (uint8_t)w; -} - -/* prevents compiler optimizing out memset() */ -static inline void secure_zero_memory(void *v, size_t n) -{ - volatile uint8_t *p = ( volatile uint8_t * )v; - - while( n-- ) *p++ = 0; -} - -/* blake2.h */ - -#if defined(__cplusplus) -extern "C" { -#endif - - enum blake2s_constant - { - BLAKE2S_BLOCKBYTES = 64, - BLAKE2S_OUTBYTES = 32, - BLAKE2S_KEYBYTES = 32, - BLAKE2S_SALTBYTES = 8, - BLAKE2S_PERSONALBYTES = 8 - }; - -#pragma pack(push, 1) - typedef struct __blake2s_param - { - uint8_t digest_length; // 1 - uint8_t key_length; // 2 - uint8_t fanout; // 3 - uint8_t depth; // 4 - uint32_t leaf_length; // 8 - uint8_t node_offset[6];// 14 - uint8_t node_depth; // 15 - uint8_t inner_length; // 16 - // uint8_t reserved[0]; - uint8_t salt[BLAKE2S_SALTBYTES]; // 24 - uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 - } blake2s_param; - - ALIGN( 64 ) typedef struct __blake2s_state - { - uint32_t h[8]; - uint32_t t[2]; - uint32_t f[2]; - uint8_t buf[2 * BLAKE2S_BLOCKBYTES]; - size_t buflen; - uint8_t last_node; - } blake2s_state ; -#pragma pack(pop) - - int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ); - - // Streaming API - int blake2s_init( blake2s_state *S, const uint8_t outlen ); - int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); - int blake2s_init_param( blake2s_state *S, const blake2s_param *P ); - int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen ); - int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen ); - - // Simple API - int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); - - // Direct Hash Mining Helpers - #define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */ - #define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0) - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c deleted file mode 100644 index ca898dc..0000000 --- a/algo/blake/sph_blake2b.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright 2009 Colin Percival, 2014 savale - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#include -#include -#include - -#include "algo/sha/sph_types.h" -#include "sph_blake2b.h" - -// Cyclic right rotation. - -#ifndef ROTR64 -#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) -#endif - -// Little-endian byte access. - -#define B2B_GET64(p) \ - (((uint64_t) ((uint8_t *) (p))[0]) ^ \ - (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ - (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \ - (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \ - (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \ - (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \ - (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ - (((uint64_t) ((uint8_t *) (p))[7]) << 56)) - -// G Mixing function. - -#define B2B_G(a, b, c, d, x, y) { \ - v[a] = v[a] + v[b] + x; \ - v[d] = ROTR64(v[d] ^ v[a], 32); \ - v[c] = v[c] + v[d]; \ - v[b] = ROTR64(v[b] ^ v[c], 24); \ - v[a] = v[a] + v[b] + y; \ - v[d] = ROTR64(v[d] ^ v[a], 16); \ - v[c] = v[c] + v[d]; \ - v[b] = ROTR64(v[b] ^ v[c], 63); } - -// Initialization Vector. - -static const uint64_t blake2b_iv[8] = { - 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, - 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, - 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, - 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 -}; - -// Compression function. "last" flag indicates last block. - -static void blake2b_compress( sph_blake2b_ctx *ctx, int last ) -{ - const uint8_t sigma[12][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } - }; - int i; - uint64_t v[16], m[16]; - - for (i = 0; i < 8; i++) { // init work variables - v[i] = ctx->h[i]; - v[i + 8] = blake2b_iv[i]; - } - - v[12] ^= ctx->t[0]; // low 64 bits of offset - v[13] ^= ctx->t[1]; // high 64 bits - if (last) // last block flag set ? - v[14] = ~v[14]; - - for (i = 0; i < 16; i++) // get little-endian words - m[i] = B2B_GET64(&ctx->b[8 * i]); - - for (i = 0; i < 12; i++) { // twelve rounds - B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]); - B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]); - B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]); - B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]); - B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]); - B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); - B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); - B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); - } - - for( i = 0; i < 8; ++i ) - ctx->h[i] ^= v[i] ^ v[i + 8]; -} - -// Initialize the hashing context "ctx" with optional key "key". -// 1 <= outlen <= 64 gives the digest size in bytes. -// Secret key (also <= 64 bytes) is optional (keylen = 0). - -int sph_blake2b_init( sph_blake2b_ctx *ctx, size_t outlen, const void *key, - size_t keylen ) // (keylen=0: no key) -{ - size_t i; - - if (outlen == 0 || outlen > 64 || keylen > 64) - return -1; // illegal parameters - - for (i = 0; i < 8; i++) // state, "param block" - ctx->h[i] = blake2b_iv[i]; - ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen; - - ctx->t[0] = 0; // input count low word - ctx->t[1] = 0; // input count high word - ctx->c = 0; // pointer within buffer - ctx->outlen = outlen; - - for (i = keylen; i < 128; i++) // zero input block - ctx->b[i] = 0; - if (keylen > 0) { - sph_blake2b_update(ctx, key, keylen); - ctx->c = 128; // at the end - } - - return 0; -} - -// Add "inlen" bytes from "in" into the hash. - -void sph_blake2b_update( sph_blake2b_ctx *ctx, const void *in, size_t inlen ) -{ - size_t i; - - for (i = 0; i < inlen; i++) { - if (ctx->c == 128) { // buffer full ? - ctx->t[0] += ctx->c; // add counters - if (ctx->t[0] < ctx->c) // carry overflow ? - ctx->t[1]++; // high word - blake2b_compress(ctx, 0); // compress (not last) - ctx->c = 0; // counter to zero - } - ctx->b[ctx->c++] = ((const uint8_t *) in)[i]; - } -} - -// Generate the message digest (size given in init). -// Result placed in "out". - -void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out ) -{ - size_t i; - - ctx->t[0] += ctx->c; // mark last block offset - if (ctx->t[0] < ctx->c) // carry overflow - ctx->t[1]++; // high word - - while (ctx->c < 128) // fill up with zeros - ctx->b[ctx->c++] = 0; - blake2b_compress(ctx, 1); // final block flag = 1 - - // little endian convert and store - for (i = 0; i < ctx->outlen; i++) { - ((uint8_t *) out)[i] = - (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF; - } -} - diff --git a/algo/blake/sph_blake2b.h b/algo/blake/sph_blake2b.h deleted file mode 100644 index eaae071..0000000 --- a/algo/blake/sph_blake2b.h +++ /dev/null @@ -1,41 +0,0 @@ -#pragma once -#ifndef __BLAKE2B_H__ -#define __BLAKE2B_H__ - -#include -#include - -#if defined(_MSC_VER) -#include -#define inline __inline -#define ALIGN(x) __declspec(align(x)) -#else -#define ALIGN(x) __attribute__((aligned(x))) -#endif - -#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__) -#define NATIVE_LITTLE_ENDIAN -#endif - -// state context -ALIGN(64) typedef struct { - uint8_t b[128]; // input buffer - uint64_t h[8]; // chained state - uint64_t t[2]; // total number of bytes - size_t c; // pointer for b[] - size_t outlen; // digest size -} sph_blake2b_ctx; - -#if defined(__cplusplus) -extern "C" { -#endif - -int sph_blake2b_init( sph_blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen); -void sph_blake2b_update( sph_blake2b_ctx *ctx, const void *in, size_t inlen); -void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out); - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/algo/blake/sse2/blake.c b/algo/blake/sse2/blake.c deleted file mode 100644 index 61529f3..0000000 --- a/algo/blake/sse2/blake.c +++ /dev/null @@ -1,476 +0,0 @@ -/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */ -/* - * BLAKE implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ -#include -#include -#include - -#include "../sph_blake.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -static const sph_u64 blkIV512[8] = { - SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), - SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), - SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), - SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) -}; - -#define Z00 0 -#define Z01 1 -#define Z02 2 -#define Z03 3 -#define Z04 4 -#define Z05 5 -#define Z06 6 -#define Z07 7 -#define Z08 8 -#define Z09 9 -#define Z0A A -#define Z0B B -#define Z0C C -#define Z0D D -#define Z0E E -#define Z0F F - -#define Z10 E -#define Z11 A -#define Z12 4 -#define Z13 8 -#define Z14 9 -#define Z15 F -#define Z16 D -#define Z17 6 -#define Z18 1 -#define Z19 C -#define Z1A 0 -#define Z1B 2 -#define Z1C B -#define Z1D 7 -#define Z1E 5 -#define Z1F 3 - -#define Z20 B -#define Z21 8 -#define Z22 C -#define Z23 0 -#define Z24 5 -#define Z25 2 -#define Z26 F -#define Z27 D -#define Z28 A -#define Z29 E -#define Z2A 3 -#define Z2B 6 -#define Z2C 7 -#define Z2D 1 -#define Z2E 9 -#define Z2F 4 - -#define Z30 7 -#define Z31 9 -#define Z32 3 -#define Z33 1 -#define Z34 D -#define Z35 C -#define Z36 B -#define Z37 E -#define Z38 2 -#define Z39 6 -#define Z3A 5 -#define Z3B A -#define Z3C 4 -#define Z3D 0 -#define Z3E F -#define Z3F 8 - -#define Z40 9 -#define Z41 0 -#define Z42 5 -#define Z43 7 -#define Z44 2 -#define Z45 4 -#define Z46 A -#define Z47 F -#define Z48 E -#define Z49 1 -#define Z4A B -#define Z4B C -#define Z4C 6 -#define Z4D 8 -#define Z4E 3 -#define Z4F D - -#define Z50 2 -#define Z51 C -#define Z52 6 -#define Z53 A -#define Z54 0 -#define Z55 B -#define Z56 8 -#define Z57 3 -#define Z58 4 -#define Z59 D -#define Z5A 7 -#define Z5B 5 -#define Z5C F -#define Z5D E -#define Z5E 1 -#define Z5F 9 - -#define Z60 C -#define Z61 5 -#define Z62 1 -#define Z63 F -#define Z64 E -#define Z65 D -#define Z66 4 -#define Z67 A -#define Z68 0 -#define Z69 7 -#define Z6A 6 -#define Z6B 3 -#define Z6C 9 -#define Z6D 2 -#define Z6E 8 -#define Z6F B - -#define Z70 D -#define Z71 B -#define Z72 7 -#define Z73 E -#define Z74 C -#define Z75 1 -#define Z76 3 -#define Z77 9 -#define Z78 5 -#define Z79 0 -#define Z7A F -#define Z7B 4 -#define Z7C 8 -#define Z7D 6 -#define Z7E 2 -#define Z7F A - -#define Z80 6 -#define Z81 F -#define Z82 E -#define Z83 9 -#define Z84 B -#define Z85 3 -#define Z86 0 -#define Z87 8 -#define Z88 C -#define Z89 2 -#define Z8A D -#define Z8B 7 -#define Z8C 1 -#define Z8D 4 -#define Z8E A -#define Z8F 5 - -#define Z90 A -#define Z91 2 -#define Z92 8 -#define Z93 4 -#define Z94 7 -#define Z95 6 -#define Z96 1 -#define Z97 5 -#define Z98 F -#define Z99 B -#define Z9A 9 -#define Z9B E -#define Z9C 3 -#define Z9D C -#define Z9E D -#define Z9F 0 - -#define Mx(r, i) Mx_(Z ## r ## i) -#define Mx_(n) Mx__(n) -#define Mx__(n) M ## n - -#define CSx(r, i) CSx_(Z ## r ## i) -#define CSx_(n) CSx__(n) -#define CSx__(n) CS ## n - -#define CS0 SPH_C32(0x243F6A88) -#define CS1 SPH_C32(0x85A308D3) -#define CS2 SPH_C32(0x13198A2E) -#define CS3 SPH_C32(0x03707344) -#define CS4 SPH_C32(0xA4093822) -#define CS5 SPH_C32(0x299F31D0) -#define CS6 SPH_C32(0x082EFA98) -#define CS7 SPH_C32(0xEC4E6C89) -#define CS8 SPH_C32(0x452821E6) -#define CS9 SPH_C32(0x38D01377) -#define CSA SPH_C32(0xBE5466CF) -#define CSB SPH_C32(0x34E90C6C) -#define CSC SPH_C32(0xC0AC29B7) -#define CSD SPH_C32(0xC97C50DD) -#define CSE SPH_C32(0x3F84D5B5) -#define CSF SPH_C32(0xB5470917) - - - -#define CBx(r, i) CBx_(Z ## r ## i) -#define CBx_(n) CBx__(n) -#define CBx__(n) CB ## n - -#define CB0 SPH_C64(0x243F6A8885A308D3) -#define CB1 SPH_C64(0x13198A2E03707344) -#define CB2 SPH_C64(0xA4093822299F31D0) -#define CB3 SPH_C64(0x082EFA98EC4E6C89) -#define CB4 SPH_C64(0x452821E638D01377) -#define CB5 SPH_C64(0xBE5466CF34E90C6C) -#define CB6 SPH_C64(0xC0AC29B7C97C50DD) -#define CB7 SPH_C64(0x3F84D5B5B5470917) -#define CB8 SPH_C64(0x9216D5D98979FB1B) -#define CB9 SPH_C64(0xD1310BA698DFB5AC) -#define CBA SPH_C64(0x2FFD72DBD01ADFB7) -#define CBB SPH_C64(0xB8E1AFED6A267E96) -#define CBC SPH_C64(0xBA7C9045F12C7F99) -#define CBD SPH_C64(0x24A19947B3916CF7) -#define CBE SPH_C64(0x0801F2E2858EFC16) -#define CBF SPH_C64(0x636920D871574E69) - - -#define GS(m0, m1, c0, c1, a, b, c, d) do { \ - a = SPH_T32(a + b + (m0 ^ c1)); \ - d = SPH_ROTR32(d ^ a, 16); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 12); \ - a = SPH_T32(a + b + (m1 ^ c0)); \ - d = SPH_ROTR32(d ^ a, 8); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 7); \ - } while (0) - -#define ROUND_S(r) do { \ - GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ - GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ - GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ - GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ - GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ - GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ - GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ - GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ - } while (0) - - - -#define GB(m0, m1, c0, c1, a, b, c, d) do { \ - a = SPH_T64(a + b + (m0 ^ c1)); \ - d = SPH_ROTR64(d ^ a, 32); \ - c = SPH_T64(c + d); \ - b = SPH_ROTR64(b ^ c, 25); \ - a = SPH_T64(a + b + (m1 ^ c0)); \ - d = SPH_ROTR64(d ^ a, 16); \ - c = SPH_T64(c + d); \ - b = SPH_ROTR64(b ^ c, 11); \ - } while (0) - -#define ROUND_B(r) do { \ - GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \ - GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \ - GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \ - GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \ - GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \ - GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \ - GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \ - GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \ - } while (0) - - -#define COMPRESS64 do { \ - int b=0; \ - sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \ - sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ - sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ - V0 = blkH0, \ - V1 = blkH1, \ - V2 = blkH2, \ - V3 = blkH3, \ - V4 = blkH4, \ - V5 = blkH5, \ - V6 = blkH6, \ - V7 = blkH7; \ - V8 = blkS0 ^ CB0, \ - V9 = blkS1 ^ CB1, \ - VA = blkS2 ^ CB2, \ - VB = blkS3 ^ CB3, \ - VC = hashctA ^ CB4, \ - VD = hashctA ^ CB5, \ - VE = hashctB ^ CB6, \ - VF = hashctB ^ CB7; \ - M0 = sph_dec64be_aligned(buf + 0), \ - M1 = sph_dec64be_aligned(buf + 8), \ - M2 = sph_dec64be_aligned(buf + 16), \ - M3 = sph_dec64be_aligned(buf + 24), \ - M4 = sph_dec64be_aligned(buf + 32), \ - M5 = sph_dec64be_aligned(buf + 40), \ - M6 = sph_dec64be_aligned(buf + 48), \ - M7 = sph_dec64be_aligned(buf + 56), \ - M8 = sph_dec64be_aligned(buf + 64), \ - M9 = sph_dec64be_aligned(buf + 72), \ - MA = sph_dec64be_aligned(buf + 80), \ - MB = sph_dec64be_aligned(buf + 88), \ - MC = sph_dec64be_aligned(buf + 96), \ - MD = sph_dec64be_aligned(buf + 104), \ - ME = sph_dec64be_aligned(buf + 112), \ - MF = sph_dec64be_aligned(buf + 120); \ - /* loop once and a half */ \ - /* save some space */ \ - for (;;) { \ - ROUND_B(0); \ - ROUND_B(1); \ - ROUND_B(2); \ - ROUND_B(3); \ - ROUND_B(4); \ - ROUND_B(5); \ - if (b) break; \ - b = 1; \ - ROUND_B(6); \ - ROUND_B(7); \ - ROUND_B(8); \ - ROUND_B(9); \ - }; \ - blkH0 ^= blkS0 ^ V0 ^ V8, \ - blkH1 ^= blkS1 ^ V1 ^ V9, \ - blkH2 ^= blkS2 ^ V2 ^ VA, \ - blkH3 ^= blkS3 ^ V3 ^ VB, \ - blkH4 ^= blkS0 ^ V4 ^ VC, \ - blkH5 ^= blkS1 ^ V5 ^ VD, \ - blkH6 ^= blkS2 ^ V6 ^ VE, \ - blkH7 ^= blkS3 ^ V7 ^ VF; \ - } while (0) -/* -*/ -#define DECL_BLK \ - sph_u64 blkH0; \ - sph_u64 blkH1; \ - sph_u64 blkH2; \ - sph_u64 blkH3; \ - sph_u64 blkH4; \ - sph_u64 blkH5; \ - sph_u64 blkH6; \ - sph_u64 blkH7; \ - sph_u64 blkS0; \ - sph_u64 blkS1; \ - sph_u64 blkS2; \ - sph_u64 blkS3; \ - -/* load initial constants */ -#define BLK_I \ -do { \ - blkH0 = SPH_C64(0x6A09E667F3BCC908); \ - blkH1 = SPH_C64(0xBB67AE8584CAA73B); \ - blkH2 = SPH_C64(0x3C6EF372FE94F82B); \ - blkH3 = SPH_C64(0xA54FF53A5F1D36F1); \ - blkH4 = SPH_C64(0x510E527FADE682D1); \ - blkH5 = SPH_C64(0x9B05688C2B3E6C1F); \ - blkH6 = SPH_C64(0x1F83D9ABFB41BD6B); \ - blkH7 = SPH_C64(0x5BE0CD19137E2179); \ - blkS0 = 0; \ - blkS1 = 0; \ - blkS2 = 0; \ - blkS3 = 0; \ - hashctB = SPH_T64(0- 1); \ -} while (0) - -/* copy in 80 for initial hash */ -#define BLK_W \ -do { \ - memcpy(hashbuf, input, 80); \ - hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 80*8; \ - hashptr = 80; \ -} while (0) - -/* copy in 64 for looped hash */ -#define BLK_U \ -do { \ - memcpy(hashbuf, hash , 64); \ - hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 64*8; \ - hashptr = 64; \ -} while (0) - -/* blake compress function */ -/* hash = blake512(loaded) */ -#define BLK_C \ -do { \ - \ - union { \ - unsigned char buf[128]; \ - sph_u64 dummy; \ - } u; \ - size_t ptr; \ - unsigned bit_len; \ - \ - ptr = hashptr; \ - bit_len = ((unsigned)ptr << 3) + 0; \ - u.buf[ptr] = ((0 & -(0x80)) | (0x80)) & 0xFF; \ - memset(u.buf + ptr + 1, 0, 111 - ptr); \ - u.buf[111] |= 1; \ - sph_enc64be_aligned(u.buf + 112, 0); \ - sph_enc64be_aligned(u.buf + 120, bit_len); \ - do { \ - const void *data = u.buf + ptr; \ - unsigned char *buf; \ - buf = hashbuf; \ - size_t clen; \ - clen = (sizeof(char)*128) - hashptr; \ - memcpy(buf + hashptr, data, clen); \ - hashctA = SPH_T64(hashctA + 1024); \ - hashctB = SPH_T64(hashctB + 1); \ - COMPRESS64; \ - } while (0); \ - /* end blake64(sc, u.buf + ptr, 128 - ptr); */ \ - sph_enc64be((unsigned char*)(hash) + (0 << 3), blkH0), \ - sph_enc64be((unsigned char*)(hash) + (1 << 3), blkH1); \ - sph_enc64be((unsigned char*)(hash) + (2 << 3), blkH2), \ - sph_enc64be((unsigned char*)(hash) + (3 << 3), blkH3); \ - sph_enc64be((unsigned char*)(hash) + (4 << 3), blkH4), \ - sph_enc64be((unsigned char*)(hash) + (5 << 3), blkH5); \ - sph_enc64be((unsigned char*)(hash) + (6 << 3), blkH6), \ - sph_enc64be((unsigned char*)(hash) + (7 << 3), blkH7); \ -} while (0) - - -#ifdef __cplusplus -} -#endif diff --git a/algo/blake/sse2/blake/sse41/api.h b/algo/blake/sse2/blake/sse41/api.h deleted file mode 100644 index 99fe592..0000000 --- a/algo/blake/sse2/blake/sse41/api.h +++ /dev/null @@ -1,2 +0,0 @@ -#define CRYPTO_BYTES 64 - diff --git a/algo/blake/sse2/blake/sse41/architectures b/algo/blake/sse2/blake/sse41/architectures deleted file mode 100644 index 331c040..0000000 --- a/algo/blake/sse2/blake/sse41/architectures +++ /dev/null @@ -1,2 +0,0 @@ -amd64 -x86 \ No newline at end of file diff --git a/algo/blake/sse2/blake/sse41/config.h b/algo/blake/sse2/blake/sse41/config.h deleted file mode 100644 index bde2040..0000000 --- a/algo/blake/sse2/blake/sse41/config.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __BLAKE512_CONFIG_H__ -#define __BLAKE512_CONFIG_H__ - -#define AVOID_BRANCHING 1 -//#define HAVE_XOP 1 - -#endif - diff --git a/algo/blake/sse2/blake/sse41/hash.c b/algo/blake/sse2/blake/sse41/hash.c deleted file mode 100644 index e5648fe..0000000 --- a/algo/blake/sse2/blake/sse41/hash.c +++ /dev/null @@ -1,287 +0,0 @@ - -#include "hash.h" -/* -#ifndef NOT_SUPERCOP - -#include "crypto_hash.h" -#include "crypto_uint64.h" -#include "crypto_uint32.h" -#include "crypto_uint8.h" - -typedef crypto_uint64 u64; -typedef crypto_uint32 u32; -typedef crypto_uint8 u8; - -#else - -typedef unsigned long long u64; -typedef unsigned int u32; -typedef unsigned char u8; - -#endif -*/ -#define U8TO32(p) \ - (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \ - ((u32)((p)[2]) << 8) | ((u32)((p)[3]) )) -#define U8TO64(p) \ - (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4)) -#define U32TO8(p, v) \ - (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \ - (p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) ); -#define U64TO8(p, v) \ - U32TO8((p), (u32)((v) >> 32)); \ - U32TO8((p) + 4, (u32)((v) )); -/* -typedef struct -{ - __m128i h[4]; - u64 s[4], t[2]; - u32 buflen, nullt; - u8 buf[128]; -} state __attribute__ ((aligned (64))); -*/ -static const u8 padding[129] = -{ - 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -}; - -static inline int blake512_compress( hashState_blake * state, const u8 * datablock ) -{ - - __m128i row1l,row1h; - __m128i row2l,row2h; - __m128i row3l,row3h; - __m128i row4l,row4h; - - const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9); - const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - __m128i m0, m1, m2, m3, m4, m5, m6, m7; - __m128i t0, t1, t2, t3, t4, t5, t6, t7; - __m128i b0, b1, b2, b3; - - m0 = _mm_loadu_si128((__m128i*)(datablock + 0)); - m1 = _mm_loadu_si128((__m128i*)(datablock + 16)); - m2 = _mm_loadu_si128((__m128i*)(datablock + 32)); - m3 = _mm_loadu_si128((__m128i*)(datablock + 48)); - m4 = _mm_loadu_si128((__m128i*)(datablock + 64)); - m5 = _mm_loadu_si128((__m128i*)(datablock + 80)); - m6 = _mm_loadu_si128((__m128i*)(datablock + 96)); - m7 = _mm_loadu_si128((__m128i*)(datablock + 112)); - - m0 = BSWAP64(m0); - m1 = BSWAP64(m1); - m2 = BSWAP64(m2); - m3 = BSWAP64(m3); - m4 = BSWAP64(m4); - m5 = BSWAP64(m5); - m6 = BSWAP64(m6); - m7 = BSWAP64(m7); - - row1l = state->h[0]; - row1h = state->h[1]; - row2l = state->h[2]; - row2h = state->h[3]; - row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL); - row3h = _mm_set_epi64x(0x082EFA98EC4E6C89ULL, 0xA4093822299F31D0ULL); - - row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); - row4h = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xC0AC29B7C97C50DDULL); - -#ifdef AVOID_BRANCHING - do - { - const __m128i mask = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_set1_epi32(state->nullt)); - const __m128i xor1 = _mm_and_si128(_mm_set1_epi64x(state->t[0]), mask); - const __m128i xor2 = _mm_and_si128(_mm_set1_epi64x(state->t[1]), mask); - row4l = _mm_xor_si128(row4l, xor1); - row4h = _mm_xor_si128(row4h, xor2); - } while(0); -#else - if(!state->nullt) - { - row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0])); - row4h = _mm_xor_si128(row4h, _mm_set1_epi64x(state->t[1])); - } -#endif - - ROUND( 0); - ROUND( 1); - ROUND( 2); - ROUND( 3); - ROUND( 4); - ROUND( 5); - ROUND( 6); - ROUND( 7); - ROUND( 8); - ROUND( 9); - ROUND(10); - ROUND(11); - ROUND(12); - ROUND(13); - ROUND(14); - ROUND(15); - - row1l = _mm_xor_si128(row3l,row1l); - row1h = _mm_xor_si128(row3h,row1h); - - state->h[0] = _mm_xor_si128(row1l, state->h[0]); - state->h[1] = _mm_xor_si128(row1h, state->h[1]); - - row2l = _mm_xor_si128(row4l,row2l); - row2h = _mm_xor_si128(row4h,row2h); - - state->h[2] = _mm_xor_si128(row2l, state->h[2]); - state->h[3] = _mm_xor_si128(row2h, state->h[3]); - - return 0; -} - -static inline void blake512_init( hashState_blake * S, u64 databitlen ) -{ - memset(S, 0, sizeof(hashState_blake)); - S->h[0] = _mm_set_epi64x(0xBB67AE8584CAA73BULL, 0x6A09E667F3BCC908ULL); - S->h[1] = _mm_set_epi64x(0xA54FF53A5F1D36F1ULL, 0x3C6EF372FE94F82BULL); - S->h[2] = _mm_set_epi64x(0x9B05688C2B3E6C1FULL, 0x510E527FADE682D1ULL); - S->h[3] = _mm_set_epi64x(0x5BE0CD19137E2179ULL, 0x1F83D9ABFB41BD6BULL); - S->buflen = databitlen; -} - - -static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) -{ - - - int left = (S->buflen >> 3); - int fill = 128 - left; - - if( left && ( ((datalen >> 3) & 0x7F) >= fill ) ) { - memcpy( (void *) (S->buf + left), (void *) data, fill ); - S->t[0] += 1024; - blake512_compress( S, S->buf ); - data += fill; - datalen -= (fill << 3); - left = 0; - } - - while( datalen >= 1024 ) { - S->t[0] += 1024; - blake512_compress( S, data ); - data += 128; - datalen -= 1024; - } - - if( datalen > 0 ) { - memcpy( (void *) (S->buf + left), (void *) data, ( datalen>>3 ) & 0x7F ); - S->buflen = (left<<3) + datalen; - } - else S->buflen=0; -} - -static inline void blake512_final( hashState_blake * S, u8 * digest ) -{ - - u8 msglen[16], zo=0x01,oo=0x81; - u64 lo=S->t[0] + S->buflen, hi = S->t[1]; - if ( lo < S->buflen ) hi++; - U64TO8( msglen + 0, hi ); - U64TO8( msglen + 8, lo ); - - if ( S->buflen == 888 ) /* one padding byte */ - { - S->t[0] -= 8; - blake512_update( S, &oo, 8 ); - } - else - { - if ( S->buflen < 888 ) /* enough space to fill the block */ - { - if ( S->buflen == 0 ) S->nullt=1; - S->t[0] -= 888 - S->buflen; - blake512_update( S, padding, 888 - S->buflen ); - } - else /* NOT enough space, need 2 compressions */ - { - S->t[0] -= 1024 - S->buflen; - blake512_update( S, padding, 1024 - S->buflen ); - S->t[0] -= 888; - blake512_update( S, padding+1, 888 ); - S->nullt = 1; - } - blake512_update( S, &zo, 8 ); - S->t[0] -= 8; - } - S->t[0] -= 128; - blake512_update( S, msglen, 128 ); - - do - { - const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - _mm_storeu_si128((__m128i*)(digest + 0), BSWAP64(S->h[0])); - _mm_storeu_si128((__m128i*)(digest + 16), BSWAP64(S->h[1])); - _mm_storeu_si128((__m128i*)(digest + 32), BSWAP64(S->h[2])); - _mm_storeu_si128((__m128i*)(digest + 48), BSWAP64(S->h[3])); - } while(0); -} - -/* -int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) -{ - - hashState_blake S; - blake512_init( &S ); - blake512_update( &S, in, inlen*8 ); - blake512_final( &S, out ); - return 0; -} -*/ -/* -#ifdef NOT_SUPERCOP - -int main() -{ - - int i, v; - u8 data[144], digest[64]; - u8 test1[]= {0x97, 0x96, 0x15, 0x87, 0xF6, 0xD9, 0x70, 0xFA, 0xBA, 0x6D, 0x24, 0x78, 0x04, 0x5D, 0xE6, 0xD1, - 0xFA, 0xBD, 0x09, 0xB6, 0x1A, 0xE5, 0x09, 0x32, 0x05, 0x4D, 0x52, 0xBC, 0x29, 0xD3, 0x1B, 0xE4, - 0xFF, 0x91, 0x02, 0xB9, 0xF6, 0x9E, 0x2B, 0xBD, 0xB8, 0x3B, 0xE1, 0x3D, 0x4B, 0x9C, 0x06, 0x09, - 0x1E, 0x5F, 0xA0, 0xB4, 0x8B, 0xD0, 0x81, 0xB6, 0x34, 0x05, 0x8B, 0xE0, 0xEC, 0x49, 0xBE, 0xB3}; - u8 test2[]= {0x31, 0x37, 0x17, 0xD6, 0x08, 0xE9, 0xCF, 0x75, 0x8D, 0xCB, 0x1E, 0xB0, 0xF0, 0xC3, 0xCF, 0x9F, - 0xC1, 0x50, 0xB2, 0xD5, 0x00, 0xFB, 0x33, 0xF5, 0x1C, 0x52, 0xAF, 0xC9, 0x9D, 0x35, 0x8A, 0x2F, - 0x13, 0x74, 0xB8, 0xA3, 0x8B, 0xBA, 0x79, 0x74, 0xE7, 0xF6, 0xEF, 0x79, 0xCA, 0xB1, 0x6F, 0x22, - 0xCE, 0x1E, 0x64, 0x9D, 0x6E, 0x01, 0xAD, 0x95, 0x89, 0xC2, 0x13, 0x04, 0x5D, 0x54, 0x5D, 0xDE}; - - for(i=0; i<144; ++i) data[i]=0; - - crypto_hash( digest, data, 1 ); - v=0; - for(i=0; i<64; ++i) { - printf("%02X", digest[i]); - if ( digest[i] != test1[i]) v=1; - } - if (v) printf("\nerror\n"); - else printf("\nok\n"); - - for(i=0; i<144; ++i) data[i]=0; - - crypto_hash( digest, data, 144 ); - v=0; - for(i=0; i<64; ++i) { - printf("%02X", digest[i]); - if ( digest[i] != test2[i]) v=1; - } - if (v) printf("\nerror\n"); - else printf("\nok\n"); - - return 0; -} - -#endif - -*/ - - diff --git a/algo/blake/sse2/blake/sse41/hash.h b/algo/blake/sse2/blake/sse41/hash.h deleted file mode 100644 index 29758b4..0000000 --- a/algo/blake/sse2/blake/sse41/hash.h +++ /dev/null @@ -1,74 +0,0 @@ - -#include -#include -#include -#include - -#include "config.h" -#include "rounds.h" -/* -#ifndef NOT_SUPERCOP - -#include "crypto_hash.h" -#include "crypto_uint64.h" -#include "crypto_uint32.h" -#include "crypto_uint8.h" - -typedef crypto_uint64 u64; -typedef crypto_uint32 u32; -typedef crypto_uint8 u8; - -#else -*/ -typedef unsigned long long u64; -typedef unsigned int u32; -typedef unsigned char u8; - -typedef struct -{ - __m128i h[4]; - u64 s[4], t[2]; - u32 buflen, nullt; - u8 buf[128]; -} hashState_blake __attribute__ ((aligned (64))); -/* -#endif - -#define U8TO32(p) \ - (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \ - ((u32)((p)[2]) << 8) | ((u32)((p)[3]) )) -#define U8TO64(p) \ - (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4)) -#define U32TO8(p, v) \ - (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \ - (p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) ); -#define U64TO8(p, v) \ - U32TO8((p), (u32)((v) >> 32)); \ - U32TO8((p) + 4, (u32)((v) )); -*/ - -/* -static const u8 padding[129] = -{ - 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -}; - -*/ -static inline void blake512_init( hashState_blake * S, u64 datalen ); - - -static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) ; - -static inline void blake512_final( hashState_blake * S, u8 * digest ) ; - - -int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) ; - - - - - - diff --git a/algo/blake/sse2/blake/sse41/implementors b/algo/blake/sse2/blake/sse41/implementors deleted file mode 100644 index 2fbd178..0000000 --- a/algo/blake/sse2/blake/sse41/implementors +++ /dev/null @@ -1,2 +0,0 @@ -Jean-Philippe Aumasson -Samuel Neves diff --git a/algo/blake/sse2/blake/sse41/rounds.h b/algo/blake/sse2/blake/sse41/rounds.h deleted file mode 100644 index 303bd11..0000000 --- a/algo/blake/sse2/blake/sse41/rounds.h +++ /dev/null @@ -1,871 +0,0 @@ - -#ifndef __BLAKE512_ROUNDS_H__ -#define __BLAKE512_ROUNDS_H__ - -#ifndef HAVE_XOP - #define BSWAP64(x) _mm_shuffle_epi8((x), u8to64) - - #define _mm_roti_epi64(x, c) \ - (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ - : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ - : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-c))) -#else - #define BSWAP64(x) _mm_perm_epi8((x),(x),u8to64) -#endif - - -#define LOAD_MSG_0_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m0, m1); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m2, m3); \ -t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_0_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m0, m1); \ -t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m2, m3); \ -t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_0_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m4, m5); \ -t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_0_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m5); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_1_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m7, m2); \ -t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m4, m6); \ -t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_1_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m5, m4); \ -t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m3, m7, 8); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_1_3(b0, b1) \ -do \ -{ \ -t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ -t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m5, m2); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_1_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m6, m1); \ -t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m1); \ -t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_2_1(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m6, m5, 8); \ -t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m2, m7); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_2_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m4, m0); \ -t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m1, m6, 0xF0); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_2_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m5, m1, 0xF0); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m4); \ -t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_2_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m7, m3); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m2, m0, 8); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_3_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m3, m1); \ -t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m6, m5); \ -t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_3_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m0); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_3_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m1, m2, 0xF0); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m2, m7, 0xF0); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_3_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m3, m5); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_4_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m2); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m1, m5); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_4_2(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m0, m3, 0xF0); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m2, m7, 0xF0); \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_4_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m7, m5, 0xF0); \ -t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m3, m1, 0xF0); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_4_4(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m6, m0, 8); \ -t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m4, m6, 0xF0); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_5_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m1, m3); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_5_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m6, m5); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m5, m1); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_5_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m2, m3, 0xF0); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m7, m0); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_5_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m6, m2); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m7, m4, 0xF0); \ -t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_6_1(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m6, m0, 0xF0); \ -t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m7, m2); \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_6_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m2, m7); \ -t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m5, m6, 8); \ -t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_6_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m0, m3); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ -t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_6_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m3, m1); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m1, m5, 0xF0); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0xD1310BA698DFB5ACULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_7_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m6, m3); \ -t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m6, m1, 0xF0); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x13198A2E03707344ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_7_2(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m7, m5, 8); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x24A19947B3916CF7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_7_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m2, m7); \ -t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m4, m1); \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_7_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m0, m2); \ -t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m3, m5); \ -t3 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x9216D5D98979FB1BULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_8_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m3, m7); \ -t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m0, m5, 8); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x82EFA98EC4E6C89ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_8_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m7, m4); \ -t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xC0AC29B7C97C50DDULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m4, m1, 8); \ -t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_8_3(b0, b1) \ -do \ -{ \ -t0 = m6; \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m5, m0, 8); \ -t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_8_4(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m1, m3, 0xF0); \ -t1 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = m2; \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x13198A2E03707344ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_9_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m5, m4); \ -t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m0); \ -t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_9_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m1, m2); \ -t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m3, m2, 0xF0); \ -t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_9_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m7, m4); \ -t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m1, m6); \ -t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_9_4(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m7, m5, 8); \ -t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m0); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x82EFA98EC4E6C89ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_10_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m0, m1); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m2, m3); \ -t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_10_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m0, m1); \ -t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m2, m3); \ -t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_10_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m4, m5); \ -t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_10_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m5); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_11_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m7, m2); \ -t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m4, m6); \ -t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_11_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m5, m4); \ -t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m3, m7, 8); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_11_3(b0, b1) \ -do \ -{ \ -t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ -t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m5, m2); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_11_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m6, m1); \ -t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m1); \ -t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_12_1(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m6, m5, 8); \ -t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m2, m7); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_12_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m4, m0); \ -t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m1, m6, 0xF0); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_12_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m5, m1, 0xF0); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m4); \ -t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_12_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m7, m3); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m2, m0, 8); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_13_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m3, m1); \ -t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m6, m5); \ -t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_13_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m0); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_13_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m1, m2, 0xF0); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m2, m7, 0xF0); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_13_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m3, m5); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_14_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m2); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m1, m5); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_14_2(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m0, m3, 0xF0); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m2, m7, 0xF0); \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_14_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m7, m5, 0xF0); \ -t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m3, m1, 0xF0); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_14_4(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m6, m0, 8); \ -t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m4, m6, 0xF0); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_15_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m1, m3); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_15_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m6, m5); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m5, m1); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_15_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m2, m3, 0xF0); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m7, m0); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_15_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m6, m2); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m7, m4, 0xF0); \ -t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - - - - - -#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ - \ - row4l = _mm_xor_si128(row4l, row1l); \ - row4h = _mm_xor_si128(row4h, row1h); \ - \ - row4l = _mm_roti_epi64(row4l, -32); \ - row4h = _mm_roti_epi64(row4h, -32); \ - \ - row3l = _mm_add_epi64(row3l, row4l); \ - row3h = _mm_add_epi64(row3h, row4h); \ - \ - row2l = _mm_xor_si128(row2l, row3l); \ - row2h = _mm_xor_si128(row2h, row3h); \ - \ - row2l = _mm_roti_epi64(row2l, -25); \ - row2h = _mm_roti_epi64(row2h, -25); \ - -#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ - \ - row4l = _mm_xor_si128(row4l, row1l); \ - row4h = _mm_xor_si128(row4h, row1h); \ - \ - row4l = _mm_roti_epi64(row4l, -16); \ - row4h = _mm_roti_epi64(row4h, -16); \ - \ - row3l = _mm_add_epi64(row3l, row4l); \ - row3h = _mm_add_epi64(row3h, row4h); \ - \ - row2l = _mm_xor_si128(row2l, row3l); \ - row2h = _mm_xor_si128(row2h, row3h); \ - \ - row2l = _mm_roti_epi64(row2l, -11); \ - row2h = _mm_roti_epi64(row2h, -11); \ - - -#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = _mm_alignr_epi8(row2h, row2l, 8); \ - t1 = _mm_alignr_epi8(row2l, row2h, 8); \ - row2l = t0; \ - row2h = t1; \ - \ - t0 = row3l; \ - row3l = row3h; \ - row3h = t0; \ - \ - t0 = _mm_alignr_epi8(row4h, row4l, 8); \ - t1 = _mm_alignr_epi8(row4l, row4h, 8); \ - row4l = t1; \ - row4h = t0; - -#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = _mm_alignr_epi8(row2l, row2h, 8); \ - t1 = _mm_alignr_epi8(row2h, row2l, 8); \ - row2l = t0; \ - row2h = t1; \ - \ - t0 = row3l; \ - row3l = row3h; \ - row3h = t0; \ - \ - t0 = _mm_alignr_epi8(row4l, row4h, 8); \ - t1 = _mm_alignr_epi8(row4h, row4l, 8); \ - row4l = t1; \ - row4h = t0; - -#define ROUND(r) \ - LOAD_MSG_ ##r ##_1(b0, b1); \ - G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_2(b0, b1); \ - G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ - LOAD_MSG_ ##r ##_3(b0, b1); \ - G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_4(b0, b1); \ - G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); - -#endif - diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h deleted file mode 100644 index dcdb41d..0000000 --- a/algo/bmw/bmw-hash-4way.h +++ /dev/null @@ -1,144 +0,0 @@ -/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * BMW interface. BMW (aka "Blue Midnight Wish") is a family of - * functions which differ by their output size; this implementation - * defines BMW for output sizes 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_bmw.h - * @author Thomas Pornin - */ - -#ifndef BMW_HASH_H__ -#define BMW_HASH_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include - -#include "algo/sha/sph_types.h" -#include "simd-utils.h" - -#define SPH_SIZE_bmw256 256 - -#define SPH_SIZE_bmw512 512 - -#if defined(__SSE2__) - -// BMW-256 4 way 32 - -typedef struct { - __m128i buf[64]; - __m128i H[16]; - size_t ptr; - sph_u32 bit_count; // assume bit_count fits in 32 bits -} bmw_4way_small_context; - -typedef bmw_4way_small_context bmw256_4way_context; - -void bmw256_4way_init(void *cc); - -void bmw256_4way(void *cc, const void *data, size_t len); - -void bmw256_4way_close(void *cc, void *dst); - -void bmw256_4way_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#endif // __SSE2__ - -#if defined(__AVX2__) - -// BMW-256 8 way 32 - -typedef struct { - __m256i buf[64]; - __m256i H[16]; - size_t ptr; - uint32_t bit_count; // assume bit_count fits in 32 bits -} bmw_8way_small_context __attribute__ ((aligned (64))); - -typedef bmw_8way_small_context bmw256_8way_context; - -void bmw256_8way_init( bmw256_8way_context *ctx ); -void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len ); -void bmw256_8way_close( bmw256_8way_context *ctx, void *dst ); - -#endif - - -#if defined(__SSE2__) - -// BMW-512 2 way 64 - -typedef struct { - __m128i buf[16]; - __m128i H[16]; - size_t ptr; - uint64_t bit_count; -} bmw_2way_big_context __attribute__ ((aligned (64))); - -typedef bmw_2way_big_context bmw512_2way_context; - -void bmw512_2way_init( bmw512_2way_context *ctx ); -void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len ); -void bmw512_2way_close( bmw512_2way_context *ctx, void *dst ); - -#endif // __SSE2__ - -#if defined(__AVX2__) - -// BMW-512 4 way 64 - -typedef struct { - __m256i buf[16]; - __m256i H[16]; - size_t ptr; - sph_u64 bit_count; -} bmw_4way_big_context; - -typedef bmw_4way_big_context bmw512_4way_context; - - -void bmw512_4way_init(void *cc); - -void bmw512_4way(void *cc, const void *data, size_t len); - -void bmw512_4way_close(void *cc, void *dst); - -void bmw512_4way_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#endif // __AVX2__ - -#ifdef __cplusplus -} -#endif - -#endif // BMW_HASH_H__ diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c deleted file mode 100644 index 8f785e3..0000000 --- a/algo/bmw/bmw256-hash-4way.c +++ /dev/null @@ -1,1109 +0,0 @@ -/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */ -/* - * BMW implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include -#include -#include "bmw-hash-4way.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -#define LPAR ( - -#if defined(__SSE2__) - -// BMW-256 4 way 32 - -static const uint32_t IV256[] = { - 0x40414243, 0x44454647, - 0x48494A4B, 0x4C4D4E4F, - 0x50515253, 0x54555657, - 0x58595A5B, 0x5C5D5E5F, - 0x60616263, 0x64656667, - 0x68696A6B, 0x6C6D6E6F, - 0x70717273, 0x74757677, - 0x78797A7B, 0x7C7D7E7F -}; - -#define ss0(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \ - _mm_slli_epi32( (x), 3) ), \ - _mm_xor_si128( mm128_rol_32( (x), 4), \ - mm128_rol_32( (x), 19) ) ) - -#define ss1(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \ - _mm_slli_epi32( (x), 2) ), \ - _mm_xor_si128( mm128_rol_32( (x), 8), \ - mm128_rol_32( (x), 23) ) ) - -#define ss2(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \ - _mm_slli_epi32( (x), 1) ), \ - _mm_xor_si128( mm128_rol_32( (x), 12), \ - mm128_rol_32( (x), 25) ) ) - -#define ss3(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \ - _mm_slli_epi32( (x), 2) ), \ - _mm_xor_si128( mm128_rol_32( (x), 15), \ - mm128_rol_32( (x), 29) ) ) - -#define ss4(x) \ - _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) ) - -#define ss5(x) \ - _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) ) - -#define rs1(x) mm128_rol_32( x, 3 ) -#define rs2(x) mm128_rol_32( x, 7 ) -#define rs3(x) mm128_rol_32( x, 13 ) -#define rs4(x) mm128_rol_32( x, 16 ) -#define rs5(x) mm128_rol_32( x, 19 ) -#define rs6(x) mm128_rol_32( x, 23 ) -#define rs7(x) mm128_rol_32( x, 27 ) - -#define rol_off_32( M, j, off ) \ - mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \ - ( ( (j) + (off) ) & 0xF ) + 1 ) - -#define add_elt_s( M, H, j ) \ - _mm_xor_si128( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \ - rol_off_32( M, j, 3 ) ), \ - rol_off_32( M, j, 10 ) ), \ - _mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \ - H[ ( (j)+7 ) & 0xF ] ) - - -#define expand1s( qt, M, H, i ) \ - _mm_add_epi32( mm128_add4_32( \ - mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \ - ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \ - mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \ - ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \ - mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \ - ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \ - mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \ - ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \ - add_elt_s( M, H, (i)-16 ) ) - -#define expand2s( qt, M, H, i) \ - _mm_add_epi32( mm128_add4_32( \ - mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \ - qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \ - mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \ - qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \ - mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \ - qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \ - mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \ - ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \ - add_elt_s( M, H, (i)-16 ) ) - -#define Ws0 \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[13], H[13] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) - -#define Ws1 \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[14], H[14] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define Ws2 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[12], H[12] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define Ws3 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 1], H[ 1] ) ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[13], H[13] ) ) - -#define Ws4 \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) - -#define Ws5 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[12], H[12] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define Ws6 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \ - _mm_xor_si128( M[ 0], H[ 0] ) ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[13], H[13] ) ) - -#define Ws7 \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[12], H[12] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) - -#define Ws8 \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[13], H[13] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define Ws9 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) - -#define Ws10 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ - _mm_xor_si128( M[ 1], H[ 1] ) ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define Ws11 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ - _mm_xor_si128( M[ 0], H[ 0] ) ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ) - -#define Ws12 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[10], H[10] ) ) - -#define Ws13 \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[11], H[11] ) ) - -#define Ws14 \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[12], H[12] ) ) - -#define Ws15 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[13], H[13] ) ) - - -void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] ) -{ - __m128i qt[32], xl, xh; \ - - qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] ); - qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] ); - qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] ); - qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] ); - qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] ); - qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] ); - qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] ); - qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] ); - qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] ); - qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] ); - qt[10] = _mm_add_epi32( ss0( Ws10), H[11] ); - qt[11] = _mm_add_epi32( ss1( Ws11), H[12] ); - qt[12] = _mm_add_epi32( ss2( Ws12), H[13] ); - qt[13] = _mm_add_epi32( ss3( Ws13), H[14] ); - qt[14] = _mm_add_epi32( ss4( Ws14), H[15] ); - qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] ); - qt[16] = expand1s( qt, M, H, 16 ); - qt[17] = expand1s( qt, M, H, 17 ); - qt[18] = expand2s( qt, M, H, 18 ); - qt[19] = expand2s( qt, M, H, 19 ); - qt[20] = expand2s( qt, M, H, 20 ); - qt[21] = expand2s( qt, M, H, 21 ); - qt[22] = expand2s( qt, M, H, 22 ); - qt[23] = expand2s( qt, M, H, 23 ); - qt[24] = expand2s( qt, M, H, 24 ); - qt[25] = expand2s( qt, M, H, 25 ); - qt[26] = expand2s( qt, M, H, 26 ); - qt[27] = expand2s( qt, M, H, 27 ); - qt[28] = expand2s( qt, M, H, 28 ); - qt[29] = expand2s( qt, M, H, 29 ); - qt[30] = expand2s( qt, M, H, 30 ); - qt[31] = expand2s( qt, M, H, 31 ); - - xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm_xor_si128( xl, _mm_xor_si128( - mm128_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); - - dH[ 0] = _mm_add_epi32( - _mm_xor_si128( M[0], - _mm_xor_si128( _mm_slli_epi32( xh, 5 ), - _mm_srli_epi32( qt[16], 5 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] )); - dH[ 1] = _mm_add_epi32( - _mm_xor_si128( M[1], - _mm_xor_si128( _mm_srli_epi32( xh, 7 ), - _mm_slli_epi32( qt[17], 8 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] )); - dH[ 2] = _mm_add_epi32( - _mm_xor_si128( M[2], - _mm_xor_si128( _mm_srli_epi32( xh, 5 ), - _mm_slli_epi32( qt[18], 5 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] )); - dH[ 3] = _mm_add_epi32( - _mm_xor_si128( M[3], - _mm_xor_si128( _mm_srli_epi32( xh, 1 ), - _mm_slli_epi32( qt[19], 5 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] )); - dH[ 4] = _mm_add_epi32( - _mm_xor_si128( M[4], - _mm_xor_si128( _mm_srli_epi32( xh, 3 ), - _mm_slli_epi32( qt[20], 0 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] )); - dH[ 5] = _mm_add_epi32( - _mm_xor_si128( M[5], - _mm_xor_si128( _mm_slli_epi32( xh, 6 ), - _mm_srli_epi32( qt[21], 6 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] )); - dH[ 6] = _mm_add_epi32( - _mm_xor_si128( M[6], - _mm_xor_si128( _mm_srli_epi32( xh, 4 ), - _mm_slli_epi32( qt[22], 6 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] )); - dH[ 7] = _mm_add_epi32( - _mm_xor_si128( M[7], - _mm_xor_si128( _mm_srli_epi32( xh, 11 ), - _mm_slli_epi32( qt[23], 2 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] )); - dH[ 8] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[4], 9 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )), - _mm_xor_si128( _mm_slli_epi32( xl, 8 ), - _mm_xor_si128( qt[23], qt[ 8] ) ) ); - dH[ 9] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[5], 10 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )), - _mm_xor_si128( _mm_srli_epi32( xl, 6 ), - _mm_xor_si128( qt[16], qt[ 9] ) ) ); - dH[10] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[6], 11 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )), - _mm_xor_si128( _mm_slli_epi32( xl, 6 ), - _mm_xor_si128( qt[17], qt[10] ) ) ); - dH[11] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[7], 12 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )), - _mm_xor_si128( _mm_slli_epi32( xl, 4 ), - _mm_xor_si128( qt[18], qt[11] ) ) ); - dH[12] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[0], 13 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )), - _mm_xor_si128( _mm_srli_epi32( xl, 3 ), - _mm_xor_si128( qt[19], qt[12] ) ) ); - dH[13] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[1], 14 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )), - _mm_xor_si128( _mm_srli_epi32( xl, 4 ), - _mm_xor_si128( qt[20], qt[13] ) ) ); - dH[14] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[2], 15 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )), - _mm_xor_si128( _mm_srli_epi32( xl, 7 ), - _mm_xor_si128( qt[21], qt[14] ) ) ); - dH[15] = _mm_add_epi32( _mm_add_epi32( - mm128_rol_32( dH[3], 16 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )), - _mm_xor_si128( _mm_srli_epi32( xl, 2 ), - _mm_xor_si128( qt[22], qt[15] ) ) ); -} - -static const uint32_t final_s[16][4] = -{ - { 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0 }, - { 0xaaaaaaa1, 0xaaaaaaa1, 0xaaaaaaa1, 0xaaaaaaa1 }, - { 0xaaaaaaa2, 0xaaaaaaa2, 0xaaaaaaa2, 0xaaaaaaa2 }, - { 0xaaaaaaa3, 0xaaaaaaa3, 0xaaaaaaa3, 0xaaaaaaa3 }, - { 0xaaaaaaa4, 0xaaaaaaa4, 0xaaaaaaa4, 0xaaaaaaa4 }, - { 0xaaaaaaa5, 0xaaaaaaa5, 0xaaaaaaa5, 0xaaaaaaa5 }, - { 0xaaaaaaa6, 0xaaaaaaa6, 0xaaaaaaa6, 0xaaaaaaa6 }, - { 0xaaaaaaa7, 0xaaaaaaa7, 0xaaaaaaa7, 0xaaaaaaa7 }, - { 0xaaaaaaa8, 0xaaaaaaa8, 0xaaaaaaa8, 0xaaaaaaa8 }, - { 0xaaaaaaa9, 0xaaaaaaa9, 0xaaaaaaa9, 0xaaaaaaa9 }, - { 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa }, - { 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab }, - { 0xaaaaaaac, 0xaaaaaaac, 0xaaaaaaac, 0xaaaaaaac }, - { 0xaaaaaaad, 0xaaaaaaad, 0xaaaaaaad, 0xaaaaaaad }, - { 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae }, - { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf } -}; -/* -static const __m128i final_s[16] = -{ - { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 }, - { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 }, - { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 }, - { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 }, - { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 }, - { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 }, - { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 }, - { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 }, - { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 }, - { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 }, - { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa }, - { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab }, - { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac }, - { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad }, - { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae }, - { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf } -}; -*/ -static void -bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv) -{ - for ( int i = 0; i < 16; i++ ) - sc->H[i] = _mm_set1_epi32( iv[i] ); - sc->ptr = 0; - sc->bit_count = 0; -} - -static void -bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) -{ - __m128i *vdata = (__m128i*)data; - __m128i *buf; - __m128i htmp[16]; - __m128i *h1, *h2; - size_t ptr; - const int buf_size = 64; // bytes of one lane, compatible with len - - sc->bit_count += (sph_u32)len << 3; - buf = sc->buf; - ptr = sc->ptr; - h1 = sc->H; - h2 = htmp; - - while ( len > 0 ) - { - size_t clen; - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_128( buf + (ptr>>2), vdata, clen >> 2 ); - vdata += ( clen >> 2 ); - len -= clen; - ptr += clen; - if ( ptr == buf_size ) - { - __m128i *ht; - compress_small( buf, h1, h2 ); - ht = h1; - h1 = h2; - h2 = ht; - ptr = 0; - } - } - sc->ptr = ptr; - - - if ( h1 != sc->H ) - memcpy_128( sc->H, h1, 16 ); -} - -static void -bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, - void *dst, size_t out_size_w32) -{ - __m128i *buf; - __m128i h1[16], h2[16], *h; - size_t ptr, u, v; - const int buf_size = 64; // bytes of one lane, compatible with len - - buf = sc->buf; - ptr = sc->ptr; - buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 ); - ptr += 4; - h = sc->H; - - // assume bit_count fits in 32 bits - if ( ptr > buf_size - 4 ) - { - memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 ); - compress_small( buf, h, h1 ); - ptr = 0; - h = h1; - } - memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 ); - buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n ); - buf[ (buf_size - 4) >> 2 ] = m128_zero; - compress_small( buf, h, h2 ); - - for ( u = 0; u < 16; u ++ ) - buf[u] = h2[u]; - - compress_small( buf, (__m128i*)final_s, h1 ); - - for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) - casti_m128i( dst, u ) = h1[v]; -} - -void -bmw256_4way_init(void *cc) -{ - bmw32_4way_init(cc, IV256); -} - -void -bmw256_4way(void *cc, const void *data, size_t len) -{ - bmw32_4way(cc, data, len); -} - -void -bmw256_4way_close(void *cc, void *dst) -{ - bmw256_4way_addbits_and_close(cc, 0, 0, dst); -} - -void -bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - bmw32_4way_close(cc, ub, n, dst, 8); -} - -#endif // __SSE2__ - -#if defined(__AVX2__) - -// BMW-256 8 way 32 - -// copied from bmw512 4 way. -// change sizes to 32, macro names from b to s, shift constants. -// all the XORs ae good. - - -#define s8s0(x) \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi32( (x), 1), \ - _mm256_slli_epi32( (x), 3) ), \ - _mm256_xor_si256( mm256_rol_32( (x), 4), \ - mm256_rol_32( (x), 19) ) ) - -#define s8s1(x) \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi32( (x), 1), \ - _mm256_slli_epi32( (x), 2) ), \ - _mm256_xor_si256( mm256_rol_32( (x), 8), \ - mm256_rol_32( (x), 23) ) ) - -#define s8s2(x) \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi32( (x), 2), \ - _mm256_slli_epi32( (x), 1) ), \ - _mm256_xor_si256( mm256_rol_32( (x), 12), \ - mm256_rol_32( (x), 25) ) ) - -#define s8s3(x) \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi32( (x), 2), \ - _mm256_slli_epi32( (x), 2) ), \ - _mm256_xor_si256( mm256_rol_32( (x), 15), \ - mm256_rol_32( (x), 29) ) ) - -#define s8s4(x) \ - _mm256_xor_si256( (x), _mm256_srli_epi32( (x), 1 ) ) - -#define s8s5(x) \ - _mm256_xor_si256( (x), _mm256_srli_epi32( (x), 2 ) ) - -#define r8s1(x) mm256_rol_32( x, 3 ) -#define r8s2(x) mm256_rol_32( x, 7 ) -#define r8s3(x) mm256_rol_32( x, 13 ) -#define r8s4(x) mm256_rol_32( x, 16 ) -#define r8s5(x) mm256_rol_32( x, 19 ) -#define r8s6(x) mm256_rol_32( x, 23 ) -#define r8s7(x) mm256_rol_32( x, 27 ) - -#define mm256_rol_off_32( M, j, off ) \ - mm256_rol_32( M[ ( (j) + (off) ) & 0xF ] , \ - ( ( (j) + (off) ) & 0xF ) + 1 ) - -#define add_elt_s8( M, H, j ) \ - _mm256_xor_si256( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_add_epi32( mm256_rol_off_32( M, j, 0 ), \ - mm256_rol_off_32( M, j, 3 ) ), \ - mm256_rol_off_32( M, j, 10 ) ), \ - _mm256_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \ - H[ ( (j)+7 ) & 0xF ] ) - -#define expand1s8( qt, M, H, i ) \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( s8s1( qt[ (i)-16 ] ), \ - s8s2( qt[ (i)-15 ] ) ), \ - _mm256_add_epi32( s8s3( qt[ (i)-14 ] ), \ - s8s0( qt[ (i)-13 ] ) ) ), \ - _mm256_add_epi32( \ - _mm256_add_epi32( s8s1( qt[ (i)-12 ] ), \ - s8s2( qt[ (i)-11 ] ) ), \ - _mm256_add_epi32( s8s3( qt[ (i)-10 ] ), \ - s8s0( qt[ (i)- 9 ] ) ) ) ), \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( s8s1( qt[ (i)- 8 ] ), \ - s8s2( qt[ (i)- 7 ] ) ), \ - _mm256_add_epi32( s8s3( qt[ (i)- 6 ] ), \ - s8s0( qt[ (i)- 5 ] ) ) ), \ - _mm256_add_epi32( \ - _mm256_add_epi32( s8s1( qt[ (i)- 4 ] ), \ - s8s2( qt[ (i)- 3 ] ) ), \ - _mm256_add_epi32( s8s3( qt[ (i)- 2 ] ), \ - s8s0( qt[ (i)- 1 ] ) ) ) ) ), \ - add_elt_s8( M, H, (i)-16 ) ) - -#define expand2s8( qt, M, H, i) \ - _mm256_add_epi32( \ - mm256_add4_32( \ - mm256_add4_32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ), \ - qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ), \ - mm256_add4_32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ), \ - qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ), \ - mm256_add4_32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ), \ - qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ), \ - mm256_add4_32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ), \ - s8s4( qt[ (i)- 2 ] ), s8s5( qt[ (i)- 1 ] ) ) ), \ - add_elt_s8( M, H, (i)-16 ) ) - - -#define W8s0 \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) - -#define W8s1 \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define W8s2 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define W8s3 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) - -#define W8s4 \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) - -#define W8s5 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define W8s6 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) - -#define W8s7 \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) - -#define W8s8 \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define W8s9 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) - -#define W8s10 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define W8s11 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ) - -#define W8s12 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ) - -#define W8s13 \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ) - -#define W8s14 \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ) - -#define W8s15 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[ 4], H[4] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) - -void compress_small_8way( const __m256i *M, const __m256i H[16], - __m256i dH[16] ) -{ - __m256i qt[32], xl, xh; - - qt[ 0] = _mm256_add_epi32( s8s0( W8s0 ), H[ 1] ); - qt[ 1] = _mm256_add_epi32( s8s1( W8s1 ), H[ 2] ); - qt[ 2] = _mm256_add_epi32( s8s2( W8s2 ), H[ 3] ); - qt[ 3] = _mm256_add_epi32( s8s3( W8s3 ), H[ 4] ); - qt[ 4] = _mm256_add_epi32( s8s4( W8s4 ), H[ 5] ); - qt[ 5] = _mm256_add_epi32( s8s0( W8s5 ), H[ 6] ); - qt[ 6] = _mm256_add_epi32( s8s1( W8s6 ), H[ 7] ); - qt[ 7] = _mm256_add_epi32( s8s2( W8s7 ), H[ 8] ); - qt[ 8] = _mm256_add_epi32( s8s3( W8s8 ), H[ 9] ); - qt[ 9] = _mm256_add_epi32( s8s4( W8s9 ), H[10] ); - qt[10] = _mm256_add_epi32( s8s0( W8s10), H[11] ); - qt[11] = _mm256_add_epi32( s8s1( W8s11), H[12] ); - qt[12] = _mm256_add_epi32( s8s2( W8s12), H[13] ); - qt[13] = _mm256_add_epi32( s8s3( W8s13), H[14] ); - qt[14] = _mm256_add_epi32( s8s4( W8s14), H[15] ); - qt[15] = _mm256_add_epi32( s8s0( W8s15), H[ 0] ); - qt[16] = expand1s8( qt, M, H, 16 ); - qt[17] = expand1s8( qt, M, H, 17 ); - qt[18] = expand2s8( qt, M, H, 18 ); - qt[19] = expand2s8( qt, M, H, 19 ); - qt[20] = expand2s8( qt, M, H, 20 ); - qt[21] = expand2s8( qt, M, H, 21 ); - qt[22] = expand2s8( qt, M, H, 22 ); - qt[23] = expand2s8( qt, M, H, 23 ); - qt[24] = expand2s8( qt, M, H, 24 ); - qt[25] = expand2s8( qt, M, H, 25 ); - qt[26] = expand2s8( qt, M, H, 26 ); - qt[27] = expand2s8( qt, M, H, 27 ); - qt[28] = expand2s8( qt, M, H, 28 ); - qt[29] = expand2s8( qt, M, H, 29 ); - qt[30] = expand2s8( qt, M, H, 30 ); - qt[31] = expand2s8( qt, M, H, 31 ); - - xl = _mm256_xor_si256( - mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm256_xor_si256( xl, _mm256_xor_si256( - mm256_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); - - dH[ 0] = _mm256_add_epi32( - _mm256_xor_si256( M[0], - _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ), - _mm256_srli_epi32( qt[16], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] )); - dH[ 1] = _mm256_add_epi32( - _mm256_xor_si256( M[1], - _mm256_xor_si256( _mm256_srli_epi32( xh, 7 ), - _mm256_slli_epi32( qt[17], 8 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] )); - dH[ 2] = _mm256_add_epi32( - _mm256_xor_si256( M[2], - _mm256_xor_si256( _mm256_srli_epi32( xh, 5 ), - _mm256_slli_epi32( qt[18], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] )); - dH[ 3] = _mm256_add_epi32( - _mm256_xor_si256( M[3], - _mm256_xor_si256( _mm256_srli_epi32( xh, 1 ), - _mm256_slli_epi32( qt[19], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] )); - dH[ 4] = _mm256_add_epi32( - _mm256_xor_si256( M[4], - _mm256_xor_si256( _mm256_srli_epi32( xh, 3 ), - _mm256_slli_epi32( qt[20], 0 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] )); - dH[ 5] = _mm256_add_epi32( - _mm256_xor_si256( M[5], - _mm256_xor_si256( _mm256_slli_epi32( xh, 6 ), - _mm256_srli_epi32( qt[21], 6 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] )); - dH[ 6] = _mm256_add_epi32( - _mm256_xor_si256( M[6], - _mm256_xor_si256( _mm256_srli_epi32( xh, 4 ), - _mm256_slli_epi32( qt[22], 6 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] )); - dH[ 7] = _mm256_add_epi32( - _mm256_xor_si256( M[7], - _mm256_xor_si256( _mm256_srli_epi32( xh, 11 ), - _mm256_slli_epi32( qt[23], 2 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] )); - dH[ 8] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[4], 9 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )), - _mm256_xor_si256( _mm256_slli_epi32( xl, 8 ), - _mm256_xor_si256( qt[23], qt[ 8] ) ) ); - dH[ 9] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[5], 10 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 6 ), - _mm256_xor_si256( qt[16], qt[ 9] ) ) ); - dH[10] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[6], 11 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )), - _mm256_xor_si256( _mm256_slli_epi32( xl, 6 ), - _mm256_xor_si256( qt[17], qt[10] ) ) ); - dH[11] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[7], 12 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )), - _mm256_xor_si256( _mm256_slli_epi32( xl, 4 ), - _mm256_xor_si256( qt[18], qt[11] ) ) ); - dH[12] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[0], 13 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 3 ), - _mm256_xor_si256( qt[19], qt[12] ) ) ); - dH[13] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[1], 14 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 4 ), - _mm256_xor_si256( qt[20], qt[13] ) ) ); - dH[14] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[2], 15 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 7 ), - _mm256_xor_si256( qt[21], qt[14] ) ) ); - dH[15] = _mm256_add_epi32( _mm256_add_epi32( - mm256_rol_32( dH[3], 16 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )), - _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ), - _mm256_xor_si256( qt[22], qt[15] ) ) ); -} - -static const __m256i final_s8[16] = -{ - { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0, - 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 }, - { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1, - 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 }, - { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2, - 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 }, - { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3, - 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 }, - { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4, - 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 }, - { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5, - 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 }, - { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6, - 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 }, - { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7, - 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 }, - { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8, - 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 }, - { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9, - 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 }, - { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, - 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa }, - { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab, - 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab }, - { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac, - 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac }, - { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad, - 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad }, - { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae, - 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae }, - { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf, - 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf } -}; - -void bmw256_8way_init( bmw256_8way_context *ctx ) -{ - ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] ); - ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] ); - ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] ); - ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] ); - ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] ); - ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] ); - ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] ); - ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] ); - ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] ); - ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] ); - ctx->H[10] = _mm256_set1_epi32( IV256[10] ); - ctx->H[11] = _mm256_set1_epi32( IV256[11] ); - ctx->H[12] = _mm256_set1_epi32( IV256[12] ); - ctx->H[13] = _mm256_set1_epi32( IV256[13] ); - ctx->H[14] = _mm256_set1_epi32( IV256[14] ); - ctx->H[15] = _mm256_set1_epi32( IV256[15] ); - ctx->ptr = 0; - ctx->bit_count = 0; - -} - -void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len ) -{ - __m256i *vdata = (__m256i*)data; - __m256i *buf; - __m256i htmp[16]; - __m256i *h1, *h2; - size_t ptr; - const int buf_size = 64; // bytes of one lane, compatible with len - - ctx->bit_count += len << 3; - buf = ctx->buf; - ptr = ctx->ptr; - h1 = ctx->H; - h2 = htmp; - - while ( len > 0 ) - { - size_t clen; - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( buf + (ptr>>2), vdata, clen >> 2 ); - vdata = vdata + (clen>>2); - len -= clen; - ptr += clen; - if ( ptr == buf_size ) - { - __m256i *ht; - compress_small_8way( buf, h1, h2 ); - ht = h1; - h1 = h2; - h2 = ht; - ptr = 0; - } - } - ctx->ptr = ptr; - - if ( h1 != ctx->H ) - memcpy_256( ctx->H, h1, 16 ); -} - -void bmw256_8way_close( bmw256_8way_context *ctx, void *dst ) -{ - __m256i *buf; - __m256i h1[16], h2[16], *h; - size_t ptr, u, v; - const int buf_size = 64; // bytes of one lane, compatible with len - - buf = ctx->buf; - ptr = ctx->ptr; - buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 ); - ptr += 4; - h = ctx->H; - - if ( ptr > (buf_size - 4) ) - { - memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 ); - compress_small_8way( buf, h, h1 ); - ptr = 0; - h = h1; - } - memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 ); - buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count ); - buf[ (buf_size - 4) >> 2 ] = m256_zero; - - - compress_small_8way( buf, h, h2 ); - - for ( u = 0; u < 16; u ++ ) - buf[u] = h2[u]; - - compress_small_8way( buf, final_s8, h1 ); - for (u = 0, v = 16 - 8; u < 8; u ++, v ++) - casti_m256i(dst,u) = h1[v]; -} - - -#endif // __AVX2__ - -#ifdef __cplusplus -} -#endif - diff --git a/algo/bmw/bmw256.c b/algo/bmw/bmw256.c deleted file mode 100644 index 39352a7..0000000 --- a/algo/bmw/bmw256.c +++ /dev/null @@ -1,66 +0,0 @@ -#include "algo-gate-api.h" - -#include -#include - -#include "sph_bmw.h" - -void bmwhash(void *output, const void *input) -{ -/* - uint32_t hash[16]; - sph_bmw256_context ctx; - - sph_bmw256_init(&ctx); - sph_bmw256(&ctx, input, 80); - sph_bmw256_close(&ctx, hash); - - memcpy(output, hash, 32); -*/ -} - -int scanhash_bmw( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - uint32_t _ALIGN(64) hash64[8]; - uint32_t _ALIGN(64) endiandata[20]; - int thr_id = mythr->id; - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - - uint32_t n = first_nonce; - - for (int k = 0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - do { - be32enc(&endiandata[19], n); - bmwhash(hash64, endiandata); - if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return true; - } - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} - -bool register_bmw256_algo( algo_gate_t* gate ) -{ - algo_not_implemented(); - return false; -// gate->scanhash = (void*)&scanhash_bmw; -// gate->hash = (void*)&bmwhash; - return true; -}; - diff --git a/algo/bmw/bmw512-4way.c b/algo/bmw/bmw512-4way.c deleted file mode 100644 index 9142e72..0000000 --- a/algo/bmw/bmw512-4way.c +++ /dev/null @@ -1,59 +0,0 @@ -#include "bmw512-gate.h" - -#ifdef BMW512_4WAY - -#include -#include -#include -//#include "sph_keccak.h" -#include "bmw-hash-4way.h" - -void bmw512hash_4way(void *state, const void *input) -{ - bmw512_4way_context ctx; - bmw512_4way_init( &ctx ); - bmw512_4way( &ctx, input, 80 ); - bmw512_4way_close( &ctx, state ); -} - -int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t hash[16*4] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash7 = &(hash[25]); // 3*8+1 - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned -// const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - bmw512hash_4way( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) ) - { - extr_lane_4x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - - } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/bmw/bmw512-gate.c b/algo/bmw/bmw512-gate.c deleted file mode 100644 index b8d1de0..0000000 --- a/algo/bmw/bmw512-gate.c +++ /dev/null @@ -1,20 +0,0 @@ -#include "bmw512-gate.h" - -int64_t bmw512_get_max64() { return 0x7ffffLL; } - -bool register_bmw512_algo( algo_gate_t* gate ) -{ - gate->optimizations = AVX2_OPT; - gate->set_target = (void*)&alt_set_target; - gate->get_max64 = (void*)&bmw512_get_max64; -#if defined (BMW512_4WAY) - gate->scanhash = (void*)&scanhash_bmw512_4way; - gate->hash = (void*)&bmw512hash_4way; -#else - gate->scanhash = (void*)&scanhash_bmw512; - gate->hash = (void*)&bmw512hash; -#endif - return true; -}; - - diff --git a/algo/bmw/bmw512-gate.h b/algo/bmw/bmw512-gate.h deleted file mode 100644 index 9aeb519..0000000 --- a/algo/bmw/bmw512-gate.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef BMW512_GATE_H__ -#define BMW512_GATE_H__ - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) - #define BMW512_4WAY 1 -#endif - -#if defined(BMW512_4WAY) - -void bmw512hash_4way( void *state, const void *input ); -int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - -void bmw512hash( void *state, const void *input ); -int scanhash_bmw512( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c deleted file mode 100644 index 7c58003..0000000 --- a/algo/bmw/bmw512-hash-4way.c +++ /dev/null @@ -1,1073 +0,0 @@ -/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */ -/* - * BMW implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include -#include -#include "bmw-hash-4way.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -#define LPAR ( - -static const sph_u64 IV512[] = { - SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), - SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F), - SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF), - SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF), - SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF), - SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF), - SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF), - SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF) -}; - -#if defined(__SSE2__) - -// BMW-512 2 way 64 - - -#define s2b0(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \ - _mm_slli_epi64( (x), 3) ), \ - _mm_xor_si128( mm128_rol_64( (x), 4), \ - mm128_rol_64( (x), 37) ) ) - -#define s2b1(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \ - _mm_slli_epi64( (x), 2) ), \ - _mm_xor_si128( mm128_rol_64( (x), 13), \ - mm128_rol_64( (x), 43) ) ) - -#define s2b2(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 2), \ - _mm_slli_epi64( (x), 1) ), \ - _mm_xor_si128( mm128_rol_64( (x), 19), \ - mm128_rol_64( (x), 53) ) ) - -#define s2b3(x) \ - _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 2), \ - _mm_slli_epi64( (x), 2) ), \ - _mm_xor_si128( mm128_rol_64( (x), 28), \ - mm128_rol_64( (x), 59) ) ) - -#define s2b4(x) \ - _mm_xor_si128( (x), _mm_srli_epi64( (x), 1 ) ) - -#define s2b5(x) \ - _mm_xor_si128( (x), _mm_srli_epi64( (x), 2 ) ) - - -#define r2b1(x) mm128_rol_64( x, 5 ) -#define r2b2(x) mm128_rol_64( x, 11 ) -#define r2b3(x) mm128_rol_64( x, 27 ) -#define r2b4(x) mm128_rol_64( x, 32 ) -#define r2b5(x) mm128_rol_64( x, 37 ) -#define r2b6(x) mm128_rol_64( x, 43 ) -#define r2b7(x) mm128_rol_64( x, 53 ) - -#define mm128_rol_off_64( M, j, off ) \ - mm128_rol_64( M[ ( (j) + (off) ) & 0xF ] , \ - ( ( (j) + (off) ) & 0xF ) + 1 ) - -#define add_elt_2b( M, H, j ) \ - _mm_xor_si128( \ - _mm_add_epi64( \ - _mm_sub_epi64( _mm_add_epi64( mm128_rol_off_64( M, j, 0 ), \ - mm128_rol_off_64( M, j, 3 ) ), \ - mm128_rol_off_64( M, j, 10 ) ), \ - _mm_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \ - H[ ( (j)+7 ) & 0xF ] ) - - -#define expand1_2b( qt, M, H, i ) \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( s2b1( qt[ (i)-16 ] ), \ - s2b2( qt[ (i)-15 ] ) ), \ - _mm_add_epi64( s2b3( qt[ (i)-14 ] ), \ - s2b0( qt[ (i)-13 ] ) ) ), \ - _mm_add_epi64( \ - _mm_add_epi64( s2b1( qt[ (i)-12 ] ), \ - s2b2( qt[ (i)-11 ] ) ), \ - _mm_add_epi64( s2b3( qt[ (i)-10 ] ), \ - s2b0( qt[ (i)- 9 ] ) ) ) ), \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( s2b1( qt[ (i)- 8 ] ), \ - s2b2( qt[ (i)- 7 ] ) ), \ - _mm_add_epi64( s2b3( qt[ (i)- 6 ] ), \ - s2b0( qt[ (i)- 5 ] ) ) ), \ - _mm_add_epi64( \ - _mm_add_epi64( s2b1( qt[ (i)- 4 ] ), \ - s2b2( qt[ (i)- 3 ] ) ), \ - _mm_add_epi64( s2b3( qt[ (i)- 2 ] ), \ - s2b0( qt[ (i)- 1 ] ) ) ) ) ), \ - add_elt_2b( M, H, (i)-16 ) ) - -#define expand2_2b( qt, M, H, i) \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( qt[ (i)-16 ], r2b1( qt[ (i)-15 ] ) ), \ - _mm_add_epi64( qt[ (i)-14 ], r2b2( qt[ (i)-13 ] ) ) ), \ - _mm_add_epi64( \ - _mm_add_epi64( qt[ (i)-12 ], r2b3( qt[ (i)-11 ] ) ), \ - _mm_add_epi64( qt[ (i)-10 ], r2b4( qt[ (i)- 9 ] ) ) ) ), \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( qt[ (i)- 8 ], r2b5( qt[ (i)- 7 ] ) ), \ - _mm_add_epi64( qt[ (i)- 6 ], r2b6( qt[ (i)- 5 ] ) ) ), \ - _mm_add_epi64( \ - _mm_add_epi64( qt[ (i)- 4 ], r2b7( qt[ (i)- 3 ] ) ), \ - _mm_add_epi64( s2b4( qt[ (i)- 2 ] ), \ - s2b5( qt[ (i)- 1 ] ) ) ) ) ), \ - add_elt_2b( M, H, (i)-16 ) ) - - -#define W2b0 \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 5], H[ 5] ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[13], H[13] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) - -#define W2b1 \ - _mm_sub_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 6], H[ 6] ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[14], H[14] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define W2b2 \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[12], H[12] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define W2b3 \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_add_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 1], H[ 1] ) ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[13], H[13] ) ) - -#define W2b4 \ - _mm_sub_epi64( \ - _mm_sub_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) - -#define W2b5 \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_add_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 3], H[ 3] ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[12], H[12] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define W2b6 \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 4], H[ 4] ), \ - _mm_xor_si128( M[ 0], H[ 0] ) ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[13], H[13] ) ) - -#define W2b7 \ - _mm_sub_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[12], H[12] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) - -#define W2b8 \ - _mm_sub_epi64( \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 2], H[ 2] ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[13], H[13] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define W2b9 \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_add_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) - -#define W2b10 \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 8], H[ 8] ), \ - _mm_xor_si128( M[ 1], H[ 1] ) ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define W2b11 \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 8], H[ 8] ), \ - _mm_xor_si128( M[ 0], H[ 0] ) ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ) - -#define W2b12 \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( \ - _mm_add_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[10], H[10] ) ) - -#define W2b13 \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( \ - _mm_add_epi64( _mm_xor_si128( M[ 2], H[ 2] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[11], H[11] ) ) - -#define W2b14 \ - _mm_sub_epi64( \ - _mm_sub_epi64( \ - _mm_add_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[ 3], H[ 3] ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[12], H[12] ) ) - -#define W2b15 \ - _mm_add_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( \ - _mm_sub_epi64( _mm_xor_si128( M[12], H[12] ), \ - _mm_xor_si128( M[ 4], H[4] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[13], H[13] ) ) - - -void compress_big_2way( const __m128i *M, const __m128i H[16], - __m128i dH[16] ) -{ - __m128i qt[32], xl, xh; - - qt[ 0] = _mm_add_epi64( s2b0( W2b0 ), H[ 1] ); - qt[ 1] = _mm_add_epi64( s2b1( W2b1 ), H[ 2] ); - qt[ 2] = _mm_add_epi64( s2b2( W2b2 ), H[ 3] ); - qt[ 3] = _mm_add_epi64( s2b3( W2b3 ), H[ 4] ); - qt[ 4] = _mm_add_epi64( s2b4( W2b4 ), H[ 5] ); - qt[ 5] = _mm_add_epi64( s2b0( W2b5 ), H[ 6] ); - qt[ 6] = _mm_add_epi64( s2b1( W2b6 ), H[ 7] ); - qt[ 7] = _mm_add_epi64( s2b2( W2b7 ), H[ 8] ); - qt[ 8] = _mm_add_epi64( s2b3( W2b8 ), H[ 9] ); - qt[ 9] = _mm_add_epi64( s2b4( W2b9 ), H[10] ); - qt[10] = _mm_add_epi64( s2b0( W2b10), H[11] ); - qt[11] = _mm_add_epi64( s2b1( W2b11), H[12] ); - qt[12] = _mm_add_epi64( s2b2( W2b12), H[13] ); - qt[13] = _mm_add_epi64( s2b3( W2b13), H[14] ); - qt[14] = _mm_add_epi64( s2b4( W2b14), H[15] ); - qt[15] = _mm_add_epi64( s2b0( W2b15), H[ 0] ); - qt[16] = expand1_2b( qt, M, H, 16 ); - qt[17] = expand1_2b( qt, M, H, 17 ); - qt[18] = expand2_2b( qt, M, H, 18 ); - qt[19] = expand2_2b( qt, M, H, 19 ); - qt[20] = expand2_2b( qt, M, H, 20 ); - qt[21] = expand2_2b( qt, M, H, 21 ); - qt[22] = expand2_2b( qt, M, H, 22 ); - qt[23] = expand2_2b( qt, M, H, 23 ); - qt[24] = expand2_2b( qt, M, H, 24 ); - qt[25] = expand2_2b( qt, M, H, 25 ); - qt[26] = expand2_2b( qt, M, H, 26 ); - qt[27] = expand2_2b( qt, M, H, 27 ); - qt[28] = expand2_2b( qt, M, H, 28 ); - qt[29] = expand2_2b( qt, M, H, 29 ); - qt[30] = expand2_2b( qt, M, H, 30 ); - qt[31] = expand2_2b( qt, M, H, 31 ); - - xl = _mm_xor_si128( - _mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ), - _mm_xor_si128( qt[18], qt[19] ) ), - _mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ), - _mm_xor_si128( qt[22], qt[23] ) ) ); - xh = _mm_xor_si128( xl, - _mm_xor_si128( - _mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ), - _mm_xor_si128( qt[26], qt[27] ) ), - _mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ), - _mm_xor_si128( qt[30], qt[31] ) ) ) ); - - dH[ 0] = _mm_add_epi64( - _mm_xor_si128( M[0], - _mm_xor_si128( _mm_slli_epi64( xh, 5 ), - _mm_srli_epi64( qt[16], 5 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ) ); - dH[ 1] = _mm_add_epi64( - _mm_xor_si128( M[1], - _mm_xor_si128( _mm_srli_epi64( xh, 7 ), - _mm_slli_epi64( qt[17], 8 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ) ); - dH[ 2] = _mm_add_epi64( - _mm_xor_si128( M[2], - _mm_xor_si128( _mm_srli_epi64( xh, 5 ), - _mm_slli_epi64( qt[18], 5 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ) ); - dH[ 3] = _mm_add_epi64( - _mm_xor_si128( M[3], - _mm_xor_si128( _mm_srli_epi64( xh, 1 ), - _mm_slli_epi64( qt[19], 5 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ) ); - dH[ 4] = _mm_add_epi64( - _mm_xor_si128( M[4], - _mm_xor_si128( _mm_srli_epi64( xh, 3 ), - _mm_slli_epi64( qt[20], 0 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ) ); - dH[ 5] = _mm_add_epi64( - _mm_xor_si128( M[5], - _mm_xor_si128( _mm_slli_epi64( xh, 6 ), - _mm_srli_epi64( qt[21], 6 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ) ); - dH[ 6] = _mm_add_epi64( - _mm_xor_si128( M[6], - _mm_xor_si128( _mm_srli_epi64( xh, 4 ), - _mm_slli_epi64( qt[22], 6 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ) ); - dH[ 7] = _mm_add_epi64( - _mm_xor_si128( M[7], - _mm_xor_si128( _mm_srli_epi64( xh, 11 ), - _mm_slli_epi64( qt[23], 2 ) ) ), - _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ) ); - dH[ 8] = _mm_add_epi64( _mm_add_epi64( - mm128_rol_64( dH[4], 9 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] ) ), - _mm_xor_si128( _mm_slli_epi64( xl, 8 ), - _mm_xor_si128( qt[23], qt[ 8] ) ) ); - dH[ 9] = _mm_add_epi64( _mm_add_epi64( - mm128_rol_64( dH[5], 10 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] ) ), - _mm_xor_si128( _mm_srli_epi64( xl, 6 ), - _mm_xor_si128( qt[16], qt[ 9] ) ) ); - dH[10] = _mm_add_epi64( _mm_add_epi64( - mm128_rol_64( dH[6], 11 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] ) ), - _mm_xor_si128( _mm_slli_epi64( xl, 6 ), - _mm_xor_si128( qt[17], qt[10] ) ) ); - dH[11] = _mm_add_epi64( _mm_add_epi64( - mm128_rol_64( dH[7], 12 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )), - _mm_xor_si128( _mm_slli_epi64( xl, 4 ), - _mm_xor_si128( qt[18], qt[11] ) ) ); - dH[12] = _mm_add_epi64( _mm_add_epi64( - mm128_rol_64( dH[0], 13 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] ) ), - _mm_xor_si128( _mm_srli_epi64( xl, 3 ), - _mm_xor_si128( qt[19], qt[12] ) ) ); - dH[13] = _mm_add_epi64( _mm_add_epi64( - mm128_rol_64( dH[1], 14 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] ) ), - _mm_xor_si128( _mm_srli_epi64( xl, 4 ), - _mm_xor_si128( qt[20], qt[13] ) ) ); - dH[14] = _mm_add_epi64( _mm_add_epi64( - mm128_rol_64( dH[2], 15 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] ) ), - _mm_xor_si128( _mm_srli_epi64( xl, 7 ), - _mm_xor_si128( qt[21], qt[14] ) ) ); - dH[15] = _mm_add_epi64( _mm_add_epi64( - mm128_rol_64( dH[3], 16 ), - _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] ) ), - _mm_xor_si128( _mm_srli_epi64( xl, 2 ), - _mm_xor_si128( qt[22], qt[15] ) ) ); -} - -static const __m128i final_b2[16] = -{ - { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 }, - { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 }, - { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 }, - { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 }, - { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 }, - { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 }, - { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 }, - { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 }, - { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 }, - { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 }, - { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 }, - { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 }, - { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 }, - { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 }, - { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 }, - { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf } -}; - -void bmw512_2way_init( bmw_2way_big_context *ctx ) -{ - ctx->H[ 0] = _mm_set1_epi64x( IV512[ 0] ); - ctx->H[ 1] = _mm_set1_epi64x( IV512[ 1] ); - ctx->H[ 2] = _mm_set1_epi64x( IV512[ 2] ); - ctx->H[ 3] = _mm_set1_epi64x( IV512[ 3] ); - ctx->H[ 4] = _mm_set1_epi64x( IV512[ 4] ); - ctx->H[ 5] = _mm_set1_epi64x( IV512[ 5] ); - ctx->H[ 6] = _mm_set1_epi64x( IV512[ 6] ); - ctx->H[ 7] = _mm_set1_epi64x( IV512[ 7] ); - ctx->H[ 8] = _mm_set1_epi64x( IV512[ 8] ); - ctx->H[ 9] = _mm_set1_epi64x( IV512[ 9] ); - ctx->H[10] = _mm_set1_epi64x( IV512[10] ); - ctx->H[11] = _mm_set1_epi64x( IV512[11] ); - ctx->H[12] = _mm_set1_epi64x( IV512[12] ); - ctx->H[13] = _mm_set1_epi64x( IV512[13] ); - ctx->H[14] = _mm_set1_epi64x( IV512[14] ); - ctx->H[15] = _mm_set1_epi64x( IV512[15] ); - ctx->ptr = 0; - ctx->bit_count = 0; -} - -void bmw512_2way( bmw_2way_big_context *ctx, const void *data, size_t len ) -{ - __m128i *buf = (__m128i*)ctx->buf; - __m128i htmp[16]; - __m128i *h1 = ctx->H; - __m128i *h2 = htmp; - size_t blen = len << 1; - size_t ptr = ctx->ptr; - size_t bptr = ctx->ptr << 1; - size_t vptr = ctx->ptr >> 3; -// const int buf_size = 128; // bytes of one lane, compatible with len - - ctx->bit_count += len << 3; - while ( blen > 0 ) - { - size_t clen = (sizeof ctx->buf ) - bptr; - if ( clen > blen ) - clen = blen; - memcpy( buf + vptr, data, clen ); - bptr += clen; - vptr = bptr >> 4; - data = (const unsigned char *)data + clen; - blen -= clen; - if ( ptr == (sizeof ctx->buf ) ) - { - __m128i *ht; - compress_big_2way( buf, h1, h2 ); - ht = h1; - h1 = h2; - h2 = ht; - ptr = 0; - } - } - ctx->ptr = ptr; - if ( h1 != ctx->H ) - memcpy_128( ctx->H, h1, 16 ); -} - -void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst ) -{ - __m128i h1[16], h2[16], *h; - __m128i *buf = (__m128i*)ctx->buf; - size_t vptr = ctx->ptr >> 3; -// unsigned bit_len = ( (unsigned)(ctx->ptr) << 1 ); - - buf[ vptr++ ] = _mm_set1_epi64x( 0x80 ); - h = ctx->H; - - if ( vptr == 16 ) - { - compress_big_2way( buf, h, h1 ); - vptr = 0; - h = h1; - } - memset_zero_128( buf + vptr, 16 - vptr - 1 ); - buf[ 15 ] = _mm_set1_epi64x( ctx->bit_count ); - compress_big_2way( buf, h, h2 ); - memcpy_128( buf, h2, 16 ); - compress_big_2way( buf, final_b2, h1 ); - memcpy( (__m128i*)dst, h1+16, 8 ); -} - -#endif // __SSE2__ - - - -#if defined(__AVX2__) - -// BMW-512 4 way 64 - - -#define sb0(x) \ - mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \ - mm256_rol_64( (x), 4), mm256_rol_64( (x),37) ) - -#define sb1(x) \ - mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 2), \ - mm256_rol_64( (x),13), mm256_rol_64( (x),43) ) - -#define sb2(x) \ - mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 1), \ - mm256_rol_64( (x),19), mm256_rol_64( (x),53) ) - -#define sb3(x) \ - mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 2), \ - mm256_rol_64( (x),28), mm256_rol_64( (x),59) ) - -#define sb4(x) \ - _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) ) - -#define sb5(x) \ - _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 2 ) ) - -#define rb1(x) mm256_rol_64( x, 5 ) -#define rb2(x) mm256_rol_64( x, 11 ) -#define rb3(x) mm256_rol_64( x, 27 ) -#define rb4(x) mm256_rol_64( x, 32 ) -#define rb5(x) mm256_rol_64( x, 37 ) -#define rb6(x) mm256_rol_64( x, 43 ) -#define rb7(x) mm256_rol_64( x, 53 ) - -#define rol_off_64( M, j, off ) \ - mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \ - ( ( (j) + (off) ) & 0xF ) + 1 ) - -#define add_elt_b( M, H, j ) \ - _mm256_xor_si256( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \ - rol_off_64( M, j, 3 ) ), \ - rol_off_64( M, j, 10 ) ), \ - _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \ - H[ ( (j)+7 ) & 0xF ] ) - - -#define expand1b( qt, M, H, i ) \ - _mm256_add_epi64( mm256_add4_64( \ - mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \ - sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \ - mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \ - sb3( qt[ (i)-10 ] ), sb0( qt[ (i)- 9 ] )), \ - mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \ - sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \ - mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \ - sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \ - add_elt_b( M, H, (i)-16 ) ) - -#define expand2b( qt, M, H, i) \ - _mm256_add_epi64( mm256_add4_64( \ - mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \ - qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \ - mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \ - qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ), \ - mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \ - qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \ - mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \ - sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \ - add_elt_b( M, H, (i)-16 ) ) - -#define Wb0 \ - _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) - -#define Wb1 \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define Wb2 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define Wb3 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) - -#define Wb4 \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) - -#define Wb5 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define Wb6 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) - -#define Wb7 \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) - -#define Wb8 \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define Wb9 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) - -#define Wb10 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) - -#define Wb11 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ) - -#define Wb12 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ) - -#define Wb13 \ - _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ) - -#define Wb14 \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ) - -#define Wb15 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[ 4], H[4] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) - -void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) -{ - __m256i qt[32], xl, xh; - - qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] ); - qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] ); - qt[ 2] = _mm256_add_epi64( sb2( Wb2 ), H[ 3] ); - qt[ 3] = _mm256_add_epi64( sb3( Wb3 ), H[ 4] ); - qt[ 4] = _mm256_add_epi64( sb4( Wb4 ), H[ 5] ); - qt[ 5] = _mm256_add_epi64( sb0( Wb5 ), H[ 6] ); - qt[ 6] = _mm256_add_epi64( sb1( Wb6 ), H[ 7] ); - qt[ 7] = _mm256_add_epi64( sb2( Wb7 ), H[ 8] ); - qt[ 8] = _mm256_add_epi64( sb3( Wb8 ), H[ 9] ); - qt[ 9] = _mm256_add_epi64( sb4( Wb9 ), H[10] ); - qt[10] = _mm256_add_epi64( sb0( Wb10), H[11] ); - qt[11] = _mm256_add_epi64( sb1( Wb11), H[12] ); - qt[12] = _mm256_add_epi64( sb2( Wb12), H[13] ); - qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] ); - qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] ); - qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); - qt[16] = expand1b( qt, M, H, 16 ); - qt[17] = expand1b( qt, M, H, 17 ); - qt[18] = expand2b( qt, M, H, 18 ); - qt[19] = expand2b( qt, M, H, 19 ); - qt[20] = expand2b( qt, M, H, 20 ); - qt[21] = expand2b( qt, M, H, 21 ); - qt[22] = expand2b( qt, M, H, 22 ); - qt[23] = expand2b( qt, M, H, 23 ); - qt[24] = expand2b( qt, M, H, 24 ); - qt[25] = expand2b( qt, M, H, 25 ); - qt[26] = expand2b( qt, M, H, 26 ); - qt[27] = expand2b( qt, M, H, 27 ); - qt[28] = expand2b( qt, M, H, 28 ); - qt[29] = expand2b( qt, M, H, 29 ); - qt[30] = expand2b( qt, M, H, 30 ); - qt[31] = expand2b( qt, M, H, 31 ); - - xl = _mm256_xor_si256( - mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), - mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) ); - xh = _mm256_xor_si256( xl, _mm256_xor_si256( - mm256_xor4( qt[24], qt[25], qt[26], qt[27] ), - mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); - - dH[ 0] = _mm256_add_epi64( - _mm256_xor_si256( M[0], - _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ), - _mm256_srli_epi64( qt[16], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) ); - dH[ 1] = _mm256_add_epi64( - _mm256_xor_si256( M[1], - _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ), - _mm256_slli_epi64( qt[17], 8 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) ); - dH[ 2] = _mm256_add_epi64( - _mm256_xor_si256( M[2], - _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ), - _mm256_slli_epi64( qt[18], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) ); - dH[ 3] = _mm256_add_epi64( - _mm256_xor_si256( M[3], - _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ), - _mm256_slli_epi64( qt[19], 5 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) ); - dH[ 4] = _mm256_add_epi64( - _mm256_xor_si256( M[4], - _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ), - _mm256_slli_epi64( qt[20], 0 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) ); - dH[ 5] = _mm256_add_epi64( - _mm256_xor_si256( M[5], - _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ), - _mm256_srli_epi64( qt[21], 6 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) ); - dH[ 6] = _mm256_add_epi64( - _mm256_xor_si256( M[6], - _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ), - _mm256_slli_epi64( qt[22], 6 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) ); - dH[ 7] = _mm256_add_epi64( - _mm256_xor_si256( M[7], - _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ), - _mm256_slli_epi64( qt[23], 2 ) ) ), - _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) ); - dH[ 8] = _mm256_add_epi64( _mm256_add_epi64( - mm256_rol_64( dH[4], 9 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )), - _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ), - _mm256_xor_si256( qt[23], qt[ 8] ) ) ); - dH[ 9] = _mm256_add_epi64( _mm256_add_epi64( - mm256_rol_64( dH[5], 10 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )), - _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ), - _mm256_xor_si256( qt[16], qt[ 9] ) ) ); - dH[10] = _mm256_add_epi64( _mm256_add_epi64( - mm256_rol_64( dH[6], 11 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )), - _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ), - _mm256_xor_si256( qt[17], qt[10] ) ) ); - dH[11] = _mm256_add_epi64( _mm256_add_epi64( - mm256_rol_64( dH[7], 12 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )), - _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ), - _mm256_xor_si256( qt[18], qt[11] ) ) ); - dH[12] = _mm256_add_epi64( _mm256_add_epi64( - mm256_rol_64( dH[0], 13 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )), - _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ), - _mm256_xor_si256( qt[19], qt[12] ) ) ); - dH[13] = _mm256_add_epi64( _mm256_add_epi64( - mm256_rol_64( dH[1], 14 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )), - _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ), - _mm256_xor_si256( qt[20], qt[13] ) ) ); - dH[14] = _mm256_add_epi64( _mm256_add_epi64( - mm256_rol_64( dH[2], 15 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )), - _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ), - _mm256_xor_si256( qt[21], qt[14] ) ) ); - dH[15] = _mm256_add_epi64( _mm256_add_epi64( - mm256_rol_64( dH[3], 16 ), - _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )), - _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ), - _mm256_xor_si256( qt[22], qt[15] ) ) ); -} - -static const __m256i final_b[16] = -{ - { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0, - 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 }, - { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1, - 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 }, - { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2, - 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 }, - { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3, - 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 }, - { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4, - 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 }, - { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5, - 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 }, - { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6, - 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 }, - { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7, - 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 }, - { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8, - 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 }, - { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9, - 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 }, - { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, - 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa }, - { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab, - 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab }, - { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac, - 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac }, - { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad, - 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad }, - { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae, - 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae }, - { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf, - 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf } -}; - -static void -bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv ) -{ - for ( int i = 0; i < 16; i++ ) - sc->H[i] = _mm256_set1_epi64x( iv[i] ); - sc->ptr = 0; - sc->bit_count = 0; -} - -static void -bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len ) -{ - __m256i *vdata = (__m256i*)data; - __m256i *buf; - __m256i htmp[16]; - __m256i *h1, *h2; - size_t ptr; - const int buf_size = 128; // bytes of one lane, compatible with len - - sc->bit_count += (sph_u64)len << 3; - buf = sc->buf; - ptr = sc->ptr; - h1 = sc->H; - h2 = htmp; - while ( len > 0 ) - { - size_t clen; - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( buf + (ptr>>3), vdata, clen >> 3 ); - vdata = vdata + (clen>>3); - len -= clen; - ptr += clen; - if ( ptr == buf_size ) - { - __m256i *ht; - compress_big( buf, h1, h2 ); - ht = h1; - h1 = h2; - h2 = ht; - ptr = 0; - } - } - sc->ptr = ptr; - if ( h1 != sc->H ) - memcpy_256( sc->H, h1, 16 ); -} - -static void -bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n, - void *dst, size_t out_size_w64) -{ - __m256i *buf; - __m256i h1[16], h2[16], *h; - size_t ptr, u, v; - unsigned z; - const int buf_size = 128; // bytes of one lane, compatible with len - - buf = sc->buf; - ptr = sc->ptr; - z = 0x80 >> n; - buf[ ptr>>3 ] = _mm256_set1_epi64x( z ); - ptr += 8; - h = sc->H; - - if ( ptr > (buf_size - 8) ) - { - memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 ); - compress_big( buf, h, h1 ); - ptr = 0; - h = h1; - } - memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 ); - buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( sc->bit_count + n ); - compress_big( buf, h, h2 ); - for ( u = 0; u < 16; u ++ ) - buf[u] = h2[u]; - compress_big( buf, final_b, h1 ); - for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) - casti_m256i(dst,u) = h1[v]; -} - -void -bmw512_4way_init(void *cc) -{ - bmw64_4way_init(cc, IV512); -} - -void -bmw512_4way(void *cc, const void *data, size_t len) -{ - bmw64_4way(cc, data, len); -} - -void -bmw512_4way_close(void *cc, void *dst) -{ - bmw512_4way_addbits_and_close(cc, 0, 0, dst); -} - -void -bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - bmw64_4way_close(cc, ub, n, dst, 8); -} - -#endif // __AVX2__ - -#ifdef __cplusplus -} -#endif - diff --git a/algo/bmw/bmw512.c b/algo/bmw/bmw512.c deleted file mode 100644 index 16620e1..0000000 --- a/algo/bmw/bmw512.c +++ /dev/null @@ -1,53 +0,0 @@ -#include "algo-gate-api.h" - -#include -#include -#include - -#include "sph_bmw.h" - -void bmw512hash(void *state, const void *input) -{ - sph_bmw512_context ctx; - uint32_t hash[32]; - - sph_bmw512_init( &ctx ); - sph_bmw512( &ctx,input, 80 ); - sph_bmw512_close( &ctx, hash ); - - memcpy( state, hash, 32 ); -} - -int scanhash_bmw512( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - //const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint32_t _ALIGN(32) hash64[8]; - uint32_t endiandata[32]; - - for (int i=0; i < 19; i++) - be32enc(&endiandata[i], pdata[i]); - - do { - - pdata[19] = ++n; - be32enc(&endiandata[19], n); - bmw512hash(hash64, endiandata); - if (((hash64[7]&0xFFFFFF00)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/bmw/sph_bmw.c b/algo/bmw/sph_bmw.c index a61ac65..6c5a6df 100644 --- a/algo/bmw/sph_bmw.c +++ b/algo/bmw/sph_bmw.c @@ -48,6 +48,8 @@ extern "C"{ #pragma warning (disable: 4146) #endif +#if !defined(__AVX2__) + static const sph_u32 IV224[] = { SPH_C32(0x00010203), SPH_C32(0x04050607), SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F), @@ -70,6 +72,8 @@ static const sph_u32 IV256[] = { SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F) }; +#endif // !AVX2 + #if SPH_64 static const sph_u64 IV384[] = { @@ -135,6 +139,8 @@ static const sph_u64 IV512[] = { #define M16_30 14, 15, 1, 2, 5, 8, 9 #define M16_31 15, 16, 2, 3, 6, 9, 10 +#if !defined(__AVX2__) + #define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \ ^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19)) #define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \ @@ -189,6 +195,8 @@ static const sph_u64 IV512[] = { #define expand2s_(qf, mf, hf, i16, ix, iy) \ expand2s_inner LPAR qf, mf, hf, i16, ix, iy) +#endif // !AVX2 + #if SPH_64 #define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \ @@ -291,6 +299,8 @@ static const sph_u64 Kb_tab[] = { tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \ op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4))) +#if !defined(__AVX2__) + #define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14) #define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15) #define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15) @@ -407,6 +417,8 @@ static const sph_u64 Kb_tab[] = { #define Qs(j) (qt[j]) +#endif // !AVX2 + #if SPH_64 #define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14) @@ -557,7 +569,6 @@ static const sph_u64 Kb_tab[] = { + ((xl >> 2) ^ qf(22) ^ qf(15))); \ } while (0) -#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH) #if SPH_64 @@ -565,6 +576,10 @@ static const sph_u64 Kb_tab[] = { #endif +#if !defined(__AVX2__) + +#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH) + static void compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16]) { @@ -711,6 +726,8 @@ bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n, sph_enc32le(out + 4 * u, h1[v]); } +#endif // !AVX2 + #if SPH_64 static void @@ -840,6 +857,8 @@ bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n, #endif +#if !defined(__AVX2__) + /* see sph_bmw.h */ void sph_bmw224_init(void *cc) @@ -898,6 +917,8 @@ sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) // sph_bmw256_init(cc); } +#endif // !AVX2 + #if SPH_64 /* see sph_bmw.h */ diff --git a/algo/bmw/sph_bmw.h b/algo/bmw/sph_bmw.h index b10071a..f53dd27 100644 --- a/algo/bmw/sph_bmw.h +++ b/algo/bmw/sph_bmw.h @@ -77,6 +77,9 @@ extern "C"{ * computation can be cloned by copying the context (e.g. with a simple * memcpy()). */ + +#if !defined(__AVX2__) + typedef struct { #ifndef DOXYGEN_IGNORE unsigned char buf[64]; /* first field, for alignment */ @@ -102,6 +105,8 @@ typedef sph_bmw_small_context sph_bmw224_context; */ typedef sph_bmw_small_context sph_bmw256_context; +#endif // !AVX2 + #if SPH_64 /** @@ -137,6 +142,8 @@ typedef sph_bmw_big_context sph_bmw512_context; #endif +#if !defined(__AVX2__) + /** * Initialize a BMW-224 context. This process performs no memory allocation. * @@ -227,6 +234,8 @@ void sph_bmw256_close(void *cc, void *dst); void sph_bmw256_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); +#endif // !AVX2 + #if SPH_64 /** diff --git a/algo/bmw/sse2/bmw.c b/algo/bmw/sse2/bmw.c deleted file mode 100644 index 51f21cc..0000000 --- a/algo/bmw/sse2/bmw.c +++ /dev/null @@ -1,519 +0,0 @@ -/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */ -/* - * BMW implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include -#include - -#ifdef __cplusplus -extern "C"{ -#endif - -#include "../sph_bmw.h" - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -static const sph_u64 bmwIV512[] = { - SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), - SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F), - SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF), - SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF), - SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF), - SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF), - SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF), - SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF) -}; - -#define XCAT(x, y) XCAT_(x, y) -#define XCAT_(x, y) x ## y - -#define LPAR ( - -#define I16_16 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -#define I16_17 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 -#define I16_18 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 -#define I16_19 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 -#define I16_20 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 -#define I16_21 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 -#define I16_22 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 -#define I16_23 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 -#define I16_24 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 -#define I16_25 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 -#define I16_26 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 -#define I16_27 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 -#define I16_28 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 -#define I16_29 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 -#define I16_30 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 -#define I16_31 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 - -#define M16_16 0, 1, 3, 4, 7, 10, 11 -#define M16_17 1, 2, 4, 5, 8, 11, 12 -#define M16_18 2, 3, 5, 6, 9, 12, 13 -#define M16_19 3, 4, 6, 7, 10, 13, 14 -#define M16_20 4, 5, 7, 8, 11, 14, 15 -#define M16_21 5, 6, 8, 9, 12, 15, 16 -#define M16_22 6, 7, 9, 10, 13, 0, 1 -#define M16_23 7, 8, 10, 11, 14, 1, 2 -#define M16_24 8, 9, 11, 12, 15, 2, 3 -#define M16_25 9, 10, 12, 13, 0, 3, 4 -#define M16_26 10, 11, 13, 14, 1, 4, 5 -#define M16_27 11, 12, 14, 15, 2, 5, 6 -#define M16_28 12, 13, 15, 16, 3, 6, 7 -#define M16_29 13, 14, 0, 1, 4, 7, 8 -#define M16_30 14, 15, 1, 2, 5, 8, 9 -#define M16_31 15, 16, 2, 3, 6, 9, 10 - -#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \ - ^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19)) -#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \ - ^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23)) -#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \ - ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25)) -#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \ - ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29)) -#define ss4(x) (((x) >> 1) ^ (x)) -#define ss5(x) (((x) >> 2) ^ (x)) -#define rs1(x) SPH_ROTL32(x, 3) -#define rs2(x) SPH_ROTL32(x, 7) -#define rs3(x) SPH_ROTL32(x, 13) -#define rs4(x) SPH_ROTL32(x, 16) -#define rs5(x) SPH_ROTL32(x, 19) -#define rs6(x) SPH_ROTL32(x, 23) -#define rs7(x) SPH_ROTL32(x, 27) - -#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555)) - -#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ - (SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \ - - SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m)) - -#define expand1s_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \ - + ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \ - + ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \ - + ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \ - + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) - -#define expand1s(qf, mf, hf, i16) \ - expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand1s_(qf, mf, hf, i16, ix, iy) \ - expand1s_inner LPAR qf, mf, hf, i16, ix, iy) - -#define expand2s_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \ - + qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \ - + qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \ - + qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \ - + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) - -#define expand2s(qf, mf, hf, i16) \ - expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand2s_(qf, mf, hf, i16, ix, iy) \ - expand2s_inner LPAR qf, mf, hf, i16, ix, iy) - -#if SPH_64 - -#define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \ - ^ SPH_ROTL64(x, 4) ^ SPH_ROTL64(x, 37)) -#define sb1(x) (((x) >> 1) ^ SPH_T64((x) << 2) \ - ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43)) -#define sb2(x) (((x) >> 2) ^ SPH_T64((x) << 1) \ - ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53)) -#define sb3(x) (((x) >> 2) ^ SPH_T64((x) << 2) \ - ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59)) -#define sb4(x) (((x) >> 1) ^ (x)) -#define sb5(x) (((x) >> 2) ^ (x)) -#define rb1(x) SPH_ROTL64(x, 5) -#define rb2(x) SPH_ROTL64(x, 11) -#define rb3(x) SPH_ROTL64(x, 27) -#define rb4(x) SPH_ROTL64(x, 32) -#define rb5(x) SPH_ROTL64(x, 37) -#define rb6(x) SPH_ROTL64(x, 43) -#define rb7(x) SPH_ROTL64(x, 53) - -#define Kb(j) SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555)) - -#if 0 - -static const sph_u64 Kb_tab[] = { - Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23), - Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31) -}; - -#define rol_off(mf, j, off) \ - SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1) - -#define add_elt_b(mf, hf, j) \ - (SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \ - - rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15)) - -#define expand1b(qf, mf, hf, i) \ - SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \ - + sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \ - + sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \ - + sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \ - + sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \ - + sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \ - + sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \ - + sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \ - + add_elt_b(mf, hf, (i) - 16)) - -#define expand2b(qf, mf, hf, i) \ - SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \ - + qf((i) - 14) + rb2(qf((i) - 13)) \ - + qf((i) - 12) + rb3(qf((i) - 11)) \ - + qf((i) - 10) + rb4(qf((i) - 9)) \ - + qf((i) - 8) + rb5(qf((i) - 7)) \ - + qf((i) - 6) + rb6(qf((i) - 5)) \ - + qf((i) - 4) + rb7(qf((i) - 3)) \ - + sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \ - + add_elt_b(mf, hf, (i) - 16)) - -#else - -#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ - (SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \ - - SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m)) - -#define expand1b_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \ - + sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \ - + sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \ - + sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \ - + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) - -#define expand1b(qf, mf, hf, i16) \ - expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand1b_(qf, mf, hf, i16, ix, iy) \ - expand1b_inner LPAR qf, mf, hf, i16, ix, iy) - -#define expand2b_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \ - + qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \ - + qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \ - + qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \ - + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) - -#define expand2b(qf, mf, hf, i16) \ - expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand2b_(qf, mf, hf, i16, ix, iy) \ - expand2b_inner LPAR qf, mf, hf, i16, ix, iy) - -#endif - -#endif - -#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \ - tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \ - op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4))) - -#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14) -#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15) -#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15) -#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13) -#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14) -#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15) -#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13) -#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14) -#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15) -#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14) -#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15) -#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9) -#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10) -#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11) -#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12) -#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13) - -#define MAKE_Qas do { \ - qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \ - qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \ - qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \ - qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \ - qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \ - qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \ - qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \ - qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \ - qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \ - qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \ - qt[10] = SPH_T32(ss0(Ws10) + H(11)); \ - qt[11] = SPH_T32(ss1(Ws11) + H(12)); \ - qt[12] = SPH_T32(ss2(Ws12) + H(13)); \ - qt[13] = SPH_T32(ss3(Ws13) + H(14)); \ - qt[14] = SPH_T32(ss4(Ws14) + H(15)); \ - qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \ - } while (0) - -#define MAKE_Qbs do { \ - qt[16] = expand1s(Qs, M, H, 16); \ - qt[17] = expand1s(Qs, M, H, 17); \ - qt[18] = expand2s(Qs, M, H, 18); \ - qt[19] = expand2s(Qs, M, H, 19); \ - qt[20] = expand2s(Qs, M, H, 20); \ - qt[21] = expand2s(Qs, M, H, 21); \ - qt[22] = expand2s(Qs, M, H, 22); \ - qt[23] = expand2s(Qs, M, H, 23); \ - qt[24] = expand2s(Qs, M, H, 24); \ - qt[25] = expand2s(Qs, M, H, 25); \ - qt[26] = expand2s(Qs, M, H, 26); \ - qt[27] = expand2s(Qs, M, H, 27); \ - qt[28] = expand2s(Qs, M, H, 28); \ - qt[29] = expand2s(Qs, M, H, 29); \ - qt[30] = expand2s(Qs, M, H, 30); \ - qt[31] = expand2s(Qs, M, H, 31); \ - } while (0) - -#define MAKE_Qs do { \ - MAKE_Qas; \ - MAKE_Qbs; \ - } while (0) - -#define Qs(j) (qt[j]) - -#define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14) -#define Wb1 MAKE_W(SPH_T64, 6, -, 8, +, 11, +, 14, -, 15) -#define Wb2 MAKE_W(SPH_T64, 0, +, 7, +, 9, -, 12, +, 15) -#define Wb3 MAKE_W(SPH_T64, 0, -, 1, +, 8, -, 10, +, 13) -#define Wb4 MAKE_W(SPH_T64, 1, +, 2, +, 9, -, 11, -, 14) -#define Wb5 MAKE_W(SPH_T64, 3, -, 2, +, 10, -, 12, +, 15) -#define Wb6 MAKE_W(SPH_T64, 4, -, 0, -, 3, -, 11, +, 13) -#define Wb7 MAKE_W(SPH_T64, 1, -, 4, -, 5, -, 12, -, 14) -#define Wb8 MAKE_W(SPH_T64, 2, -, 5, -, 6, +, 13, -, 15) -#define Wb9 MAKE_W(SPH_T64, 0, -, 3, +, 6, -, 7, +, 14) -#define Wb10 MAKE_W(SPH_T64, 8, -, 1, -, 4, -, 7, +, 15) -#define Wb11 MAKE_W(SPH_T64, 8, -, 0, -, 2, -, 5, +, 9) -#define Wb12 MAKE_W(SPH_T64, 1, +, 3, -, 6, -, 9, +, 10) -#define Wb13 MAKE_W(SPH_T64, 2, +, 4, +, 7, +, 10, +, 11) -#define Wb14 MAKE_W(SPH_T64, 3, -, 5, +, 8, -, 11, -, 12) -#define Wb15 MAKE_W(SPH_T64, 12, -, 4, -, 6, -, 9, +, 13) - -#define MAKE_Qab do { \ - qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \ - qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \ - qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \ - qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \ - qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \ - qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \ - qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \ - qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \ - qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \ - qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \ - qt[10] = SPH_T64(sb0(Wb10) + H(11)); \ - qt[11] = SPH_T64(sb1(Wb11) + H(12)); \ - qt[12] = SPH_T64(sb2(Wb12) + H(13)); \ - qt[13] = SPH_T64(sb3(Wb13) + H(14)); \ - qt[14] = SPH_T64(sb4(Wb14) + H(15)); \ - qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \ - } while (0) - -#define MAKE_Qbb do { \ - qt[16] = expand1b(Qb, M, H, 16); \ - qt[17] = expand1b(Qb, M, H, 17); \ - qt[18] = expand2b(Qb, M, H, 18); \ - qt[19] = expand2b(Qb, M, H, 19); \ - qt[20] = expand2b(Qb, M, H, 20); \ - qt[21] = expand2b(Qb, M, H, 21); \ - qt[22] = expand2b(Qb, M, H, 22); \ - qt[23] = expand2b(Qb, M, H, 23); \ - qt[24] = expand2b(Qb, M, H, 24); \ - qt[25] = expand2b(Qb, M, H, 25); \ - qt[26] = expand2b(Qb, M, H, 26); \ - qt[27] = expand2b(Qb, M, H, 27); \ - qt[28] = expand2b(Qb, M, H, 28); \ - qt[29] = expand2b(Qb, M, H, 29); \ - qt[30] = expand2b(Qb, M, H, 30); \ - qt[31] = expand2b(Qb, M, H, 31); \ - } while (0) - -#define MAKE_Qb do { \ - MAKE_Qab; \ - MAKE_Qbb; \ - } while (0) - -#define Qb(j) (qt[j]) - -#define FOLD(type, mkQ, tt, rol, mf, qf, dhf) do { \ - type qt[32], xl, xh; \ - mkQ; \ - xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \ - ^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \ - xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \ - ^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \ - dhf( 0) = tt(((xh << 5) ^ (qf(16) >> 5) ^ mf( 0)) \ - + (xl ^ qf(24) ^ qf( 0))); \ - dhf( 1) = tt(((xh >> 7) ^ (qf(17) << 8) ^ mf( 1)) \ - + (xl ^ qf(25) ^ qf( 1))); \ - dhf( 2) = tt(((xh >> 5) ^ (qf(18) << 5) ^ mf( 2)) \ - + (xl ^ qf(26) ^ qf( 2))); \ - dhf( 3) = tt(((xh >> 1) ^ (qf(19) << 5) ^ mf( 3)) \ - + (xl ^ qf(27) ^ qf( 3))); \ - dhf( 4) = tt(((xh >> 3) ^ (qf(20) << 0) ^ mf( 4)) \ - + (xl ^ qf(28) ^ qf( 4))); \ - dhf( 5) = tt(((xh << 6) ^ (qf(21) >> 6) ^ mf( 5)) \ - + (xl ^ qf(29) ^ qf( 5))); \ - dhf( 6) = tt(((xh >> 4) ^ (qf(22) << 6) ^ mf( 6)) \ - + (xl ^ qf(30) ^ qf( 6))); \ - dhf( 7) = tt(((xh >> 11) ^ (qf(23) << 2) ^ mf( 7)) \ - + (xl ^ qf(31) ^ qf( 7))); \ - dhf( 8) = tt(rol(dhf(4), 9) + (xh ^ qf(24) ^ mf( 8)) \ - + ((xl << 8) ^ qf(23) ^ qf( 8))); \ - dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \ - + ((xl >> 6) ^ qf(16) ^ qf( 9))); \ - dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \ - + ((xl << 6) ^ qf(17) ^ qf(10))); \ - dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \ - + ((xl << 4) ^ qf(18) ^ qf(11))); \ - dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \ - + ((xl >> 3) ^ qf(19) ^ qf(12))); \ - dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \ - + ((xl >> 4) ^ qf(20) ^ qf(13))); \ - dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \ - + ((xl >> 7) ^ qf(21) ^ qf(14))); \ - dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \ - + ((xl >> 2) ^ qf(22) ^ qf(15))); \ - } while (0) - -#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH) - -#define FOLDb FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH) - -#define DECL_BMW \ - sph_u64 bmwH[16]; \ - -/* load initial constants */ -#define BMW_I \ -do { \ - memcpy(bmwH, bmwIV512, sizeof bmwH); \ - hashptr = 0; \ - hashctA = 0; \ -} while (0) - -/* load hash for loop */ -#define BMW_U \ -do { \ - const void *data = hash; \ - size_t len = 64; \ - unsigned char *buf; \ - \ - hashctA += (sph_u64)len << 3; \ - buf = hashbuf; \ - memcpy(buf, data, 64); \ - hashptr = 64; \ -} while (0) - - -/* bmw512 hash loaded */ -/* hash = blake512(loaded) */ -#define BMW_C \ -do { \ - void *dst = hash; \ - size_t out_size_w64 = 8; \ - unsigned char *data; \ - sph_u64 *dh; \ - unsigned char *out; \ - size_t ptr, u, v; \ - unsigned z; \ - sph_u64 h1[16], h2[16], *h; \ - data = hashbuf; \ - ptr = hashptr; \ - z = 0x80 >> 0; \ - data[ptr ++] = ((0 & -z) | z) & 0xFF; \ - memset(data + ptr, 0, (sizeof(char)*128) - 8 - ptr); \ - sph_enc64le_aligned(data + (sizeof(char)*128) - 8, \ - SPH_T64(hashctA + 0)); \ - /* for break loop */ \ - /* one copy of inline FOLD */ \ - /* FOLD uses, */ \ - /* uint64 *h, data */ \ - /* uint64 dh, state */ \ - h = bmwH; \ - dh = h2; \ - for (;;) { \ - FOLDb; \ - /* dh gets changed for 2nd run */ \ - if (dh == h1) break; \ - for (u = 0; u < 16; u ++) \ - sph_enc64le_aligned(data + 8 * u, h2[u]); \ - dh = h1; \ - h = (sph_u64*)final_b; \ - } \ - /* end wrapped for break loop */ \ - out = dst; \ - for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) \ - sph_enc64le(out + 8 * u, h1[v]); \ -} while (0) - -/* -static void -compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16]) -{ - -#define M(x) sph_dec64le_aligned(data + 8 * (x)) -#define H(x) (h[x]) -#define dH(x) (dh[x]) - - FOLDb; - -#undef M -#undef H -#undef dH -} -*/ - -static const sph_u64 final_b[16] = { - SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1), - SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3), - SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5), - SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7), - SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9), - SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab), - SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad), - SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf) -}; - - -#ifdef __cplusplus -} -#endif diff --git a/algo/bmw/sse2/sph_bmw.h b/algo/bmw/sse2/sph_bmw.h deleted file mode 100644 index e63961b..0000000 --- a/algo/bmw/sse2/sph_bmw.h +++ /dev/null @@ -1,61 +0,0 @@ -/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * BMW interface. BMW (aka "Blue Midnight Wish") is a family of - * functions which differ by their output size; this implementation - * defines BMW for output sizes 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_bmw.h - * @author Thomas Pornin - */ - -#ifndef SPH_BMW_H__ -#define SPH_BMW_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "sph_types.h" - -#define SPH_SIZE_bmw512 512 - -typedef struct { -#ifndef DOXYGEN_IGNORE - sph_u64 bmwH[16]; -#endif -} sph_bmw_big_context; - -typedef sph_bmw_big_context sph_bmw512_context; - - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c deleted file mode 100644 index 37df7ee..0000000 --- a/algo/cubehash/cube-hash-2way.c +++ /dev/null @@ -1,212 +0,0 @@ -#if defined(__AVX2__) - -#include -#include -#include -#include "cube-hash-2way.h" - -// 2x128 - -/* -// The result of hashing 10 rounds of initial data which consists of params -// zero padded. -static const uint64_t IV256[] = -{ -0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131, -0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00, -0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD, -0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF -}; - -static const uint64_t IV512[] = -{ -0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E, -0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33, -0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934, -0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246 -}; -*/ - -static void transform_2way( cube_2way_context *sp ) -{ - int r; - const int rounds = sp->rounds; - - __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1; - - x0 = _mm256_load_si256( (__m256i*)sp->h ); - x1 = _mm256_load_si256( (__m256i*)sp->h + 1 ); - x2 = _mm256_load_si256( (__m256i*)sp->h + 2 ); - x3 = _mm256_load_si256( (__m256i*)sp->h + 3 ); - x4 = _mm256_load_si256( (__m256i*)sp->h + 4 ); - x5 = _mm256_load_si256( (__m256i*)sp->h + 5 ); - x6 = _mm256_load_si256( (__m256i*)sp->h + 6 ); - x7 = _mm256_load_si256( (__m256i*)sp->h + 7 ); - - for ( r = 0; r < rounds; ++r ) - { - x4 = _mm256_add_epi32( x0, x4 ); - x5 = _mm256_add_epi32( x1, x5 ); - x6 = _mm256_add_epi32( x2, x6 ); - x7 = _mm256_add_epi32( x3, x7 ); - y0 = x0; - y1 = x1; - x0 = mm256_rol_32( x2, 7 ); - x1 = mm256_rol_32( x3, 7 ); - x2 = mm256_rol_32( y0, 7 ); - x3 = mm256_rol_32( y1, 7 ); - x0 = _mm256_xor_si256( x0, x4 ); - x1 = _mm256_xor_si256( x1, x5 ); - x2 = _mm256_xor_si256( x2, x6 ); - x3 = _mm256_xor_si256( x3, x7 ); - x4 = mm256_swap64_128( x4 ); - x5 = mm256_swap64_128( x5 ); - x6 = mm256_swap64_128( x6 ); - x7 = mm256_swap64_128( x7 ); - x4 = _mm256_add_epi32( x0, x4 ); - x5 = _mm256_add_epi32( x1, x5 ); - x6 = _mm256_add_epi32( x2, x6 ); - x7 = _mm256_add_epi32( x3, x7 ); - y0 = x0; - y1 = x2; - x0 = mm256_rol_32( x1, 11 ); - x1 = mm256_rol_32( y0, 11 ); - x2 = mm256_rol_32( x3, 11 ); - x3 = mm256_rol_32( y1, 11 ); - x0 = _mm256_xor_si256( x0, x4 ); - x1 = _mm256_xor_si256( x1, x5 ); - x2 = _mm256_xor_si256( x2, x6 ); - x3 = _mm256_xor_si256( x3, x7 ); - x4 = mm256_swap32_64( x4 ); - x5 = mm256_swap32_64( x5 ); - x6 = mm256_swap32_64( x6 ); - x7 = mm256_swap32_64( x7 ); - } - - _mm256_store_si256( (__m256i*)sp->h, x0 ); - _mm256_store_si256( (__m256i*)sp->h + 1, x1 ); - _mm256_store_si256( (__m256i*)sp->h + 2, x2 ); - _mm256_store_si256( (__m256i*)sp->h + 3, x3 ); - _mm256_store_si256( (__m256i*)sp->h + 4, x4 ); - _mm256_store_si256( (__m256i*)sp->h + 5, x5 ); - _mm256_store_si256( (__m256i*)sp->h + 6, x6 ); - _mm256_store_si256( (__m256i*)sp->h + 7, x7 ); - -} - -int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds, - int blockbytes ) -{ - __m128i* h = (__m128i*)sp->h; - sp->hashlen = hashbitlen/128; - sp->blocksize = blockbytes/16; - sp->rounds = rounds; - sp->pos = 0; - - if ( hashbitlen == 512 ) - { - - h[ 0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 ); - h[ 2] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 ); - h[ 4] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 ); - h[ 6] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 ); - h[ 8] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 ); - h[10] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 ); - h[12] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B ); - h[14] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 ); - h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6]; - h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14]; - } - else - { - h[ 0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 ); - h[ 2] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B ); - h[ 4] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 ); - h[ 6] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 ); - h[ 8] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 ); - h[10] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 ); - h[12] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE ); - h[14] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB ); - h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6]; - h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14]; - } - - return 0; -} - - -int cube_2way_update( cube_2way_context *sp, const void *data, size_t size ) -{ - const int len = size >> 4; - const __m256i *in = (__m256i*)data; - int i; - - // It is assumed data is aligned to 256 bits and is a multiple of 128 bits. - // Current usage sata is either 64 or 80 bytes. - - for ( i = 0; i < len; i++ ) - { - sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] ); - sp->pos++; - if ( sp->pos == sp->blocksize ) - { - transform_2way( sp ); - sp->pos = 0; - } - } - return 0; -} - -int cube_2way_close( cube_2way_context *sp, void *output ) -{ - __m256i *hash = (__m256i*)output; - int i; - - // pos is zero for 64 byte data, 1 for 80 byte data. - sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], - _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) ); - transform_2way( sp ); - - sp->h[7] = _mm256_xor_si256( sp->h[7], - _mm256_set_epi32( 1,0,0,0, 1,0,0,0 ) ); - - for ( i = 0; i < 10; ++i ) transform_2way( sp ); - - memcpy( hash, sp->h, sp->hashlen<<5 ); - return 0; -} - -int cube_2way_update_close( cube_2way_context *sp, void *output, - const void *data, size_t size ) -{ - const int len = size >> 4; - const __m256i *in = (__m256i*)data; - __m256i *hash = (__m256i*)output; - int i; - - for ( i = 0; i < len; i++ ) - { - sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] ); - sp->pos++; - if ( sp->pos == sp->blocksize ) - { - transform_2way( sp ); - sp->pos = 0; - } - } - - // pos is zero for 64 byte data, 1 for 80 byte data. - sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], - _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) ); - transform_2way( sp ); - - sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0, - 1,0,0,0 ) ); - - for ( i = 0; i < 10; ++i ) transform_2way( sp ); - - memcpy( hash, sp->h, sp->hashlen<<5 ); - return 0; -} - -#endif diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h deleted file mode 100644 index 1da565f..0000000 --- a/algo/cubehash/cube-hash-2way.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef CUBE_HASH_2WAY_H__ -#define CUBE_HASH_2WAY_H__ - -#if defined(__AVX2__) - -#include -#include "simd-utils.h" - -// 2x128, 2 way parallel SSE2 - -struct _cube_2way_context -{ - __m256i h[8]; - int hashlen; // __m128i - int rounds; - int blocksize; // __m128i - int pos; // number of __m128i read into x from current block -} __attribute__ ((aligned (64))); - -typedef struct _cube_2way_context cube_2way_context; - -int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds, - int blockbytes ); -// reinitialize context with same parameters, much faster. -int cube_2way_reinit( cube_2way_context *sp ); - -int cube_2way_update( cube_2way_context *sp, const void *data, size_t size ); - -int cube_2way_close( cube_2way_context *sp, void *output ); - -int cube_2way_update_close( cube_2way_context *sp, void *output, - const void *data, size_t size ); - - -#endif -#endif diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c deleted file mode 100644 index 7f6591f..0000000 --- a/algo/cubehash/cubehash_sse2.c +++ /dev/null @@ -1,281 +0,0 @@ -/* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */ -#define CUBEHASH_ROUNDS 16 -#define CUBEHASH_BLOCKBYTES 32 -#define OPTIMIZE_SSE2 -#if defined(OPTIMIZE_SSE2) -#include -#endif -#ifdef __AVX2__ -#include -#endif -#include "cubehash_sse2.h" -#include "algo/sha/sha3-defs.h" -#include -#include -#include -#include "simd-utils.h" -#include - -static void transform( cubehashParam *sp ) -{ - int r; - const int rounds = sp->rounds; - -#ifdef __AVX2__ - - register __m256i x0, x1, x2, x3, y0, y1; - - x0 = _mm256_load_si256( (__m256i*)sp->x ); - x1 = _mm256_load_si256( (__m256i*)sp->x + 1 ); - x2 = _mm256_load_si256( (__m256i*)sp->x + 2 ); - x3 = _mm256_load_si256( (__m256i*)sp->x + 3 ); - - for ( r = 0; r < rounds; ++r ) - { - x2 = _mm256_add_epi32( x0, x2 ); - x3 = _mm256_add_epi32( x1, x3 ); - y0 = x0; - x0 = mm256_rol_32( x1, 7 ); - x1 = mm256_rol_32( y0, 7 ); - x0 = _mm256_xor_si256( x0, x2 ); - x1 = _mm256_xor_si256( x1, x3 ); - x2 = mm256_swap64_128( x2 ); - x3 = mm256_swap64_128( x3 ); - x2 = _mm256_add_epi32( x0, x2 ); - x3 = _mm256_add_epi32( x1, x3 ); - y0 = mm256_swap_128( x0 ); - y1 = mm256_swap_128( x1 ); - x0 = mm256_rol_32( y0, 11 ); - x1 = mm256_rol_32( y1, 11 ); - x0 = _mm256_xor_si256( x0, x2 ); - x1 = _mm256_xor_si256( x1, x3 ); - x2 = mm256_swap32_64( x2 ); - x3 = mm256_swap32_64( x3 ); - } - - _mm256_store_si256( (__m256i*)sp->x, x0 ); - _mm256_store_si256( (__m256i*)sp->x + 1, x1 ); - _mm256_store_si256( (__m256i*)sp->x + 2, x2 ); - _mm256_store_si256( (__m256i*)sp->x + 3, x3 ); - -#else - __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3; - - x0 = _mm_load_si128( (__m128i*)sp->x ); - x1 = _mm_load_si128( (__m128i*)sp->x + 1 ); - x2 = _mm_load_si128( (__m128i*)sp->x + 2 ); - x3 = _mm_load_si128( (__m128i*)sp->x + 3 ); - x4 = _mm_load_si128( (__m128i*)sp->x + 4 ); - x5 = _mm_load_si128( (__m128i*)sp->x + 5 ); - x6 = _mm_load_si128( (__m128i*)sp->x + 6 ); - x7 = _mm_load_si128( (__m128i*)sp->x + 7 ); - - for (r = 0; r < rounds; ++r) { - x4 = _mm_add_epi32(x0, x4); - x5 = _mm_add_epi32(x1, x5); - x6 = _mm_add_epi32(x2, x6); - x7 = _mm_add_epi32(x3, x7); - y0 = x2; - y1 = x3; - y2 = x0; - y3 = x1; - x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25)); - x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25)); - x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25)); - x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25)); - x0 = _mm_xor_si128(x0, x4); - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_xor_si128(x2, x6); - x3 = _mm_xor_si128(x3, x7); - x4 = _mm_shuffle_epi32(x4, 0x4e); - x5 = _mm_shuffle_epi32(x5, 0x4e); - x6 = _mm_shuffle_epi32(x6, 0x4e); - x7 = _mm_shuffle_epi32(x7, 0x4e); - x4 = _mm_add_epi32(x0, x4); - x5 = _mm_add_epi32(x1, x5); - x6 = _mm_add_epi32(x2, x6); - x7 = _mm_add_epi32(x3, x7); - y0 = x1; - y1 = x0; - y2 = x3; - y3 = x2; - x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21)); - x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21)); - x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21)); - x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21)); - x0 = _mm_xor_si128(x0, x4); - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_xor_si128(x2, x6); - x3 = _mm_xor_si128(x3, x7); - x4 = _mm_shuffle_epi32(x4, 0xb1); - x5 = _mm_shuffle_epi32(x5, 0xb1); - x6 = _mm_shuffle_epi32(x6, 0xb1); - x7 = _mm_shuffle_epi32(x7, 0xb1); - } - - _mm_store_si128( (__m128i*)sp->x, x0 ); - _mm_store_si128( (__m128i*)sp->x + 1, x1 ); - _mm_store_si128( (__m128i*)sp->x + 2, x2 ); - _mm_store_si128( (__m128i*)sp->x + 3, x3 ); - _mm_store_si128( (__m128i*)sp->x + 4, x4 ); - _mm_store_si128( (__m128i*)sp->x + 5, x5 ); - _mm_store_si128( (__m128i*)sp->x + 6, x6 ); - _mm_store_si128( (__m128i*)sp->x + 7, x7 ); - -#endif -} // transform - -/* -// The result of hashing 10 rounds of initial data which is params and -// mostly zeros. -static const uint64_t IV256[] = -{ -0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131, -0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00, -0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD, -0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF -}; - -static const uint64_t IV512[] = -{ -0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E, -0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33, -0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934, -0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246 -}; -*/ - -int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes) -{ - __m128i *x = (__m128i*)sp->x; - sp->hashlen = hashbitlen/128; - sp->blocksize = blockbytes/16; - sp->rounds = rounds; - sp->pos = 0; - - if ( hashbitlen == 512 ) - { - - x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 ); - x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 ); - x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 ); - x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 ); - x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 ); - x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 ); - x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B ); - x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 ); - } - else - { - x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 ); - x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B ); - x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 ); - x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 ); - x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 ); - x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 ); - x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE ); - x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB ); - } - - return SUCCESS; -} - -int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size ) -{ - const int len = size / 16; - const __m128i* in = (__m128i*)data; - int i; - - // It is assumed data is aligned to 256 bits and is a multiple of 128 bits. - // Current usage sata is either 64 or 80 bytes. - - for ( i = 0; i < len; i++ ) - { - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] ); - sp->pos++; - if ( sp->pos == sp->blocksize ) - { - transform( sp ); - sp->pos = 0; - } - } - - return SUCCESS; -} - -int cubehashDigest( cubehashParam *sp, byte *digest ) -{ - __m128i* hash = (__m128i*)digest; - int i; - - // pos is zero for 64 byte data, 1 for 80 byte data. - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], - _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ) ); - transform( sp ); - - sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - - for ( i = 0; i < sp->hashlen; i++ ) - hash[i] = sp->x[i]; - - return SUCCESS; -} - -int cubehashUpdateDigest( cubehashParam *sp, byte *digest, - const byte *data, size_t size ) -{ - const int len = size / 16; - const __m128i* in = (__m128i*)data; - __m128i* hash = (__m128i*)digest; - int i; - - // It is assumed data is aligned to 256 bits and is a multiple of 128 bits. - // Current usage sata is either 64 or 80 bytes. - - for ( i = 0; i < len; i++ ) - { - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] ); - sp->pos++; - if ( sp->pos == sp->blocksize ) - { - transform( sp ); - sp->pos = 0; - } - } - - // pos is zero for 64 byte data, 1 for 80 byte data. - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], - _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ) ); - transform( sp ); - - sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) ); - - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - transform( sp ); - - for ( i = 0; i < sp->hashlen; i++ ) - hash[i] = sp->x[i]; - - return SUCCESS; -} - diff --git a/algo/cubehash/cubehash_sse2.h b/algo/cubehash/cubehash_sse2.h deleted file mode 100644 index 4e1eaa3..0000000 --- a/algo/cubehash/cubehash_sse2.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef CUBEHASH_SSE2_H__ -#define CUBEHASH_SSE2_H__ - -#include "compat.h" -#include -#include "algo/sha/sha3-defs.h" - -#define OPTIMIZE_SSE2 - -#include - -/*!\brief Holds all the parameters necessary for the CUBEHASH algorithm. - * \ingroup HASH_cubehash_m - */ - -struct _cubehashParam -{ - int hashlen; // __m128i - int rounds; - int blocksize; // __m128i - int pos; // number of __m128i read into x from current block - __m128i _ALIGN(256) x[8]; // aligned for __m256i -}; - -typedef struct _cubehashParam cubehashParam; - -#ifdef __cplusplus -extern "C" { -#endif - -int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes); -// reinitialize context with same parameters, much faster. -int cubehashReinit( cubehashParam* sp ); - -int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size); - -int cubehashDigest(cubehashParam* sp, byte *digest); - -int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data, - size_t size ); - -#ifdef __cplusplus -} -#endif - -#endif /* H_CUBEHASH */ diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c index 7dd48e4..a4e3958 100644 --- a/algo/echo/aes_ni/hash.c +++ b/algo/echo/aes_ni/hash.c @@ -7,7 +7,6 @@ * - implements NIST hash api * - assumes that message lenght is multiple of 8-bits * - _ECHO_VPERM_ must be defined if compiling with ../main.c - * - define NO_AES_NI for aes_ni version * * Cagdas Calik * ccalik@metu.edu.tr @@ -21,13 +20,7 @@ #include "hash_api.h" //#include "vperm.h" #include -/* -#ifndef NO_AES_NI -#include -#else -#include -#endif -*/ +#include "simd-utils.h" MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F}; MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC}; @@ -62,8 +55,8 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 #define ECHO_SUBBYTES(state, i, j) \ state[i][j] = _mm_aesenc_si128(state[i][j], k1);\ - state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\ - k1 = _mm_add_epi32(k1, M128(const1)) + k1 = _mm_add_epi32(k1, M128(const1));\ + state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero)) #define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \ s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\ @@ -179,53 +172,53 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc for(b = 0; b < uBlockCount; b++) { - ctx->k = _mm_add_epi64(ctx->k, ctx->const1536); + ctx->k = _mm_add_epi64(ctx->k, ctx->const1536); - // load message - for(j = ctx->uHashSize / 256; j < 4; j++) - { - for(i = 0; i < 4; i++) + // load message + for(j = ctx->uHashSize / 256; j < 4; j++) { - _state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); + for(i = 0; i < 4; i++) + { + _state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); + } } - } - // save state - SAVESTATE(_statebackup, _state); + // save state + SAVESTATE(_statebackup, _state); - k1 = ctx->k; + k1 = ctx->k; - for(r = 0; r < ctx->uRounds / 2; r++) - { - ECHO_ROUND_UNROLL2; - } + for(r = 0; r < ctx->uRounds / 2; r++) + { + ECHO_ROUND_UNROLL2; + } - if(ctx->uHashSize == 256) - { - for(i = 0; i < 4; i++) + if(ctx->uHashSize == 256) { - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]); - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]); + for(i = 0; i < 4; i++) + { + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]); + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]); + } } - } - else - { - for(i = 0; i < 4; i++) - { - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); - _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); - _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]); - _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]); - } - } - pmsg += ctx->uBlockLength; + else + { + for(i = 0; i < 4; i++) + { + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); + _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); + _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]); + _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]); + } + } + pmsg += ctx->uBlockLength; } SAVESTATE(ctx->state, _state); @@ -390,13 +383,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval) } // Store the hash value - _mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]); - _mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]); + _mm_store_si128((__m128i*)hashval + 0, state->state[0][0]); + _mm_store_si128((__m128i*)hashval + 1, state->state[1][0]); if(state->uHashSize == 512) { - _mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]); - _mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]); + _mm_store_si128((__m128i*)hashval + 2, state->state[2][0]); + _mm_store_si128((__m128i*)hashval + 3, state->state[3][0]); } return SUCCESS; @@ -513,19 +506,178 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, } // Store the hash value - _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] ); - _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] ); + _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] ); + _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] ); + + if( state->uHashSize == 512 ) + { + _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] ); + _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] ); + + } + return SUCCESS; +} + +HashReturn echo_full( hashState_echo *state, BitSequence *hashval, + int nHashSize, const BitSequence *data, DataLength datalen ) +{ + int i, j; + + state->k = m128_zero; + state->processed_bits = 0; + state->uBufferBytes = 0; + + switch( nHashSize ) + { + case 256: + state->uHashSize = 256; + state->uBlockLength = 192; + state->uRounds = 8; + state->hashsize = m128_const_64( 0, 0x100 ); + state->const1536 = m128_const_64( 0, 0x600 ); + break; + + case 512: + state->uHashSize = 512; + state->uBlockLength = 128; + state->uRounds = 10; + state->hashsize = m128_const_64( 0, 0x200 ); + state->const1536 = m128_const_64( 0, 0x400 ); + break; + + default: + return BAD_HASHBITLEN; + } + + for(i = 0; i < 4; i++) + for(j = 0; j < nHashSize / 256; j++) + state->state[i][j] = state->hashsize; + + for(i = 0; i < 4; i++) + for(j = nHashSize / 256; j < 4; j++) + state->state[i][j] = m128_zero; + + + unsigned int uBlockCount, uRemainingBytes; + + if( (state->uBufferBytes + datalen) >= state->uBlockLength ) + { + if( state->uBufferBytes != 0 ) + { + // Fill the buffer + memcpy( state->buffer + state->uBufferBytes, + (void*)data, state->uBlockLength - state->uBufferBytes ); + + // Process buffer + Compress( state, state->buffer, 1 ); + state->processed_bits += state->uBlockLength * 8; + + data += state->uBlockLength - state->uBufferBytes; + datalen -= state->uBlockLength - state->uBufferBytes; + } + + // buffer now does not contain any unprocessed bytes + + uBlockCount = datalen / state->uBlockLength; + uRemainingBytes = datalen % state->uBlockLength; + + if( uBlockCount > 0 ) + { + Compress( state, data, uBlockCount ); + state->processed_bits += uBlockCount * state->uBlockLength * 8; + data += uBlockCount * state->uBlockLength; + } + + if( uRemainingBytes > 0 ) + memcpy(state->buffer, (void*)data, uRemainingBytes); + + state->uBufferBytes = uRemainingBytes; + } + else + { + memcpy( state->buffer + state->uBufferBytes, (void*)data, datalen ); + state->uBufferBytes += datalen; + } + + __m128i remainingbits; + + // Add remaining bytes in the buffer + state->processed_bits += state->uBufferBytes * 8; + + remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 ); + + // Pad with 0x80 + state->buffer[state->uBufferBytes++] = 0x80; + // Enough buffer space for padding in this block? + if( (state->uBlockLength - state->uBufferBytes) >= 18 ) + { + // Pad with zeros + memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) ); + + // Hash size + *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize; + + // Processed bits + *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = + state->processed_bits; + *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; + + // Last block contains message bits? + if( state->uBufferBytes == 1 ) + { + state->k = _mm_xor_si128( state->k, state->k ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + } + else + { + state->k = _mm_add_epi64( state->k, remainingbits ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + } + + // Compress + Compress( state, state->buffer, 1 ); + } + else + { + // Fill with zero and compress + memset( state->buffer + state->uBufferBytes, 0, + state->uBlockLength - state->uBufferBytes ); + state->k = _mm_add_epi64( state->k, remainingbits ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + Compress( state, state->buffer, 1 ); + + // Last block + memset( state->buffer, 0, state->uBlockLength - 18 ); + + // Hash size + *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = + state->uHashSize; + + // Processed bits + *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = + state->processed_bits; + *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; + // Compress the last block + state->k = _mm_xor_si128( state->k, state->k ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + Compress( state, state->buffer, 1) ; + } + + // Store the hash value + _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] ); + _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] ); if( state->uHashSize == 512 ) { - _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] ); - _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] ); + _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] ); + _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] ); } return SUCCESS; } + HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval) { HashReturn hRet; diff --git a/algo/echo/aes_ni/hash_api.h b/algo/echo/aes_ni/hash_api.h index 01e5598..a550088 100644 --- a/algo/echo/aes_ni/hash_api.h +++ b/algo/echo/aes_ni/hash_api.h @@ -15,7 +15,7 @@ #ifndef HASH_API_H #define HASH_API_H -#ifndef NO_AES_NI +#ifdef __AES__ #define HASH_IMPL_STR "ECHO-aesni" #else #define HASH_IMPL_STR "ECHO-vperm" @@ -55,6 +55,8 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, const BitSequence *data, DataLength databitlen ); +HashReturn echo_full( hashState_echo *state, BitSequence *hashval, + int nHashSize, const BitSequence *data, DataLength databitlen ); #endif // HASH_API_H diff --git a/algo/echo/sph_echo.ch b/algo/echo/sph_echo.ch new file mode 100644 index 0000000..ad5441e --- /dev/null +++ b/algo/echo/sph_echo.ch @@ -0,0 +1,320 @@ +/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * ECHO interface. ECHO is a family of functions which differ by + * their output size; this implementation defines ECHO for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_echo.h + * @author Thomas Pornin + */ + +#ifndef SPH_ECHO_H__ +#define SPH_ECHO_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "algo/sha/sph_types.h" + +/** + * Output size (in bits) for ECHO-224. + */ +#define SPH_SIZE_echo224 224 + +/** + * Output size (in bits) for ECHO-256. + */ +#define SPH_SIZE_echo256 256 + +/** + * Output size (in bits) for ECHO-384. + */ +#define SPH_SIZE_echo384 384 + +/** + * Output size (in bits) for ECHO-512. + */ +#define SPH_SIZE_echo512 512 + +/** + * This structure is a context for ECHO computations: it contains the + * intermediate values and some data from the last entered block. Once + * an ECHO computation has been performed, the context can be reused for + * another computation. This specific structure is used for ECHO-224 + * and ECHO-256. + * + * The contents of this structure are private. A running ECHO computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[192]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[4][4]; +#if SPH_64 + sph_u64 Vb[4][2]; +#endif + } u; + sph_u32 C0, C1, C2, C3; +#endif +} sph_echo_small_context; + +/** + * This structure is a context for ECHO computations: it contains the + * intermediate values and some data from the last entered block. Once + * an ECHO computation has been performed, the context can be reused for + * another computation. This specific structure is used for ECHO-384 + * and ECHO-512. + * + * The contents of this structure are private. A running ECHO computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[8][4]; +#if SPH_64 + sph_u64 Vb[8][2]; +#endif + } u; + sph_u32 C0, C1, C2, C3; +#endif +} sph_echo_big_context; + +/** + * Type for a ECHO-224 context (identical to the common "small" context). + */ +typedef sph_echo_small_context sph_echo224_context; + +/** + * Type for a ECHO-256 context (identical to the common "small" context). + */ +typedef sph_echo_small_context sph_echo256_context; + +/** + * Type for a ECHO-384 context (identical to the common "big" context). + */ +typedef sph_echo_big_context sph_echo384_context; + +/** + * Type for a ECHO-512 context (identical to the common "big" context). + */ +typedef sph_echo_big_context sph_echo512_context; + +/** + * Initialize an ECHO-224 context. This process performs no memory allocation. + * + * @param cc the ECHO-224 context (pointer to a + * sph_echo224_context) + */ +void sph_echo224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo224(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-224 context + * @param dst the destination buffer + */ +void sph_echo224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-256 context. This process performs no memory allocation. + * + * @param cc the ECHO-256 context (pointer to a + * sph_echo256_context) + */ +void sph_echo256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo256(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-256 context + * @param dst the destination buffer + */ +void sph_echo256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-384 context. This process performs no memory allocation. + * + * @param cc the ECHO-384 context (pointer to a + * sph_echo384_context) + */ +void sph_echo384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo384(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-384 context + * @param dst the destination buffer + */ +void sph_echo384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-512 context. This process performs no memory allocation. + * + * @param cc the ECHO-512 context (pointer to a + * sph_echo512_context) + */ +void sph_echo512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo512(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-512 context + * @param dst the destination buffer + */ +void sph_echo512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/algo/echo/sph_echo.h b/algo/echo/sph_echo.h index ad5441e..1247655 100644 --- a/algo/echo/sph_echo.h +++ b/algo/echo/sph_echo.h @@ -7,7 +7,7 @@ * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -15,10 +15,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -37,31 +37,31 @@ #define SPH_ECHO_H__ #ifdef __cplusplus -extern "C"{ +extern "C" { #endif -#include #include "algo/sha/sph_types.h" +#include /** * Output size (in bits) for ECHO-224. */ -#define SPH_SIZE_echo224 224 +#define SPH_SIZE_echo224 224 /** * Output size (in bits) for ECHO-256. */ -#define SPH_SIZE_echo256 256 +#define SPH_SIZE_echo256 256 /** * Output size (in bits) for ECHO-384. */ -#define SPH_SIZE_echo384 384 +#define SPH_SIZE_echo384 384 /** * Output size (in bits) for ECHO-512. */ -#define SPH_SIZE_echo512 512 +#define SPH_SIZE_echo512 512 /** * This structure is a context for ECHO computations: it contains the @@ -76,15 +76,15 @@ extern "C"{ */ typedef struct { #ifndef DOXYGEN_IGNORE - unsigned char buf[192]; /* first field, for alignment */ - size_t ptr; - union { - sph_u32 Vs[4][4]; + unsigned char buf[192]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[4][4]; #if SPH_64 - sph_u64 Vb[4][2]; + sph_u64 Vb[4][2]; #endif - } u; - sph_u32 C0, C1, C2, C3; + } u; + sph_u32 C0, C1, C2, C3; #endif } sph_echo_small_context; @@ -101,15 +101,15 @@ typedef struct { */ typedef struct { #ifndef DOXYGEN_IGNORE - unsigned char buf[128]; /* first field, for alignment */ - size_t ptr; - union { - sph_u32 Vs[8][4]; + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[8][4]; #if SPH_64 - sph_u64 Vb[8][2]; + sph_u64 Vb[8][2]; #endif - } u; - sph_u32 C0, C1, C2, C3; + } u; + sph_u32 C0, C1, C2, C3; #endif } sph_echo_big_context; @@ -175,8 +175,8 @@ void sph_echo224_close(void *cc, void *dst); * @param n the number of extra bits (0 to 7) * @param dst the destination buffer */ -void sph_echo224_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); +void sph_echo224_addbits_and_close(void *cc, unsigned ub, unsigned n, + void *dst); /** * Initialize an ECHO-256 context. This process performs no memory allocation. @@ -220,8 +220,8 @@ void sph_echo256_close(void *cc, void *dst); * @param n the number of extra bits (0 to 7) * @param dst the destination buffer */ -void sph_echo256_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); +void sph_echo256_addbits_and_close(void *cc, unsigned ub, unsigned n, + void *dst); /** * Initialize an ECHO-384 context. This process performs no memory allocation. @@ -265,8 +265,8 @@ void sph_echo384_close(void *cc, void *dst); * @param n the number of extra bits (0 to 7) * @param dst the destination buffer */ -void sph_echo384_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); +void sph_echo384_addbits_and_close(void *cc, unsigned ub, unsigned n, + void *dst); /** * Initialize an ECHO-512 context. This process performs no memory allocation. @@ -310,11 +310,10 @@ void sph_echo512_close(void *cc, void *dst); * @param n the number of extra bits (0 to 7) * @param dst the destination buffer */ -void sph_echo512_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - +void sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, + void *dst); + #ifdef __cplusplus } #endif - #endif diff --git a/algo/fugue/sph_fugue.h b/algo/fugue/sph_fugue.h index d8d0ea0..08d4dde 100644 --- a/algo/fugue/sph_fugue.h +++ b/algo/fugue/sph_fugue.h @@ -74,6 +74,14 @@ void sph_fugue512_close(void *cc, void *dst); void sph_fugue512_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); +#define sph_fugue512_full( cc, dst, data, len ) \ +do{ \ + sph_fugue512_init( cc ); \ + sph_fugue512( cc, data, len ); \ + sph_fugue512_close( cc, dst ); \ +}while(0) + + #ifdef __cplusplus } #endif diff --git a/algo/gost/sph_gost.c b/algo/gost/sph_gost.c deleted file mode 100644 index 629a79c..0000000 --- a/algo/gost/sph_gost.c +++ /dev/null @@ -1,1045 +0,0 @@ -/* GOST hash function for sib algo SibCoin */ - -#include -#include -#include -#include - -#include "sph_gost.h" - -#ifdef __cplusplus -extern "C"{ -#endif - - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -//-------------------------------------------------------------------------------------------- -// -// stribog implementation -// -//-------------------------------------------------------------------------------------------- - - -// Tables for function F -static const sph_u64 TG[8][256] = {{ - 0xE6F87E5C5B711FD0,0x258377800924FA16,0xC849E07E852EA4A8,0x5B4686A18F06C16A, - 0x0B32E9A2D77B416E,0xABDA37A467815C66,0xF61796A81A686676,0xF5DC0B706391954B, - 0x4862F38DB7E64BF1,0xFF5C629A68BD85C5,0xCB827DA6FCD75795,0x66D36DAF69B9F089, - 0x356C9F74483D83B0,0x7CBCECB1238C99A1,0x36A702AC31C4708D,0x9EB6A8D02FBCDFD6, - 0x8B19FA51E5B3AE37,0x9CCFB5408A127D0B,0xBC0C78B508208F5A,0xE533E3842288ECED, - 0xCEC2C7D377C15FD2,0xEC7817B6505D0F5E,0xB94CC2C08336871D,0x8C205DB4CB0B04AD, - 0x763C855B28A0892F,0x588D1B79F6FF3257,0x3FECF69E4311933E,0x0FC0D39F803A18C9, - 0xEE010A26F5F3AD83,0x10EFE8F4411979A6,0x5DCDA10C7DE93A10,0x4A1BEE1D1248E92C, - 0x53BFF2DB21847339,0xB4F50CCFA6A23D09,0x5FB4BC9CD84798CD,0xE88A2D8B071C56F9, - 0x7F7771695A756A9C,0xC5F02E71A0BA1EBC,0xA663F9AB4215E672,0x2EB19E22DE5FBB78, - 0x0DB9CE0F2594BA14,0x82520E6397664D84,0x2F031E6A0208EA98,0x5C7F2144A1BE6BF0, - 0x7A37CB1CD16362DB,0x83E08E2B4B311C64,0xCF70479BAB960E32,0x856BA986B9DEE71E, - 0xB5478C877AF56CE9,0xB8FE42885F61D6FD,0x1BDD0156966238C8,0x622157923EF8A92E, - 0xFC97FF42114476F8,0x9D7D350856452CEB,0x4C90C9B0E0A71256,0x2308502DFBCB016C, - 0x2D7A03FAA7A64845,0xF46E8B38BFC6C4AB,0xBDBEF8FDD477DEBA,0x3AAC4CEBC8079B79, - 0xF09CB105E8879D0C,0x27FA6A10AC8A58CB,0x8960E7C1401D0CEA,0x1A6F811E4A356928, - 0x90C4FB0773D196FF,0x43501A2F609D0A9F,0xF7A516E0C63F3796,0x1CE4A6B3B8DA9252, - 0x1324752C38E08A9B,0xA5A864733BEC154F,0x2BF124575549B33F,0xD766DB15440DC5C7, - 0xA7D179E39E42B792,0xDADF151A61997FD3,0x86A0345EC0271423,0x38D5517B6DA939A4, - 0x6518F077104003B4,0x02791D90A5AEA2DD,0x88D267899C4A5D0A,0x930F66DF0A2865C2, - 0x4EE9D4204509B08B,0x325538916685292A,0x412907BFC533A842,0xB27E2B62544DC673, - 0x6C5304456295E007,0x5AF406E95351908A,0x1F2F3B6BC123616F,0xC37B09DC5255E5C6, - 0x3967D133B1FE6844,0x298839C7F0E711E2,0x409B87F71964F9A2,0xE938ADC3DB4B0719, - 0x0C0B4E47F9C3EBF4,0x5534D576D36B8843,0x4610A05AEB8B02D8,0x20C3CDF58232F251, - 0x6DE1840DBEC2B1E7,0xA0E8DE06B0FA1D08,0x7B854B540D34333B,0x42E29A67BCCA5B7F, - 0xD8A6088AC437DD0E,0xC63BB3A9D943ED81,0x21714DBD5E65A3B1,0x6761EDE7B5EEA169, - 0x2431F7C8D573ABF6,0xD51FC685E1A3671A,0x5E063CD40410C92D,0x283AB98F2CB04002, - 0x8FEBC06CB2F2F790,0x17D64F116FA1D33C,0xE07359F1A99EE4AA,0x784ED68C74CDC006, - 0x6E2A19D5C73B42DA,0x8712B4161C7045C3,0x371582E4ED93216D,0xACE390414939F6FC, - 0x7EC5F12186223B7C,0xC0B094042BAC16FB,0xF9D745379A527EBF,0x737C3F2EA3B68168, - 0x33E7B8D9BAD278CA,0xA9A32A34C22FFEBB,0xE48163CCFEDFBD0D,0x8E5940246EA5A670, - 0x51C6EF4B842AD1E4,0x22BAD065279C508C,0xD91488C218608CEE,0x319EA5491F7CDA17, - 0xD394E128134C9C60,0x094BF43272D5E3B3,0x9BF612A5A4AAD791,0xCCBBDA43D26FFD0F, - 0x34DE1F3C946AD250,0x4F5B5468995EE16B,0xDF9FAF6FEA8F7794,0x2648EA5870DD092B, - 0xBFC7E56D71D97C67,0xDDE6B2FF4F21D549,0x3C276B463AE86003,0x91767B4FAF86C71F, - 0x68A13E7835D4B9A0,0xB68C115F030C9FD4,0x141DD2C916582001,0x983D8F7DDD5324AC, - 0x64AA703FCC175254,0xC2C989948E02B426,0x3E5E76D69F46C2DE,0x50746F03587D8004, - 0x45DB3D829272F1E5,0x60584A029B560BF3,0xFBAE58A73FFCDC62,0xA15A5E4E6CAD4CE8, - 0x4BA96E55CE1FB8CC,0x08F9747AAE82B253,0xC102144CF7FB471B,0x9F042898F3EB8E36, - 0x068B27ADF2EFFB7A,0xEDCA97FE8C0A5EBE,0x778E0513F4F7D8CF,0x302C2501C32B8BF7, - 0x8D92DDFC175C554D,0xF865C57F46052F5F,0xEAF3301BA2B2F424,0xAA68B7ECBBD60D86, - 0x998F0F350104754C,0x0000000000000000,0xF12E314D34D0CCEC,0x710522BE061823B5, - 0xAF280D9930C005C1,0x97FD5CE25D693C65,0x19A41CC633CC9A15,0x95844172F8C79EB8, - 0xDC5432B7937684A9,0x9436C13A2490CF58,0x802B13F332C8EF59,0xC442AE397CED4F5C, - 0xFA1CD8EFE3AB8D82,0xF2E5AC954D293FD1,0x6AD823E8907A1B7D,0x4D2249F83CF043B6, - 0x03CB9DD879F9F33D,0xDE2D2F2736D82674,0x2A43A41F891EE2DF,0x6F98999D1B6C133A, - 0xD4AD46CD3DF436FA,0xBB35DF50269825C0,0x964FDCAA813E6D85,0xEB41B0537EE5A5C4, - 0x0540BA758B160847,0xA41AE43BE7BB44AF,0xE3B8C429D0671797,0x819993BBEE9FBEB9, - 0xAE9A8DD1EC975421,0xF3572CDD917E6E31,0x6393D7DAE2AFF8CE,0x47A2201237DC5338, - 0xA32343DEC903EE35,0x79FC56C4A89A91E6,0x01B28048DC5751E0,0x1296F564E4B7DB7B, - 0x75F7188351597A12,0xDB6D9552BDCE2E33,0x1E9DBB231D74308F,0x520D7293FDD322D9, - 0xE20A44610C304677,0xFEEEE2D2B4EAD425,0xCA30FDEE20800675,0x61EACA4A47015A13, - 0xE74AFE1487264E30,0x2CC883B27BF119A5,0x1664CF59B3F682DC,0xA811AA7C1E78AF5B, - 0x1D5626FB648DC3B2,0xB73E9117DF5BCE34,0xD05F7CF06AB56F5D,0xFD257F0ACD132718, - 0x574DC8E676C52A9E,0x0739A7E52EB8AA9A,0x5486553E0F3CD9A3,0x56FF48AEAA927B7E, - 0xBE756525AD8E2D87,0x7D0E6CF9FFDBC841,0x3B1ECCA31450CA99,0x6913BE30E983E840, - 0xAD511009956EA71C,0xB1B5B6BA2DB4354E,0x4469BDCA4E25A005,0x15AF5281CA0F71E1, - 0x744598CB8D0E2BF2,0x593F9B312AA863B7,0xEFB38A6E29A4FC63,0x6B6AA3A04C2D4A9D, - 0x3D95EB0EE6BF31E3,0xA291C3961554BFD5,0x18169C8EEF9BCBF5,0x115D68BC9D4E2846, - 0xBA875F18FACF7420,0xD1EDFCB8B6E23EBD,0xB00736F2F1E364AE,0x84D929CE6589B6FE, - 0x70B7A2F6DA4F7255,0x0E7253D75C6D4929,0x04F23A3D574159A7,0x0A8069EA0B2C108E, - 0x49D073C56BB11A11,0x8AAB7A1939E4FFD7,0xCD095A0B0E38ACEF,0xC9FB60365979F548, - 0x92BDE697D67F3422,0xC78933E10514BC61,0xE1C1D9B975C9B54A,0xD2266160CF1BCD80, - 0x9A4492ED78FD8671,0xB3CCAB2A881A9793,0x72CEBF667FE1D088,0xD6D45B5D985A9427 -},{ - 0xC811A8058C3F55DE,0x65F5B43196B50619,0xF74F96B1D6706E43,0x859D1E8BCB43D336, - 0x5AAB8A85CCFA3D84,0xF9C7BF99C295FCFD,0xA21FD5A1DE4B630F,0xCDB3EF763B8B456D, - 0x803F59F87CF7C385,0xB27C73BE5F31913C,0x98E3AC6633B04821,0xBF61674C26B8F818, - 0x0FFBC995C4C130C8,0xAAA0862010761A98,0x6057F342210116AA,0xF63C760C0654CC35, - 0x2DDB45CC667D9042,0xBCF45A964BD40382,0x68E8A0C3EF3C6F3D,0xA7BD92D269FF73BC, - 0x290AE20201ED2287,0xB7DE34CDE885818F,0xD901EEA7DD61059B,0xD6FA273219A03553, - 0xD56F1AE874CCCEC9,0xEA31245C2E83F554,0x7034555DA07BE499,0xCE26D2AC56E7BEF7, - 0xFD161857A5054E38,0x6A0E7DA4527436D1,0x5BD86A381CDE9FF2,0xCAF7756231770C32, - 0xB09AAED9E279C8D0,0x5DEF1091C60674DB,0x111046A2515E5045,0x23536CE4729802FC, - 0xC50CBCF7F5B63CFA,0x73A16887CD171F03,0x7D2941AFD9F28DBD,0x3F5E3EB45A4F3B9D, - 0x84EEFE361B677140,0x3DB8E3D3E7076271,0x1A3A28F9F20FD248,0x7EBC7C75B49E7627, - 0x74E5F293C7EB565C,0x18DCF59E4F478BA4,0x0C6EF44FA9ADCB52,0xC699812D98DAC760, - 0x788B06DC6E469D0E,0xFC65F8EA7521EC4E,0x30A5F7219E8E0B55,0x2BEC3F65BCA57B6B, - 0xDDD04969BAF1B75E,0x99904CDBE394EA57,0x14B201D1E6EA40F6,0xBBB0C08241284ADD, - 0x50F20463BF8F1DFF,0xE8D7F93B93CBACB8,0x4D8CB68E477C86E8,0xC1DD1B3992268E3F, - 0x7C5AA11209D62FCB,0x2F3D98ABDB35C9AE,0x671369562BFD5FF5,0x15C1E16C36CEE280, - 0x1D7EB2EDF8F39B17,0xDA94D37DB00DFE01,0x877BC3EC760B8ADA,0xCB8495DFE153AE44, - 0x05A24773B7B410B3,0x12857B783C32ABDF,0x8EB770D06812513B,0x536739B9D2E3E665, - 0x584D57E271B26468,0xD789C78FC9849725,0xA935BBFA7D1AE102,0x8B1537A3DFA64188, - 0xD0CD5D9BC378DE7A,0x4AC82C9A4D80CFB7,0x42777F1B83BDB620,0x72D2883A1D33BD75, - 0x5E7A2D4BAB6A8F41,0xF4DAAB6BBB1C95D9,0x905CFFE7FD8D31B6,0x83AA6422119B381F, - 0xC0AEFB8442022C49,0xA0F908C663033AE3,0xA428AF0804938826,0xADE41C341A8A53C7, - 0xAE7121EE77E6A85D,0xC47F5C4A25929E8C,0xB538E9AA55CDD863,0x06377AA9DAD8EB29, - 0xA18AE87BB3279895,0x6EDFDA6A35E48414,0x6B7D9D19825094A7,0xD41CFA55A4E86CBF, - 0xE5CAEDC9EA42C59C,0xA36C351C0E6FC179,0x5181E4DE6FABBF89,0xFFF0C530184D17D4, - 0x9D41EB1584045892,0x1C0D525028D73961,0xF178EC180CA8856A,0x9A0571018EF811CD, - 0x4091A27C3EF5EFCC,0x19AF15239F6329D2,0x347450EFF91EB990,0xE11B4A078DD27759, - 0xB9561DE5FC601331,0x912F1F5A2DA993C0,0x1654DCB65BA2191A,0x3E2DDE098A6B99EB, - 0x8A66D71E0F82E3FE,0x8C51ADB7D55A08D7,0x4533E50F8941FF7F,0x02E6DD67BD4859EC, - 0xE068AABA5DF6D52F,0xC24826E3FF4A75A5,0x6C39070D88ACDDF8,0x6486548C4691A46F, - 0xD1BEBD26135C7C0C,0xB30F93038F15334A,0x82D9849FC1BF9A69,0x9C320BA85420FAE4, - 0xFA528243AFF90767,0x9ED4D6CFE968A308,0xB825FD582C44B147,0x9B7691BC5EDCB3BB, - 0xC7EA619048FE6516,0x1063A61F817AF233,0x47D538683409A693,0x63C2CE984C6DED30, - 0x2A9FDFD86C81D91D,0x7B1E3B06032A6694,0x666089EBFBD9FD83,0x0A598EE67375207B, - 0x07449A140AFC495F,0x2CA8A571B6593234,0x1F986F8A45BBC2FB,0x381AA4A050B372C2, - 0x5423A3ADD81FAF3A,0x17273C0B8B86BB6C,0xFE83258DC869B5A2,0x287902BFD1C980F1, - 0xF5A94BD66B3837AF,0x88800A79B2CABA12,0x55504310083B0D4C,0xDF36940E07B9EEB2, - 0x04D1A7CE6790B2C5,0x612413FFF125B4DC,0x26F12B97C52C124F,0x86082351A62F28AC, - 0xEF93632F9937E5E7,0x3507B052293A1BE6,0xE72C30AE570A9C70,0xD3586041AE1425E0, - 0xDE4574B3D79D4CC4,0x92BA228040C5685A,0xF00B0CA5DC8C271C,0xBE1287F1F69C5A6E, - 0xF39E317FB1E0DC86,0x495D114020EC342D,0x699B407E3F18CD4B,0xDCA3A9D46AD51528, - 0x0D1D14F279896924,0x0000000000000000,0x593EB75FA196C61E,0x2E4E78160B116BD8, - 0x6D4AE7B058887F8E,0xE65FD013872E3E06,0x7A6DDBBBD30EC4E2,0xAC97FC89CAAEF1B1, - 0x09CCB33C1E19DBE1,0x89F3EAC462EE1864,0x7770CF49AA87ADC6,0x56C57ECA6557F6D6, - 0x03953DDA6D6CFB9A,0x36928D884456E07C,0x1EEB8F37959F608D,0x31D6179C4EAAA923, - 0x6FAC3AD7E5C02662,0x43049FA653991456,0xABD3669DC052B8EE,0xAF02C153A7C20A2B, - 0x3CCB036E3723C007,0x93C9C23D90E1CA2C,0xC33BC65E2F6ED7D3,0x4CFF56339758249E, - 0xB1E94E64325D6AA6,0x37E16D359472420A,0x79F8E661BE623F78,0x5214D90402C74413, - 0x482EF1FDF0C8965B,0x13F69BC5EC1609A9,0x0E88292814E592BE,0x4E198B542A107D72, - 0xCCC00FCBEBAFE71B,0x1B49C844222B703E,0x2564164DA840E9D5,0x20C6513E1FF4F966, - 0xBAC3203F910CE8AB,0xF2EDD1C261C47EF0,0x814CB945ACD361F3,0x95FEB8944A392105, - 0x5C9CF02C1622D6AD,0x971865F3F77178E9,0xBD87BA2B9BF0A1F4,0x444005B259655D09, - 0xED75BE48247FBC0B,0x7596122E17CFF42A,0xB44B091785E97A15,0x966B854E2755DA9F, - 0xEEE0839249134791,0x32432A4623C652B9,0xA8465B47AD3E4374,0xF8B45F2412B15E8B, - 0x2417F6F078644BA3,0xFB2162FE7FDDA511,0x4BBBCC279DA46DC1,0x0173E0BDD024A276, - 0x22208C59A2BCA08A,0x8FC4906DB836F34D,0xE4B90D743A6667EA,0x7147B5E0705F46EF, - 0x2782CB2A1508B039,0xEC065EF5F45B1E7D,0x21B5B183CFD05B10,0xDBE733C060295C77, - 0x9FA73672394C017E,0xCF55321186C31C81,0xD8720E1A0D45A7ED,0x3B8F997A3DDF8958, - 0x3AFC79C7EDFB2B2E,0xE9A4198643EF0ECE,0x5F09CDF67B4E2D37,0x4F6A6BE9FA34DF04, - 0xB6ADD47038A123F9,0x8D224D0A057EAAA1,0xC96248B85C1BF7A8,0xE3FD9760309A2EB5, - 0x0B2A6E5BA351820D,0xEB42C4E1FEA75722,0x948D58299A1D8373,0x7FCF9CC864BAD451, - 0xA55B4FB5D4B72A50,0x08BF5381CE3D7997,0x46A6D8D5E42D04E5,0xD22B80FC7E308796, - 0x57B69E77B57354A0,0x3969441D8097D0B4,0x3330CAFBF3E2F0CF,0xE28E77DDE0BE8CC3, - 0x62B12E259C494F46,0xA6CE726FB9DBD1CA,0x41E242C1EED14DBA,0x76032FF47AA30FB0 -},{ - 0x45B268A93ACDE4CC,0xAF7F0BE884549D08,0x048354B3C1468263,0x925435C2C80EFED2, - 0xEE4E37F27FDFFBA7,0x167A33920C60F14D,0xFB123B52EA03E584,0x4A0CAB53FDBB9007, - 0x9DEAF6380F788A19,0xCB48EC558F0CB32A,0xB59DC4B2D6FEF7E0,0xDCDBCA22F4F3ECB6, - 0x11DF5813549A9C40,0xE33FDEDF568ACED3,0xA0C1C8124322E9C3,0x07A56B8158FA6D0D, - 0x77279579B1E1F3DD,0xD9B18B74422AC004,0xB8EC2D9FFFABC294,0xF4ACF8A82D75914F, - 0x7BBF69B1EF2B6878,0xC4F62FAF487AC7E1,0x76CE809CC67E5D0C,0x6711D88F92E4C14C, - 0x627B99D9243DEDFE,0x234AA5C3DFB68B51,0x909B1F15262DBF6D,0x4F66EA054B62BCB5, - 0x1AE2CF5A52AA6AE8,0xBEA053FBD0CE0148,0xED6808C0E66314C9,0x43FE16CD15A82710, - 0xCD049231A06970F6,0xE7BC8A6C97CC4CB0,0x337CE835FCB3B9C0,0x65DEF2587CC780F3, - 0x52214EDE4132BB50,0x95F15E4390F493DF,0x870839625DD2E0F1,0x41313C1AFB8B66AF, - 0x91720AF051B211BC,0x477D427ED4EEA573,0x2E3B4CEEF6E3BE25,0x82627834EB0BCC43, - 0x9C03E3DD78E724C8,0x2877328AD9867DF9,0x14B51945E243B0F2,0x574B0F88F7EB97E2, - 0x88B6FA989AA4943A,0x19C4F068CB168586,0x50EE6409AF11FAEF,0x7DF317D5C04EABA4, - 0x7A567C5498B4C6A9,0xB6BBFB804F42188E,0x3CC22BCF3BC5CD0B,0xD04336EAAA397713, - 0xF02FAC1BEC33132C,0x2506DBA7F0D3488D,0xD7E65D6BF2C31A1E,0x5EB9B2161FF820F5, - 0x842E0650C46E0F9F,0x716BEB1D9E843001,0xA933758CAB315ED4,0x3FE414FDA2792265, - 0x27C9F1701EF00932,0x73A4C1CA70A771BE,0x94184BA6E76B3D0E,0x40D829FF8C14C87E, - 0x0FBEC3FAC77674CB,0x3616A9634A6A9572,0x8F139119C25EF937,0xF545ED4D5AEA3F9E, - 0xE802499650BA387B,0x6437E7BD0B582E22,0xE6559F89E053E261,0x80AD52E305288DFC, - 0x6DC55A23E34B9935,0xDE14E0F51AD0AD09,0xC6390578A659865E,0x96D7617109487CB1, - 0xE2D6CB3A21156002,0x01E915E5779FAED1,0xADB0213F6A77DCB7,0x9880B76EB9A1A6AB, - 0x5D9F8D248644CF9B,0xFD5E4536C5662658,0xF1C6B9FE9BACBDFD,0xEACD6341BE9979C4, - 0xEFA7221708405576,0x510771ECD88E543E,0xC2BA51CB671F043D,0x0AD482AC71AF5879, - 0xFE787A045CDAC936,0xB238AF338E049AED,0xBD866CC94972EE26,0x615DA6EBBD810290, - 0x3295FDD08B2C1711,0xF834046073BF0AEA,0xF3099329758FFC42,0x1CAEB13E7DCFA934, - 0xBA2307481188832B,0x24EFCE42874CE65C,0x0E57D61FB0E9DA1A,0xB3D1BAD6F99B343C, - 0xC0757B1C893C4582,0x2B510DB8403A9297,0x5C7698C1F1DB614A,0x3E0D0118D5E68CB4, - 0xD60F488E855CB4CF,0xAE961E0DF3CB33D9,0x3A8E55AB14A00ED7,0x42170328623789C1, - 0x838B6DD19C946292,0x895FEF7DED3B3AEB,0xCFCBB8E64E4A3149,0x064C7E642F65C3DC, - 0x3D2B3E2A4C5A63DA,0x5BD3F340A9210C47,0xB474D157A1615931,0xAC5934DA1DE87266, - 0x6EE365117AF7765B,0xC86ED36716B05C44,0x9BA6885C201D49C5,0xB905387A88346C45, - 0x131072C4BAB9DDFF,0xBF49461EA751AF99,0xD52977BC1CE05BA1,0xB0F785E46027DB52, - 0x546D30BA6E57788C,0x305AD707650F56AE,0xC987C682612FF295,0xA5AB8944F5FBC571, - 0x7ED528E759F244CA,0x8DDCBBCE2C7DB888,0xAA154ABE328DB1BA,0x1E619BE993ECE88B, - 0x09F2BD9EE813B717,0x7401AA4B285D1CB3,0x21858F143195CAEE,0x48C381841398D1B8, - 0xFCB750D3B2F98889,0x39A86A998D1CE1B9,0x1F888E0CE473465A,0x7899568376978716, - 0x02CF2AD7EE2341BF,0x85C713B5B3F1A14E,0xFF916FE12B4567E7,0x7C1A0230B7D10575, - 0x0C98FCC85ECA9BA5,0xA3E7F720DA9E06AD,0x6A6031A2BBB1F438,0x973E74947ED7D260, - 0x2CF4663918C0FF9A,0x5F50A7F368678E24,0x34D983B4A449D4CD,0x68AF1B755592B587, - 0x7F3C3D022E6DEA1B,0xABFC5F5B45121F6B,0x0D71E92D29553574,0xDFFDF5106D4F03D8, - 0x081BA87B9F8C19C6,0xDB7EA1A3AC0981BB,0xBBCA12AD66172DFA,0x79704366010829C7, - 0x179326777BFF5F9C,0x0000000000000000,0xEB2476A4C906D715,0x724DD42F0738DF6F, - 0xB752EE6538DDB65F,0x37FFBC863DF53BA3,0x8EFA84FCB5C157E6,0xE9EB5C73272596AA, - 0x1B0BDABF2535C439,0x86E12C872A4D4E20,0x9969A28BCE3E087A,0xFAFB2EB79D9C4B55, - 0x056A4156B6D92CB2,0x5A3AE6A5DEBEA296,0x22A3B026A8292580,0x53C85B3B36AD1581, - 0xB11E900117B87583,0xC51F3A4A3FE56930,0xE019E1EDCF3621BD,0xEC811D2591FCBA18, - 0x445B7D4C4D524A1D,0xA8DA6069DCAEF005,0x58F5CC72309DE329,0xD4C062596B7FF570, - 0xCE22AD0339D59F98,0x591CD99747024DF8,0x8B90C5AA03187B54,0xF663D27FC356D0F0, - 0xD8589E9135B56ED5,0x35309651D3D67A1C,0x12F96721CD26732E,0xD28C1C3D441A36AC, - 0x492A946164077F69,0x2D1D73DC6F5F514B,0x6F0A70F40D68D88A,0x60B4B30ECA1EAC41, - 0xD36509D83385987D,0x0B3D97490630F6A8,0x9ECCC90A96C46577,0xA20EE2C5AD01A87C, - 0xE49AB55E0E70A3DE,0xA4429CA182646BA0,0xDA97B446DB962F6A,0xCCED87D4D7F6DE27, - 0x2AB8185D37A53C46,0x9F25DCEFE15BCBA6,0xC19C6EF9FEA3EB53,0xA764A3931BD884CE, - 0x2FD2590B817C10F4,0x56A21A6D80743933,0xE573A0BB79EF0D0F,0x155C0CA095DC1E23, - 0x6C2C4FC694D437E4,0x10364DF623053291,0xDD32DFC7836C4267,0x03263F3299BCEF6E, - 0x66F8CD6AE57B6F9D,0x8C35AE2B5BE21659,0x31B3C2E21290F87F,0x93BD2027BF915003, - 0x69460E90220D1B56,0x299E276FAE19D328,0x63928C3C53A2432F,0x7082FEF8E91B9ED0, - 0xBC6F792C3EED40F7,0x4C40D537D2DE53DB,0x75E8BFAE5FC2B262,0x4DA9C0D2A541FD0A, - 0x4E8FFFE03CFD1264,0x2620E495696FA7E3,0xE1F0F408B8A98F6C,0xD1AA230FDDA6D9C2, - 0xC7D0109DD1C6288F,0x8A79D04F7487D585,0x4694579BA3710BA2,0x38417F7CFA834F68, - 0x1D47A4DB0A5007E5,0x206C9AF1460A643F,0xA128DDF734BD4712,0x8144470672B7232D, - 0xF2E086CC02105293,0x182DE58DBC892B57,0xCAA1F9B0F8931DFB,0x6B892447CC2E5AE9, - 0xF9DD11850420A43B,0x4BE5BEB68A243ED6,0x5584255F19C8D65D,0x3B67404E633FA006, - 0xA68DB6766C472A1F,0xF78AC79AB4C97E21,0xC353442E1080AAEC,0x9A4F9DB95782E714 -},{ - 0x05BA7BC82C9B3220,0x31A54665F8B65E4F,0xB1B651F77547F4D4,0x8BFA0D857BA46682, - 0x85A96C5AA16A98BB,0x990FAEF908EB79C9,0xA15E37A247F4A62D,0x76857DCD5D27741E, - 0xF8C50B800A1820BC,0xBE65DCB201F7A2B4,0x666D1B986F9426E7,0x4CC921BF53C4E648, - 0x95410A0F93D9CA42,0x20CDCCAA647BA4EF,0x429A4060890A1871,0x0C4EA4F69B32B38B, - 0xCCDA362DDE354CD3,0x96DC23BC7C5B2FA9,0xC309BB68AA851AB3,0xD26131A73648E013, - 0x021DC52941FC4DB2,0xCD5ADAB7704BE48A,0xA77965D984ED71E6,0x32386FD61734BBA4, - 0xE82D6DD538AB7245,0x5C2147EA6177B4B1,0x5DA1AB70CF091CE8,0xAC907FCE72B8BDFF, - 0x57C85DFD972278A8,0xA4E44C6A6B6F940D,0x3851995B4F1FDFE4,0x62578CCAED71BC9E, - 0xD9882BB0C01D2C0A,0x917B9D5D113C503B,0xA2C31E11A87643C6,0xE463C923A399C1CE, - 0xF71686C57EA876DC,0x87B4A973E096D509,0xAF0D567D9D3A5814,0xB40C2A3F59DCC6F4, - 0x3602F88495D121DD,0xD3E1DD3D9836484A,0xF945E71AA46688E5,0x7518547EB2A591F5, - 0x9366587450C01D89,0x9EA81018658C065B,0x4F54080CBC4603A3,0x2D0384C65137BF3D, - 0xDC325078EC861E2A,0xEA30A8FC79573FF7,0x214D2030CA050CB6,0x65F0322B8016C30C, - 0x69BE96DD1B247087,0xDB95EE9981E161B8,0xD1FC1814D9CA05F8,0x820ED2BBCC0DE729, - 0x63D76050430F14C7,0x3BCCB0E8A09D3A0F,0x8E40764D573F54A2,0x39D175C1E16177BD, - 0x12F5A37C734F1F4B,0xAB37C12F1FDFC26D,0x5648B167395CD0F1,0x6C04ED1537BF42A7, - 0xED97161D14304065,0x7D6C67DAAB72B807,0xEC17FA87BA4EE83C,0xDFAF79CB0304FBC1, - 0x733F060571BC463E,0x78D61C1287E98A27,0xD07CF48E77B4ADA1,0xB9C262536C90DD26, - 0xE2449B5860801605,0x8FC09AD7F941FCFB,0xFAD8CEA94BE46D0E,0xA343F28B0608EB9F, - 0x9B126BD04917347B,0x9A92874AE7699C22,0x1B017C42C4E69EE0,0x3A4C5C720EE39256, - 0x4B6E9F5E3EA399DA,0x6BA353F45AD83D35,0xE7FEE0904C1B2425,0x22D009832587E95D, - 0x842980C00F1430E2,0xC6B3C0A0861E2893,0x087433A419D729F2,0x341F3DADD42D6C6F, - 0xEE0A3FAEFBB2A58E,0x4AEE73C490DD3183,0xAAB72DB5B1A16A34,0xA92A04065E238FDF, - 0x7B4B35A1686B6FCC,0x6A23BF6EF4A6956C,0x191CB96B851AD352,0x55D598D4D6DE351A, - 0xC9604DE5F2AE7EF3,0x1CA6C2A3A981E172,0xDE2F9551AD7A5398,0x3025AAFF56C8F616, - 0x15521D9D1E2860D9,0x506FE31CFA45073A,0x189C55F12B647B0B,0x0180EC9AAE7EA859, - 0x7CEC8B40050C105E,0x2350E5198BF94104,0xEF8AD33455CC0DD7,0x07A7BEE16D677F92, - 0xE5E325B90DE76997,0x5A061591A26E637A,0xB611EF1618208B46,0x09F4DF3EB7A981AB, - 0x1EBB078AE87DACC0,0xB791038CB65E231F,0x0FD38D4574B05660,0x67EDF702C1EA8EBE, - 0xBA5F4BE0831238CD,0xE3C477C2CEFEBE5C,0x0DCE486C354C1BD2,0x8C5DB36416C31910, - 0x26EA9ED1A7627324,0x039D29B3EF82E5EB,0x9F28FC82CBF2AE02,0xA8AAE89CF05D2786, - 0x431AACFA2774B028,0xCF471F9E31B7A938,0x581BD0B8E3922EC8,0xBC78199B400BEF06, - 0x90FB71C7BF42F862,0x1F3BEB1046030499,0x683E7A47B55AD8DE,0x988F4263A695D190, - 0xD808C72A6E638453,0x0627527BC319D7CB,0xEBB04466D72997AE,0xE67E0C0AE2658C7C, - 0x14D2F107B056C880,0x7122C32C30400B8C,0x8A7AE11FD5DACEDB,0xA0DEDB38E98A0E74, - 0xAD109354DCC615A6,0x0BE91A17F655CC19,0x8DDD5FFEB8BDB149,0xBFE53028AF890AED, - 0xD65BA6F5B4AD7A6A,0x7956F0882997227E,0x10E8665532B352F9,0x0E5361DFDACEFE39, - 0xCEC7F3049FC90161,0xFF62B561677F5F2E,0x975CCF26D22587F0,0x51EF0F86543BAF63, - 0x2F1E41EF10CBF28F,0x52722635BBB94A88,0xAE8DBAE73344F04D,0x410769D36688FD9A, - 0xB3AB94DE34BBB966,0x801317928DF1AA9B,0xA564A0F0C5113C54,0xF131D4BEBDB1A117, - 0x7F71A2F3EA8EF5B5,0x40878549C8F655C3,0x7EF14E6944F05DEC,0xD44663DCF55137D8, - 0xF2ACFD0D523344FC,0x0000000000000000,0x5FBC6E598EF5515A,0x16CF342EF1AA8532, - 0xB036BD6DDB395C8D,0x13754FE6DD31B712,0xBBDFA77A2D6C9094,0x89E7C8AC3A582B30, - 0x3C6B0E09CDFA459D,0xC4AE0589C7E26521,0x49735A777F5FD468,0xCAFD64561D2C9B18, - 0xDA1502032F9FC9E1,0x8867243694268369,0x3782141E3BAF8984,0x9CB5D53124704BE9, - 0xD7DB4A6F1AD3D233,0xA6F989432A93D9BF,0x9D3539AB8A0EE3B0,0x53F2CAAF15C7E2D1, - 0x6E19283C76430F15,0x3DEBE2936384EDC4,0x5E3C82C3208BF903,0x33B8834CB94A13FD, - 0x6470DEB12E686B55,0x359FD1377A53C436,0x61CAA57902F35975,0x043A975282E59A79, - 0xFD7F70482683129C,0xC52EE913699CCD78,0x28B9FF0E7DAC8D1D,0x5455744E78A09D43, - 0xCB7D88CCB3523341,0x44BD121B4A13CFBA,0x4D49CD25FDBA4E11,0x3E76CB208C06082F, - 0x3FF627BA2278A076,0xC28957F204FBB2EA,0x453DFE81E46D67E3,0x94C1E6953DA7621B, - 0x2C83685CFF491764,0xF32C1197FC4DECA5,0x2B24D6BD922E68F6,0xB22B78449AC5113F, - 0x48F3B6EDD1217C31,0x2E9EAD75BEB55AD6,0x174FD8B45FD42D6B,0x4ED4E4961238ABFA, - 0x92E6B4EEFEBEB5D0,0x46A0D7320BEF8208,0x47203BA8A5912A51,0x24F75BF8E69E3E96, - 0xF0B1382413CF094E,0xFEE259FBC901F777,0x276A724B091CDB7D,0xBDF8F501EE75475F, - 0x599B3C224DEC8691,0x6D84018F99C1EAFE,0x7498B8E41CDB39AC,0xE0595E71217C5BB7, - 0x2AA43A273C50C0AF,0xF50B43EC3F543B6E,0x838E3E2162734F70,0xC09492DB4507FF58, - 0x72BFEA9FDFC2EE67,0x11688ACF9CCDFAA0,0x1A8190D86A9836B9,0x7ACBD93BC615C795, - 0xC7332C3A286080CA,0x863445E94EE87D50,0xF6966A5FD0D6DE85,0xE9AD814F96D5DA1C, - 0x70A22FB69E3EA3D5,0x0A69F68D582B6440,0xB8428EC9C2EE757F,0x604A49E3AC8DF12C, - 0x5B86F90B0C10CB23,0xE1D9B2EB8F02F3EE,0x29391394D3D22544,0xC8E0A17F5CD0D6AA, - 0xB58CC6A5F7A26EAD,0x8193FB08238F02C2,0xD5C68F465B2F9F81,0xFCFF9CD288FDBAC5, - 0x77059157F359DC47,0x1D262E3907FF492B,0xFB582233E59AC557,0xDDB2BCE242F8B673, - 0x2577B76248E096CF,0x6F99C4A6D83DA74C,0xC1147E41EB795701,0xF48BAF76912A9337 -},{ - 0x3EF29D249B2C0A19,0xE9E16322B6F8622F,0x5536994047757F7A,0x9F4D56D5A47B0B33, - 0x822567466AA1174C,0xB8F5057DEB082FB2,0xCC48C10BF4475F53,0x373088D4275DEC3A, - 0x968F4325180AED10,0x173D232CF7016151,0xAE4ED09F946FCC13,0xFD4B4741C4539873, - 0x1B5B3F0DD9933765,0x2FFCB0967B644052,0xE02376D20A89840C,0xA3AE3A70329B18D7, - 0x419CBD2335DE8526,0xFAFEBF115B7C3199,0x0397074F85AA9B0D,0xC58AD4FB4836B970, - 0xBEC60BE3FC4104A8,0x1EFF36DC4B708772,0x131FDC33ED8453B6,0x0844E33E341764D3, - 0x0FF11B6EAB38CD39,0x64351F0A7761B85A,0x3B5694F509CFBA0E,0x30857084B87245D0, - 0x47AFB3BD2297AE3C,0xF2BA5C2F6F6B554A,0x74BDC4761F4F70E1,0xCFDFC64471EDC45E, - 0xE610784C1DC0AF16,0x7ACA29D63C113F28,0x2DED411776A859AF,0xAC5F211E99A3D5EE, - 0xD484F949A87EF33B,0x3CE36CA596E013E4,0xD120F0983A9D432C,0x6BC40464DC597563, - 0x69D5F5E5D1956C9E,0x9AE95F043698BB24,0xC9ECC8DA66A4EF44,0xD69508C8A5B2EAC6, - 0xC40C2235C0503B80,0x38C193BA8C652103,0x1CEEC75D46BC9E8F,0xD331011937515AD1, - 0xD8E2E56886ECA50F,0xB137108D5779C991,0x709F3B6905CA4206,0x4FEB50831680CAEF, - 0xEC456AF3241BD238,0x58D673AFE181ABBE,0x242F54E7CAD9BF8C,0x0211F1810DCC19FD, - 0x90BC4DBB0F43C60A,0x9518446A9DA0761D,0xA1BFCBF13F57012A,0x2BDE4F8961E172B5, - 0x27B853A84F732481,0xB0B1E643DF1F4B61,0x18CC38425C39AC68,0xD2B7F7D7BF37D821, - 0x3103864A3014C720,0x14AA246372ABFA5C,0x6E600DB54EBAC574,0x394765740403A3F3, - 0x09C215F0BC71E623,0x2A58B947E987F045,0x7B4CDF18B477BDD8,0x9709B5EB906C6FE0, - 0x73083C268060D90B,0xFEDC400E41F9037E,0x284948C6E44BE9B8,0x728ECAE808065BFB, - 0x06330E9E17492B1A,0x5950856169E7294E,0xBAE4F4FCE6C4364F,0xCA7BCF95E30E7449, - 0x7D7FD186A33E96C2,0x52836110D85AD690,0x4DFAA1021B4CD312,0x913ABB75872544FA, - 0xDD46ECB9140F1518,0x3D659A6B1E869114,0xC23F2CABD719109A,0xD713FE062DD46836, - 0xD0A60656B2FBC1DC,0x221C5A79DD909496,0xEFD26DBCA1B14935,0x0E77EDA0235E4FC9, - 0xCBFD395B6B68F6B9,0x0DE0EAEFA6F4D4C4,0x0422FF1F1A8532E7,0xF969B85EDED6AA94, - 0x7F6E2007AEF28F3F,0x3AD0623B81A938FE,0x6624EE8B7AADA1A7,0xB682E8DDC856607B, - 0xA78CC56F281E2A30,0xC79B257A45FAA08D,0x5B4174E0642B30B3,0x5F638BFF7EAE0254, - 0x4BC9AF9C0C05F808,0xCE59308AF98B46AE,0x8FC58DA9CC55C388,0x803496C7676D0EB1, - 0xF33CAAE1E70DD7BA,0xBB6202326EA2B4BF,0xD5020F87201871CB,0x9D5CA754A9B712CE, - 0x841669D87DE83C56,0x8A6184785EB6739F,0x420BBA6CB0741E2B,0xF12D5B60EAC1CE47, - 0x76AC35F71283691C,0x2C6BB7D9FECEDB5F,0xFCCDB18F4C351A83,0x1F79C012C3160582, - 0xF0ABADAE62A74CB7,0xE1A5801C82EF06FC,0x67A21845F2CB2357,0x5114665F5DF04D9D, - 0xBF40FD2D74278658,0xA0393D3FB73183DA,0x05A409D192E3B017,0xA9FB28CF0B4065F9, - 0x25A9A22942BF3D7C,0xDB75E22703463E02,0xB326E10C5AB5D06C,0xE7968E8295A62DE6, - 0xB973F3B3636EAD42,0xDF571D3819C30CE5,0xEE549B7229D7CBC5,0x12992AFD65E2D146, - 0xF8EF4E9056B02864,0xB7041E134030E28B,0xC02EDD2ADAD50967,0x932B4AF48AE95D07, - 0x6FE6FB7BC6DC4784,0x239AACB755F61666,0x401A4BEDBDB807D6,0x485EA8D389AF6305, - 0xA41BC220ADB4B13D,0x753B32B89729F211,0x997E584BB3322029,0x1D683193CEDA1C7F, - 0xFF5AB6C0C99F818E,0x16BBD5E27F67E3A1,0xA59D34EE25D233CD,0x98F8AE853B54A2D9, - 0x6DF70AFACB105E79,0x795D2E99B9BBA425,0x8E437B6744334178,0x0186F6CE886682F0, - 0xEBF092A3BB347BD2,0xBCD7FA62F18D1D55,0xADD9D7D011C5571E,0x0BD3E471B1BDFFDE, - 0xAA6C2F808EEAFEF4,0x5EE57D31F6C880A4,0xF50FA47FF044FCA0,0x1ADDC9C351F5B595, - 0xEA76646D3352F922,0x0000000000000000,0x85909F16F58EBEA6,0x46294573AAF12CCC, - 0x0A5512BF39DB7D2E,0x78DBD85731DD26D5,0x29CFBE086C2D6B48,0x218B5D36583A0F9B, - 0x152CD2ADFACD78AC,0x83A39188E2C795BC,0xC3B9DA655F7F926A,0x9ECBA01B2C1D89C3, - 0x07B5F8509F2FA9EA,0x7EE8D6C926940DCF,0x36B67E1AAF3B6ECA,0x86079859702425AB, - 0xFB7849DFD31AB369,0x4C7C57CC932A51E2,0xD96413A60E8A27FF,0x263EA566C715A671, - 0x6C71FC344376DC89,0x4A4F595284637AF8,0xDAF314E98B20BCF2,0x572768C14AB96687, - 0x1088DB7C682EC8BB,0x887075F9537A6A62,0x2E7A4658F302C2A2,0x619116DBE582084D, - 0xA87DDE018326E709,0xDCC01A779C6997E8,0xEDC39C3DAC7D50C8,0xA60A33A1A078A8C0, - 0xC1A82BE452B38B97,0x3F746BEA134A88E9,0xA228CCBEBAFD9A27,0xABEAD94E068C7C04, - 0xF48952B178227E50,0x5CF48CB0FB049959,0x6017E0156DE48ABD,0x4438B4F2A73D3531, - 0x8C528AE649FF5885,0xB515EF924DFCFB76,0x0C661C212E925634,0xB493195CC59A7986, - 0x9CDA519A21D1903E,0x32948105B5BE5C2D,0x194ACE8CD45F2E98,0x438D4CA238129CDB, - 0x9B6FA9CABEFE39D4,0x81B26009EF0B8C41,0xDED1EBF691A58E15,0x4E6DA64D9EE6481F, - 0x54B06F8ECF13FD8A,0x49D85E1D01C9E1F5,0xAFC826511C094EE3,0xF698A33075EE67AD, - 0x5AC7822EEC4DB243,0x8DD47C28C199DA75,0x89F68337DB1CE892,0xCDCE37C57C21DDA3, - 0x530597DE503C5460,0x6A42F2AA543FF793,0x5D727A7E73621BA9,0xE232875307459DF1, - 0x56A19E0FC2DFE477,0xC61DD3B4CD9C227D,0xE5877F03986A341B,0x949EB2A415C6F4ED, - 0x6206119460289340,0x6380E75AE84E11B0,0x8BE772B6D6D0F16F,0x50929091D596CF6D, - 0xE86795EC3E9EE0DF,0x7CF927482B581432,0xC86A3E14EEC26DB4,0x7119CDA78DACC0F6, - 0xE40189CD100CB6EB,0x92ADBC3A028FDFF7,0xB2A017C2D2D3529C,0x200DABF8D05C8D6B, - 0x34A78F9BA2F77737,0xE3B4719D8F231F01,0x45BE423C2F5BB7C1,0xF71E55FEFD88E55D, - 0x6853032B59F3EE6E,0x65B3E9C4FF073AAA,0x772AC3399AE5EBEC,0x87816E97F842A75B, - 0x110E2DB2E0484A4B,0x331277CB3DD8DEDD,0xBD510CAC79EB9FA5,0x352179552A91F5C7 -},{ - 0x8AB0A96846E06A6D,0x43C7E80B4BF0B33A,0x08C9B3546B161EE5,0x39F1C235EBA990BE, - 0xC1BEF2376606C7B2,0x2C209233614569AA,0xEB01523B6FC3289A,0x946953AB935ACEDD, - 0x272838F63E13340E,0x8B0455ECA12BA052,0x77A1B2C4978FF8A2,0xA55122CA13E54086, - 0x2276135862D3F1CD,0xDB8DDFDE08B76CFE,0x5D1E12C89E4A178A,0x0E56816B03969867, - 0xEE5F79953303ED59,0xAFED748BAB78D71D,0x6D929F2DF93E53EE,0xF5D8A8F8BA798C2A, - 0xF619B1698E39CF6B,0x95DDAF2F749104E2,0xEC2A9C80E0886427,0xCE5C8FD8825B95EA, - 0xC4E0D9993AC60271,0x4699C3A5173076F9,0x3D1B151F50A29F42,0x9ED505EA2BC75946, - 0x34665ACFDC7F4B98,0x61B1FB53292342F7,0xC721C0080E864130,0x8693CD1696FD7B74, - 0x872731927136B14B,0xD3446C8A63A1721B,0x669A35E8A6680E4A,0xCAB658F239509A16, - 0xA4E5DE4EF42E8AB9,0x37A7435EE83F08D9,0x134E6239E26C7F96,0x82791A3C2DF67488, - 0x3F6EF00A8329163C,0x8E5A7E42FDEB6591,0x5CAAEE4C7981DDB5,0x19F234785AF1E80D, - 0x255DDDE3ED98BD70,0x50898A32A99CCCAC,0x28CA4519DA4E6656,0xAE59880F4CB31D22, - 0x0D9798FA37D6DB26,0x32F968F0B4FFCD1A,0xA00F09644F258545,0xFA3AD5175E24DE72, - 0xF46C547C5DB24615,0x713E80FBFF0F7E20,0x7843CF2B73D2AAFA,0xBD17EA36AEDF62B4, - 0xFD111BACD16F92CF,0x4ABAA7DBC72D67E0,0xB3416B5DAD49FAD3,0xBCA316B24914A88B, - 0x15D150068AECF914,0xE27C1DEBE31EFC40,0x4FE48C759BEDA223,0x7EDCFD141B522C78, - 0x4E5070F17C26681C,0xE696CAC15815F3BC,0x35D2A64B3BB481A7,0x800CFF29FE7DFDF6, - 0x1ED9FAC3D5BAA4B0,0x6C2663A91EF599D1,0x03C1199134404341,0xF7AD4DED69F20554, - 0xCD9D9649B61BD6AB,0xC8C3BDE7EADB1368,0xD131899FB02AFB65,0x1D18E352E1FAE7F1, - 0xDA39235AEF7CA6C1,0xA1BBF5E0A8EE4F7A,0x91377805CF9A0B1E,0x3138716180BF8E5B, - 0xD9F83ACBDB3CE580,0x0275E515D38B897E,0x472D3F21F0FBBCC6,0x2D946EB7868EA395, - 0xBA3C248D21942E09,0xE7223645BFDE3983,0xFF64FEB902E41BB1,0xC97741630D10D957, - 0xC3CB1722B58D4ECC,0xA27AEC719CAE0C3B,0x99FECB51A48C15FB,0x1465AC826D27332B, - 0xE1BD047AD75EBF01,0x79F733AF941960C5,0x672EC96C41A3C475,0xC27FEBA6524684F3, - 0x64EFD0FD75E38734,0xED9E60040743AE18,0xFB8E2993B9EF144D,0x38453EB10C625A81, - 0x6978480742355C12,0x48CF42CE14A6EE9E,0x1CAC1FD606312DCE,0x7B82D6BA4792E9BB, - 0x9D141C7B1F871A07,0x5616B80DC11C4A2E,0xB849C198F21FA777,0x7CA91801C8D9A506, - 0xB1348E487EC273AD,0x41B20D1E987B3A44,0x7460AB55A3CFBBE3,0x84E628034576F20A, - 0x1B87D16D897A6173,0x0FE27DEFE45D5258,0x83CDE6B8CA3DBEB7,0x0C23647ED01D1119, - 0x7A362A3EA0592384,0xB61F40F3F1893F10,0x75D457D1440471DC,0x4558DA34237035B8, - 0xDCA6116587FC2043,0x8D9B67D3C9AB26D0,0x2B0B5C88EE0E2517,0x6FE77A382AB5DA90, - 0x269CC472D9D8FE31,0x63C41E46FAA8CB89,0xB7ABBC771642F52F,0x7D1DE4852F126F39, - 0xA8C6BA3024339BA0,0x600507D7CEE888C8,0x8FEE82C61A20AFAE,0x57A2448926D78011, - 0xFCA5E72836A458F0,0x072BCEBB8F4B4CBD,0x497BBE4AF36D24A1,0x3CAFE99BB769557D, - 0x12FA9EBD05A7B5A9,0xE8C04BAA5B836BDB,0x4273148FAC3B7905,0x908384812851C121, - 0xE557D3506C55B0FD,0x72FF996ACB4F3D61,0x3EDA0C8E64E2DC03,0xF0868356E6B949E9, - 0x04EAD72ABB0B0FFC,0x17A4B5135967706A,0xE3C8E16F04D5367F,0xF84F30028DAF570C, - 0x1846C8FCBD3A2232,0x5B8120F7F6CA9108,0xD46FA231ECEA3EA6,0x334D947453340725, - 0x58403966C28AD249,0xBED6F3A79A9F21F5,0x68CCB483A5FE962D,0xD085751B57E1315A, - 0xFED0023DE52FD18E,0x4B0E5B5F20E6ADDF,0x1A332DE96EB1AB4C,0xA3CE10F57B65C604, - 0x108F7BA8D62C3CD7,0xAB07A3A11073D8E1,0x6B0DAD1291BED56C,0xF2F366433532C097, - 0x2E557726B2CEE0D4,0x0000000000000000,0xCB02A476DE9B5029,0xE4E32FD48B9E7AC2, - 0x734B65EE2C84F75E,0x6E5386BCCD7E10AF,0x01B4FC84E7CBCA3F,0xCFE8735C65905FD5, - 0x3613BFDA0FF4C2E6,0x113B872C31E7F6E8,0x2FE18BA255052AEB,0xE974B72EBC48A1E4, - 0x0ABC5641B89D979B,0xB46AA5E62202B66E,0x44EC26B0C4BBFF87,0xA6903B5B27A503C7, - 0x7F680190FC99E647,0x97A84A3AA71A8D9C,0xDD12EDE16037EA7C,0xC554251DDD0DC84E, - 0x88C54C7D956BE313,0x4D91696048662B5D,0xB08072CC9909B992,0xB5DE5962C5C97C51, - 0x81B803AD19B637C9,0xB2F597D94A8230EC,0x0B08AAC55F565DA4,0xF1327FD2017283D6, - 0xAD98919E78F35E63,0x6AB9519676751F53,0x24E921670A53774F,0xB9FD3D1C15D46D48, - 0x92F66194FBDA485F,0x5A35DC7311015B37,0xDED3F4705477A93D,0xC00A0EB381CD0D8D, - 0xBB88D809C65FE436,0x16104997BEACBA55,0x21B70AC95693B28C,0x59F4C5E225411876, - 0xD5DB5EB50B21F499,0x55D7A19CF55C096F,0xA97246B4C3F8519F,0x8552D487A2BD3835, - 0x54635D181297C350,0x23C2EFDC85183BF2,0x9F61F96ECC0C9379,0x534893A39DDC8FED, - 0x5EDF0B59AA0A54CB,0xAC2C6D1A9F38945C,0xD7AEBBA0D8AA7DE7,0x2ABFA00C09C5EF28, - 0xD84CC64F3CF72FBF,0x2003F64DB15878B3,0xA724C7DFC06EC9F8,0x069F323F68808682, - 0xCC296ACD51D01C94,0x055E2BAE5CC0C5C3,0x6270E2C21D6301B6,0x3B842720382219C0, - 0xD2F0900E846AB824,0x52FC6F277A1745D2,0xC6953C8CE94D8B0F,0xE009F8FE3095753E, - 0x655B2C7992284D0B,0x984A37D54347DFC4,0xEAB5AEBF8808E2A5,0x9A3FD2C090CC56BA, - 0x9CA0E0FFF84CD038,0x4C2595E4AFADE162,0xDF6708F4B3BC6302,0xBF620F237D54EBCA, - 0x93429D101C118260,0x097D4FD08CDDD4DA,0x8C2F9B572E60ECEF,0x708A7C7F18C4B41F, - 0x3A30DBA4DFE9D3FF,0x4006F19A7FB0F07B,0x5F6BF7DD4DC19EF4,0x1F6D064732716E8F, - 0xF9FBCC866A649D33,0x308C8DE567744464,0x8971B0F972A0292C,0xD61A47243F61B7D8, - 0xEFEB8511D4C82766,0x961CB6BE40D147A3,0xAAB35F25F7B812DE,0x76154E407044329D, - 0x513D76B64E570693,0xF3479AC7D2F90AA8,0x9B8B2E4477079C85,0x297EB99D3D85AC69 -},{ - 0x7E37E62DFC7D40C3,0x776F25A4EE939E5B,0xE045C850DD8FB5AD,0x86ED5BA711FF1952, - 0xE91D0BD9CF616B35,0x37E0AB256E408FFB,0x9607F6C031025A7A,0x0B02F5E116D23C9D, - 0xF3D8486BFB50650C,0x621CFF27C40875F5,0x7D40CB71FA5FD34A,0x6DAA6616DAA29062, - 0x9F5F354923EC84E2,0xEC847C3DC507C3B3,0x025A3668043CE205,0xA8BF9E6C4DAC0B19, - 0xFA808BE2E9BEBB94,0xB5B99C5277C74FA3,0x78D9BC95F0397BCC,0xE332E50CDBAD2624, - 0xC74FCE129332797E,0x1729ECEB2EA709AB,0xC2D6B9F69954D1F8,0x5D898CBFBAB8551A, - 0x859A76FB17DD8ADB,0x1BE85886362F7FB5,0xF6413F8FF136CD8A,0xD3110FA5BBB7E35C, - 0x0A2FEED514CC4D11,0xE83010EDCD7F1AB9,0xA1E75DE55F42D581,0xEEDE4A55C13B21B6, - 0xF2F5535FF94E1480,0x0CC1B46D1888761E,0xBCE15FDB6529913B,0x2D25E8975A7181C2, - 0x71817F1CE2D7A554,0x2E52C5CB5C53124B,0xF9F7A6BEEF9C281D,0x9E722E7D21F2F56E, - 0xCE170D9B81DCA7E6,0x0E9B82051CB4941B,0x1E712F623C49D733,0x21E45CFA42F9F7DC, - 0xCB8E7A7F8BBA0F60,0x8E98831A010FB646,0x474CCF0D8E895B23,0xA99285584FB27A95, - 0x8CC2B57205335443,0x42D5B8E984EFF3A5,0x012D1B34021E718C,0x57A6626AAE74180B, - 0xFF19FC06E3D81312,0x35BA9D4D6A7C6DFE,0xC9D44C178F86ED65,0x506523E6A02E5288, - 0x03772D5C06229389,0x8B01F4FE0B691EC0,0xF8DABD8AED825991,0x4C4E3AEC985B67BE, - 0xB10DF0827FBF96A9,0x6A69279AD4F8DAE1,0xE78689DCD3D5FF2E,0x812E1A2B1FA553D1, - 0xFBAD90D6EBA0CA18,0x1AC543B234310E39,0x1604F7DF2CB97827,0xA6241C6951189F02, - 0x753513CCEAAF7C5E,0x64F2A59FC84C4EFA,0x247D2B1E489F5F5A,0xDB64D718AB474C48, - 0x79F4A7A1F2270A40,0x1573DA832A9BEBAE,0x3497867968621C72,0x514838D2A2302304, - 0xF0AF6537FD72F685,0x1D06023E3A6B44BA,0x678588C3CE6EDD73,0x66A893F7CC70ACFF, - 0xD4D24E29B5EDA9DF,0x3856321470EA6A6C,0x07C3418C0E5A4A83,0x2BCBB22F5635BACD, - 0x04B46CD00878D90A,0x06EE5AB80C443B0F,0x3B211F4876C8F9E5,0x0958C38912EEDE98, - 0xD14B39CDBF8B0159,0x397B292072F41BE0,0x87C0409313E168DE,0xAD26E98847CAA39F, - 0x4E140C849C6785BB,0xD5FF551DB7F3D853,0xA0CA46D15D5CA40D,0xCD6020C787FE346F, - 0x84B76DCF15C3FB57,0xDEFDA0FCA121E4CE,0x4B8D7B6096012D3D,0x9AC642AD298A2C64, - 0x0875D8BD10F0AF14,0xB357C6EA7B8374AC,0x4D6321D89A451632,0xEDA96709C719B23F, - 0xF76C24BBF328BC06,0xC662D526912C08F2,0x3CE25EC47892B366,0xB978283F6F4F39BD, - 0xC08C8F9E9D6833FD,0x4F3917B09E79F437,0x593DE06FB2C08C10,0xD6887841B1D14BDA, - 0x19B26EEE32139DB0,0xB494876675D93E2F,0x825937771987C058,0x90E9AC783D466175, - 0xF1827E03FF6C8709,0x945DC0A8353EB87F,0x4516F9658AB5B926,0x3F9573987EB020EF, - 0xB855330B6D514831,0x2AE6A91B542BCB41,0x6331E413C6160479,0x408F8E8180D311A0, - 0xEFF35161C325503A,0xD06622F9BD9570D5,0x8876D9A20D4B8D49,0xA5533135573A0C8B, - 0xE168D364DF91C421,0xF41B09E7F50A2F8F,0x12B09B0F24C1A12D,0xDA49CC2CA9593DC4, - 0x1F5C34563E57A6BF,0x54D14F36A8568B82,0xAF7CDFE043F6419A,0xEA6A2685C943F8BC, - 0xE5DCBFB4D7E91D2B,0xB27ADDDE799D0520,0x6B443CAED6E6AB6D,0x7BAE91C9F61BE845, - 0x3EB868AC7CAE5163,0x11C7B65322E332A4,0xD23C1491B9A992D0,0x8FB5982E0311C7CA, - 0x70AC6428E0C9D4D8,0x895BC2960F55FCC5,0x76423E90EC8DEFD7,0x6FF0507EDE9E7267, - 0x3DCF45F07A8CC2EA,0x4AA06054941F5CB1,0x5810FB5BB0DEFD9C,0x5EFEA1E3BC9AC693, - 0x6EDD4B4ADC8003EB,0x741808F8E8B10DD2,0x145EC1B728859A22,0x28BC9F7350172944, - 0x270A06424EBDCCD3,0x972AEDF4331C2BF6,0x059977E40A66A886,0x2550302A4A812ED6, - 0xDD8A8DA0A7037747,0xC515F87A970E9B7B,0x3023EAA9601AC578,0xB7E3AA3A73FBADA6, - 0x0FB699311EAAE597,0x0000000000000000,0x310EF19D6204B4F4,0x229371A644DB6455, - 0x0DECAF591A960792,0x5CA4978BB8A62496,0x1C2B190A38753536,0x41A295B582CD602C, - 0x3279DCC16426277D,0xC1A194AA9F764271,0x139D803B26DFD0A1,0xAE51C4D441E83016, - 0xD813FA44AD65DFC1,0xAC0BF2BC45D4D213,0x23BE6A9246C515D9,0x49D74D08923DCF38, - 0x9D05032127D066E7,0x2F7FDEFF5E4D63C7,0xA47E2A0155247D07,0x99B16FF12FA8BFED, - 0x4661D4398C972AAF,0xDFD0BBC8A33F9542,0xDCA79694A51D06CB,0xB020EBB67DA1E725, - 0xBA0F0563696DAA34,0xE4F1A480D5F76CA7,0xC438E34E9510EAF7,0x939E81243B64F2FC, - 0x8DEFAE46072D25CF,0x2C08F3A3586FF04E,0xD7A56375B3CF3A56,0x20C947CE40E78650, - 0x43F8A3DD86F18229,0x568B795EAC6A6987,0x8003011F1DBB225D,0xF53612D3F7145E03, - 0x189F75DA300DEC3C,0x9570DB9C3720C9F3,0xBB221E576B73DBB8,0x72F65240E4F536DD, - 0x443BE25188ABC8AA,0xE21FFE38D9B357A8,0xFD43CA6EE7E4F117,0xCAA3614B89A47EEC, - 0xFE34E732E1C6629E,0x83742C431B99B1D4,0xCF3A16AF83C2D66A,0xAAE5A8044990E91C, - 0x26271D764CA3BD5F,0x91C4B74C3F5810F9,0x7C6DD045F841A2C6,0x7F1AFD19FE63314F, - 0xC8F957238D989CE9,0xA709075D5306EE8E,0x55FC5402AA48FA0E,0x48FA563C9023BEB4, - 0x65DFBEABCA523F76,0x6C877D22D8BCE1EE,0xCC4D3BF385E045E3,0xBEBB69B36115733E, - 0x10EAAD6720FD4328,0xB6CEB10E71E5DC2A,0xBDCC44EF6737E0B7,0x523F158EA412B08D, - 0x989C74C52DB6CE61,0x9BEB59992B945DE8,0x8A2CEFCA09776F4C,0xA3BD6B8D5B7E3784, - 0xEB473DB1CB5D8930,0xC3FBA2C29B4AA074,0x9C28181525CE176B,0x683311F2D0C438E4, - 0x5FD3BAD7BE84B71F,0xFC6ED15AE5FA809B,0x36CDB0116C5EFE77,0x29918447520958C8, - 0xA29070B959604608,0x53120EBAA60CC101,0x3A0C047C74D68869,0x691E0AC6D2DA4968, - 0x73DB4974E6EB4751,0x7A838AFDF40599C9,0x5A4ACD33B4E21F99,0x6046C94FC03497F0, - 0xE6AB92E8D1CB8EA2,0x3354C7F5663856F1,0xD93EE170AF7BAE4D,0x616BD27BC22AE67C, - 0x92B39A10397A8370,0xABC8B3304B8E9890,0xBF967287630B02B2,0x5B67D607B6FC6E15 -},{ - 0xD031C397CE553FE6,0x16BA5B01B006B525,0xA89BADE6296E70C8,0x6A1F525D77D3435B, - 0x6E103570573DFA0B,0x660EFB2A17FC95AB,0x76327A9E97634BF6,0x4BAD9D6462458BF5, - 0xF1830CAEDBC3F748,0xC5C8F542669131FF,0x95044A1CDC48B0CB,0x892962DF3CF8B866, - 0xB0B9E208E930C135,0xA14FB3F0611A767C,0x8D2605F21C160136,0xD6B71922FECC549E, - 0x37089438A5907D8B,0x0B5DA38E5803D49C,0x5A5BCC9CEA6F3CBC,0xEDAE246D3B73FFE5, - 0xD2B87E0FDE22EDCE,0x5E54ABB1CA8185EC,0x1DE7F88FE80561B9,0xAD5E1A870135A08C, - 0x2F2ADBD665CECC76,0x5780B5A782F58358,0x3EDC8A2EEDE47B3F,0xC9D95C3506BEE70F, - 0x83BE111D6C4E05EE,0xA603B90959367410,0x103C81B4809FDE5D,0x2C69B6027D0C774A, - 0x399080D7D5C87953,0x09D41E16487406B4,0xCDD63B1826505E5F,0xF99DC2F49B0298E8, - 0x9CD0540A943CB67F,0xBCA84B7F891F17C5,0x723D1DB3B78DF2A6,0x78AA6E71E73B4F2E, - 0x1433E699A071670D,0x84F21BE454620782,0x98DF3327B4D20F2F,0xF049DCE2D3769E5C, - 0xDB6C60199656EB7A,0x648746B2078B4783,0x32CD23598DCBADCF,0x1EA4955BF0C7DA85, - 0xE9A143401B9D46B5,0xFD92A5D9BBEC21B8,0xC8138C790E0B8E1B,0x2EE00B9A6D7BA562, - 0xF85712B893B7F1FC,0xEB28FED80BEA949D,0x564A65EB8A40EA4C,0x6C9988E8474A2823, - 0x4535898B121D8F2D,0xABD8C03231ACCBF4,0xBA2E91CAB9867CBD,0x7960BE3DEF8E263A, - 0x0C11A977602FD6F0,0xCB50E1AD16C93527,0xEAE22E94035FFD89,0x2866D12F5DE2CE1A, - 0xFF1B1841AB9BF390,0x9F9339DE8CFE0D43,0x964727C8C48A0BF7,0x524502C6AAAE531C, - 0x9B9C5EF3AC10B413,0x4FA2FA4942AB32A5,0x3F165A62E551122B,0xC74148DA76E6E3D7, - 0x924840E5E464B2A7,0xD372AE43D69784DA,0x233B72A105E11A86,0xA48A04914941A638, - 0xB4B68525C9DE7865,0xDDEABAACA6CF8002,0x0A9773C250B6BD88,0xC284FFBB5EBD3393, - 0x8BA0DF472C8F6A4E,0x2AEF6CB74D951C32,0x427983722A318D41,0x73F7CDFFBF389BB2, - 0x074C0AF9382C026C,0x8A6A0F0B243A035A,0x6FDAE53C5F88931F,0xC68B98967E538AC3, - 0x44FF59C71AA8E639,0xE2FCE0CE439E9229,0xA20CDE2479D8CD40,0x19E89FA2C8EBD8E9, - 0xF446BBCFF398270C,0x43B3533E2284E455,0xD82F0DCD8E945046,0x51066F12B26CE820, - 0xE73957AF6BC5426D,0x081ECE5A40C16FA0,0x3B193D4FC5BFAB7B,0x7FE66488DF174D42, - 0x0E9814EF705804D8,0x8137AC857C39D7C6,0xB1733244E185A821,0x695C3F896F11F867, - 0xF6CF0657E3EFF524,0x1AABF276D02963D5,0x2DA3664E75B91E5E,0x0289BD981077D228, - 0x90C1FD7DF413608F,0x3C5537B6FD93A917,0xAA12107E3919A2E0,0x0686DAB530996B78, - 0xDAA6B0559EE3826E,0xC34E2FF756085A87,0x6D5358A44FFF4137,0xFC587595B35948AC, - 0x7CA5095CC7D5F67E,0xFB147F6C8B754AC0,0xBFEB26AB91DDACF9,0x6896EFC567A49173, - 0xCA9A31E11E7C5C33,0xBBE44186B13315A9,0x0DDB793B689ABFE4,0x70B4A02BA7FA208E, - 0xE47A3A7B7307F951,0x8CECD5BE14A36822,0xEEED49B923B144D9,0x17708B4DB8B3DC31, - 0x6088219F2765FED3,0xB3FA8FDCF1F27A09,0x910B2D31FCA6099B,0x0F52C4A378ED6DCC, - 0x50CCBF5EBAD98134,0x6BD582117F662A4F,0x94CE9A50D4FDD9DF,0x2B25BCFB45207526, - 0x67C42B661F49FCBF,0x492420FC723259DD,0x03436DD418C2BB3C,0x1F6E4517F872B391, - 0xA08563BC69AF1F68,0xD43EA4BAEEBB86B6,0x01CAD04C08B56914,0xAC94CACB0980C998, - 0x54C3D8739A373864,0x26FEC5C02DBACAC2,0xDEA9D778BE0D3B3E,0x040F672D20EEB950, - 0xE5B0EA377BB29045,0xF30AB136CBB42560,0x62019C0737122CFB,0xE86B930C13282FA1, - 0xCC1CEB542EE5374B,0x538FD28AA21B3A08,0x1B61223AD89C0AC1,0x36C24474AD25149F, - 0x7A23D3E9F74C9D06,0xBE21F6E79968C5ED,0xCF5F868036278C77,0xF705D61BEB5A9C30, - 0x4D2B47D152DCE08D,0x5F9E7BFDC234ECF8,0x247778583DCD18EA,0x867BA67C4415D5AA, - 0x4CE1979D5A698999,0x0000000000000000,0xEC64F42133C696F1,0xB57C5569C16B1171, - 0xC1C7926F467F88AF,0x654D96FE0F3E2E97,0x15F936D5A8C40E19,0xB8A72C52A9F1AE95, - 0xA9517DAA21DB19DC,0x58D27104FA18EE94,0x5918A148F2AD8780,0x5CDD1629DAF657C4, - 0x8274C15164FB6CFA,0xD1FB13DBC6E056F2,0x7D6FD910CF609F6A,0xB63F38BDD9A9AA4D, - 0x3D9FE7FAF526C003,0x74BBC706871499DE,0xDF630734B6B8522A,0x3AD3ED03CD0AC26F, - 0xFADEAF2083C023D4,0xC00D42234ECAE1BB,0x8538CBA85CD76E96,0xC402250E6E2458EB, - 0x47BC3413026A5D05,0xAFD7A71F114272A4,0x978DF784CC3F62E3,0xB96DFC1EA144C781, - 0x21B2CF391596C8AE,0x318E4E8D950916F3,0xCE9556CC3E92E563,0x385A509BDD7D1047, - 0x358129A0B5E7AFA3,0xE6F387E363702B79,0xE0755D5653E94001,0x7BE903A5FFF9F412, - 0x12B53C2C90E80C75,0x3307F315857EC4DB,0x8FAFB86A0C61D31E,0xD9E5DD8186213952, - 0x77F8AAD29FD622E2,0x25BDA814357871FE,0x7571174A8FA1F0CA,0x137FEC60985D6561, - 0x30449EC19DBC7FE7,0xA540D4DD41F4CF2C,0xDC206AE0AE7AE916,0x5B911CD0E2DA55A8, - 0xB2305F90F947131D,0x344BF9ECBD52C6B7,0x5D17C665D2433ED0,0x18224FEEC05EB1FD, - 0x9E59E992844B6457,0x9A568EBFA4A5DD07,0xA3C60E68716DA454,0x7E2CB4C4D7A22456, - 0x87B176304CA0BCBE,0x413AEEA632F3367D,0x9915E36BBC67663B,0x40F03EEA3A465F69, - 0x1C2D28C3E0B008AD,0x4E682A054A1E5BB1,0x05C5B761285BD044,0xE1BF8D1A5B5C2915, - 0xF2C0617AC3014C74,0xB7F5E8F1D11CC359,0x63CB4C4B3FA745EF,0x9D1A84469C89DF6B, - 0xE33630824B2BFB3D,0xD5F474F6E60EEFA2,0xF58C6B83FB2D4E18,0x4676E45F0ADF3411, - 0x20781F751D23A1BA,0xBD629B3381AA7ED1,0xAE1D775319F71BB0,0xFED1C80DA32E9A84, - 0x5509083F92825170,0x29AC01635557A70E,0xA7C9694551831D04,0x8E65682604D4BA0A, - 0x11F651F8882AB749,0xD77DC96EF6793D8A,0xEF2799F52B042DCD,0x48EEF0B07A8730C9, - 0x22F1A2ED0D547392,0x6142F1D32FD097C7,0x4A674D286AF0E2E1,0x80FD7CC9748CBED2, - 0x717E7067AF4F499A,0x938290A9ECD1DBB3,0x88E3B293344DD172,0x2734158C250FA3D6 -}}; - -// Constant values for KeySchedule function -const unsigned char C[12][64] = {{ - 0xB1,0x08,0x5B,0xDA,0x1E,0xCA,0xDA,0xE9,0xEB,0xCB,0x2F,0x81,0xC0,0x65,0x7C,0x1F, - 0x2F,0x6A,0x76,0x43,0x2E,0x45,0xD0,0x16,0x71,0x4E,0xB8,0x8D,0x75,0x85,0xC4,0xFC, - 0x4B,0x7C,0xE0,0x91,0x92,0x67,0x69,0x01,0xA2,0x42,0x2A,0x08,0xA4,0x60,0xD3,0x15, - 0x05,0x76,0x74,0x36,0xCC,0x74,0x4D,0x23,0xDD,0x80,0x65,0x59,0xF2,0xA6,0x45,0x07 -},{ - 0x6F,0xA3,0xB5,0x8A,0xA9,0x9D,0x2F,0x1A,0x4F,0xE3,0x9D,0x46,0x0F,0x70,0xB5,0xD7, - 0xF3,0xFE,0xEA,0x72,0x0A,0x23,0x2B,0x98,0x61,0xD5,0x5E,0x0F,0x16,0xB5,0x01,0x31, - 0x9A,0xB5,0x17,0x6B,0x12,0xD6,0x99,0x58,0x5C,0xB5,0x61,0xC2,0xDB,0x0A,0xA7,0xCA, - 0x55,0xDD,0xA2,0x1B,0xD7,0xCB,0xCD,0x56,0xE6,0x79,0x04,0x70,0x21,0xB1,0x9B,0xB7 -},{ - 0xF5,0x74,0xDC,0xAC,0x2B,0xCE,0x2F,0xC7,0x0A,0x39,0xFC,0x28,0x6A,0x3D,0x84,0x35, - 0x06,0xF1,0x5E,0x5F,0x52,0x9C,0x1F,0x8B,0xF2,0xEA,0x75,0x14,0xB1,0x29,0x7B,0x7B, - 0xD3,0xE2,0x0F,0xE4,0x90,0x35,0x9E,0xB1,0xC1,0xC9,0x3A,0x37,0x60,0x62,0xDB,0x09, - 0xC2,0xB6,0xF4,0x43,0x86,0x7A,0xDB,0x31,0x99,0x1E,0x96,0xF5,0x0A,0xBA,0x0A,0xB2 -},{ - 0xEF,0x1F,0xDF,0xB3,0xE8,0x15,0x66,0xD2,0xF9,0x48,0xE1,0xA0,0x5D,0x71,0xE4,0xDD, - 0x48,0x8E,0x85,0x7E,0x33,0x5C,0x3C,0x7D,0x9D,0x72,0x1C,0xAD,0x68,0x5E,0x35,0x3F, - 0xA9,0xD7,0x2C,0x82,0xED,0x03,0xD6,0x75,0xD8,0xB7,0x13,0x33,0x93,0x52,0x03,0xBE, - 0x34,0x53,0xEA,0xA1,0x93,0xE8,0x37,0xF1,0x22,0x0C,0xBE,0xBC,0x84,0xE3,0xD1,0x2E -},{ - 0x4B,0xEA,0x6B,0xAC,0xAD,0x47,0x47,0x99,0x9A,0x3F,0x41,0x0C,0x6C,0xA9,0x23,0x63, - 0x7F,0x15,0x1C,0x1F,0x16,0x86,0x10,0x4A,0x35,0x9E,0x35,0xD7,0x80,0x0F,0xFF,0xBD, - 0xBF,0xCD,0x17,0x47,0x25,0x3A,0xF5,0xA3,0xDF,0xFF,0x00,0xB7,0x23,0x27,0x1A,0x16, - 0x7A,0x56,0xA2,0x7E,0xA9,0xEA,0x63,0xF5,0x60,0x17,0x58,0xFD,0x7C,0x6C,0xFE,0x57 -},{ - 0xAE,0x4F,0xAE,0xAE,0x1D,0x3A,0xD3,0xD9,0x6F,0xA4,0xC3,0x3B,0x7A,0x30,0x39,0xC0, - 0x2D,0x66,0xC4,0xF9,0x51,0x42,0xA4,0x6C,0x18,0x7F,0x9A,0xB4,0x9A,0xF0,0x8E,0xC6, - 0xCF,0xFA,0xA6,0xB7,0x1C,0x9A,0xB7,0xB4,0x0A,0xF2,0x1F,0x66,0xC2,0xBE,0xC6,0xB6, - 0xBF,0x71,0xC5,0x72,0x36,0x90,0x4F,0x35,0xFA,0x68,0x40,0x7A,0x46,0x64,0x7D,0x6E -},{ - 0xF4,0xC7,0x0E,0x16,0xEE,0xAA,0xC5,0xEC,0x51,0xAC,0x86,0xFE,0xBF,0x24,0x09,0x54, - 0x39,0x9E,0xC6,0xC7,0xE6,0xBF,0x87,0xC9,0xD3,0x47,0x3E,0x33,0x19,0x7A,0x93,0xC9, - 0x09,0x92,0xAB,0xC5,0x2D,0x82,0x2C,0x37,0x06,0x47,0x69,0x83,0x28,0x4A,0x05,0x04, - 0x35,0x17,0x45,0x4C,0xA2,0x3C,0x4A,0xF3,0x88,0x86,0x56,0x4D,0x3A,0x14,0xD4,0x93 -},{ - 0x9B,0x1F,0x5B,0x42,0x4D,0x93,0xC9,0xA7,0x03,0xE7,0xAA,0x02,0x0C,0x6E,0x41,0x41, - 0x4E,0xB7,0xF8,0x71,0x9C,0x36,0xDE,0x1E,0x89,0xB4,0x44,0x3B,0x4D,0xDB,0xC4,0x9A, - 0xF4,0x89,0x2B,0xCB,0x92,0x9B,0x06,0x90,0x69,0xD1,0x8D,0x2B,0xD1,0xA5,0xC4,0x2F, - 0x36,0xAC,0xC2,0x35,0x59,0x51,0xA8,0xD9,0xA4,0x7F,0x0D,0xD4,0xBF,0x02,0xE7,0x1E -},{ - 0x37,0x8F,0x5A,0x54,0x16,0x31,0x22,0x9B,0x94,0x4C,0x9A,0xD8,0xEC,0x16,0x5F,0xDE, - 0x3A,0x7D,0x3A,0x1B,0x25,0x89,0x42,0x24,0x3C,0xD9,0x55,0xB7,0xE0,0x0D,0x09,0x84, - 0x80,0x0A,0x44,0x0B,0xDB,0xB2,0xCE,0xB1,0x7B,0x2B,0x8A,0x9A,0xA6,0x07,0x9C,0x54, - 0x0E,0x38,0xDC,0x92,0xCB,0x1F,0x2A,0x60,0x72,0x61,0x44,0x51,0x83,0x23,0x5A,0xDB -},{ - 0xAB,0xBE,0xDE,0xA6,0x80,0x05,0x6F,0x52,0x38,0x2A,0xE5,0x48,0xB2,0xE4,0xF3,0xF3, - 0x89,0x41,0xE7,0x1C,0xFF,0x8A,0x78,0xDB,0x1F,0xFF,0xE1,0x8A,0x1B,0x33,0x61,0x03, - 0x9F,0xE7,0x67,0x02,0xAF,0x69,0x33,0x4B,0x7A,0x1E,0x6C,0x30,0x3B,0x76,0x52,0xF4, - 0x36,0x98,0xFA,0xD1,0x15,0x3B,0xB6,0xC3,0x74,0xB4,0xC7,0xFB,0x98,0x45,0x9C,0xED -},{ - 0x7B,0xCD,0x9E,0xD0,0xEF,0xC8,0x89,0xFB,0x30,0x02,0xC6,0xCD,0x63,0x5A,0xFE,0x94, - 0xD8,0xFA,0x6B,0xBB,0xEB,0xAB,0x07,0x61,0x20,0x01,0x80,0x21,0x14,0x84,0x66,0x79, - 0x8A,0x1D,0x71,0xEF,0xEA,0x48,0xB9,0xCA,0xEF,0xBA,0xCD,0x1D,0x7D,0x47,0x6E,0x98, - 0xDE,0xA2,0x59,0x4A,0xC0,0x6F,0xD8,0x5D,0x6B,0xCA,0xA4,0xCD,0x81,0xF3,0x2D,0x1B -},{ - 0x37,0x8E,0xE7,0x67,0xF1,0x16,0x31,0xBA,0xD2,0x13,0x80,0xB0,0x04,0x49,0xB1,0x7A, - 0xCD,0xA4,0x3C,0x32,0xBC,0xDF,0x1D,0x77,0xF8,0x20,0x12,0xD4,0x30,0x21,0x9F,0x9B, - 0x5D,0x80,0xEF,0x9D,0x18,0x91,0xCC,0x86,0xE7,0x1D,0xA4,0xAA,0x88,0xE1,0x28,0x52, - 0xFA,0xF4,0x17,0xD5,0xD9,0xB2,0x1B,0x99,0x48,0xBC,0x92,0x4A,0xF1,0x1B,0xD7,0x20 -}}; - - -static void AddModulo512(const void *a,const void *b,void *c) -{ - const unsigned char *A=a, *B=b; - unsigned char *C=c; - int t = 0; -#ifdef FULL_UNROLL -#define ADDBYTE_8(i) t = A[i] + B[i] + (t >> 8); C[i] = t & 0xFF; - - ADDBYTE_8(63) - ADDBYTE_8(62) - ADDBYTE_8(61) - ADDBYTE_8(60) - ADDBYTE_8(59) - ADDBYTE_8(58) - ADDBYTE_8(57) - ADDBYTE_8(56) - ADDBYTE_8(55) - ADDBYTE_8(54) - ADDBYTE_8(53) - ADDBYTE_8(52) - ADDBYTE_8(51) - ADDBYTE_8(50) - ADDBYTE_8(49) - ADDBYTE_8(48) - ADDBYTE_8(47) - ADDBYTE_8(46) - ADDBYTE_8(45) - ADDBYTE_8(44) - ADDBYTE_8(43) - ADDBYTE_8(42) - ADDBYTE_8(41) - ADDBYTE_8(40) - ADDBYTE_8(39) - ADDBYTE_8(38) - ADDBYTE_8(37) - ADDBYTE_8(36) - ADDBYTE_8(35) - ADDBYTE_8(34) - ADDBYTE_8(33) - ADDBYTE_8(32) - ADDBYTE_8(31) - ADDBYTE_8(30) - ADDBYTE_8(29) - ADDBYTE_8(28) - ADDBYTE_8(27) - ADDBYTE_8(26) - ADDBYTE_8(25) - ADDBYTE_8(24) - ADDBYTE_8(23) - ADDBYTE_8(22) - ADDBYTE_8(21) - ADDBYTE_8(20) - ADDBYTE_8(19) - ADDBYTE_8(18) - ADDBYTE_8(17) - ADDBYTE_8(16) - ADDBYTE_8(15) - ADDBYTE_8(14) - ADDBYTE_8(13) - ADDBYTE_8(12) - ADDBYTE_8(11) - ADDBYTE_8(10) - ADDBYTE_8(9) - ADDBYTE_8(8) - ADDBYTE_8(7) - ADDBYTE_8(6) - ADDBYTE_8(5) - ADDBYTE_8(4) - ADDBYTE_8(3) - ADDBYTE_8(2) - ADDBYTE_8(1) - ADDBYTE_8(0) - -#else - int i = 0; - - for(i=63;i>=0;i--) - { - t = A[i] + B[i] + (t >> 8); - C[i] = t & 0xFF; - } -#endif -} - -static void AddXor512(const void *a,const void *b,void *c) -{ - const unsigned long long *A=a, *B=b; - unsigned long long *C=c; -#ifdef FULL_UNROLL - C[0] = A[0] ^ B[0]; - C[1] = A[1] ^ B[1]; - C[2] = A[2] ^ B[2]; - C[3] = A[3] ^ B[3]; - C[4] = A[4] ^ B[4]; - C[5] = A[5] ^ B[5]; - C[6] = A[6] ^ B[6]; - C[7] = A[7] ^ B[7]; -#else - int i = 0; - - for(i=0; i<8; i++) { - C[i] = A[i] ^ B[i]; - } -#endif -} - -static void F(unsigned char *state) -{ - unsigned long long return_state[8]; - register unsigned long long r = 0; - r ^= TG[0][state[56]]; - r ^= TG[1][state[48]]; - r ^= TG[2][state[40]]; - r ^= TG[3][state[32]]; - r ^= TG[4][state[24]]; - r ^= TG[5][state[16]]; - r ^= TG[6][state[8]]; - r ^= TG[7][state[0]]; - return_state[0] = r; - r = 0; - - r ^= TG[0][state[57]]; - r ^= TG[1][state[49]]; - r ^= TG[2][state[41]]; - r ^= TG[3][state[33]]; - r ^= TG[4][state[25]]; - r ^= TG[5][state[17]]; - r ^= TG[6][state[9]]; - r ^= TG[7][state[1]]; - return_state[1] = r; - r = 0; - - r ^= TG[0][state[58]]; - r ^= TG[1][state[50]]; - r ^= TG[2][state[42]]; - r ^= TG[3][state[34]]; - r ^= TG[4][state[26]]; - r ^= TG[5][state[18]]; - r ^= TG[6][state[10]]; - r ^= TG[7][state[2]]; - return_state[2] = r; - r = 0; - - r ^= TG[0][state[59]]; - r ^= TG[1][state[51]]; - r ^= TG[2][state[43]]; - r ^= TG[3][state[35]]; - r ^= TG[4][state[27]]; - r ^= TG[5][state[19]]; - r ^= TG[6][state[11]]; - r ^= TG[7][state[3]]; - return_state[3] = r; - r = 0; - - r ^= TG[0][state[60]]; - r ^= TG[1][state[52]]; - r ^= TG[2][state[44]]; - r ^= TG[3][state[36]]; - r ^= TG[4][state[28]]; - r ^= TG[5][state[20]]; - r ^= TG[6][state[12]]; - r ^= TG[7][state[4]]; - return_state[4] = r; - r = 0; - - r ^= TG[0][state[61]]; - r ^= TG[1][state[53]]; - r ^= TG[2][state[45]]; - r ^= TG[3][state[37]]; - r ^= TG[4][state[29]]; - r ^= TG[5][state[21]]; - r ^= TG[6][state[13]]; - r ^= TG[7][state[5]]; - return_state[5] = r; - r = 0; - - r ^= TG[0][state[62]]; - r ^= TG[1][state[54]]; - r ^= TG[2][state[46]]; - r ^= TG[3][state[38]]; - r ^= TG[4][state[30]]; - r ^= TG[5][state[22]]; - r ^= TG[6][state[14]]; - r ^= TG[7][state[6]]; - return_state[6] = r; - r = 0; - - r ^= TG[0][state[63]]; - r ^= TG[1][state[55]]; - r ^= TG[2][state[47]]; - r ^= TG[3][state[39]]; - r ^= TG[4][state[31]]; - r ^= TG[5][state[23]]; - r ^= TG[6][state[15]]; - r ^= TG[7][state[7]]; - return_state[7] = r; - - memcpy(state,(unsigned char*)return_state,64); -} - -#define KeySchedule(K,i) AddXor512(K,C[i],K); F(K); - -static void E(unsigned char *K,const unsigned char *m, unsigned char *state) -{ -#ifdef FULL_UNROLL - AddXor512(m,K,state); - - F(state); - KeySchedule(K,0); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,1); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,2); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,3); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,4); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,5); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,6); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,7); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,8); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,9); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,10); - AddXor512(state,K,state); - - F(state); - KeySchedule(K,11); - AddXor512(state,K,state); -#else - int i = 0; - - AddXor512(m,K,state); - - for(i=0;i<12;i++) { - F(state); - KeySchedule(K,i); - AddXor512(state,K,state); - } -#endif -} - -static void g_N(const unsigned char *N,unsigned char *h,const unsigned char *m) -{ - unsigned char t[64], K[64]; - - AddXor512(N,h,K); - - F(K); - - E(K,m,t); - - AddXor512(t,h,t); - AddXor512(t,m,h); -} - -static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long long length,unsigned char *out) -{ - unsigned char v512[64] = { - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00 - }; - unsigned char v0[64] = { - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 - }; - unsigned char Sigma[64] = { - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 - }; - unsigned char N[64] = { - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 - }; - unsigned char m[64], *hash = IV; - unsigned long long len = length; - - // Stage 2 - while (len >= 512) - { - memcpy(m, message + len/8 - 63 - ( (len & 0x7) == 0 ), 64); - - g_N(N,hash,m); - AddModulo512(N,v512,N); - AddModulo512(Sigma,m,Sigma); - len -= 512; - } - - memset(m,0,64); - memcpy(m + 63 - len/8 + ( (len & 0x7) == 0 ), message, len/8 + 1 - ( (len & 0x7) == 0 )); - - // Stage 3 - m[ 63 - len/8 ] |= (1 << (len & 0x7)); - - g_N(N,hash,m); - v512[63] = len & 0xFF; - v512[62] = (unsigned char) (len >> 8); - AddModulo512(N,v512,N); - - AddModulo512(Sigma,m,Sigma); - - g_N(v0,hash,N); - g_N(v0,hash,Sigma); - - memcpy(out, hash, 64); -} - -static void hash_512(const unsigned char *message, unsigned long long length, unsigned char *out) -{ - unsigned char IV[64] = { - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 - }; - - hash_X(IV,message,length,out); -} - -static void hash_256(const unsigned char *message, unsigned long long length, unsigned char *out) -{ - unsigned char IV[64] = { - 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, - 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, - 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, - 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01 - }; - unsigned char hash[64]; - - hash_X(IV,message,length,hash); - - memcpy(out,hash,32); -} - - - - - -/* see sph_gost.h */ -void -sph_gost256_init(void *cc) -{ - //gost_init(cc, 256); -} - -/* see sph_gost.h */ -void -sph_gost256(void *cc, const void *data, size_t len) -{ - hash_256(data, 8*len, cc); -} - -/* see sph_gost.h */ -void -sph_gost256_close(void *cc, void *dst) -{ - //sph_gost256_addbits_and_close(cc, 0, 0, dst); - memcpy(dst, cc, 32); -} - -/* see sph_gost.h */ -void -sph_gost256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - //gost_close32(cc, ub, n, dst); -} - -/* see sph_gost.h */ -void -sph_gost512_init(void *cc) -{ - //gost_init(cc, 512); -} - -/* see sph_gost.h */ -void -sph_gost512(void *cc, const void *data, size_t len) -{ - hash_512(data, 8*len, cc); -} - -/* see sph_gost.h */ -void -sph_gost512_close(void *cc, void *dst) -{ - //sph_gost512_addbits_and_close(cc, 0, 0, dst); - memcpy(dst, cc, 64); -} - -/* see sph_gost.h */ -void -sph_gost512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - //gost_close64(cc, ub, n, dst); -} - - -#ifdef __cplusplus -} -#endif diff --git a/algo/gost/sph_gost.h b/algo/gost/sph_gost.h deleted file mode 100644 index 6defe67..0000000 --- a/algo/gost/sph_gost.h +++ /dev/null @@ -1,185 +0,0 @@ -/* $Id: sph_gost.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * GOST interface. This is the interface for GOST R 12 with the - * recommended parameters for SHA-3, with output lengths 256 - * and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_gost.h - * @author Mish - */ - -#ifndef SPH_GOST_H__ -#define SPH_GOST_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "algo/sha/sph_types.h" - -/** - * Output size (in bits) for GOST-256. - */ -#define SPH_SIZE_gost256 256 - -/** - * Output size (in bits) for GOST-512. - */ -#define SPH_SIZE_gost512 512 - -/** - * This structure is a context for Keccak computations: it contains the - * intermediate values and some data from the last entered block. Once a - * GOST computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running GOST computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ - -/** - * This structure is a context for Gost-256 computations. - */ - -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[32]; /* first field, for alignment */ - size_t ptr; - sph_u32 V[3][8]; -#endif -} sph_gost256_context; - -/** - * This structure is a context for Gost-512 computations. - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ - size_t ptr; - sph_u32 V[5][8]; -#endif -} sph_gost512_context; - - -/** - * Initialize a GOST-256 context. This process performs no memory allocation. - * - * @param cc the GOST-256 context (pointer to a - * sph_gost256_context) - */ -void sph_gost256_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Gost-256 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_gost256(void *cc, const void *data, size_t len); - -/** - * Terminate the current GOST-256 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (32 bytes). The context is automatically - * reinitialized. - * - * @param cc the GOST-256 context - * @param dst the destination buffer - */ -void sph_gost256_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (32 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the GOST-256 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_gost256_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -/** - * Initialize a Gost-512 context. This process performs no memory allocation. - * - * @param cc the GOST-512 context (pointer to a - * sph_gost512_context) - */ -void sph_gost512_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the GOST-512 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_gost512(void *cc, const void *data, size_t len); - -/** - * Terminate the current GOST-512 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (64 bytes). The context is automatically - * reinitialized. - * - * @param cc the GOST-512 context - * @param dst the destination buffer - */ -void sph_gost512_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (64 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the GOST-512 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_gost512_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/gr/cryptonote/crypto/.dirstamp b/algo/gr/cryptonote/crypto/.dirstamp new file mode 100644 index 0000000..e69de29 diff --git a/algo/gr/cryptonote/cryptonight.c b/algo/gr/cryptonote/cryptonight.c deleted file mode 100644 index a80d61d..0000000 --- a/algo/gr/cryptonote/cryptonight.c +++ /dev/null @@ -1,300 +0,0 @@ -// Copyright (c) 2012-2013 The Cryptonote developers -// Distributed under the MIT/X11 software license, see the accompanying -// file COPYING or http://www.opensource.org/licenses/mit-license.php. -// Portions Copyright (c) 2018 The Monero developers -// Portions Copyright (c) 2018 The TurtleCoin Developers - -#include -#include -#include "crypto/oaes_lib.h" -#include "crypto/c_keccak.h" -#include "crypto/c_groestl.h" -#include "crypto/c_blake256.h" -#include "crypto/c_jh.h" -#include "crypto/c_skein.h" -#include "crypto/int-util.h" -#include "crypto/hash-ops.h" -#include "crypto/variant2_int_sqrt.h" - -#if defined(_MSC_VER) -#include -#endif - -#define MEMORY 2097152 /* 2 MiB 2^21 */ -#define ITER 1048576 /* 2^20 */ -#define ITER_DIV 524288 /* 2^19 */ -#define AES_BLOCK_SIZE 16 -#define AES_KEY_SIZE 32 /*16*/ -#define INIT_SIZE_BLK 8 -#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) -#define CN_INIT (MEMORY / INIT_SIZE_BYTE) -#define CN_AES_INIT (MEMORY / AES_BLOCK_SIZE) - -#define VARIANT1_1(p) \ - do if (variant == 1) \ - { \ - const uint8_t tmp = ((const uint8_t*)(p))[11]; \ - static const uint32_t table = 0x75310; \ - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \ - ((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \ - } while(0) - -#define VARIANT1_2(p) \ - do if (variant == 1) \ - { \ - ((uint64_t*)p)[1] ^= tweak1_2; \ - } while(0) - -#define VARIANT1_INIT() \ - if (variant == 1 && len < 43) \ - { \ - fprintf(stderr, "Cryptonight variant 1 needs at least 43 bytes of data"); \ - _exit(1); \ - } \ - const uint64_t tweak1_2 = (variant == 1) ? *(const uint64_t*)(((const uint8_t*)input)+35) ^ ctx->state.hs.w[24] : 0 - -#define U64(p) ((uint64_t*)(p)) - -#define VARIANT2_INIT(b, state) \ - uint64_t division_result; \ - uint64_t sqrt_result; \ - do if (variant >= 2) \ - { \ - U64(b)[2] = state.hs.w[8] ^ state.hs.w[10]; \ - U64(b)[3] = state.hs.w[9] ^ state.hs.w[11]; \ - division_result = state.hs.w[12]; \ - sqrt_result = state.hs.w[13]; \ - } while (0) - -#define VARIANT2_SHUFFLE_ADD(base_ptr, offset, a, b) \ - do if (variant >= 2) \ - { \ - uint64_t* chunk1 = U64((base_ptr) + ((offset) ^ 0x10)); \ - uint64_t* chunk2 = U64((base_ptr) + ((offset) ^ 0x20)); \ - uint64_t* chunk3 = U64((base_ptr) + ((offset) ^ 0x30)); \ - \ - const uint64_t chunk1_old[2] = { chunk1[0], chunk1[1] }; \ - \ - chunk1[0] = chunk3[0] + U64(b + 16)[0]; \ - chunk1[1] = chunk3[1] + U64(b + 16)[1]; \ - \ - chunk3[0] = chunk2[0] + U64(a)[0]; \ - chunk3[1] = chunk2[1] + U64(a)[1]; \ - \ - chunk2[0] = chunk1_old[0] + U64(b)[0]; \ - chunk2[1] = chunk1_old[1] + U64(b)[1]; \ - } while (0) - -#define VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr) \ - ((uint64_t*)(b))[0] ^= division_result ^ (sqrt_result << 32); \ - { \ - const uint64_t dividend = ((uint64_t*)(ptr))[1]; \ - const uint32_t divisor = (((uint32_t*)(ptr))[0] + (uint32_t)(sqrt_result << 1)) | 0x80000001UL; \ - division_result = ((uint32_t)(dividend / divisor)) + \ - (((uint64_t)(dividend % divisor)) << 32); \ - } \ - const uint64_t sqrt_input = ((uint64_t*)(ptr))[0] + division_result - -#define VARIANT2_INTEGER_MATH(b, ptr) \ - do if (variant >= 2) \ - { \ - VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \ - VARIANT2_INTEGER_MATH_SQRT_STEP_FP64(); \ - VARIANT2_INTEGER_MATH_SQRT_FIXUP(sqrt_result); \ - } while (0) - -#define VARIANT2_2() \ - do if (variant >= 2) { \ - ((uint64_t*)(ctx->long_state + ((j * AES_BLOCK_SIZE) ^ 0x10)))[0] ^= hi; \ - ((uint64_t*)(ctx->long_state + ((j * AES_BLOCK_SIZE) ^ 0x10)))[1] ^= lo; \ - hi ^= ((uint64_t*)(ctx->long_state + ((j * AES_BLOCK_SIZE) ^ 0x20)))[0]; \ - lo ^= ((uint64_t*)(ctx->long_state + ((j * AES_BLOCK_SIZE) ^ 0x20)))[1]; \ - } while (0) - -#pragma pack(push, 1) -union cn_slow_hash_state { - union hash_state hs; - struct { - uint8_t k[64]; - uint8_t init[INIT_SIZE_BYTE]; - }; -}; -#pragma pack(pop) - -static void do_blake_hash(const void* input, size_t len, char* output) { - blake256_hash((uint8_t*)output, input, len); -} - -void do_groestl_hash(const void* input, size_t len, char* output) { - groestl(input, len * 8, (uint8_t*)output); -} - -static void do_jh_hash(const void* input, size_t len, char* output) { - int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output); - assert(SUCCESS == r); -} - -static void do_skein_hash(const void* input, size_t len, char* output) { - int r = c_skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output); - assert(SKEIN_SUCCESS == r); -} - -static void (* const extra_hashes[4])(const void *, size_t, char *) = { - do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash -}; - -extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey); -extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey); - -static inline size_t e2i(const uint8_t* a) { - return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (CN_AES_INIT - 1); -} - -static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) { - ((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res); -} - -static void sum_half_blocks(uint8_t* a, const uint8_t* b) { - uint64_t a0, a1, b0, b1; - - a0 = SWAP64LE(((uint64_t*) a)[0]); - a1 = SWAP64LE(((uint64_t*) a)[1]); - b0 = SWAP64LE(((uint64_t*) b)[0]); - b1 = SWAP64LE(((uint64_t*) b)[1]); - a0 += b0; - a1 += b1; - ((uint64_t*) a)[0] = SWAP64LE(a0); - ((uint64_t*) a)[1] = SWAP64LE(a1); -} - -static inline void copy_block(uint8_t* dst, const uint8_t* src) { - ((uint64_t*) dst)[0] = ((uint64_t*) src)[0]; - ((uint64_t*) dst)[1] = ((uint64_t*) src)[1]; -} - -static void swap_blocks(uint8_t* a, uint8_t* b) { - size_t i; - uint8_t t; - for (i = 0; i < AES_BLOCK_SIZE; i++) { - t = a[i]; - a[i] = b[i]; - b[i] = t; - } -} - -static inline void xor_blocks(uint8_t* a, const uint8_t* b) { - ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0]; - ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1]; -} - -static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { - ((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0]; - ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; -} - -struct cryptonight_ctx { - uint8_t long_state[MEMORY]; - union cn_slow_hash_state state; - uint8_t text[INIT_SIZE_BYTE]; - uint8_t a[AES_BLOCK_SIZE]; - uint8_t b[AES_BLOCK_SIZE * 2]; - uint8_t c[AES_BLOCK_SIZE]; - uint8_t aes_key[AES_KEY_SIZE]; - oaes_ctx* aes_ctx; -}; - -void cryptonight_hash(const char* input, char* output, uint32_t len, int variant) { -#if defined(_MSC_VER) - struct cryptonight_ctx *ctx = _malloca(sizeof(struct cryptonight_ctx)); -#else - struct cryptonight_ctx *ctx = alloca(sizeof(struct cryptonight_ctx)); -#endif - hash_process(&ctx->state.hs, (const uint8_t*) input, len); - memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); - memcpy(ctx->aes_key, ctx->state.hs.b, AES_KEY_SIZE); - ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); - size_t i, j; - - VARIANT1_INIT(); - VARIANT2_INIT(ctx->b, ctx->state); - - oaes_key_import_data(ctx->aes_ctx, ctx->aes_key, AES_KEY_SIZE); - for (i = 0; i < CN_INIT; i++) { - for (j = 0; j < INIT_SIZE_BLK; j++) { - aesb_pseudo_round(&ctx->text[AES_BLOCK_SIZE * j], - &ctx->text[AES_BLOCK_SIZE * j], - ctx->aes_ctx->key->exp_data); - } - memcpy(&ctx->long_state[i * INIT_SIZE_BYTE], ctx->text, INIT_SIZE_BYTE); - } - - for (i = 0; i < 16; i++) { - ctx->a[i] = ctx->state.k[i] ^ ctx->state.k[32 + i]; - ctx->b[i] = ctx->state.k[16 + i] ^ ctx->state.k[48 + i]; - } - - for (i = 0; i < ITER_DIV; i++) { - /* Dependency chain: address -> read value ------+ - * written value <-+ hard function (AES or MUL) <+ - * next address <-+ - */ - /* Iteration 1 */ - j = e2i(ctx->a); - aesb_single_round(&ctx->long_state[j * AES_BLOCK_SIZE], ctx->c, ctx->a); - VARIANT2_SHUFFLE_ADD(ctx->long_state, j * AES_BLOCK_SIZE, ctx->a, ctx->b); - xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j * AES_BLOCK_SIZE]); - VARIANT1_1((uint8_t*)&ctx->long_state[j * AES_BLOCK_SIZE]); - /* Iteration 2 */ - j = e2i(ctx->c); - - uint64_t* dst = (uint64_t*)&ctx->long_state[j * AES_BLOCK_SIZE]; - - uint64_t t[2]; - t[0] = dst[0]; - t[1] = dst[1]; - - VARIANT2_INTEGER_MATH(t, ctx->c); - - uint64_t hi; - uint64_t lo = mul128(((uint64_t*)ctx->c)[0], t[0], &hi); - - VARIANT2_2(); - VARIANT2_SHUFFLE_ADD(ctx->long_state, j * AES_BLOCK_SIZE, ctx->a, ctx->b); - - ((uint64_t*)ctx->a)[0] += hi; - ((uint64_t*)ctx->a)[1] += lo; - - dst[0] = ((uint64_t*)ctx->a)[0]; - dst[1] = ((uint64_t*)ctx->a)[1]; - - ((uint64_t*)ctx->a)[0] ^= t[0]; - ((uint64_t*)ctx->a)[1] ^= t[1]; - - VARIANT1_2((uint8_t*)&ctx->long_state[j * AES_BLOCK_SIZE]); - copy_block(ctx->b + AES_BLOCK_SIZE, ctx->b); - copy_block(ctx->b, ctx->c); - } - - memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); - oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE); - for (i = 0; i < CN_INIT; i++) { - for (j = 0; j < INIT_SIZE_BLK; j++) { - xor_blocks(&ctx->text[j * AES_BLOCK_SIZE], - &ctx->long_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]); - aesb_pseudo_round(&ctx->text[j * AES_BLOCK_SIZE], - &ctx->text[j * AES_BLOCK_SIZE], - ctx->aes_ctx->key->exp_data); - } - } - memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE); - hash_permutation(&ctx->state.hs); - /*memcpy(hash, &state, 32);*/ - extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output); - oaes_free((OAES_CTX **) &ctx->aes_ctx); -} - -void cryptonight_fast_hash(const char* input, char* output, uint32_t len) { - union hash_state state; - hash_process(&state, (const uint8_t*) input, len); - memcpy(output, &state, HASH_SIZE); -} diff --git a/algo/gr/cryptonote/cryptonight.h b/algo/gr/cryptonote/cryptonight.h deleted file mode 100644 index 7d99ec0..0000000 --- a/algo/gr/cryptonote/cryptonight.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef CRYPTONIGHT_H -#define CRYPTONIGHT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -void cryptonight_hash(const char* input, char* output, uint32_t len, int variant); -void cryptonight_fast_hash(const char* input, char* output, uint32_t len); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/gr/cryptonote/cryptonight_dark.c b/algo/gr/cryptonote/cryptonight_dark.c index 9c8af92..ab57e7b 100644 --- a/algo/gr/cryptonote/cryptonight_dark.c +++ b/algo/gr/cryptonote/cryptonight_dark.c @@ -6,6 +6,7 @@ #include #include +#include #include "crypto/oaes_lib.h" #include "crypto/c_keccak.h" #include "crypto/c_groestl.h" @@ -150,38 +151,11 @@ static inline size_t e2i(const uint8_t* a) { return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (CN_AES_INIT - 1); } -static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) { - ((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res); -} - -static void sum_half_blocks(uint8_t* a, const uint8_t* b) { - uint64_t a0, a1, b0, b1; - - a0 = SWAP64LE(((uint64_t*) a)[0]); - a1 = SWAP64LE(((uint64_t*) a)[1]); - b0 = SWAP64LE(((uint64_t*) b)[0]); - b1 = SWAP64LE(((uint64_t*) b)[1]); - a0 += b0; - a1 += b1; - ((uint64_t*) a)[0] = SWAP64LE(a0); - ((uint64_t*) a)[1] = SWAP64LE(a1); -} - static inline void copy_block(uint8_t* dst, const uint8_t* src) { ((uint64_t*) dst)[0] = ((uint64_t*) src)[0]; ((uint64_t*) dst)[1] = ((uint64_t*) src)[1]; } -static void swap_blocks(uint8_t* a, uint8_t* b) { - size_t i; - uint8_t t; - for (i = 0; i < AES_BLOCK_SIZE; i++) { - t = a[i]; - a[i] = b[i]; - b[i] = t; - } -} - static inline void xor_blocks(uint8_t* a, const uint8_t* b) { ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0]; ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1]; diff --git a/algo/gr/cryptonote/cryptonight_dark_lite.c b/algo/gr/cryptonote/cryptonight_dark_lite.c index 15b3e09..16ad0f5 100644 --- a/algo/gr/cryptonote/cryptonight_dark_lite.c +++ b/algo/gr/cryptonote/cryptonight_dark_lite.c @@ -6,6 +6,7 @@ #include #include +#include #include "crypto/oaes_lib.h" #include "crypto/c_keccak.h" #include "crypto/c_groestl.h" @@ -150,38 +151,11 @@ static inline size_t e2i(const uint8_t* a) { return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (CN_AES_INIT - 1); } -static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) { - ((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res); -} - -static void sum_half_blocks(uint8_t* a, const uint8_t* b) { - uint64_t a0, a1, b0, b1; - - a0 = SWAP64LE(((uint64_t*) a)[0]); - a1 = SWAP64LE(((uint64_t*) a)[1]); - b0 = SWAP64LE(((uint64_t*) b)[0]); - b1 = SWAP64LE(((uint64_t*) b)[1]); - a0 += b0; - a1 += b1; - ((uint64_t*) a)[0] = SWAP64LE(a0); - ((uint64_t*) a)[1] = SWAP64LE(a1); -} - static inline void copy_block(uint8_t* dst, const uint8_t* src) { ((uint64_t*) dst)[0] = ((uint64_t*) src)[0]; ((uint64_t*) dst)[1] = ((uint64_t*) src)[1]; } -static void swap_blocks(uint8_t* a, uint8_t* b) { - size_t i; - uint8_t t; - for (i = 0; i < AES_BLOCK_SIZE; i++) { - t = a[i]; - a[i] = b[i]; - b[i] = t; - } -} - static inline void xor_blocks(uint8_t* a, const uint8_t* b) { ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0]; ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1]; diff --git a/algo/gr/cryptonote/cryptonight_fast.c b/algo/gr/cryptonote/cryptonight_fast.c index 8145f76..a404eb9 100644 --- a/algo/gr/cryptonote/cryptonight_fast.c +++ b/algo/gr/cryptonote/cryptonight_fast.c @@ -6,6 +6,7 @@ #include #include +#include #include "crypto/oaes_lib.h" #include "crypto/c_keccak.h" #include "crypto/c_groestl.h" @@ -150,38 +151,11 @@ static inline size_t e2i(const uint8_t* a) { return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (CN_AES_INIT - 1); } -static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) { - ((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res); -} - -static void sum_half_blocks(uint8_t* a, const uint8_t* b) { - uint64_t a0, a1, b0, b1; - - a0 = SWAP64LE(((uint64_t*) a)[0]); - a1 = SWAP64LE(((uint64_t*) a)[1]); - b0 = SWAP64LE(((uint64_t*) b)[0]); - b1 = SWAP64LE(((uint64_t*) b)[1]); - a0 += b0; - a1 += b1; - ((uint64_t*) a)[0] = SWAP64LE(a0); - ((uint64_t*) a)[1] = SWAP64LE(a1); -} - static inline void copy_block(uint8_t* dst, const uint8_t* src) { ((uint64_t*) dst)[0] = ((uint64_t*) src)[0]; ((uint64_t*) dst)[1] = ((uint64_t*) src)[1]; } -static void swap_blocks(uint8_t* a, uint8_t* b) { - size_t i; - uint8_t t; - for (i = 0; i < AES_BLOCK_SIZE; i++) { - t = a[i]; - a[i] = b[i]; - b[i] = t; - } -} - static inline void xor_blocks(uint8_t* a, const uint8_t* b) { ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0]; ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1]; diff --git a/algo/gr/cryptonote/cryptonight_lite.c b/algo/gr/cryptonote/cryptonight_lite.c index d8af93f..52a407b 100644 --- a/algo/gr/cryptonote/cryptonight_lite.c +++ b/algo/gr/cryptonote/cryptonight_lite.c @@ -6,6 +6,7 @@ #include #include +#include #include "crypto/oaes_lib.h" #include "crypto/c_keccak.h" #include "crypto/c_groestl.h" @@ -150,38 +151,11 @@ static inline size_t e2i(const uint8_t* a) { return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (CN_AES_INIT - 1); } -static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) { - ((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res); -} - -static void sum_half_blocks(uint8_t* a, const uint8_t* b) { - uint64_t a0, a1, b0, b1; - - a0 = SWAP64LE(((uint64_t*) a)[0]); - a1 = SWAP64LE(((uint64_t*) a)[1]); - b0 = SWAP64LE(((uint64_t*) b)[0]); - b1 = SWAP64LE(((uint64_t*) b)[1]); - a0 += b0; - a1 += b1; - ((uint64_t*) a)[0] = SWAP64LE(a0); - ((uint64_t*) a)[1] = SWAP64LE(a1); -} - static inline void copy_block(uint8_t* dst, const uint8_t* src) { ((uint64_t*) dst)[0] = ((uint64_t*) src)[0]; ((uint64_t*) dst)[1] = ((uint64_t*) src)[1]; } -static void swap_blocks(uint8_t* a, uint8_t* b) { - size_t i; - uint8_t t; - for (i = 0; i < AES_BLOCK_SIZE; i++) { - t = a[i]; - a[i] = b[i]; - b[i] = t; - } -} - static inline void xor_blocks(uint8_t* a, const uint8_t* b) { ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0]; ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1]; diff --git a/algo/gr/cryptonote/cryptonight_soft_shell.c b/algo/gr/cryptonote/cryptonight_soft_shell.c deleted file mode 100644 index 9299b4c..0000000 --- a/algo/gr/cryptonote/cryptonight_soft_shell.c +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright (c) 2012-2013 The Cryptonote developers -// Distributed under the MIT/X11 software license, see the accompanying -// file COPYING or http://www.opensource.org/licenses/mit-license.php. -// Portions Copyright (c) 2018 The Monero developers -// Portions Copyright (c) 2018 The TurtleCoin Developers - -#include -#include -#include "crypto/oaes_lib.h" -#include "crypto/c_keccak.h" -#include "crypto/c_groestl.h" -#include "crypto/c_blake256.h" -#include "crypto/c_jh.h" -#include "crypto/c_skein.h" -#include "crypto/int-util.h" -#include "crypto/hash-ops.h" -#include "crypto/variant2_int_sqrt.h" - -#if defined(_MSC_VER) -#include -#endif - -// Standard Crypto Definitions -#define AES_BLOCK_SIZE 16 -#define AES_KEY_SIZE 32 -#define INIT_SIZE_BLK 8 -#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) - -#define VARIANT1_1(p) \ - do if (variant == 1) \ - { \ - const uint8_t tmp = ((const uint8_t*)(p))[11]; \ - static const uint32_t table = 0x75310; \ - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \ - ((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \ - } while(0) - -#define VARIANT1_2(p) \ - do if (variant == 1) \ - { \ - ((uint64_t*)p)[1] ^= tweak1_2; \ - } while(0) - -#define VARIANT1_INIT() \ - if (variant == 1 && len < 43) \ - { \ - fprintf(stderr, "Cryptonight variant 1 needs at least 43 bytes of data"); \ - _exit(1); \ - } \ - const uint64_t tweak1_2 = (variant == 1) ? *(const uint64_t*)(((const uint8_t*)input)+35) ^ state.hs.w[24] : 0 - -#define U64(p) ((uint64_t*)(p)) - -#define VARIANT2_INIT(b, state) \ - uint64_t division_result; \ - uint64_t sqrt_result; \ - do if (variant >= 2) \ - { \ - U64(b)[2] = state.hs.w[8] ^ state.hs.w[10]; \ - U64(b)[3] = state.hs.w[9] ^ state.hs.w[11]; \ - division_result = state.hs.w[12]; \ - sqrt_result = state.hs.w[13]; \ - } while (0) - -#define VARIANT2_SHUFFLE_ADD(base_ptr, offset, a, b) \ - do if (variant >= 2) \ - { \ - uint64_t* chunk1 = U64((base_ptr) + ((offset) ^ 0x10)); \ - uint64_t* chunk2 = U64((base_ptr) + ((offset) ^ 0x20)); \ - uint64_t* chunk3 = U64((base_ptr) + ((offset) ^ 0x30)); \ - \ - const uint64_t chunk1_old[2] = { chunk1[0], chunk1[1] }; \ - \ - chunk1[0] = chunk3[0] + U64(b + 16)[0]; \ - chunk1[1] = chunk3[1] + U64(b + 16)[1]; \ - \ - chunk3[0] = chunk2[0] + U64(a)[0]; \ - chunk3[1] = chunk2[1] + U64(a)[1]; \ - \ - chunk2[0] = chunk1_old[0] + U64(b)[0]; \ - chunk2[1] = chunk1_old[1] + U64(b)[1]; \ - } while (0) - -#define VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr) \ - ((uint64_t*)(b))[0] ^= division_result ^ (sqrt_result << 32); \ - { \ - const uint64_t dividend = ((uint64_t*)(ptr))[1]; \ - const uint32_t divisor = (((uint32_t*)(ptr))[0] + (uint32_t)(sqrt_result << 1)) | 0x80000001UL; \ - division_result = ((uint32_t)(dividend / divisor)) + \ - (((uint64_t)(dividend % divisor)) << 32); \ - } \ - const uint64_t sqrt_input = ((uint64_t*)(ptr))[0] + division_result - -#define VARIANT2_INTEGER_MATH(b, ptr) \ - do if (variant >= 2) \ - { \ - VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \ - VARIANT2_INTEGER_MATH_SQRT_STEP_FP64(); \ - VARIANT2_INTEGER_MATH_SQRT_FIXUP(sqrt_result); \ - } while (0) - -#define VARIANT2_2() \ - do if (variant >= 2) { \ - ((uint64_t*)(long_state + ((j * AES_BLOCK_SIZE) ^ 0x10)))[0] ^= hi; \ - ((uint64_t*)(long_state + ((j * AES_BLOCK_SIZE) ^ 0x10)))[1] ^= lo; \ - hi ^= ((uint64_t*)(long_state + ((j * AES_BLOCK_SIZE) ^ 0x20)))[0]; \ - lo ^= ((uint64_t*)(long_state + ((j * AES_BLOCK_SIZE) ^ 0x20)))[1]; \ - } while (0) - -#pragma pack(push, 1) -union cn_slow_hash_state { - union hash_state hs; - struct { - uint8_t k[64]; - uint8_t init[INIT_SIZE_BYTE]; - }; -}; -#pragma pack(pop) - -static void do_soft_shell_blake_hash(const void* input, size_t len, char* output) { - blake256_hash((uint8_t*)output, input, len); -} - -void do_soft_shell_groestl_hash(const void* input, size_t len, char* output) { - groestl(input, len * 8, (uint8_t*)output); -} - -static void do_soft_shell_jh_hash(const void* input, size_t len, char* output) { - int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output); - assert(SUCCESS == r); -} - -static void do_soft_shell_skein_hash(const void* input, size_t len, char* output) { - int r = c_skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output); - assert(SKEIN_SUCCESS == r); -} - -static void (* const extra_hashes[4])(const void *, size_t, char *) = { - do_soft_shell_blake_hash, do_soft_shell_groestl_hash, do_soft_shell_jh_hash, do_soft_shell_skein_hash -}; - -extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey); -extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey); - -static inline size_t e2i(const uint8_t* a, size_t count) { - return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (count - 1); -} - -static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) { - ((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res); -} - -static void sum_half_blocks(uint8_t* a, const uint8_t* b) { - uint64_t a0, a1, b0, b1; - - a0 = SWAP64LE(((uint64_t*) a)[0]); - a1 = SWAP64LE(((uint64_t*) a)[1]); - b0 = SWAP64LE(((uint64_t*) b)[0]); - b1 = SWAP64LE(((uint64_t*) b)[1]); - a0 += b0; - a1 += b1; - ((uint64_t*) a)[0] = SWAP64LE(a0); - ((uint64_t*) a)[1] = SWAP64LE(a1); -} - -static inline void copy_block(uint8_t* dst, const uint8_t* src) { - ((uint64_t*) dst)[0] = ((uint64_t*) src)[0]; - ((uint64_t*) dst)[1] = ((uint64_t*) src)[1]; -} - -static void swap_blocks(uint8_t* a, uint8_t* b) { - size_t i; - uint8_t t; - for (i = 0; i < AES_BLOCK_SIZE; i++) { - t = a[i]; - a[i] = b[i]; - b[i] = t; - } -} - -static inline void xor_blocks(uint8_t* a, const uint8_t* b) { - ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0]; - ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1]; -} - -static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { - ((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0]; - ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; -} - -void cryptonight_soft_shell_hash(const char* input, char* output, uint32_t len, int variant, uint32_t scratchpad, uint32_t iterations) { - union cn_slow_hash_state state; - uint8_t text[INIT_SIZE_BYTE]; - uint8_t a[AES_BLOCK_SIZE]; - uint8_t b[AES_BLOCK_SIZE]; - uint8_t c[AES_BLOCK_SIZE]; - uint8_t aes_key[AES_KEY_SIZE]; - oaes_ctx* aes_ctx; - -#if defined(_MSC_VER) - uint8_t *long_state = (uint8_t *)_malloca(scratchpad); -#else - uint8_t *long_state = (uint8_t *)malloc(scratchpad); -#endif - - size_t CN_INIT = (scratchpad / INIT_SIZE_BYTE); - size_t ITER_DIV = (iterations / 2); - size_t CN_AES_INIT = (scratchpad / AES_BLOCK_SIZE) / 2; - - hash_process(&state.hs, (const uint8_t*) input, len); - memcpy(text, state.init, INIT_SIZE_BYTE); - memcpy(aes_key, state.hs.b, AES_KEY_SIZE); - aes_ctx = (oaes_ctx*) oaes_alloc(); - size_t i, j; - - VARIANT1_INIT(); - VARIANT2_INIT(b, state); - - oaes_key_import_data(aes_ctx, aes_key, AES_KEY_SIZE); - for (i = 0; i < CN_INIT; i++) { - for (j = 0; j < INIT_SIZE_BLK; j++) { - aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], - &text[AES_BLOCK_SIZE * j], - aes_ctx->key->exp_data); - } - memcpy(&long_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE); - } - - for (i = 0; i < 16; i++) { - a[i] = state.k[i] ^ state.k[32 + i]; - b[i] = state.k[16 + i] ^ state.k[48 + i]; - } - - for (i = 0; i < ITER_DIV; i++) { - /* Dependency chain: address -> read value ------+ - * written value <-+ hard function (AES or MUL) <+ - * next address <-+ - */ - /* Iteration 1 */ - j = e2i(a, CN_AES_INIT); - aesb_single_round(&long_state[j * AES_BLOCK_SIZE], c, a); - VARIANT2_SHUFFLE_ADD(long_state, j * AES_BLOCK_SIZE, a, b); - xor_blocks_dst(c, b, &long_state[j * AES_BLOCK_SIZE]); - VARIANT1_1((uint8_t*)&long_state[j * AES_BLOCK_SIZE]); - /* Iteration 2 */ - j = e2i(c, CN_AES_INIT); - - uint64_t* dst = (uint64_t*)&long_state[j * AES_BLOCK_SIZE]; - - uint64_t t[2]; - t[0] = dst[0]; - t[1] = dst[1]; - - VARIANT2_INTEGER_MATH(t, c); - - uint64_t hi; - uint64_t lo = mul128(((uint64_t*)c)[0], t[0], &hi); - - VARIANT2_2(); - VARIANT2_SHUFFLE_ADD(long_state, j * AES_BLOCK_SIZE, a, b); - - ((uint64_t*)a)[0] += hi; - ((uint64_t*)a)[1] += lo; - - dst[0] = ((uint64_t*)a)[0]; - dst[1] = ((uint64_t*)a)[1]; - - ((uint64_t*)a)[0] ^= t[0]; - ((uint64_t*)a)[1] ^= t[1]; - - VARIANT1_2((uint8_t*)&long_state[j * AES_BLOCK_SIZE]); - copy_block(b + AES_BLOCK_SIZE, b); - copy_block(b, c); - } - - memcpy(text, state.init, INIT_SIZE_BYTE); - oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE); - for (i = 0; i < CN_INIT; i++) { - for (j = 0; j < INIT_SIZE_BLK; j++) { - xor_blocks(&text[j * AES_BLOCK_SIZE], - &long_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]); - aesb_pseudo_round(&text[j * AES_BLOCK_SIZE], - &text[j * AES_BLOCK_SIZE], - aes_ctx->key->exp_data); - } - } - memcpy(state.init, text, INIT_SIZE_BYTE); - hash_permutation(&state.hs); - /*memcpy(hash, &state, 32);*/ - extra_hashes[state.hs.b[0] & 3](&state, 200, output); - oaes_free((OAES_CTX **) &aes_ctx); -} - -void cryptonight_soft_shell_fast_hash(const char* input, char* output, uint32_t len) { - union hash_state state; - hash_process(&state, (const uint8_t*) input, len); - memcpy(output, &state, HASH_SIZE); -} diff --git a/algo/gr/cryptonote/cryptonight_soft_shell.h b/algo/gr/cryptonote/cryptonight_soft_shell.h deleted file mode 100644 index d32570b..0000000 --- a/algo/gr/cryptonote/cryptonight_soft_shell.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef CRYPTONIGHT_SOFT_SHELL_H -#define CRYPTONIGHT_SOFT_SHELL_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -void cryptonight_soft_shell_hash(const char* input, char* output, uint32_t len, int variant, uint32_t scratchpad, uint32_t iterations); -void cryptonight_soft_shell_fast_hash(const char* input, char* output, uint32_t len); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/gr/cryptonote/cryptonight_turtle.c b/algo/gr/cryptonote/cryptonight_turtle.c index c6705e3..eaa819b 100644 --- a/algo/gr/cryptonote/cryptonight_turtle.c +++ b/algo/gr/cryptonote/cryptonight_turtle.c @@ -6,6 +6,7 @@ #include #include +#include #include "crypto/oaes_lib.h" #include "crypto/c_keccak.h" #include "crypto/c_groestl.h" @@ -150,38 +151,11 @@ static inline size_t e2i(const uint8_t* a) { return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (CN_AES_INIT - 1); } -static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) { - ((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res); -} - -static void sum_half_blocks(uint8_t* a, const uint8_t* b) { - uint64_t a0, a1, b0, b1; - - a0 = SWAP64LE(((uint64_t*) a)[0]); - a1 = SWAP64LE(((uint64_t*) a)[1]); - b0 = SWAP64LE(((uint64_t*) b)[0]); - b1 = SWAP64LE(((uint64_t*) b)[1]); - a0 += b0; - a1 += b1; - ((uint64_t*) a)[0] = SWAP64LE(a0); - ((uint64_t*) a)[1] = SWAP64LE(a1); -} - static inline void copy_block(uint8_t* dst, const uint8_t* src) { ((uint64_t*) dst)[0] = ((uint64_t*) src)[0]; ((uint64_t*) dst)[1] = ((uint64_t*) src)[1]; } -static void swap_blocks(uint8_t* a, uint8_t* b) { - size_t i; - uint8_t t; - for (i = 0; i < AES_BLOCK_SIZE; i++) { - t = a[i]; - a[i] = b[i]; - b[i] = t; - } -} - static inline void xor_blocks(uint8_t* a, const uint8_t* b) { ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0]; ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1]; diff --git a/algo/gr/cryptonote/cryptonight_turtle_lite.c b/algo/gr/cryptonote/cryptonight_turtle_lite.c index 4731537..76dbdd0 100644 --- a/algo/gr/cryptonote/cryptonight_turtle_lite.c +++ b/algo/gr/cryptonote/cryptonight_turtle_lite.c @@ -6,6 +6,7 @@ #include #include +#include #include "crypto/oaes_lib.h" #include "crypto/c_keccak.h" #include "crypto/c_groestl.h" @@ -150,38 +151,11 @@ static inline size_t e2i(const uint8_t* a) { return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (CN_AES_INIT - 1); } -static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) { - ((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res); -} - -static void sum_half_blocks(uint8_t* a, const uint8_t* b) { - uint64_t a0, a1, b0, b1; - - a0 = SWAP64LE(((uint64_t*) a)[0]); - a1 = SWAP64LE(((uint64_t*) a)[1]); - b0 = SWAP64LE(((uint64_t*) b)[0]); - b1 = SWAP64LE(((uint64_t*) b)[1]); - a0 += b0; - a1 += b1; - ((uint64_t*) a)[0] = SWAP64LE(a0); - ((uint64_t*) a)[1] = SWAP64LE(a1); -} - static inline void copy_block(uint8_t* dst, const uint8_t* src) { ((uint64_t*) dst)[0] = ((uint64_t*) src)[0]; ((uint64_t*) dst)[1] = ((uint64_t*) src)[1]; } -static void swap_blocks(uint8_t* a, uint8_t* b) { - size_t i; - uint8_t t; - for (i = 0; i < AES_BLOCK_SIZE; i++) { - t = a[i]; - a[i] = b[i]; - b[i] = t; - } -} - static inline void xor_blocks(uint8_t* a, const uint8_t* b) { ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0]; ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1]; diff --git a/algo/gr/gr-gate.c b/algo/gr/gr-gate.c index 64f0898..3d5b7cf 100644 --- a/algo/gr/gr-gate.c +++ b/algo/gr/gr-gate.c @@ -1,327 +1,471 @@ #include "gr-gate.h" -#include -#include -#include -#include #include "../blake/sph_blake.h" #include "../bmw/sph_bmw.h" +#include "../cubehash/sph_cubehash.h" +#include "../echo/sph_echo.h" +#include "../fugue/sph_fugue.h" #include "../groestl/sph_groestl.h" +#include "../hamsi/sph_hamsi.h" #include "../jh/sph_jh.h" #include "../keccak/sph_keccak.h" -#include "../skein/sph_skein.h" #include "../luffa/sph_luffa.h" -#include "../cubehash/sph_cubehash.h" +#include "../lyra2/lyra2.h" +#include "../sha/sph_sha2.h" +#include "../shabal/sph_shabal.h" #include "../shavite/sph_shavite.h" #include "../simd/sph_simd.h" -#include "../echo/sph_echo.h" -#include "../hamsi/sph_hamsi.h" -#include "../fugue/sph_fugue.h" -#include "../shabal/sph_shabal.h" +#include "../skein/sph_skein.h" #include "../whirlpool/sph_whirlpool.h" -#include "../sha/sph_sha2.h" -#include "../tiger/sph_tiger.h" -#include "../lyra2/lyra2.h" -#include "../haval/sph-haval.h" -#include "../gost/sph_gost.h" +#include "cryptonote/crypto/c_keccak.h" +#include "cryptonote/crypto/hash.h" #include "cryptonote/cryptonight_dark.h" #include "cryptonote/cryptonight_dark_lite.h" #include "cryptonote/cryptonight_fast.h" -#include "cryptonote/cryptonight.h" #include "cryptonote/cryptonight_lite.h" -#include "cryptonote/cryptonight_soft_shell.h" #include "cryptonote/cryptonight_turtle.h" #include "cryptonote/cryptonight_turtle_lite.h" +#include +#include +#include +#include -int64_t gr_get_max64() -{ - return 0x7ffLL; -} +int64_t gr_get_max64() { return 0x7ffLL; } -bool register_gr_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_gr; - gate->hash = (void*)&gr_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&gr_get_max64; - gate->set_target = (void*)&scrypt_set_target; +bool register_gr_algo(algo_gate_t *gate) { + gate->scanhash = (void *)&scanhash_gr; + gate->hash = (void *)&gr_hash; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->get_max64 = (void *)&gr_get_max64; + opt_target_factor = 65536.0; return true; }; enum Algo { - BLAKE = 0, - BMW, - GROESTL, - JH, - KECCAK, - SKEIN, - LUFFA, - CUBEHASH, - SHAVITE, - SIMD, - ECHO, - HAMSI, - FUGUE, - SHABAL, - WHIRLPOOL, - HASH_FUNC_COUNT + BLAKE = 0, + BMW, + GROESTL, + JH, + KECCAK, + SKEIN, + LUFFA, + CUBEHASH, + SHAVITE, + SIMD, + ECHO, + HAMSI, + FUGUE, + SHABAL, + WHIRLPOOL, + HASH_FUNC_COUNT }; enum CNAlgo { - CNDark = 0, - CNDarklite, - CNFast, - CNLite, - CNTurtle, - CNTurtlelite, - CN_HASH_FUNC_COUNT + CNDark = 0, + CNDarklite, + CNFast, + CNLite, + CNTurtle, + CNTurtlelite, + CN_HASH_FUNC_COUNT }; -static void selectAlgo(unsigned char nibble, bool* selectedAlgos, uint8_t* selectedIndex, int algoCount, int* currentCount) { - uint8_t algoDigit = (nibble & 0x0F) % algoCount; - if(!selectedAlgos[algoDigit]) { - selectedAlgos[algoDigit] = true; - selectedIndex[currentCount[0]] = algoDigit; - currentCount[0] = currentCount[0] + 1; - } - algoDigit = (nibble >> 4) % algoCount; - if(!selectedAlgos[algoDigit]) { - selectedAlgos[algoDigit] = true; - selectedIndex[currentCount[0]] = algoDigit; - currentCount[0] = currentCount[0] + 1; - } +static void selectAlgo(unsigned char nibble, bool *selectedAlgos, + uint8_t *selectedIndex, int algoCount, + int *currentCount) { + uint8_t algoDigit = (nibble & 0x0F) % algoCount; + if (!selectedAlgos[algoDigit]) { + selectedAlgos[algoDigit] = true; + selectedIndex[currentCount[0]] = algoDigit; + currentCount[0] = currentCount[0] + 1; + } + algoDigit = (nibble >> 4) % algoCount; + if (!selectedAlgos[algoDigit]) { + selectedAlgos[algoDigit] = true; + selectedIndex[currentCount[0]] = algoDigit; + currentCount[0] = currentCount[0] + 1; + } } -static void getAlgoString(void *mem, unsigned int size, uint8_t* selectedAlgoOutput, int algoCount) { +static void getAlgoString(const void *mem, unsigned int size, + uint8_t *selectedAlgoOutput, int algoCount) { int i; unsigned char *p = (unsigned char *)mem; - unsigned int len = size/2; - unsigned char j = 0; + unsigned int len = size / 2; bool selectedAlgo[algoCount]; - for(int z=0; z < algoCount; z++) { - selectedAlgo[z] = false; + for (int z = 0; z < algoCount; z++) { + selectedAlgo[z] = false; } int selectedCount = 0; - for (i=0;i %.3lf %sH/s per " + "thread.\r", + prefix, hashrate, hr_units, gr_bench_time / opt_n_threads, + hashrate / opt_n_threads, hr_units); + fflush(stdout); + pthread_mutex_unlock(&applog_lock); + + } else { + applog(LOG_BLUE, "%s\t%.2lf %sH/s (%.2lfs)\t-> %.3lf %sH/s per thread.", + prefix, hashrate, hr_units, gr_bench_time / opt_n_threads, + hashrate / opt_n_threads, hr_units); + } + if (reset) { + gr_bench_time = 0; + gr_bench_hashes = 0; + } + pthread_mutex_unlock(&stats_lock); +} + +static void sync() { + static volatile int done = 0; + + pthread_mutex_lock(&stats_lock); + done++; + if (done != opt_n_threads) { + pthread_cond_wait(&sync_cond, &stats_lock); + } else { + done = 0; + pthread_cond_broadcast(&sync_cond); + } + pthread_mutex_unlock(&stats_lock); +} + +static void gr_extensive_bench(void *input, int thr_id) { + char prefix[50]; + if (opt_benchmark_extended) { + int i; + if (thr_id == 0) { + applog(LOG_BLUE, "Testing Cryptonight algorithms (10s per algorithm)"); + } + for (i = 0; i < 6; i++) { + gr_bench(0, i, input, 10., 0); + sync(); + if (thr_id == 0) { + sprintf(prefix, "Type %d:", i + 1); + print_stats(prefix, true, false); + } + } + if (thr_id == 0) { + applog(LOG_BLUE, "Testing Core algorithms (2s per algorithm)"); + } + for (i = 0; i < 15; i++) { + gr_bench(1, i, input, 2., 0); + + sync(); + if (thr_id == 0) { + sprintf(prefix, "Type %d:", i + 1); + print_stats(prefix, true, false); + } + } + if (thr_id == 0) { + applog(LOG_BLUE, "Testing CN Rotations (10s per rotation)"); + } + static volatile int rot = 0; + while (rot < 20) { + gr_bench(2, -1, input, 10., rot); + + sync(); + if (thr_id == 0) { + sprintf(prefix, "Rotation %d %d %d:", cc[rot][0], cc[rot][1], + cc[rot][2]); + print_stats(prefix, true, false); + rot++; + } + // Make sure rot is updated. + sync(); + } } + + // Default benchmark that goes through all CN scenarios with Random Core. + double target_time = 30; + double target_multi = 1; + + if (thr_id == 0) { + applog(LOG_BLUE, "Testing Average performance"); + } + while (true) { + gr_bench(3, -1, input, 2., 0); + + sync(); + if (thr_id == 0) { + if (target_time * target_multi > gr_bench_time / opt_n_threads) { + // Update line. + print_stats("Hashrate (Avg):", false, true); + } else { + // Print stats for good. + print_stats("Hashrate (Avg):", false, false); + target_multi++; + } + } + } + exit(0); } -void gr_hash(void* output, const void* input) { - - uint32_t hash[64/4]; - sph_blake512_context ctx_blake; - sph_bmw512_context ctx_bmw; - sph_groestl512_context ctx_groestl; - sph_jh512_context ctx_jh; - sph_keccak512_context ctx_keccak; - sph_skein512_context ctx_skein; - sph_luffa512_context ctx_luffa; - sph_cubehash512_context ctx_cubehash; - sph_shavite512_context ctx_shavite; - sph_simd512_context ctx_simd; - sph_echo512_context ctx_echo; - sph_hamsi512_context ctx_hamsi; - sph_fugue512_context ctx_fugue; - sph_shabal512_context ctx_shabal; - sph_whirlpool_context ctx_whirlpool; - sph_haval256_5_context ctx_haval; - sph_tiger_context ctx_tiger; - sph_gost512_context ctx_gost; - sph_sha256_context ctx_sha; - - void *in = (void*) input; - int size = 80; - uint8_t selectedAlgoOutput[15] = {0}; - uint8_t selectedCNAlgoOutput[6] = {0}; - getAlgoString(&input[4], 64, selectedAlgoOutput, 15); - getAlgoString(&input[4], 64, selectedCNAlgoOutput, 6); - int i; - for (i = 0; i < 18; i++) - { - uint8_t algo; - uint8_t cnAlgo; - int coreSelection; - int cnSelection = -1; - if(i < 5) { - coreSelection = i; - } else if(i < 11) { - coreSelection = i-1; - } else { - coreSelection = i-2; - } - if(i==5) { - coreSelection = -1; - cnSelection = 0; - } - if(i==11) { - coreSelection = -1; - cnSelection = 1; - } - if(i==17) { - coreSelection = -1; - cnSelection = 2; - } - if(coreSelection >= 0) { - algo = selectedAlgoOutput[(uint8_t)coreSelection]; - } else { - algo = 16; // skip core hashing for this loop iteration - } - if(cnSelection >=0) { - cnAlgo = selectedCNAlgoOutput[(uint8_t)cnSelection]; - } else { - cnAlgo = 14; // skip cn hashing for this loop iteration - } - //selection cnAlgo. if a CN algo is selected then core algo will not be selected - switch(cnAlgo) - { - case CNDark: - cryptonightdark_hash(in, hash, size, 1); - break; - case CNDarklite: - cryptonightdarklite_hash(in, hash, size, 1); - break; - case CNFast: - cryptonightfast_hash(in, hash, size, 1); - break; - case CNLite: - cryptonightlite_hash(in, hash, size, 1); - break; - case CNTurtle: - cryptonightturtle_hash(in, hash, size, 1); - break; - case CNTurtlelite: - cryptonightturtlelite_hash(in, hash, size, 1); - break; - } - //selection core algo - switch (algo) { - case BLAKE: - sph_blake512_init(&ctx_blake); - sph_blake512(&ctx_blake, in, size); - sph_blake512_close(&ctx_blake, hash); - break; - case BMW: - sph_bmw512_init(&ctx_bmw); - sph_bmw512(&ctx_bmw, in, size); - sph_bmw512_close(&ctx_bmw, hash); - break; - case GROESTL: - sph_groestl512_init(&ctx_groestl); - sph_groestl512(&ctx_groestl, in, size); - sph_groestl512_close(&ctx_groestl, hash); - break; - case SKEIN: - sph_skein512_init(&ctx_skein); - sph_skein512(&ctx_skein, in, size); - sph_skein512_close(&ctx_skein, hash); - break; - case JH: - sph_jh512_init(&ctx_jh); - sph_jh512(&ctx_jh, in, size); - sph_jh512_close(&ctx_jh, hash); - break; - case KECCAK: - sph_keccak512_init(&ctx_keccak); - sph_keccak512(&ctx_keccak, in, size); - sph_keccak512_close(&ctx_keccak, hash); - break; - case LUFFA: - sph_luffa512_init(&ctx_luffa); - sph_luffa512(&ctx_luffa, in, size); - sph_luffa512_close(&ctx_luffa, hash); - break; - case CUBEHASH: - sph_cubehash512_init(&ctx_cubehash); - sph_cubehash512(&ctx_cubehash, in, size); - sph_cubehash512_close(&ctx_cubehash, hash); - break; - case SHAVITE: - sph_shavite512_init(&ctx_shavite); - sph_shavite512(&ctx_shavite, in, size); - sph_shavite512_close(&ctx_shavite, hash); - break; - case SIMD: - sph_simd512_init(&ctx_simd); - sph_simd512(&ctx_simd, in, size); - sph_simd512_close(&ctx_simd, hash); - break; - case ECHO: - sph_echo512_init(&ctx_echo); - sph_echo512(&ctx_echo, in, size); - sph_echo512_close(&ctx_echo, hash); - break; - case HAMSI: - sph_hamsi512_init(&ctx_hamsi); - sph_hamsi512(&ctx_hamsi, in, size); - sph_hamsi512_close(&ctx_hamsi, hash); - break; - case FUGUE: - sph_fugue512_init(&ctx_fugue); - sph_fugue512(&ctx_fugue, in, size); - sph_fugue512_close(&ctx_fugue, hash); - break; - case SHABAL: - sph_shabal512_init(&ctx_shabal); - sph_shabal512(&ctx_shabal, in, size); - sph_shabal512_close(&ctx_shabal, hash); - break; - case WHIRLPOOL: - sph_whirlpool_init(&ctx_whirlpool); - sph_whirlpool(&ctx_whirlpool, in, size); - sph_whirlpool_close(&ctx_whirlpool, hash); - break; - } - if(cnSelection >= 0) { - memset(&hash[8], 0, 32); - } - in = (void*) hash; - size = 64; - } - memcpy(output, hash, 32); +void gr_hash(void *output, const void *input, uint8_t cn) { + static __thread uint8_t hash_1[64]; + static __thread uint8_t hash_2[64]; + + static __thread uint8_t selectedAlgoOutput[15] = {0}; + static __thread uint8_t selectedCNAlgoOutput[6] = {0}; + + getAlgoString(input + 4, 64, selectedAlgoOutput, 15); + if (cn > 19) { + getAlgoString(input + 4, 64, selectedCNAlgoOutput, 6); + } else { + // Benchmarking. + selectedCNAlgoOutput[0] = cc[cn][0]; + selectedCNAlgoOutput[1] = cc[cn][1]; + selectedCNAlgoOutput[2] = cc[cn][2]; + } + + // First phasee uses full 80 bytes. Ther rest usees shorter 64 bytes. + doCoreAlgo(selectedAlgoOutput[0], input, hash_1, 80); + doCoreAlgo(selectedAlgoOutput[1], hash_1, hash_2, 64); + doCoreAlgo(selectedAlgoOutput[2], hash_2, hash_1, 64); + doCoreAlgo(selectedAlgoOutput[3], hash_1, hash_2, 64); + doCoreAlgo(selectedAlgoOutput[4], hash_2, hash_1, 64); + doCNAlgo(selectedCNAlgoOutput[0], hash_1, hash_2, 64); + memset(hash_2 + 32, 0, 32); + + doCoreAlgo(selectedAlgoOutput[5], hash_2, hash_1, 64); + doCoreAlgo(selectedAlgoOutput[6], hash_1, hash_2, 64); + doCoreAlgo(selectedAlgoOutput[7], hash_2, hash_1, 64); + doCoreAlgo(selectedAlgoOutput[8], hash_1, hash_2, 64); + doCoreAlgo(selectedAlgoOutput[9], hash_2, hash_1, 64); + doCNAlgo(selectedCNAlgoOutput[1], hash_1, hash_2, 64); + memset(hash_2 + 32, 0, 32); + + doCoreAlgo(selectedAlgoOutput[10], hash_2, hash_1, 64); + doCoreAlgo(selectedAlgoOutput[11], hash_1, hash_2, 64); + doCoreAlgo(selectedAlgoOutput[12], hash_2, hash_1, 64); + doCoreAlgo(selectedAlgoOutput[13], hash_1, hash_2, 64); + doCoreAlgo(selectedAlgoOutput[14], hash_2, hash_1, 64); + doCNAlgo(selectedCNAlgoOutput[2], hash_1, hash_2, 64); + // memset(hash_2 + 32, 0, 32); + + memcpy(output, hash_2, 32); } -int scanhash_gr( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; +int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done, + struct thr_info *mythr) { + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; - uint32_t _ALIGN(64) endiandata[20]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; + uint32_t _ALIGN(64) endiandata[20]; + const uint32_t first_nonce = pdata[19]; + uint32_t nonce = first_nonce; + int thr_id = mythr->id; + + if (opt_benchmark) { + gr_extensive_bench(endiandata, thr_id); + return 0; + } - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + swab32_array(endiandata, pdata, 20); - swab32_array( endiandata, pdata, 20 ); + uint32_t hash[8]; + const uint32_t Htarg = ptarget[7]; + do { + be32enc(&endiandata[19], nonce); - do { - const uint32_t Htarg = ptarget[7]; - uint32_t hash[8]; - be32enc(&endiandata[19], nonce); - gr_hash(hash, endiandata); + gr_hash(hash, endiandata, 0xFF); - if (hash[7] <= Htarg) { - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 1; - } - nonce++; + if (hash[7] <= Htarg) { + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce; + submit_solution(work, hash, mythr); + } + ++nonce; - } while (nonce < max_nonce && !work_restart[thr_id].restart); + } while (nonce < max_nonce && !work_restart[thr_id].restart); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce + 1; + return 0; } diff --git a/algo/gr/gr-gate.h b/algo/gr/gr-gate.h index f9ca804..8bd5f3c 100644 --- a/algo/gr/gr-gate.h +++ b/algo/gr/gr-gate.h @@ -4,7 +4,8 @@ #include "algo-gate-api.h" #include -void gr_hash( void *state, const void *input ); -int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr); +void gr_hash(void *state, const void *input, uint8_t rot); +int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done, + struct thr_info *mythr); #endif diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h index 2a56aad..e09e8de 100644 --- a/algo/groestl/aes_ni/groestl-intr-aes.h +++ b/algo/groestl/aes_ni/groestl-intr-aes.h @@ -1,3 +1,6 @@ +#if !defined GROESTL_INTR_AES_H__ +#define GROESTL_INTR_AES_H__ + /* groestl-intr-aes.h Aug 2011 * * Groestl implementation with intrinsics using ssse3, sse4.1, and aes @@ -11,16 +14,51 @@ #include #include "hash-groestl.h" -/* global constants */ -__m128i ROUND_CONST_Lx; -//__m128i ROUND_CONST_L0[ROUNDS512]; -//__m128i ROUND_CONST_L7[ROUNDS512]; -__m128i ROUND_CONST_P[ROUNDS1024]; -__m128i ROUND_CONST_Q[ROUNDS1024]; -__m128i TRANSP_MASK; -__m128i SUBSH_MASK[8]; -__m128i ALL_1B; -__m128i ALL_FF; +static const __m128i round_const_p[] __attribute__ ((aligned (64))) = +{ + { 0x7060504030201000, 0xf0e0d0c0b0a09080 }, + { 0x7161514131211101, 0xf1e1d1c1b1a19181 }, + { 0x7262524232221202, 0xf2e2d2c2b2a29282 }, + { 0x7363534333231303, 0xf3e3d3c3b3a39383 }, + { 0x7464544434241404, 0xf4e4d4c4b4a49484 }, + { 0x7565554535251505, 0xf5e5d5c5b5a59585 }, + { 0x7666564636261606, 0xf6e6d6c6b6a69686 }, + { 0x7767574737271707, 0xf7e7d7c7b7a79787 }, + { 0x7868584838281808, 0xf8e8d8c8b8a89888 }, + { 0x7969594939291909, 0xf9e9d9c9b9a99989 }, + { 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a }, + { 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b }, + { 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c }, + { 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d } +}; + +static const __m128i round_const_q[] __attribute__ ((aligned (64))) = +{ + { 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f }, + { 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e }, + { 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d }, + { 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c }, + { 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b }, + { 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a }, + { 0x8999a9b9c9d9e9f9, 0x0919293949596979 }, + { 0x8898a8b8c8d8e8f8, 0x0818283848586878 }, + { 0x8797a7b7c7d7e7f7, 0x0717273747576777 }, + { 0x8696a6b6c6d6e6f6, 0x0616263646566676 }, + { 0x8595a5b5c5d5e5f5, 0x0515253545556575 }, + { 0x8494a4b4c4d4e4f4, 0x0414243444546474 }, + { 0x8393a3b3c3d3e3f3, 0x0313233343536373 }, + { 0x8292a2b2c2d2e2f2, 0x0212223242526272 } +}; + +static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 }; +static const __m128i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 }; +static const __m128i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 }; +static const __m128i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a }; +static const __m128i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b }; +static const __m128i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c }; +static const __m128i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d }; +static const __m128i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e }; +static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 }; #define tos(a) #a #define tostr(a) tos(a) @@ -73,7 +111,7 @@ __m128i ALL_FF; b5 = a7;\ a6 = _mm_xor_si128(a6, a7);\ a7 = _mm_xor_si128(a7, b6);\ - \ + \ /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ b0 = _mm_xor_si128(b0, a4);\ b6 = _mm_xor_si128(b6, a4);\ @@ -111,7 +149,7 @@ __m128i ALL_FF; \ /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ /* compute w_i : add y_{i+4} */\ - b1 = ALL_1B;\ + b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\ MUL2(a0, b0, b1);\ a0 = _mm_xor_si128(a0, TEMP0);\ MUL2(a1, b0, b1);\ @@ -152,25 +190,6 @@ __m128i ALL_FF; }/*MixBytes*/ -#define SET_CONSTANTS(){\ - ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ - ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\ - SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\ - SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\ - SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\ - SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\ - SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\ - SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\ - SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\ - for(i = 0; i < ROUNDS1024; i++)\ - {\ - ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\ - }\ -}while(0);\ - /* one round * a0-a7 = input rows * b0-b7 = output rows @@ -194,32 +213,34 @@ __m128i ALL_FF; u8 round_counter = 0;\ for(round_counter = 0; round_counter < 14; round_counter+=2) {\ /* AddRoundConstant P1024 */\ - xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\ - xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\ - xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\ - xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\ - xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\ - xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\ - xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\ - xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\ + xmm8 = _mm_xor_si128( xmm8, \ + casti_m128i( round_const_p, round_counter ) ); \ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm8 = _mm_shuffle_epi8( xmm8, SUBSH_MASK0 ); \ + xmm9 = _mm_shuffle_epi8( xmm9, SUBSH_MASK1 ); \ + xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK2 ); \ + xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK3 ); \ + xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK4 ); \ + xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK5 ); \ + xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK6 ); \ + xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK7 ); \ /* SubBytes + MixBytes */\ - SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \ + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \ \ /* AddRoundConstant P1024 */\ - xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\ - xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\ - xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\ - xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\ - xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\ - xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\ - xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\ - xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\ - /* SubBytes + MixBytes */\ - SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + xmm0 = _mm_xor_si128( xmm0, \ + casti_m128i( round_const_p, round_counter+1 ) ); \ + xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK0 ); \ + xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK1 ); \ + xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK2 ); \ + xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK3 ); \ + xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK4 ); \ + xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK5 ); \ + xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK6 ); \ + xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK7 ); \ + SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \ + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \ }\ } @@ -227,48 +248,52 @@ __m128i ALL_FF; u8 round_counter = 0;\ for(round_counter = 0; round_counter < 14; round_counter+=2) {\ /* AddRoundConstant Q1024 */\ - xmm1 = ALL_FF;\ - xmm8 = _mm_xor_si128(xmm8, xmm1);\ - xmm9 = _mm_xor_si128(xmm9, xmm1);\ - xmm10 = _mm_xor_si128(xmm10, xmm1);\ - xmm11 = _mm_xor_si128(xmm11, xmm1);\ - xmm12 = _mm_xor_si128(xmm12, xmm1);\ - xmm13 = _mm_xor_si128(xmm13, xmm1);\ - xmm14 = _mm_xor_si128(xmm14, xmm1);\ - xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\ + xmm1 = m128_neg1;\ + xmm8 = _mm_xor_si128( xmm8, xmm1 ); \ + xmm9 = _mm_xor_si128( xmm9, xmm1 ); \ + xmm10 = _mm_xor_si128( xmm10, xmm1 ); \ + xmm11 = _mm_xor_si128( xmm11, xmm1 ); \ + xmm12 = _mm_xor_si128( xmm12, xmm1 ); \ + xmm13 = _mm_xor_si128( xmm13, xmm1 ); \ + xmm14 = _mm_xor_si128( xmm14, xmm1 ); \ + xmm15 = _mm_xor_si128( xmm15, \ + casti_m128i( round_const_q, round_counter ) ); \ /* ShiftBytes Q1024 + pre-AESENCLAST */\ - xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\ - xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\ - xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\ - xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\ - xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\ - xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\ - xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\ - xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\ + xmm8 = _mm_shuffle_epi8( xmm8, SUBSH_MASK1 ); \ + xmm9 = _mm_shuffle_epi8( xmm9, SUBSH_MASK3 ); \ + xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK5 ); \ + xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK7 ); \ + xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK0 ); \ + xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK2 ); \ + xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK4 ); \ + xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK6 ); \ /* SubBytes + MixBytes */\ - SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \ + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 , xmm7 ); \ \ /* AddRoundConstant Q1024 */\ - xmm9 = ALL_FF;\ - xmm0 = _mm_xor_si128(xmm0, xmm9);\ - xmm1 = _mm_xor_si128(xmm1, xmm9);\ - xmm2 = _mm_xor_si128(xmm2, xmm9);\ - xmm3 = _mm_xor_si128(xmm3, xmm9);\ - xmm4 = _mm_xor_si128(xmm4, xmm9);\ - xmm5 = _mm_xor_si128(xmm5, xmm9);\ - xmm6 = _mm_xor_si128(xmm6, xmm9);\ - xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\ + xmm9 = m128_neg1;\ + xmm0 = _mm_xor_si128( xmm0, xmm9 ); \ + xmm1 = _mm_xor_si128( xmm1, xmm9 ); \ + xmm2 = _mm_xor_si128( xmm2, xmm9 ); \ + xmm3 = _mm_xor_si128( xmm3, xmm9 ); \ + xmm4 = _mm_xor_si128( xmm4, xmm9 ); \ + xmm5 = _mm_xor_si128( xmm5, xmm9 ); \ + xmm6 = _mm_xor_si128( xmm6, xmm9 ); \ + xmm7 = _mm_xor_si128( xmm7, \ + casti_m128i( round_const_q, round_counter+1 ) ); \ /* ShiftBytes Q1024 + pre-AESENCLAST */\ - xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\ - xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\ - xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\ - xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\ - xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\ - xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\ - xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\ - xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\ + xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK1 ); \ + xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK3 ); \ + xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK5 ); \ + xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK7 ); \ + xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK0 ); \ + xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK2 ); \ + xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK4 ); \ + xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK6 ); \ /* SubBytes + MixBytes */\ - SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \ + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \ }\ } @@ -280,7 +305,7 @@ __m128i ALL_FF; * clobbers: t0-t7 */ #define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - t0 = TRANSP_MASK;\ + t0 = TRANSP_MASK; \ \ i6 = _mm_shuffle_epi8(i6, t0);\ i0 = _mm_shuffle_epi8(i0, t0);\ @@ -368,7 +393,7 @@ __m128i ALL_FF; i4 = _mm_unpacklo_epi64(i4, i5);\ t1 = _mm_unpackhi_epi64(t1, i5);\ t2 = i6;\ - o0 = TRANSP_MASK;\ + o0 = TRANSP_MASK; \ i6 = _mm_unpacklo_epi64(i6, i7);\ t2 = _mm_unpackhi_epi64(t2, i7);\ /* load transpose mask into a register, because it will be used 8 times */\ @@ -609,3 +634,4 @@ void OF1024( __m128i* chaining ) return; } +#endif diff --git a/algo/groestl/aes_ni/groestl256-intr-aes.h b/algo/groestl/aes_ni/groestl256-intr-aes.h index 57dd930..61c1b7b 100644 --- a/algo/groestl/aes_ni/groestl256-intr-aes.h +++ b/algo/groestl/aes_ni/groestl256-intr-aes.h @@ -11,17 +11,44 @@ #include #include "hash-groestl256.h" -/* global constants */ -__m128i ROUND_CONST_Lx; -__m128i ROUND_CONST_L0[ROUNDS512]; -__m128i ROUND_CONST_L7[ROUNDS512]; -//__m128i ROUND_CONST_P[ROUNDS1024]; -//__m128i ROUND_CONST_Q[ROUNDS1024]; -__m128i TRANSP_MASK; -__m128i SUBSH_MASK[8]; -__m128i ALL_1B; -__m128i ALL_FF; - +static const __m128i round_const_l0[] __attribute__ ((aligned (64))) = +{ + { 0x7060504030201000, 0xffffffffffffffff }, + { 0x7161514131211101, 0xffffffffffffffff }, + { 0x7262524232221202, 0xffffffffffffffff }, + { 0x7363534333231303, 0xffffffffffffffff }, + { 0x7464544434241404, 0xffffffffffffffff }, + { 0x7565554535251505, 0xffffffffffffffff }, + { 0x7666564636261606, 0xffffffffffffffff }, + { 0x7767574737271707, 0xffffffffffffffff }, + { 0x7868584838281808, 0xffffffffffffffff }, + { 0x7969594939291909, 0xffffffffffffffff } +}; + +static const __m128i round_const_l7[] __attribute__ ((aligned (64))) = +{ + { 0x0000000000000000, 0x8f9fafbfcfdfefff }, + { 0x0000000000000000, 0x8e9eaebecedeeefe }, + { 0x0000000000000000, 0x8d9dadbdcdddedfd }, + { 0x0000000000000000, 0x8c9cacbcccdcecfc }, + { 0x0000000000000000, 0x8b9babbbcbdbebfb }, + { 0x0000000000000000, 0x8a9aaabacadaeafa }, + { 0x0000000000000000, 0x8999a9b9c9d9e9f9 }, + { 0x0000000000000000, 0x8898a8b8c8d8e8f8 }, + { 0x0000000000000000, 0x8797a7b7c7d7e7f7 }, + { 0x0000000000000000, 0x8696a6b6c6d6e6f6 } +}; + +static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 }; + +static const __m128i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 }; +static const __m128i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b }; +static const __m128i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d }; +static const __m128i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f }; +static const __m128i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 }; +static const __m128i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a }; +static const __m128i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c }; +static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e }; #define tos(a) #a #define tostr(a) tos(a) @@ -38,8 +65,6 @@ __m128i ALL_FF; i = _mm_xor_si128(i, j);\ } - /**/ - /* Yet another implementation of MixBytes. This time we use the formulae (3) from the paper "Byte Slicing Groestl". Input: a0, ..., a7 @@ -113,7 +138,7 @@ __m128i ALL_FF; \ /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ /* compute w_i : add y_{i+4} */\ - b1 = ALL_1B;\ + b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\ MUL2(a0, b0, b1);\ a0 = _mm_xor_si128(a0, TEMP0);\ MUL2(a1, b0, b1);\ @@ -153,25 +178,6 @@ __m128i ALL_FF; b1 = _mm_xor_si128(b1, a4);\ }/*MixBytes*/ -#define SET_CONSTANTS(){\ - ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ - SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ - SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ - SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ - SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ - SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ - SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ - SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ - }\ - ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ -}while(0); \ - /* one round * i = round number * a0-a7 = input rows @@ -179,34 +185,34 @@ __m128i ALL_FF; */ #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* AddRoundConstant */\ - b1 = ROUND_CONST_Lx;\ - a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ - a1 = _mm_xor_si128(a1, b1);\ - a2 = _mm_xor_si128(a2, b1);\ - a3 = _mm_xor_si128(a3, b1);\ - a4 = _mm_xor_si128(a4, b1);\ - a5 = _mm_xor_si128(a5, b1);\ - a6 = _mm_xor_si128(a6, b1);\ - a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + b1 = m128_const_64( 0xffffffffffffffff, 0 ); \ + a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \ + a1 = _mm_xor_si128( a1, b1 ); \ + a2 = _mm_xor_si128( a2, b1 ); \ + a3 = _mm_xor_si128( a3, b1 ); \ + a4 = _mm_xor_si128( a4, b1 ); \ + a5 = _mm_xor_si128( a5, b1 ); \ + a6 = _mm_xor_si128( a6, b1 ); \ + a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \ \ /* ShiftBytes + SubBytes (interleaved) */\ b0 = _mm_xor_si128(b0, b0);\ - a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ - a0 = _mm_aesenclast_si128(a0, b0);\ - a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ - a1 = _mm_aesenclast_si128(a1, b0);\ - a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ - a2 = _mm_aesenclast_si128(a2, b0);\ - a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ - a3 = _mm_aesenclast_si128(a3, b0);\ - a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ - a4 = _mm_aesenclast_si128(a4, b0);\ - a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ - a5 = _mm_aesenclast_si128(a5, b0);\ - a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ - a6 = _mm_aesenclast_si128(a6, b0);\ - a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ - a7 = _mm_aesenclast_si128(a7, b0);\ + a0 = _mm_shuffle_epi8( a0, SUBSH_MASK0 ); \ + a0 = _mm_aesenclast_si128( a0, b0 );\ + a1 = _mm_shuffle_epi8( a1, SUBSH_MASK1 ); \ + a1 = _mm_aesenclast_si128( a1, b0 );\ + a2 = _mm_shuffle_epi8( a2, SUBSH_MASK2 ); \ + a2 = _mm_aesenclast_si128( a2, b0 );\ + a3 = _mm_shuffle_epi8( a3, SUBSH_MASK3 ); \ + a3 = _mm_aesenclast_si128( a3, b0 );\ + a4 = _mm_shuffle_epi8( a4, SUBSH_MASK4 ); \ + a4 = _mm_aesenclast_si128( a4, b0 );\ + a5 = _mm_shuffle_epi8( a5, SUBSH_MASK5 ); \ + a5 = _mm_aesenclast_si128( a5, b0 );\ + a6 = _mm_shuffle_epi8( a6, SUBSH_MASK6 ); \ + a6 = _mm_aesenclast_si128( a6, b0 );\ + a7 = _mm_shuffle_epi8( a7, SUBSH_MASK7 ); \ + a7 = _mm_aesenclast_si128( a7, b0 );\ \ /* MixBytes */\ MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ @@ -234,8 +240,9 @@ __m128i ALL_FF; * outputs: i0, o1-o3 * clobbers: t0 */ + #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - t0 = TRANSP_MASK;\ + t0 = TRANSP_MASK; \ \ i0 = _mm_shuffle_epi8(i0, t0);\ i1 = _mm_shuffle_epi8(i1, t0);\ diff --git a/algo/groestl/aes_ni/hash-groestl.c b/algo/groestl/aes_ni/hash-groestl.c index e77aab9..d26ef27 100644 --- a/algo/groestl/aes_ni/hash-groestl.c +++ b/algo/groestl/aes_ni/hash-groestl.c @@ -14,50 +14,15 @@ #include "miner.h" #include "simd-utils.h" -#ifndef NO_AES_NI - -#include "groestl-version.h" - -#ifdef TASM - #ifdef VAES - #include "groestl-asm-aes.h" - #else - #ifdef VAVX - #include "groestl-asm-avx.h" - #else - #ifdef VVPERM - #include "groestl-asm-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif -#else - #ifdef TINTR - #ifdef VAES - #include "groestl-intr-aes.h" - #else - #ifdef VAVX - #include "groestl-intr-avx.h" - #else - #ifdef VVPERM - #include "groestl-intr-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif - #else - #error NO TYPE SPECIFIED (-DT[ASM/INTR]) - #endif -#endif +#ifdef __AES__ + +#include "groestl-intr-aes.h" HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen ) { int i; ctx->hashlen = hashlen; - SET_CONSTANTS(); if (ctx->chaining == NULL || ctx->buffer == NULL) return FAIL_GR; @@ -67,8 +32,10 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen ) ctx->chaining[i] = _mm_setzero_si128(); ctx->buffer[i] = _mm_setzero_si128(); } - ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); - INIT(ctx->chaining); + + // The only non-zero in the IV is len. It can be hard coded. + ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 ); + ctx->buf_ptr = 0; ctx->rem_ptr = 0; @@ -87,8 +54,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx ) ctx->chaining[i] = _mm_setzero_si128(); ctx->buffer[i] = _mm_setzero_si128(); } - ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); - INIT(ctx->chaining); + ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 ); ctx->buf_ptr = 0; ctx->rem_ptr = 0; @@ -104,7 +70,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx ) // 5. Midstate will work at reduced impact than full hash, if total hash // (midstate + tail) is less than 1 block. // This, unfortunately, is the case with all current users. -// 6. the morefull blocks the bigger the gain +// 6. the more full blocks the bigger the gain // use only for midstate precalc HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input, @@ -138,12 +104,11 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input, // deprecated do not use HashReturn_gr final_groestl( hashState_groestl* ctx, void* output ) { - const int len = (int)ctx->databitlen / 128; // bits to __m128i - const int blocks = ctx->blk_count + 1; // adjust for final block - - const int rem_ptr = ctx->rem_ptr; // end of data start of padding - const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i - const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer + const int len = (int)ctx->databitlen / 128; // bits to __m128i + const uint64_t blocks = ctx->blk_count + 1; // adjust for final block + const int rem_ptr = ctx->rem_ptr; // end of data start of padding + const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer int i; // first pad byte = 0x80, last pad byte = block count @@ -152,21 +117,18 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output ) if ( rem_ptr == len - 1 ) { // only 128 bits left in buffer, all padding at once - ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 ); } else { // add first padding - ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 ); // add zero padding for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = _mm_setzero_si128(); // add length padding, second last byte is zero unless blocks > 255 - ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, - 0, 0 ,0,0, 0,0,0,0 ); + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); } // digest final padding block and do output transform @@ -180,6 +142,75 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output ) return SUCCESS_GR; } +int groestl512_full( hashState_groestl* ctx, void* output, + const void* input, uint64_t databitlen ) +{ + + int i; + ctx->hashlen = 64; + + for ( i = 0; i < SIZE512; i++ ) + { + ctx->chaining[i] = _mm_setzero_si128(); + ctx->buffer[i] = _mm_setzero_si128(); + } + ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 ); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + // --- update --- + + const int len = (int)databitlen / 128; + const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hash_offset = SIZE512 - hashlen_m128i; + int rem = ctx->rem_ptr; + uint64_t blocks = len / SIZE512; + __m128i* in = (__m128i*)input; + + // digest any full blocks, process directly from input + for ( i = 0; i < blocks; i++ ) + TF1024( ctx->chaining, &in[ i * SIZE512 ] ); + ctx->buf_ptr = blocks * SIZE512; + + // copy any remaining data to buffer, it may already contain data + // from a previous update for a midstate precalc + for ( i = 0; i < len % SIZE512; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + i += rem; // use i as rem_ptr in final + + //--- final --- + + blocks++; // adjust for final block + + if ( i == len -1 ) + { + // only 128 bits left in buffer, all padding at once + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 ); + } + else + { + // add first padding + ctx->buffer[i] = m128_const_64( 0, 0x80 ); + // add zero padding + for ( i += 1; i < SIZE512 - 1; i++ ) + ctx->buffer[i] = _mm_setzero_si128(); + + // add length padding, second last byte is zero unless blocks > 255 + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); + } + + // digest final padding block and do output transform + TF1024( ctx->chaining, ctx->buffer ); + OF1024( ctx->chaining ); + + // store hash result in output + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return 0; +} + + HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output, const void* input, DataLength_gr databitlen ) { @@ -187,7 +218,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output, const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; int rem = ctx->rem_ptr; - int blocks = len / SIZE512; + uint64_t blocks = len / SIZE512; __m128i* in = (__m128i*)input; int i; @@ -211,21 +242,18 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output, if ( i == len -1 ) { // only 128 bits left in buffer, all padding at once - ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 ); } else { // add first padding - ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[i] = m128_const_64( 0, 0x80 ); // add zero padding for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = _mm_setzero_si128(); // add length padding, second last byte is zero unless blocks > 255 - ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, - 0, 0 ,0,0, 0,0,0,0 ); + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); } // digest final padding block and do output transform diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h index b537490..595dc3d 100644 --- a/algo/groestl/aes_ni/hash-groestl.h +++ b/algo/groestl/aes_ni/hash-groestl.h @@ -87,5 +87,6 @@ HashReturn_gr final_groestl( hashState_groestl*, void* ); HashReturn_gr update_and_final_groestl( hashState_groestl*, void*, const void*, DataLength_gr ); +int groestl512_full( hashState_groestl*, void*, const void*, uint64_t ); #endif /* __hash_h */ diff --git a/algo/groestl/aes_ni/hash-groestl256.c b/algo/groestl/aes_ni/hash-groestl256.c index cee3eac..53f45a6 100644 --- a/algo/groestl/aes_ni/hash-groestl256.c +++ b/algo/groestl/aes_ni/hash-groestl256.c @@ -11,43 +11,9 @@ #include "miner.h" #include "simd-utils.h" -#ifndef NO_AES_NI - -#include "groestl-version.h" - -#ifdef TASM - #ifdef VAES - #include "groestl256-asm-aes.h" - #else - #ifdef VAVX - #include "groestl256-asm-avx.h" - #else - #ifdef VVPERM - #include "groestl256-asm-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif -#else - #ifdef TINTR - #ifdef VAES - #include "groestl256-intr-aes.h" - #else - #ifdef VAVX - #include "groestl256-intr-avx.h" - #else - #ifdef VVPERM - #include "groestl256-intr-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif - #else - #error NO TYPE SPECIFIED (-DT[ASM/INTR]) - #endif -#endif +#ifdef __AES__ + +#include "groestl256-intr-aes.h" /* initialise context */ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen ) @@ -55,7 +21,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen ) int i; ctx->hashlen = hashlen; - SET_CONSTANTS(); if (ctx->chaining == NULL || ctx->buffer == NULL) return FAIL_GR; @@ -86,8 +51,11 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx) ctx->chaining[i] = _mm_setzero_si128(); ctx->buffer[i] = _mm_setzero_si128(); } - ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); - INIT256(ctx->chaining); + + ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 ); + +// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); +// INIT256(ctx->chaining); ctx->buf_ptr = 0; ctx->rem_ptr = 0; @@ -246,6 +214,98 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx, return SUCCESS_GR; } +int groestl256_full( hashState_groestl256* ctx, + void* output, const void* input, DataLength_gr databitlen ) +{ + int i; + ctx->hashlen = 32; + for ( i = 0; i < SIZE256; i++ ) + { + ctx->chaining[i] = _mm_setzero_si128(); + ctx->buffer[i] = _mm_setzero_si128(); + } + ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); + INIT256( ctx->chaining ); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + const int len = (int)databitlen / 128; + const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hash_offset = SIZE256 - hashlen_m128i; + int rem = ctx->rem_ptr; + int blocks = len / SIZE256; + __m128i* in = (__m128i*)input; + + // --- update --- + + // digest any full blocks, process directly from input + for ( i = 0; i < blocks; i++ ) + TF512( ctx->chaining, &in[ i * SIZE256 ] ); + ctx->buf_ptr = blocks * SIZE256; + + // cryptonight has 200 byte input, an odd number of __m128i + // remainder is only 8 bytes, ie u64. + if ( databitlen % 128 !=0 ) + { + // must be cryptonight, copy 64 bits of data + *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] ); + i = -1; // signal for odd length + } + else + { + // Copy any remaining data to buffer for final transform + for ( i = 0; i < len % SIZE256; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + i += rem; // use i as rem_ptr in final + } + + //--- final --- + + // adjust for final block + blocks++; + + if ( i == len - 1 ) + { + // all padding at once + ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0, + 0, 0,0,0, 0,0,0,0x80 ); + } + else + { + if ( i == -1 ) + { + // cryptonight odd length + ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull; + // finish the block with zero and length padding as normal + i = 0; + } + else + { + // add first padding + ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0x80 ); + } + // add zero padding + for ( i += 1; i < SIZE256 - 1; i++ ) + ctx->buffer[i] = _mm_setzero_si128(); + // add length padding + // cheat since we know the block count is trivial, good if block < 256 + ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0, + 0, 0,0,0, 0,0,0,0 ); + } + + // digest final padding block and do output transform + TF512( ctx->chaining, ctx->buffer ); + OF512( ctx->chaining ); + + // store hash result in output + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return SUCCESS_GR; +} + + /* hash bit sequence */ HashReturn_gr hash_groestl256(int hashbitlen, const BitSequence_gr* data, diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h index f82c1de..9410266 100644 --- a/algo/groestl/aes_ni/hash-groestl256.h +++ b/algo/groestl/aes_ni/hash-groestl256.h @@ -93,9 +93,6 @@ typedef enum typedef struct { __attribute__ ((aligned (32))) __m128i chaining[SIZE256]; __attribute__ ((aligned (32))) __m128i buffer[SIZE256]; -// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */ -// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */ -// u64 block_counter; /* message block counter */ int hashlen; // bytes int blk_count; int buf_ptr; /* data buffer pointer */ @@ -118,4 +115,7 @@ HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr, HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*, const void*, DataLength_gr ); +int groestl256_full( hashState_groestl256* ctx, + void* output, const void* input, DataLength_gr databitlen ); + #endif /* __hash_h */ diff --git a/algo/groestl/groestl.c b/algo/groestl/groestl.c deleted file mode 100644 index 571c4c0..0000000 --- a/algo/groestl/groestl.c +++ /dev/null @@ -1,119 +0,0 @@ -#include "algo-gate-api.h" - -#include -#include -#include -#include - -#ifdef NO_AES_NI - #include "sph_groestl.h" -#else - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -typedef struct -{ -#ifdef NO_AES_NI - sph_groestl512_context groestl1, groestl2; -#else - hashState_groestl groestl1, groestl2; -#endif - -} groestl_ctx_holder; - -static groestl_ctx_holder groestl_ctx; - -void init_groestl_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init( &groestl_ctx.groestl1 ); - sph_groestl512_init( &groestl_ctx.groestl2 ); -#else - init_groestl( &groestl_ctx.groestl1, 64 ); - init_groestl( &groestl_ctx.groestl2, 64 ); -#endif -} - -void groestlhash( void *output, const void *input ) -{ - uint32_t hash[16] __attribute__ ((aligned (64))); - groestl_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &groestl_ctx, sizeof(groestl_ctx) ); - -#ifdef NO_AES_NI - sph_groestl512(&ctx.groestl1, input, 80); - sph_groestl512_close(&ctx.groestl1, hash); - - sph_groestl512(&ctx.groestl2, hash, 64); - sph_groestl512_close(&ctx.groestl2, hash); -#else - update_and_final_groestl( &ctx.groestl1, (char*)hash, - (const char*)input, 640 ); - - update_and_final_groestl( &ctx.groestl2, (char*)hash, - (const char*)hash, 512 ); -#endif - memcpy(output, hash, 32); - } - -int scanhash_groestl( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t endiandata[20] __attribute__ ((aligned (64))); - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - swab32_array( endiandata, pdata, 20 ); - - do { - const uint32_t Htarg = ptarget[7]; - uint32_t hash[8] __attribute__ ((aligned (64))); - be32enc(&endiandata[19], nonce); - groestlhash(hash, endiandata); - - if (hash[7] <= Htarg ) - if ( fulltest(hash, ptarget)) - { - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 1; - } - - nonce++; - - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -void groestl_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool register_dmd_gr_algo( algo_gate_t* gate ) -{ - init_groestl_ctx(); - gate->optimizations = SSE2_OPT | AES_OPT; - gate->scanhash = (void*)&scanhash_groestl; - gate->hash = (void*)&groestlhash; - gate->set_target = (void*)&groestl_set_target; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - -bool register_groestl_algo( algo_gate_t* gate ) -{ - register_dmd_gr_algo( gate ); - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; - return true; -}; - diff --git a/algo/groestl/myr-groestl.c b/algo/groestl/myr-groestl.c deleted file mode 100644 index d66260a..0000000 --- a/algo/groestl/myr-groestl.c +++ /dev/null @@ -1,103 +0,0 @@ -#include "myrgr-gate.h" - -#include -#include -#include -#include - -#ifdef NO_AES_NI - #include "sph_groestl.h" -#else - #include "aes_ni/hash-groestl.h" -#endif -#include "algo/sha/sph_sha2.h" - -typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; -#else - hashState_groestl groestl; -#endif - sph_sha256_context sha; -} myrgr_ctx_holder; - -myrgr_ctx_holder myrgr_ctx; - -void init_myrgr_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init( &myrgr_ctx.groestl ); -#else - init_groestl (&myrgr_ctx.groestl, 64 ); -#endif - sph_sha256_init(&myrgr_ctx.sha); -} - -void myriad_hash(void *output, const void *input) -{ - myrgr_ctx_holder ctx; - memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) ); - - uint32_t _ALIGN(32) hash[16]; - -#ifdef NO_AES_NI - sph_groestl512(&ctx.groestl, input, 80); - sph_groestl512_close(&ctx.groestl, hash); -#else - update_groestl( &ctx.groestl, (char*)input, 640 ); - final_groestl( &ctx.groestl, (char*)hash); -#endif - - sph_sha256(&ctx.sha, hash, 64); - sph_sha256_close(&ctx.sha, hash); - - memcpy(output, hash, 32); -} - -int scanhash_myriad( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - uint32_t _ALIGN(64) endiandata[20]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - swab32_array( endiandata, pdata, 20 ); - - do { - const uint32_t Htarg = ptarget[7]; - uint32_t hash[8]; - be32enc(&endiandata[19], nonce); - myriad_hash(hash, endiandata); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) { - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 1; - } - nonce++; - - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} -/* -bool register_myriad_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT; - init_myrgr_ctx(); - gate->scanhash = (void*)&scanhash_myriad; - gate->hash = (void*)&myriadhash; -// gate->hash_alt = (void*)&myriadhash; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; -*/ diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c deleted file mode 100644 index b7ba0eb..0000000 --- a/algo/groestl/myrgr-4way.c +++ /dev/null @@ -1,94 +0,0 @@ -#include "myrgr-gate.h" - -#if defined(MYRGR_4WAY) - -#include -#include -#include -#include - -#include "aes_ni/hash-groestl.h" -#include "algo/sha/sha2-hash-4way.h" - -typedef struct { - hashState_groestl groestl; - sha256_4way_context sha; -} myrgr_4way_ctx_holder; - -myrgr_4way_ctx_holder myrgr_4way_ctx; - -void init_myrgr_4way_ctx() -{ - init_groestl (&myrgr_4way_ctx.groestl, 64 ); - sha256_4way_init( &myrgr_4way_ctx.sha ); -} - -void myriad_4way_hash( void *output, const void *input ) -{ - uint32_t hash0[20] __attribute__ ((aligned (64))); - uint32_t hash1[20] __attribute__ ((aligned (64))); - uint32_t hash2[20] __attribute__ ((aligned (64))); - uint32_t hash3[20] __attribute__ ((aligned (64))); - uint32_t vhash[16*4] __attribute__ ((aligned (64))); - myrgr_4way_ctx_holder ctx; - memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) ); - - dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 ); - - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 ); - memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 ); - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - - sha256_4way( &ctx.sha, vhash, 64 ); - sha256_4way_close( &ctx.sha, output ); -} - -int scanhash_myriad_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[7<<2]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ( (uint32_t*)ptarget )[7] = 0x0000ff; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); - - myriad_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane ] <= Htarg ) - { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/groestl/myrgr-gate.c b/algo/groestl/myrgr-gate.c deleted file mode 100644 index aa8ebd8..0000000 --- a/algo/groestl/myrgr-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "myrgr-gate.h" - -bool register_myriad_algo( algo_gate_t* gate ) -{ -#if defined (MYRGR_4WAY) - init_myrgr_4way_ctx(); - gate->scanhash = (void*)&scanhash_myriad_4way; - gate->hash = (void*)&myriad_4way_hash; -#else - init_myrgr_ctx(); - gate->scanhash = (void*)&scanhash_myriad; - gate->hash = (void*)&myriad_hash; -#endif - gate->optimizations = AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/groestl/myrgr-gate.h b/algo/groestl/myrgr-gate.h deleted file mode 100644 index 89fc5f1..0000000 --- a/algo/groestl/myrgr-gate.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef MYRGR_GATE_H__ -#define MYRGR_GATE_H__ - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define MYRGR_4WAY -#endif - -#if defined(MYRGR_4WAY) - -void myriad_4way_hash( void *state, const void *input ); - -int scanhash_myriad_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_myrgr_4way_ctx(); - -#endif - -void myriad_hash( void *state, const void *input ); - -int scanhash_myriad( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_myrgr_ctx(); - -#endif - diff --git a/algo/groestl/sph_groestl.hpp b/algo/groestl/sph_groestl.hpp new file mode 100644 index 0000000..bf954e1 --- /dev/null +++ b/algo/groestl/sph_groestl.hpp @@ -0,0 +1,329 @@ +/* $Id: sph_groestl.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Groestl interface. This code implements Groestl with the recommended + * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_groestl.h + * @author Thomas Pornin + */ + +#ifndef SPH_GROESTL_H__ +#define SPH_GROESTL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "algo/sha/sph_types.h" +#include + +/** + * Output size (in bits) for Groestl-224. + */ +#define SPH_SIZE_groestl224 224 + +/** + * Output size (in bits) for Groestl-256. + */ +#define SPH_SIZE_groestl256 256 + +/** + * Output size (in bits) for Groestl-384. + */ +#define SPH_SIZE_groestl384 384 + +/** + * Output size (in bits) for Groestl-512. + */ +#define SPH_SIZE_groestl512 512 + +/** + * This structure is a context for Groestl-224 and Groestl-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a Groestl computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running Groestl + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[8]; +#endif + sph_u32 narrow[16]; + } state; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_groestl_small_context; + +/** + * This structure is a context for Groestl-224 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_small_context sph_groestl224_context; + +/** + * This structure is a context for Groestl-256 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_small_context sph_groestl256_context; + +/** + * This structure is a context for Groestl-384 and Groestl-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a Groestl computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running Groestl + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[16]; +#endif + sph_u32 narrow[32]; + } state; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_groestl_big_context; + +/** + * This structure is a context for Groestl-384 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_big_context sph_groestl384_context; + +/** + * This structure is a context for Groestl-512 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_big_context sph_groestl512_context; + +/** + * Initialize a Groestl-224 context. This process performs no memory allocation. + * + * @param cc the Groestl-224 context (pointer to a + * sph_groestl224_context) + */ +void sph_groestl224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-224 context + * @param dst the destination buffer + */ +void sph_groestl224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n, + void *dst); + +/** + * Initialize a Groestl-256 context. This process performs no memory allocation. + * + * @param cc the Groestl-256 context (pointer to a + * sph_groestl256_context) + */ +void sph_groestl256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-256 context + * @param dst the destination buffer + */ +void sph_groestl256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n, + void *dst); + +/** + * Initialize a Groestl-384 context. This process performs no memory allocation. + * + * @param cc the Groestl-384 context (pointer to a + * sph_groestl384_context) + */ +void sph_groestl384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-384 context + * @param dst the destination buffer + */ +void sph_groestl384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n, + void *dst); + +/** + * Initialize a Groestl-512 context. This process performs no memory allocation. + * + * @param cc the Groestl-512 context (pointer to a + * sph_groestl512_context) + */ +void sph_groestl512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-512 context + * @param dst the destination buffer + */ +void sph_groestl512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, + void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c deleted file mode 100644 index 53ba5e3..0000000 --- a/algo/hamsi/hamsi-hash-4way.c +++ /dev/null @@ -1,934 +0,0 @@ -/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */ -/* - * Hamsi implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -//#include "miner.h" -#include "hamsi-hash-4way.h" - -#if defined(__AVX2__) - -#ifdef __cplusplus -extern "C"{ -#endif - -/* - * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one - * table lookup during message expansion (1 to 8, inclusive). If we note - * w the number of bits per message word (w=32 for Hamsi-224/256, w=64 - * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for - * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level, - * then we will get t tables (where t=ceil(w/n)) of individual size - * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and - * n=5, there are 7 tables, but the last one uses only two bits on - * input, not five). - * - * Also, we read t rows of r words from RAM. Words in a given row are - * concatenated in RAM in that order, so most of the cost is about - * reading the first row word; comparatively, cache misses are thus - * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8). - * - * When n=1, tables are "special" in that we omit the first entry of - * each table (which always contains 0), so that total table size is - * halved. - * - * We thus have the following (size1 is the cumulative table size of - * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2 - * are for Hamsi-224/256 and Hamsi-384/512, respectively). - * - * n size1 size2 t1 t2 - * --------------------------------------- - * 1 1024 4096 32 64 - * 2 2048 8192 16 32 - * 3 2688 10880 11 22 - * 4 4096 16384 8 16 - * 5 6272 25600 7 13 - * 6 10368 41984 6 11 - * 7 16896 73856 5 10 - * 8 32768 131072 4 8 - * - * So there is a trade-off: a lower n makes the tables fit better in - * L1 cache, but increases the number of memory accesses. The optimal - * value depends on the amount of available L1 cache and the relative - * impact of a cache miss. - * - * Experimentally, in ideal benchmark conditions (which are not necessarily - * realistic with regards to L1 cache contention), it seems that n=8 is - * the best value on "big" architectures (those with 32 kB or more of L1 - * cache), while n=4 is better on "small" architectures. This was tested - * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3 - * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302 - * (8 kB L1 cache). - * - * Note: with n=1, the 32 tables (actually implemented as one big table) - * are read entirely and sequentially, regardless of the input data, - * thus avoiding any data-dependent table access pattern. - */ - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -//#include "hamsi-helper-4way.c" - -static const sph_u32 IV512[] = { - SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172), - SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062), - SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33), - SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48), - SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c), - SPH_C32(0x6769756d) -}; - -static const sph_u32 alpha_n[] = { - SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), - SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc), - SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0), - SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00), - SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc), - SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0) -}; - -static const sph_u32 alpha_f[] = { - SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), - SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c), - SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9), - SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c), - SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c) -}; - -// imported from hamsi helper - -/* Note: this table lists bits within each byte from least - siginificant to most significant. */ -static const sph_u32 T512[64][16] = { - { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), - SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), - SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), - SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000), - SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984), - SPH_C32(0x9e69af68) }, - { SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000), - SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), - SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240), - SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000), - SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5), - SPH_C32(0x0c26f262) }, - { SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000), - SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78), - SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400), - SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000), - SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f), - SPH_C32(0xdc24e61f) }, - { SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), - SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), - SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800), - SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000), - SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f), - SPH_C32(0x3daac2da) }, - { SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000), - SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1), - SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800), - SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000), - SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da), - SPH_C32(0x78cace29) }, - { SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000), - SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), - SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400), - SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000), - SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247), - SPH_C32(0x2dd1f9ab) }, - { SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000), - SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745), - SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00), - SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000), - SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f), - SPH_C32(0xbf2c0be2) }, - { SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000), - SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93), - SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000), - SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000), - SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36), - SPH_C32(0x32219526) }, - { SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000), - SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae), - SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001), - SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000), - SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f), - SPH_C32(0xac8e6c88) }, - { SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000), - SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), - SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004), - SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000), - SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96), - SPH_C32(0x7b1bd6b9) }, - { SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000), - SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba), - SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000), - SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000), - SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604), - SPH_C32(0xf746c320) }, - { SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), - SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), - SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009), - SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000), - SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a), - SPH_C32(0x69505b3a) }, - { SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000), - SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25), - SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050), - SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000), - SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2), - SPH_C32(0x8a341574) }, - { SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000), - SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), - SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0), - SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000), - SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc), - SPH_C32(0x450360bf) }, - { SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000), - SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543), - SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060), - SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000), - SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d), - SPH_C32(0xf3d45758) }, - { SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), - SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), - SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110), - SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000), - SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25), - SPH_C32(0x925c44e9) }, - { SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000), - SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514), - SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000), - SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000), - SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315), - SPH_C32(0xa123ff9f) }, - { SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000), - SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860), - SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000), - SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000), - SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e), - SPH_C32(0x1568ff0f) }, - { SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000), - SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6), - SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000), - SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000), - SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616), - SPH_C32(0xc5c1eb3e) }, - { SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000), - SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145), - SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000), - SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000), - SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6), - SPH_C32(0x1af21fe1) }, - { SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000), - SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae), - SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000), - SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000), - SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17), - SPH_C32(0x857f3c2b) }, - { SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), - SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), - SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000), - SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000), - SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94), - SPH_C32(0x2ba05a55) }, - { SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000), - SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757), - SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001), - SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000), - SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba), - SPH_C32(0xfeabf254) }, - { SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000), - SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), - SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002), - SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000), - SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7), - SPH_C32(0xfe1cdc7f) }, - { SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000), - SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea), - SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000), - SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000), - SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea), - SPH_C32(0xb0a51834) }, - { SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), - SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), - SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000), - SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000), - SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae), - SPH_C32(0xa6b8c28d) }, - { SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000), - SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75), - SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000), - SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000), - SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156), - SPH_C32(0x3a4e99d7) }, - { SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), - SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), - SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000), - SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000), - SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6), - SPH_C32(0xe1844257) }, - { SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000), - SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512), - SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000), - SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000), - SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37), - SPH_C32(0x2c3b504e) }, - { SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000), - SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856), - SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000), - SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000), - SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4), - SPH_C32(0x524a0d59) }, - { SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000), - SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc), - SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000), - SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000), - SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88), - SPH_C32(0x378dd173) }, - { SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), - SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f), - SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000), - SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000), - SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4), - SPH_C32(0x8b6c72bd) }, - { SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780), - SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418), - SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000), - SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000), - SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d), - SPH_C32(0x8e67b7fa) }, - { SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280), - SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), - SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000), - SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000), - SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec), - SPH_C32(0x443d3004) }, - { SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80), - SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924), - SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000), - SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000), - SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a), - SPH_C32(0xf4f6ea7b) }, - { SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300), - SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), - SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000), - SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000), - SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8), - SPH_C32(0x979961d0) }, - { SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380), - SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6), - SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000), - SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000), - SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812), - SPH_C32(0x98aa496e) }, - { SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180), - SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), - SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000), - SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000), - SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec), - SPH_C32(0x094e3198) }, - { SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000), - SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736), - SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000), - SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000), - SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76), - SPH_C32(0xe86cba2e) }, - { SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000), - SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431), - SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000), - SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000), - SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9), - SPH_C32(0x4b7eec55) }, - { SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001), - SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd), - SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000), - SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800), - SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429), - SPH_C32(0x1e7536a6) }, - { SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000), - SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), - SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000), - SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000), - SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46), - SPH_C32(0x24314f17) }, - { SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e), - SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d), - SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000), - SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000), - SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222), - SPH_C32(0x9075b1ce) }, - { SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a), - SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), - SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000), - SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000), - SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa), - SPH_C32(0x9b6ef888) }, - { SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e), - SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167), - SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000), - SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000), - SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e), - SPH_C32(0xd8b61463) }, - { SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c), - SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), - SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000), - SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000), - SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2), - SPH_C32(0x3ea660f7) }, - { SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e), - SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce), - SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000), - SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000), - SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018), - SPH_C32(0x7f975691) }, - { SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), - SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), - SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000), - SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000), - SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd), - SPH_C32(0x2c94459e) }, - { SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000), - SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da), - SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000), - SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0), - SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c), - SPH_C32(0x56a7b19f) }, - { SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000), - SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), - SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000), - SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220), - SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8), - SPH_C32(0x81fdf908) }, - { SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000), - SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d), - SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000), - SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060), - SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06), - SPH_C32(0x5bd61539) }, - { SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000), - SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), - SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000), - SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480), - SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f), - SPH_C32(0x15b961e7) }, - { SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000), - SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6), - SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000), - SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800), - SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14), - SPH_C32(0x2a2c18f0) }, - { SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), - SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), - SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000), - SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000), - SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23), - SPH_C32(0x551e3d6e) }, - { SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000), - SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da), - SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000), - SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000), - SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254), - SPH_C32(0x33c5244f) }, - { SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000), - SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), - SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000), - SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800), - SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c), - SPH_C32(0x8a58e6a4) }, - { SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000), - SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f), - SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000), - SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002), - SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808), - SPH_C32(0xda878000) }, - { SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), - SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a), - SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000), - SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005), - SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb), - SPH_C32(0x3c5dfffe) }, - { SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000), - SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e), - SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000), - SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003), - SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752), - SPH_C32(0x7b1675d7) }, - { SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000), - SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), - SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000), - SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008), - SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3), - SPH_C32(0x2879ebac) }, - { SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000), - SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e), - SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000), - SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001), - SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60), - SPH_C32(0xbe0a679e) }, - { SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000), - SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), - SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000), - SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012), - SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf), - SPH_C32(0x30aebcf7) }, - { SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000), - SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57), - SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000), - SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0), - SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03), - SPH_C32(0xc7ff60f0) }, - { SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000), - SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), - SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000), - SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140), - SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877), - SPH_C32(0xe7e00a94) } -}; - -#define INPUT_BIG \ -do { \ - const __m256i zero = _mm256_setzero_si256(); \ - __m256i db = *buf; \ - const sph_u32 *tp = &T512[0][0]; \ - m0 = zero; \ - m1 = zero; \ - m2 = zero; \ - m3 = zero; \ - m4 = zero; \ - m5 = zero; \ - m6 = zero; \ - m7 = zero; \ - for ( int u = 0; u < 64; u++ ) \ - { \ - __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \ - dm = mm256_negate_32( _mm256_or_si256( dm, \ - _mm256_slli_epi64( dm, 32 ) ) ); \ - m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \ - _mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \ - tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \ - m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \ - _mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \ - tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \ - m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \ - _mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \ - tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \ - m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \ - _mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \ - tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \ - m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \ - _mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \ - tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \ - m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \ - _mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \ - tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \ - m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \ - _mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \ - tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \ - m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \ - _mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \ - tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \ - tp += 0x10; \ - db = _mm256_srli_epi64( db, 1 ); \ - } \ -} while (0) - -#define SBOX( a, b, c, d ) \ -do { \ - __m256i t; \ - t = a; \ - a = _mm256_and_si256( a, c ); \ - a = _mm256_xor_si256( a, d ); \ - c = _mm256_xor_si256( c, b ); \ - c = _mm256_xor_si256( c, a ); \ - d = _mm256_or_si256( d, t ); \ - d = _mm256_xor_si256( d, b ); \ - t = _mm256_xor_si256( t, c ); \ - b = d; \ - d = _mm256_or_si256( d, t ); \ - d = _mm256_xor_si256( d, a ); \ - a = _mm256_and_si256( a, b ); \ - t = _mm256_xor_si256( t, a ); \ - b = _mm256_xor_si256( b, d ); \ - b = _mm256_xor_si256( b, t ); \ - a = c; \ - c = b; \ - b = d; \ - d = mm256_not( t ); \ -} while (0) - -#define L( a, b, c, d ) \ -do { \ - a = mm256_rol_32( a, 13 ); \ - c = mm256_rol_32( c, 3 ); \ - b = _mm256_xor_si256( b, _mm256_xor_si256( a, c ) ); \ - d = _mm256_xor_si256( d, _mm256_xor_si256( c, \ - _mm256_slli_epi32( a, 3 ) ) ); \ - b = mm256_rol_32( b, 1 ); \ - d = mm256_rol_32( d, 7 ); \ - a = _mm256_xor_si256( a, _mm256_xor_si256( b, d ) ); \ - c = _mm256_xor_si256( c, _mm256_xor_si256( d, \ - _mm256_slli_epi32( b, 7 ) ) ); \ - a = mm256_rol_32( a, 5 ); \ - c = mm256_rol_32( c, 22 ); \ -} while (0) - -#define DECL_STATE_BIG \ - __m256i c0, c1, c2, c3, c4, c5, c6, c7; \ - -#define READ_STATE_BIG(sc) \ -do { \ - c0 = sc->h[0x0]; \ - c1 = sc->h[0x1]; \ - c2 = sc->h[0x2]; \ - c3 = sc->h[0x3]; \ - c4 = sc->h[0x4]; \ - c5 = sc->h[0x5]; \ - c6 = sc->h[0x6]; \ - c7 = sc->h[0x7]; \ -} while (0) - -#define WRITE_STATE_BIG(sc) \ -do { \ - sc->h[0x0] = c0; \ - sc->h[0x1] = c1; \ - sc->h[0x2] = c2; \ - sc->h[0x3] = c3; \ - sc->h[0x4] = c4; \ - sc->h[0x5] = c5; \ - sc->h[0x6] = c6; \ - sc->h[0x7] = c7; \ -} while (0) - -#define s0 m0 -#define s1 c0 -#define s2 m1 -#define s3 c1 -#define s4 c2 -#define s5 m2 -#define s6 c3 -#define s7 m3 -#define s8 m4 -#define s9 c4 -#define sA m5 -#define sB c5 -#define sC c6 -#define sD m6 -#define sE c7 -#define sF m7 - -#define ROUND_BIG(rc, alpha) \ -do { \ - __m256i t0, t1, t2, t3; \ - s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \ - alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \ - alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \ - s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \ - alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \ - alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \ - s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \ - alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \ - alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \ - s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \ - alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \ - alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \ - s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \ - alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \ - alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \ - s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \ - alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \ - alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \ - s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \ - alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \ - alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \ - s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \ - alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \ - alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \ - s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \ - alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \ - alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \ - s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \ - alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \ - alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \ - sA = _mm256_xor_si256( sA, _mm256_set_epi32( \ - alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \ - alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \ - sB = _mm256_xor_si256( sB, _mm256_set_epi32( \ - alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \ - alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \ - sC = _mm256_xor_si256( sC, _mm256_set_epi32( \ - alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \ - alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \ - sD = _mm256_xor_si256( sD, _mm256_set_epi32( \ - alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \ - alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \ - sE = _mm256_xor_si256( sE, _mm256_set_epi32( \ - alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \ - alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \ - sF = _mm256_xor_si256( sF, _mm256_set_epi32( \ - alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \ - alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \ -\ - SBOX( s0, s4, s8, sC ); \ - SBOX( s1, s5, s9, sD ); \ - SBOX( s2, s6, sA, sE ); \ - SBOX( s3, s7, sB, sF ); \ -\ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \ - _mm256_bslli_epi128( s5, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \ - _mm256_bslli_epi128( sE, 4 ), 0xAA ); \ - L( s0, t1, s9, t3 ); \ - s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ -\ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ - _mm256_bslli_epi128( s6, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \ - _mm256_bslli_epi128( sF, 4 ), 0xAA ); \ - L( s1, t1, sA, t3 ); \ - s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ -\ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \ - _mm256_bslli_epi128( s7, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \ - _mm256_bslli_epi128( sC, 4 ), 0xAA ); \ - L( s2, t1, sB, t3 ); \ - s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ -\ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \ - _mm256_bslli_epi128( s4, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \ - _mm256_bslli_epi128( sD, 4 ), 0xAA ); \ - L( s3, t1, s8, t3 ); \ - s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ -\ - t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \ - t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \ - t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \ - _mm256_bslli_epi128( sB, 4 ), 0xAA ); \ - L( t0, t1, t2, t3 ); \ - s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \ - s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \ - s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \ - s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \ - s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \ - sA = _mm256_blend_epi32( sA, t2, 0xAA ); \ - s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \ - sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \ -\ - t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ - _mm256_bslli_epi128( sD, 4 ), 0xAA ); \ - t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \ - L( t0, t1, t2, t3 ); \ - s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \ - sC = _mm256_blend_epi32( sC, t0, 0xAA ); \ - s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \ - sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \ - s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \ - sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \ - s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \ - sF = _mm256_blend_epi32( sF, t3, 0xAA ); \ -} while (0) - -#define P_BIG \ -do { \ - ROUND_BIG(0, alpha_n); \ - ROUND_BIG(1, alpha_n); \ - ROUND_BIG(2, alpha_n); \ - ROUND_BIG(3, alpha_n); \ - ROUND_BIG(4, alpha_n); \ - ROUND_BIG(5, alpha_n); \ -} while (0) - -#define PF_BIG \ -do { \ - ROUND_BIG( 0, alpha_f); \ - ROUND_BIG( 1, alpha_f); \ - ROUND_BIG( 2, alpha_f); \ - ROUND_BIG( 3, alpha_f); \ - ROUND_BIG( 4, alpha_f); \ - ROUND_BIG( 5, alpha_f); \ - ROUND_BIG( 6, alpha_f); \ - ROUND_BIG( 7, alpha_f); \ - ROUND_BIG( 8, alpha_f); \ - ROUND_BIG( 9, alpha_f); \ - ROUND_BIG(10, alpha_f); \ - ROUND_BIG(11, alpha_f); \ -} while (0) - -#define T_BIG \ -do { /* order is important */ \ - c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \ - c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \ - c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \ - c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \ - c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \ - c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \ - c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \ - c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \ -} while (0) - -void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num ) -{ - DECL_STATE_BIG - sph_u32 tmp; - - tmp = SPH_T32( (sph_u32)num << 6 ); - sc->count_low = SPH_T32( sc->count_low + tmp ); - sc->count_high += (sph_u32)( (num >> 13) >> 13 ); - if ( sc->count_low < tmp ) - sc->count_high++; - - READ_STATE_BIG( sc ); - while ( num-- > 0 ) - { - __m256i m0, m1, m2, m3, m4, m5, m6, m7; - - INPUT_BIG; - P_BIG; - T_BIG; - buf++; - } - WRITE_STATE_BIG( sc ); -} - -void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf ) -{ - __m256i m0, m1, m2, m3, m4, m5, m6, m7; - DECL_STATE_BIG - READ_STATE_BIG( sc ); - INPUT_BIG; - PF_BIG; - T_BIG; - WRITE_STATE_BIG( sc ); -} - -void hamsi512_4way_init( hamsi_4way_big_context *sc ) -{ - sc->partial_len = 0; - sph_u32 lo, hi; - sc->count_high = sc->count_low = 0; - for ( int i = 0; i < 8; i++ ) - { - lo = 2*i; - hi = 2*i + 1; - sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo], - IV512[hi], IV512[lo], IV512[hi], IV512[lo] ); - } -} - -void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len ) -{ - __m256i *vdata = (__m256i*)data; - -// It looks like the only way to get in here is if core was previously called -// with a very small len -// That's not likely even with 80 byte input so deprecate partial len -/* - if ( sc->partial_len != 0 ) - { - size_t mlen; - - mlen = 8 - sc->partial_len; - if ( len < mlen ) - { - memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 ); - sc->partial_len += len; - return; - } - else - { - memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 ); - len -= mlen; - vdata += mlen>>3; - hamsi_big( sc, sc->partial, 1 ); - sc->partial_len = 0; - } - } -*/ - - hamsi_big( sc, vdata, len>>3 ); - vdata += ( (len& ~(size_t)7) >> 3 ); - len &= (size_t)7; - memcpy_256( sc->buf, vdata, len>>3 ); - sc->partial_len = len; -} - -void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst ) -{ - __m256i pad[1]; - int ch, cl; - - sph_enc32be( &ch, sc->count_high ); - sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) ); - pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch ); - sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL, - 0UL, 0x80UL, 0UL, 0x80UL ); - hamsi_big( sc, sc->buf, 1 ); - hamsi_big_final( sc, pad ); - - mm256_block_bswap_32( (__m256i*)dst, sc->h ); -} - -#ifdef __cplusplus -} -#endif -#endif diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h deleted file mode 100644 index f70f3fe..0000000 --- a/algo/hamsi/hamsi-hash-4way.h +++ /dev/null @@ -1,72 +0,0 @@ -/* $Id: sph_hamsi.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * Hamsi interface. This code implements Hamsi with the recommended - * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_hamsi.h - * @author Thomas Pornin - */ - -#ifndef HAMSI_4WAY_H__ -#define HAMSI_4WAY_H__ - -#include -#include "algo/sha/sph_types.h" - -#if defined (__AVX2__) - -#include "simd-utils.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#define SPH_SIZE_hamsi512 512 - -// Partial is only scalar but needs pointer ref for hamsi-helper -// deprecate partial_len -typedef struct { - __m256i h[8]; - __m256i buf[1]; - size_t partial_len; - sph_u32 count_high, count_low; -} hamsi_4way_big_context; - -typedef hamsi_4way_big_context hamsi512_4way_context; - -void hamsi512_4way_init( hamsi512_4way_context *sc ); -void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len ); -void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst ); - -#ifdef __cplusplus -} -#endif - -#endif - -#endif diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c deleted file mode 100644 index c9e7ad8..0000000 --- a/algo/haval/haval-4way-helper.c +++ /dev/null @@ -1,115 +0,0 @@ -/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */ -/* - * Helper code, included (three times !) by HAVAL implementation. - * - * TODO: try to merge this with md_helper.c. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#undef SPH_XCAT -#define SPH_XCAT(a, b) SPH_XCAT_(a, b) -#undef SPH_XCAT_ -#define SPH_XCAT_(a, b) a ## b - -static void -SPH_XCAT(SPH_XCAT(haval, PASSES), _4way) -( haval_4way_context *sc, const void *data, size_t len ) -{ - __m128i *vdata = (__m128i*)data; - unsigned current; - - current = (unsigned)sc->count_low & 127U; - while ( len > 0 ) - { - unsigned clen; - sph_u32 clow, clow2; - - clen = 128U - current; - if ( clen > len ) - clen = len; - memcpy_128( sc->buf + (current>>2), vdata, clen>>2 ); - vdata += clen>>2; - current += clen; - len -= clen; - if ( current == 128U ) - { - DSTATE; - IN_PREPARE(sc->buf); - RSTATE; - SPH_XCAT(CORE, PASSES)(INW); - WSTATE; - current = 0; - } - clow = sc->count_low; - clow2 = SPH_T32(clow + clen); - sc->count_low = clow2; - if ( clow2 < clow ) - sc->count_high ++; - } -} - -static void -SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc, - void *dst) -{ - unsigned current; - DSTATE; - - current = (unsigned)sc->count_low & 127UL; - - sc->buf[ current>>2 ] = m128_one_32; - current += 4; - RSTATE; - if ( current > 116UL ) - { - memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 ); - do - { - IN_PREPARE(sc->buf); - SPH_XCAT(CORE, PASSES)(INW); - } while (0); - current = 0; - } - - uint32_t t1, t2; - memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 ); - t1 = 0x01 | (PASSES << 3); - t2 = sc->olen << 3; - sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) ); - sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 ); - sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3) - | (sc->count_low >> 29) ); - do - { - IN_PREPARE(sc->buf); - SPH_XCAT(CORE, PASSES)(INW); - } while (0); - WSTATE; - haval_4way_out( sc, dst ); -} diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c deleted file mode 100644 index 35cfd17..0000000 --- a/algo/haval/haval-hash-4way.c +++ /dev/null @@ -1,524 +0,0 @@ -/* $Id: haval.c 227 2010-06-16 17:28:38Z tp $ */ -/* - * HAVAL implementation. - * - * The HAVAL reference paper is of questionable clarity with regards to - * some details such as endianness of bits within a byte, bytes within - * a 32-bit word, or the actual ordering of words within a stream of - * words. This implementation has been made compatible with the reference - * implementation available on: http://labs.calyptix.com/haval.php - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include -#include "haval-hash-4way.h" - -// won't compile with sse4.2 -//#if defined (__SSE4_2__) -#if defined(__AVX__) - -#ifdef __cplusplus -extern "C"{ -#endif - -//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAVAL -#define SPH_SMALL_FOOTPRINT_HAVAL 1 -//#endif - -#define F1(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( x0, \ - _mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \ - _mm_xor_si128( _mm_and_si128( x2, x5 ), \ - _mm_and_si128( x3, x6 ) ) ) ) \ - -#define F2(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( \ - _mm_and_si128( x2, \ - _mm_xor_si128( _mm_andnot_si128( x3, x1 ), \ - _mm_xor_si128( _mm_and_si128( x4, x5 ), \ - _mm_xor_si128( x6, x0 ) ) ) ), \ - _mm_xor_si128( \ - _mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \ - _mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \ - -#define F3(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( \ - _mm_and_si128( x3, \ - _mm_xor_si128( _mm_and_si128( x1, x2 ), \ - _mm_xor_si128( x6, x0 ) ) ), \ - _mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \ - _mm_and_si128( x2, x5 ) ), x0 ) ) - -#define F4(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( \ - _mm_xor_si128( \ - _mm_and_si128( x3, \ - _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \ - _mm_or_si128( x4, x6 ) ), x5 ) ), \ - _mm_and_si128( x4, \ - _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm128_not(x2), x5 ), \ - _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \ - _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) ) - - -#define F5(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( \ - _mm_and_si128( x0, \ - mm128_not( _mm_xor_si128( \ - _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \ - _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \ - _mm_and_si128( x2, x5 ) ), \ - _mm_and_si128( x3, x6 ) ) ) - -/* - * The macros below integrate the phi() permutations, depending on the - * pass and the total number of passes. - */ - -#define FP3_1(x6, x5, x4, x3, x2, x1, x0) \ - F1(x1, x0, x3, x5, x6, x2, x4) -#define FP3_2(x6, x5, x4, x3, x2, x1, x0) \ - F2(x4, x2, x1, x0, x5, x3, x6) -#define FP3_3(x6, x5, x4, x3, x2, x1, x0) \ - F3(x6, x1, x2, x3, x4, x5, x0) - -#define FP4_1(x6, x5, x4, x3, x2, x1, x0) \ - F1(x2, x6, x1, x4, x5, x3, x0) -#define FP4_2(x6, x5, x4, x3, x2, x1, x0) \ - F2(x3, x5, x2, x0, x1, x6, x4) -#define FP4_3(x6, x5, x4, x3, x2, x1, x0) \ - F3(x1, x4, x3, x6, x0, x2, x5) -#define FP4_4(x6, x5, x4, x3, x2, x1, x0) \ - F4(x6, x4, x0, x5, x2, x1, x3) - -#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \ - F1(x3, x4, x1, x0, x5, x2, x6) -#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \ - F2(x6, x2, x1, x0, x3, x4, x5) -#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \ - F3(x2, x6, x0, x4, x3, x1, x5) -#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \ - F4(x1, x5, x3, x2, x0, x4, x6) -#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \ - F5(x2, x5, x0, x6, x4, x3, x1) - -/* - * One step, for "n" passes, pass number "p" (1 <= p <= n), using - * input word number "w" and step constant "c". - */ -#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \ -do { \ - __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ - x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \ - mm128_ror_32( x7, 11 ) ), \ - _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \ -} while (0) - -/* - * PASSy(n, in) computes pass number "y", for a total of "n", using the - * one-argument macro "in" to access input words. Current state is assumed - * to be held in variables "s0" to "s7". - */ - -//#if SPH_SMALL_FOOTPRINT_HAVAL - -#define PASS1(n, in) do { \ - unsigned pass_count; \ - for (pass_count = 0; pass_count < 32; pass_count += 8) { \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \ - in(pass_count + 0), SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \ - in(pass_count + 1), SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \ - in(pass_count + 2), SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \ - in(pass_count + 3), SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \ - in(pass_count + 4), SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \ - in(pass_count + 5), SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \ - in(pass_count + 6), SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \ - in(pass_count + 7), SPH_C32(0x00000000)); \ - } \ - } while (0) - -#define PASSG(p, n, in) do { \ - unsigned pass_count; \ - for (pass_count = 0; pass_count < 32; pass_count += 8) { \ - STEP(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \ - in(MP ## p[pass_count + 0]), \ - RK ## p[pass_count + 0]); \ - STEP(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \ - in(MP ## p[pass_count + 1]), \ - RK ## p[pass_count + 1]); \ - STEP(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \ - in(MP ## p[pass_count + 2]), \ - RK ## p[pass_count + 2]); \ - STEP(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \ - in(MP ## p[pass_count + 3]), \ - RK ## p[pass_count + 3]); \ - STEP(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \ - in(MP ## p[pass_count + 4]), \ - RK ## p[pass_count + 4]); \ - STEP(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \ - in(MP ## p[pass_count + 5]), \ - RK ## p[pass_count + 5]); \ - STEP(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \ - in(MP ## p[pass_count + 6]), \ - RK ## p[pass_count + 6]); \ - STEP(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \ - in(MP ## p[pass_count + 7]), \ - RK ## p[pass_count + 7]); \ - } \ - } while (0) - -#define PASS2(n, in) PASSG(2, n, in) -#define PASS3(n, in) PASSG(3, n, in) -#define PASS4(n, in) PASSG(4, n, in) -#define PASS5(n, in) PASSG(5, n, in) - -static const unsigned MP2[32] = { - 5, 14, 26, 18, 11, 28, 7, 16, - 0, 23, 20, 22, 1, 10, 4, 8, - 30, 3, 21, 9, 17, 24, 29, 6, - 19, 12, 15, 13, 2, 25, 31, 27 -}; - -static const unsigned MP3[32] = { - 19, 9, 4, 20, 28, 17, 8, 22, - 29, 14, 25, 12, 24, 30, 16, 26, - 31, 15, 7, 3, 1, 0, 18, 27, - 13, 6, 21, 10, 23, 11, 5, 2 -}; - -static const unsigned MP4[32] = { - 24, 4, 0, 14, 2, 7, 28, 23, - 26, 6, 30, 20, 18, 25, 19, 3, - 22, 11, 31, 21, 8, 27, 12, 9, - 1, 29, 5, 15, 17, 10, 16, 13 -}; - -static const unsigned MP5[32] = { - 27, 3, 21, 26, 17, 11, 20, 29, - 19, 0, 12, 7, 13, 8, 31, 10, - 5, 9, 14, 30, 18, 6, 28, 24, - 2, 23, 16, 22, 4, 1, 25, 15 -}; - -static const sph_u32 RK2[32] = { - SPH_C32(0x452821E6), SPH_C32(0x38D01377), - SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), - SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), - SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917), - SPH_C32(0x9216D5D9), SPH_C32(0x8979FB1B), - SPH_C32(0xD1310BA6), SPH_C32(0x98DFB5AC), - SPH_C32(0x2FFD72DB), SPH_C32(0xD01ADFB7), - SPH_C32(0xB8E1AFED), SPH_C32(0x6A267E96), - SPH_C32(0xBA7C9045), SPH_C32(0xF12C7F99), - SPH_C32(0x24A19947), SPH_C32(0xB3916CF7), - SPH_C32(0x0801F2E2), SPH_C32(0x858EFC16), - SPH_C32(0x636920D8), SPH_C32(0x71574E69), - SPH_C32(0xA458FEA3), SPH_C32(0xF4933D7E), - SPH_C32(0x0D95748F), SPH_C32(0x728EB658), - SPH_C32(0x718BCD58), SPH_C32(0x82154AEE), - SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5) -}; - -static const sph_u32 RK3[32] = { - SPH_C32(0x9C30D539), SPH_C32(0x2AF26013), - SPH_C32(0xC5D1B023), SPH_C32(0x286085F0), - SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF), - SPH_C32(0x8E79DCB0), SPH_C32(0x603A180E), - SPH_C32(0x6C9E0E8B), SPH_C32(0xB01E8A3E), - SPH_C32(0xD71577C1), SPH_C32(0xBD314B27), - SPH_C32(0x78AF2FDA), SPH_C32(0x55605C60), - SPH_C32(0xE65525F3), SPH_C32(0xAA55AB94), - SPH_C32(0x57489862), SPH_C32(0x63E81440), - SPH_C32(0x55CA396A), SPH_C32(0x2AAB10B6), - SPH_C32(0xB4CC5C34), SPH_C32(0x1141E8CE), - SPH_C32(0xA15486AF), SPH_C32(0x7C72E993), - SPH_C32(0xB3EE1411), SPH_C32(0x636FBC2A), - SPH_C32(0x2BA9C55D), SPH_C32(0x741831F6), - SPH_C32(0xCE5C3E16), SPH_C32(0x9B87931E), - SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C) -}; - -static const sph_u32 RK4[32] = { - SPH_C32(0x7A325381), SPH_C32(0x28958677), - SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF), - SPH_C32(0xC4BFE81B), SPH_C32(0x66282193), - SPH_C32(0x61D809CC), SPH_C32(0xFB21A991), - SPH_C32(0x487CAC60), SPH_C32(0x5DEC8032), - SPH_C32(0xEF845D5D), SPH_C32(0xE98575B1), - SPH_C32(0xDC262302), SPH_C32(0xEB651B88), - SPH_C32(0x23893E81), SPH_C32(0xD396ACC5), - SPH_C32(0x0F6D6FF3), SPH_C32(0x83F44239), - SPH_C32(0x2E0B4482), SPH_C32(0xA4842004), - SPH_C32(0x69C8F04A), SPH_C32(0x9E1F9B5E), - SPH_C32(0x21C66842), SPH_C32(0xF6E96C9A), - SPH_C32(0x670C9C61), SPH_C32(0xABD388F0), - SPH_C32(0x6A51A0D2), SPH_C32(0xD8542F68), - SPH_C32(0x960FA728), SPH_C32(0xAB5133A3), - SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4) -}; - -static const sph_u32 RK5[32] = { - SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98), - SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176), - SPH_C32(0x66CA593E), SPH_C32(0x82430E88), - SPH_C32(0x8CEE8619), SPH_C32(0x456F9FB4), - SPH_C32(0x7D84A5C3), SPH_C32(0x3B8B5EBE), - SPH_C32(0xE06F75D8), SPH_C32(0x85C12073), - SPH_C32(0x401A449F), SPH_C32(0x56C16AA6), - SPH_C32(0x4ED3AA62), SPH_C32(0x363F7706), - SPH_C32(0x1BFEDF72), SPH_C32(0x429B023D), - SPH_C32(0x37D0D724), SPH_C32(0xD00A1248), - SPH_C32(0xDB0FEAD3), SPH_C32(0x49F1C09B), - SPH_C32(0x075372C9), SPH_C32(0x80991B7B), - SPH_C32(0x25D479D8), SPH_C32(0xF6E8DEF7), - SPH_C32(0xE3FE501A), SPH_C32(0xB6794C3B), - SPH_C32(0x976CE0BD), SPH_C32(0x04C006BA), - SPH_C32(0xC1A94FB6), SPH_C32(0x409F60C4) -}; - -#define SAVE_STATE \ - __m128i u0, u1, u2, u3, u4, u5, u6, u7; \ - do { \ - u0 = s0; \ - u1 = s1; \ - u2 = s2; \ - u3 = s3; \ - u4 = s4; \ - u5 = s5; \ - u6 = s6; \ - u7 = s7; \ - } while (0) - -#define UPDATE_STATE \ -do { \ - s0 = _mm_add_epi32( s0, u0 ); \ - s1 = _mm_add_epi32( s1, u1 ); \ - s2 = _mm_add_epi32( s2, u2 ); \ - s3 = _mm_add_epi32( s3, u3 ); \ - s4 = _mm_add_epi32( s4, u4 ); \ - s5 = _mm_add_epi32( s5, u5 ); \ - s6 = _mm_add_epi32( s6, u6 ); \ - s7 = _mm_add_epi32( s7, u7 ); \ -} while (0) - -/* - * COREn(in) performs the core HAVAL computation for "n" passes, using - * the one-argument macro "in" to access the input words. Running state - * is held in variable "s0" to "s7". - */ -/* -#define CORE3(in) do { \ - SAVE_STATE; \ - PASS1(3, in); \ - PASS2(3, in); \ - PASS3(3, in); \ - UPDATE_STATE; \ - } while (0) - -#define CORE4(in) do { \ - SAVE_STATE; \ - PASS1(4, in); \ - PASS2(4, in); \ - PASS3(4, in); \ - PASS4(4, in); \ - UPDATE_STATE; \ - } while (0) -*/ -#define CORE5(in) do { \ - SAVE_STATE; \ - PASS1(5, in); \ - PASS2(5, in); \ - PASS3(5, in); \ - PASS4(5, in); \ - PASS5(5, in); \ - UPDATE_STATE; \ - } while (0) - -/* - * DSTATE declares the state variables "s0" to "s7". - */ -#define DSTATE __m128i s0, s1, s2, s3, s4, s5, s6, s7 - -/* - * RSTATE fills the state variables from the context "sc". - */ -#define RSTATE \ -do { \ - s0 = sc->s0; \ - s1 = sc->s1; \ - s2 = sc->s2; \ - s3 = sc->s3; \ - s4 = sc->s4; \ - s5 = sc->s5; \ - s6 = sc->s6; \ - s7 = sc->s7; \ -} while (0) - -/* - * WSTATE updates the context "sc" from the state variables. - */ -#define WSTATE \ -do { \ - sc->s0 = s0; \ - sc->s1 = s1; \ - sc->s2 = s2; \ - sc->s3 = s3; \ - sc->s4 = s4; \ - sc->s5 = s5; \ - sc->s6 = s6; \ - sc->s7 = s7; \ -} while (0) - -/* - * Initialize a context. "olen" is the output length, in 32-bit words - * (between 4 and 8, inclusive). "passes" is the number of passes - * (3, 4 or 5). - */ -static void -haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes ) -{ - sc->s0 = _mm_set1_epi32( 0x243F6A88UL ); - sc->s1 = _mm_set1_epi32( 0x85A308D3UL ); - sc->s2 = _mm_set1_epi32( 0x13198A2EUL ); - sc->s3 = _mm_set1_epi32( 0x03707344UL ); - sc->s4 = _mm_set1_epi32( 0xA4093822UL ); - sc->s5 = _mm_set1_epi32( 0x299F31D0UL ); - sc->s6 = _mm_set1_epi32( 0x082EFA98UL ); - sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL ); - sc->olen = olen; - sc->passes = passes; - sc->count_high = 0; - sc->count_low = 0; - -} - -#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata) - -#define INW(i) load_ptr[ i ] - -/* - * Write out HAVAL output. The output length is tailored to the requested - * length. - */ -static void -haval_4way_out( haval_4way_context *sc, void *dst ) -{ - __m128i *buf = (__m128i*)dst; - DSTATE; - RSTATE; - - buf[0] = s0; - buf[1] = s1; - buf[2] = s2; - buf[3] = s3; - buf[4] = s4; - buf[5] = s5; - buf[6] = s6; - buf[7] = s7; -} - -/* - * The main core functions inline the code with the COREx() macros. We - * use a helper file, included three times, which avoids code copying. - */ -/* -#undef PASSES -#define PASSES 3 -#include "haval-helper.c" - -#undef PASSES -#define PASSES 4 -#include "haval-helper.c" -*/ - -#undef PASSES -#define PASSES 5 -#include "haval-4way-helper.c" - -/* ====================================================================== */ - -#define API(xxx, y) \ -void \ -haval ## xxx ## _ ## y ## _4way_init(void *cc) \ -{ \ - haval_4way_init(cc, xxx >> 5, y); \ -} \ - \ -void \ -haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \ -{ \ - haval ## y ## _4way(cc, data, len); \ -} \ - \ -void \ -haval ## xxx ## _ ## y ## _4way_close(void *cc, void *dst) \ -{ \ - haval ## y ## _4way_close(cc, dst); \ -} \ - -API(256, 5) - -#define RVAL \ -do { \ - s0 = val[0]; \ - s1 = val[1]; \ - s2 = val[2]; \ - s3 = val[3]; \ - s4 = val[4]; \ - s5 = val[5]; \ - s6 = val[6]; \ - s7 = val[7]; \ -} while (0) - -#define WVAL \ -do { \ - val[0] = s0; \ - val[1] = s1; \ - val[2] = s2; \ - val[3] = s3; \ - val[4] = s4; \ - val[5] = s5; \ - val[6] = s6; \ - val[7] = s7; \ -} while (0) - -#define INMSG(i) msg[i] - -#ifdef __cplusplus -} -#endif -#endif diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h deleted file mode 100644 index 47338ce..0000000 --- a/algo/haval/haval-hash-4way.h +++ /dev/null @@ -1,95 +0,0 @@ -/* $Id: sph_haval.h 218 2010-06-08 17:06:34Z tp $ */ -/** - * HAVAL interface. - * - * HAVAL is actually a family of 15 hash functions, depending on whether - * the internal computation uses 3, 4 or 5 passes, and on the output - * length, which is 128, 160, 192, 224 or 256 bits. This implementation - * provides interface functions for all 15, which internally map to - * three cores (depending on the number of passes). Note that output - * lengths other than 256 bits are not obtained by a simple truncation - * of a longer result; the requested length is encoded within the - * padding data. - * - * HAVAL was published in: Yuliang Zheng, Josef Pieprzyk and Jennifer - * Seberry: "HAVAL -- a one-way hashing algorithm with variable length - * of output", Advances in Cryptology -- AUSCRYPT'92, Lecture Notes in - * Computer Science, Vol.718, pp.83-104, Springer-Verlag, 1993. - * - * This paper, and a reference implementation, are available on the - * Calyptix web site: http://labs.calyptix.com/haval.php - * - * The HAVAL reference paper is quite unclear on the data encoding - * details, i.e. endianness (both byte order within a 32-bit word, and - * word order within a message block). This implementation has been - * made compatible with the reference implementation referenced above. - * - * @warning A collision for HAVAL-128/3 (HAVAL with three passes and - * 128-bit output) has been published; this function is thus considered - * as cryptographically broken. The status for other variants is unclear; - * use only with care. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_haval.h - * @author Thomas Pornin - */ - -#ifndef HAVAL_HASH_4WAY_H__ -#define HAVAL_HASH_4WAY_H__ - -#if defined(__AVX__) - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "algo/sha/sph_types.h" -#include "simd-utils.h" - -#define SPH_SIZE_haval256_5 256 - -typedef struct { - __m128i buf[32]; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - unsigned olen, passes; - sph_u32 count_high, count_low; -} haval_4way_context; - -typedef haval_4way_context haval256_5_4way_context; - -void haval256_5_4way_init( void *cc ); - -void haval256_5_4way( void *cc, const void *data, size_t len ); - -void haval256_5_4way_close( void *cc, void *dst ); - -#ifdef __cplusplus -} -#endif -#endif -#endif diff --git a/algo/haval/haval-helper.c b/algo/haval/haval-helper.c deleted file mode 100644 index d12f7d6..0000000 --- a/algo/haval/haval-helper.c +++ /dev/null @@ -1,190 +0,0 @@ -/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */ -/* - * Helper code, included (three times !) by HAVAL implementation. - * - * TODO: try to merge this with md_helper.c. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#undef SPH_XCAT -#define SPH_XCAT(a, b) SPH_XCAT_(a, b) -#undef SPH_XCAT_ -#define SPH_XCAT_(a, b) a ## b - -static void -#ifdef SPH_UPTR -SPH_XCAT(SPH_XCAT(haval, PASSES), _short) -#else -SPH_XCAT(haval, PASSES) -#endif -(sph_haval_context *sc, const void *data, size_t len) -{ - unsigned current; - -#if SPH_64 - current = (unsigned)sc->count & 127U; -#else - current = (unsigned)sc->count_low & 127U; -#endif - while (len > 0) { - unsigned clen; -#if !SPH_64 - sph_u32 clow, clow2; -#endif - - clen = 128U - current; - if (clen > len) - clen = len; - memcpy(sc->buf + current, data, clen); - data = (const unsigned char *)data + clen; - current += clen; - len -= clen; - if (current == 128U) { - DSTATE; - IN_PREPARE(sc->buf); - RSTATE; - SPH_XCAT(CORE, PASSES)(INW); - WSTATE; - current = 0; - } -#if SPH_64 - sc->count += clen; -#else - clow = sc->count_low; - clow2 = SPH_T32(clow + clen); - sc->count_low = clow2; - if (clow2 < clow) - sc->count_high ++; -#endif - } -} - -#ifdef SPH_UPTR -static void -SPH_XCAT(haval, PASSES)(sph_haval_context *sc, const void *data, size_t len) -{ - unsigned current; - size_t orig_len; -#if !SPH_64 - sph_u32 clow, clow2; -#endif - DSTATE; - - if (len < 256U) { - SPH_XCAT(SPH_XCAT(haval, PASSES), _short)(sc, data, len); - return; - } -#if SPH_64 - current = (unsigned)sc->count & 127U; -#else - current = (unsigned)sc->count_low & 127U; -#endif - if (current > 0) { - unsigned clen; - clen = 128U - current; - SPH_XCAT(SPH_XCAT(haval, PASSES), _short)(sc, data, clen); - data = (const unsigned char *)data + clen; - len -= clen; - } -#if !SPH_UNALIGNED - if (((SPH_UPTR)data & 3U) != 0) { - SPH_XCAT(SPH_XCAT(haval, PASSES), _short)(sc, data, len); - return; - } -#endif - orig_len = len; - RSTATE; - while (len >= 128U) { - IN_PREPARE(data); - SPH_XCAT(CORE, PASSES)(INW); - data = (const unsigned char *)data + 128U; - len -= 128U; - } - WSTATE; - if (len > 0) - memcpy(sc->buf, data, len); -#if SPH_64 - sc->count += (sph_u64)orig_len; -#else - clow = sc->count_low; - clow2 = SPH_T32(clow + orig_len); - sc->count_low = clow2; - if (clow2 < clow) - sc->count_high ++; - orig_len >>= 12; - orig_len >>= 10; - orig_len >>= 10; - sc->count_high += orig_len; -#endif -} -#endif - -static void -SPH_XCAT(SPH_XCAT(haval, PASSES), _close)(sph_haval_context *sc, - unsigned ub, unsigned n, void *dst) -{ - unsigned current; - DSTATE; - -#if SPH_64 - current = (unsigned)sc->count & 127U; -#else - current = (unsigned)sc->count_low & 127U; -#endif - sc->buf[current ++] = (0x01 << n) | ((ub & 0xFF) >> (8 - n)); - RSTATE; - if (current > 118U) { - memset(sc->buf + current, 0, 128U - current); - - do { - IN_PREPARE(sc->buf); - SPH_XCAT(CORE, PASSES)(INW); - } while (0); - current = 0; - } - memset(sc->buf + current, 0, 118U - current); - sc->buf[118] = 0x01 | (PASSES << 3); - sc->buf[119] = sc->olen << 3; -#if SPH_64 - sph_enc64le_aligned(sc->buf + 120, SPH_T64(sc->count << 3)); -#else - sph_enc32le_aligned(sc->buf + 120, SPH_T32(sc->count_low << 3)); - sph_enc32le_aligned(sc->buf + 124, - SPH_T32((sc->count_high << 3) | (sc->count_low >> 29))); -#endif - - do { - IN_PREPARE(sc->buf); - SPH_XCAT(CORE, PASSES)(INW); - } while (0); - WSTATE; - - haval_out(sc, dst); -// haval_init(sc, sc->olen, sc->passes); -} diff --git a/algo/haval/haval.c b/algo/haval/haval.c deleted file mode 100644 index 269005f..0000000 --- a/algo/haval/haval.c +++ /dev/null @@ -1,983 +0,0 @@ -/* $Id: haval.c 227 2010-06-16 17:28:38Z tp $ */ -/* - * HAVAL implementation. - * - * The HAVAL reference paper is of questionable clarity with regards to - * some details such as endianness of bits within a byte, bytes within - * a 32-bit word, or the actual ordering of words within a stream of - * words. This implementation has been made compatible with the reference - * implementation available on: http://labs.calyptix.com/haval.php - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph-haval.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAVAL -#define SPH_SMALL_FOOTPRINT_HAVAL 1 -#endif - -/* - * Basic definition from the reference paper. - * -#define F1(x6, x5, x4, x3, x2, x1, x0) \ - (((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ ((x0) & (x1)) ^ (x0)) - * - */ - -#define F1(x6, x5, x4, x3, x2, x1, x0) \ - (((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0)) - -/* - * Basic definition from the reference paper. - * -#define F2(x6, x5, x4, x3, x2, x1, x0) \ - (((x1) & (x2) & (x3)) ^ ((x2) & (x4) & (x5)) ^ ((x1) & (x2)) \ - ^ ((x1) & (x4)) ^ ((x2) & (x6)) ^ ((x3) & (x5)) \ - ^ ((x4) & (x5)) ^ ((x0) & (x2)) ^ (x0)) - * - */ - -#define F2(x6, x5, x4, x3, x2, x1, x0) \ - (((x2) & (((x1) & ~(x3)) ^ ((x4) & (x5)) ^ (x6) ^ (x0))) \ - ^ ((x4) & ((x1) ^ (x5))) ^ ((x3 & (x5)) ^ (x0))) - -/* - * Basic definition from the reference paper. - * -#define F3(x6, x5, x4, x3, x2, x1, x0) \ - (((x1) & (x2) & (x3)) ^ ((x1) & (x4)) ^ ((x2) & (x5)) \ - ^ ((x3) & (x6)) ^ ((x0) & (x3)) ^ (x0)) - * - */ - -#define F3(x6, x5, x4, x3, x2, x1, x0) \ - (((x3) & (((x1) & (x2)) ^ (x6) ^ (x0))) \ - ^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ (x0)) - -/* - * Basic definition from the reference paper. - * -#define F4(x6, x5, x4, x3, x2, x1, x0) \ - (((x1) & (x2) & (x3)) ^ ((x2) & (x4) & (x5)) ^ ((x3) & (x4) & (x6)) \ - ^ ((x1) & (x4)) ^ ((x2) & (x6)) ^ ((x3) & (x4)) ^ ((x3) & (x5)) \ - ^ ((x3) & (x6)) ^ ((x4) & (x5)) ^ ((x4) & (x6)) ^ ((x0) & (x4)) ^ (x0)) - * - */ - -#define F4(x6, x5, x4, x3, x2, x1, x0) \ - (((x3) & (((x1) & (x2)) ^ ((x4) | (x6)) ^ (x5))) \ - ^ ((x4) & ((~(x2) & (x5)) ^ (x1) ^ (x6) ^ (x0))) \ - ^ ((x2) & (x6)) ^ (x0)) - -/* - * Basic definition from the reference paper. - * -#define F5(x6, x5, x4, x3, x2, x1, x0) \ - (((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)) \ - ^ ((x0) & (x1) & (x2) & (x3)) ^ ((x0) & (x5)) ^ (x0)) - * - */ - -#define F5(x6, x5, x4, x3, x2, x1, x0) \ - (((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \ - ^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6))) - -/* - * The macros below integrate the phi() permutations, depending on the - * pass and the total number of passes. - */ - -#define FP3_1(x6, x5, x4, x3, x2, x1, x0) \ - F1(x1, x0, x3, x5, x6, x2, x4) -#define FP3_2(x6, x5, x4, x3, x2, x1, x0) \ - F2(x4, x2, x1, x0, x5, x3, x6) -#define FP3_3(x6, x5, x4, x3, x2, x1, x0) \ - F3(x6, x1, x2, x3, x4, x5, x0) - -#define FP4_1(x6, x5, x4, x3, x2, x1, x0) \ - F1(x2, x6, x1, x4, x5, x3, x0) -#define FP4_2(x6, x5, x4, x3, x2, x1, x0) \ - F2(x3, x5, x2, x0, x1, x6, x4) -#define FP4_3(x6, x5, x4, x3, x2, x1, x0) \ - F3(x1, x4, x3, x6, x0, x2, x5) -#define FP4_4(x6, x5, x4, x3, x2, x1, x0) \ - F4(x6, x4, x0, x5, x2, x1, x3) - -#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \ - F1(x3, x4, x1, x0, x5, x2, x6) -#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \ - F2(x6, x2, x1, x0, x3, x4, x5) -#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \ - F3(x2, x6, x0, x4, x3, x1, x5) -#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \ - F4(x1, x5, x3, x2, x0, x4, x6) -#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \ - F5(x2, x5, x0, x6, x4, x3, x1) - -/* - * One step, for "n" passes, pass number "p" (1 <= p <= n), using - * input word number "w" and step constant "c". - */ -#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) do { \ - sph_u32 t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ - (x7) = SPH_T32(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \ - + (w) + (c)); \ - } while (0) - -/* - * PASSy(n, in) computes pass number "y", for a total of "n", using the - * one-argument macro "in" to access input words. Current state is assumed - * to be held in variables "s0" to "s7". - */ - -#if SPH_SMALL_FOOTPRINT_HAVAL - -#define PASS1(n, in) do { \ - unsigned pass_count; \ - for (pass_count = 0; pass_count < 32; pass_count += 8) { \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \ - in(pass_count + 0), SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \ - in(pass_count + 1), SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \ - in(pass_count + 2), SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \ - in(pass_count + 3), SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \ - in(pass_count + 4), SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \ - in(pass_count + 5), SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \ - in(pass_count + 6), SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \ - in(pass_count + 7), SPH_C32(0x00000000)); \ - } \ - } while (0) - -#define PASSG(p, n, in) do { \ - unsigned pass_count; \ - for (pass_count = 0; pass_count < 32; pass_count += 8) { \ - STEP(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \ - in(MP ## p[pass_count + 0]), \ - RK ## p[pass_count + 0]); \ - STEP(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \ - in(MP ## p[pass_count + 1]), \ - RK ## p[pass_count + 1]); \ - STEP(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \ - in(MP ## p[pass_count + 2]), \ - RK ## p[pass_count + 2]); \ - STEP(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \ - in(MP ## p[pass_count + 3]), \ - RK ## p[pass_count + 3]); \ - STEP(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \ - in(MP ## p[pass_count + 4]), \ - RK ## p[pass_count + 4]); \ - STEP(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \ - in(MP ## p[pass_count + 5]), \ - RK ## p[pass_count + 5]); \ - STEP(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \ - in(MP ## p[pass_count + 6]), \ - RK ## p[pass_count + 6]); \ - STEP(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \ - in(MP ## p[pass_count + 7]), \ - RK ## p[pass_count + 7]); \ - } \ - } while (0) - -#define PASS2(n, in) PASSG(2, n, in) -#define PASS3(n, in) PASSG(3, n, in) -#define PASS4(n, in) PASSG(4, n, in) -#define PASS5(n, in) PASSG(5, n, in) - -static const unsigned MP2[32] = { - 5, 14, 26, 18, 11, 28, 7, 16, - 0, 23, 20, 22, 1, 10, 4, 8, - 30, 3, 21, 9, 17, 24, 29, 6, - 19, 12, 15, 13, 2, 25, 31, 27 -}; - -static const unsigned MP3[32] = { - 19, 9, 4, 20, 28, 17, 8, 22, - 29, 14, 25, 12, 24, 30, 16, 26, - 31, 15, 7, 3, 1, 0, 18, 27, - 13, 6, 21, 10, 23, 11, 5, 2 -}; - -static const unsigned MP4[32] = { - 24, 4, 0, 14, 2, 7, 28, 23, - 26, 6, 30, 20, 18, 25, 19, 3, - 22, 11, 31, 21, 8, 27, 12, 9, - 1, 29, 5, 15, 17, 10, 16, 13 -}; - -static const unsigned MP5[32] = { - 27, 3, 21, 26, 17, 11, 20, 29, - 19, 0, 12, 7, 13, 8, 31, 10, - 5, 9, 14, 30, 18, 6, 28, 24, - 2, 23, 16, 22, 4, 1, 25, 15 -}; - -static const sph_u32 RK2[32] = { - SPH_C32(0x452821E6), SPH_C32(0x38D01377), - SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), - SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), - SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917), - SPH_C32(0x9216D5D9), SPH_C32(0x8979FB1B), - SPH_C32(0xD1310BA6), SPH_C32(0x98DFB5AC), - SPH_C32(0x2FFD72DB), SPH_C32(0xD01ADFB7), - SPH_C32(0xB8E1AFED), SPH_C32(0x6A267E96), - SPH_C32(0xBA7C9045), SPH_C32(0xF12C7F99), - SPH_C32(0x24A19947), SPH_C32(0xB3916CF7), - SPH_C32(0x0801F2E2), SPH_C32(0x858EFC16), - SPH_C32(0x636920D8), SPH_C32(0x71574E69), - SPH_C32(0xA458FEA3), SPH_C32(0xF4933D7E), - SPH_C32(0x0D95748F), SPH_C32(0x728EB658), - SPH_C32(0x718BCD58), SPH_C32(0x82154AEE), - SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5) -}; - -static const sph_u32 RK3[32] = { - SPH_C32(0x9C30D539), SPH_C32(0x2AF26013), - SPH_C32(0xC5D1B023), SPH_C32(0x286085F0), - SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF), - SPH_C32(0x8E79DCB0), SPH_C32(0x603A180E), - SPH_C32(0x6C9E0E8B), SPH_C32(0xB01E8A3E), - SPH_C32(0xD71577C1), SPH_C32(0xBD314B27), - SPH_C32(0x78AF2FDA), SPH_C32(0x55605C60), - SPH_C32(0xE65525F3), SPH_C32(0xAA55AB94), - SPH_C32(0x57489862), SPH_C32(0x63E81440), - SPH_C32(0x55CA396A), SPH_C32(0x2AAB10B6), - SPH_C32(0xB4CC5C34), SPH_C32(0x1141E8CE), - SPH_C32(0xA15486AF), SPH_C32(0x7C72E993), - SPH_C32(0xB3EE1411), SPH_C32(0x636FBC2A), - SPH_C32(0x2BA9C55D), SPH_C32(0x741831F6), - SPH_C32(0xCE5C3E16), SPH_C32(0x9B87931E), - SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C) -}; - -static const sph_u32 RK4[32] = { - SPH_C32(0x7A325381), SPH_C32(0x28958677), - SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF), - SPH_C32(0xC4BFE81B), SPH_C32(0x66282193), - SPH_C32(0x61D809CC), SPH_C32(0xFB21A991), - SPH_C32(0x487CAC60), SPH_C32(0x5DEC8032), - SPH_C32(0xEF845D5D), SPH_C32(0xE98575B1), - SPH_C32(0xDC262302), SPH_C32(0xEB651B88), - SPH_C32(0x23893E81), SPH_C32(0xD396ACC5), - SPH_C32(0x0F6D6FF3), SPH_C32(0x83F44239), - SPH_C32(0x2E0B4482), SPH_C32(0xA4842004), - SPH_C32(0x69C8F04A), SPH_C32(0x9E1F9B5E), - SPH_C32(0x21C66842), SPH_C32(0xF6E96C9A), - SPH_C32(0x670C9C61), SPH_C32(0xABD388F0), - SPH_C32(0x6A51A0D2), SPH_C32(0xD8542F68), - SPH_C32(0x960FA728), SPH_C32(0xAB5133A3), - SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4) -}; - -static const sph_u32 RK5[32] = { - SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98), - SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176), - SPH_C32(0x66CA593E), SPH_C32(0x82430E88), - SPH_C32(0x8CEE8619), SPH_C32(0x456F9FB4), - SPH_C32(0x7D84A5C3), SPH_C32(0x3B8B5EBE), - SPH_C32(0xE06F75D8), SPH_C32(0x85C12073), - SPH_C32(0x401A449F), SPH_C32(0x56C16AA6), - SPH_C32(0x4ED3AA62), SPH_C32(0x363F7706), - SPH_C32(0x1BFEDF72), SPH_C32(0x429B023D), - SPH_C32(0x37D0D724), SPH_C32(0xD00A1248), - SPH_C32(0xDB0FEAD3), SPH_C32(0x49F1C09B), - SPH_C32(0x075372C9), SPH_C32(0x80991B7B), - SPH_C32(0x25D479D8), SPH_C32(0xF6E8DEF7), - SPH_C32(0xE3FE501A), SPH_C32(0xB6794C3B), - SPH_C32(0x976CE0BD), SPH_C32(0x04C006BA), - SPH_C32(0xC1A94FB6), SPH_C32(0x409F60C4) -}; - -#else - -#define PASS1(n, in) do { \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in( 0), SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in( 1), SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in( 2), SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in( 3), SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in( 4), SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in( 5), SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in( 6), SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in( 7), SPH_C32(0x00000000)); \ - \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in( 8), SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in( 9), SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in(10), SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in(11), SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in(12), SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in(13), SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in(14), SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in(15), SPH_C32(0x00000000)); \ - \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in(16), SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in(17), SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in(18), SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in(19), SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in(20), SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in(21), SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in(22), SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in(23), SPH_C32(0x00000000)); \ - \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in(24), SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in(25), SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in(26), SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in(27), SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in(28), SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in(29), SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in(30), SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in(31), SPH_C32(0x00000000)); \ - } while (0) - -#define PASS2(n, in) do { \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in( 5), SPH_C32(0x452821E6)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in(14), SPH_C32(0x38D01377)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(26), SPH_C32(0xBE5466CF)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in(18), SPH_C32(0x34E90C6C)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in(11), SPH_C32(0xC0AC29B7)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(28), SPH_C32(0xC97C50DD)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in( 7), SPH_C32(0x3F84D5B5)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in(16), SPH_C32(0xB5470917)); \ - \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in( 0), SPH_C32(0x9216D5D9)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in(23), SPH_C32(0x8979FB1B)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(20), SPH_C32(0xD1310BA6)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in(22), SPH_C32(0x98DFB5AC)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in( 1), SPH_C32(0x2FFD72DB)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(10), SPH_C32(0xD01ADFB7)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in( 4), SPH_C32(0xB8E1AFED)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in( 8), SPH_C32(0x6A267E96)); \ - \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in(30), SPH_C32(0xBA7C9045)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in( 3), SPH_C32(0xF12C7F99)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(21), SPH_C32(0x24A19947)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in( 9), SPH_C32(0xB3916CF7)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in(17), SPH_C32(0x0801F2E2)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(24), SPH_C32(0x858EFC16)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in(29), SPH_C32(0x636920D8)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in( 6), SPH_C32(0x71574E69)); \ - \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in(19), SPH_C32(0xA458FEA3)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in(12), SPH_C32(0xF4933D7E)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(15), SPH_C32(0x0D95748F)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in(13), SPH_C32(0x728EB658)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in( 2), SPH_C32(0x718BCD58)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(25), SPH_C32(0x82154AEE)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in(31), SPH_C32(0x7B54A41D)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in(27), SPH_C32(0xC25A59B5)); \ - } while (0) - -#define PASS3(n, in) do { \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(19), SPH_C32(0x9C30D539)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in( 9), SPH_C32(0x2AF26013)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in( 4), SPH_C32(0xC5D1B023)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in(20), SPH_C32(0x286085F0)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in(28), SPH_C32(0xCA417918)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in(17), SPH_C32(0xB8DB38EF)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in( 8), SPH_C32(0x8E79DCB0)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in(22), SPH_C32(0x603A180E)); \ - \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(29), SPH_C32(0x6C9E0E8B)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in(14), SPH_C32(0xB01E8A3E)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in(25), SPH_C32(0xD71577C1)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in(12), SPH_C32(0xBD314B27)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in(24), SPH_C32(0x78AF2FDA)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in(30), SPH_C32(0x55605C60)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in(16), SPH_C32(0xE65525F3)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in(26), SPH_C32(0xAA55AB94)); \ - \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(31), SPH_C32(0x57489862)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in(15), SPH_C32(0x63E81440)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in( 7), SPH_C32(0x55CA396A)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in( 3), SPH_C32(0x2AAB10B6)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in( 1), SPH_C32(0xB4CC5C34)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in( 0), SPH_C32(0x1141E8CE)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in(18), SPH_C32(0xA15486AF)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in(27), SPH_C32(0x7C72E993)); \ - \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(13), SPH_C32(0xB3EE1411)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in( 6), SPH_C32(0x636FBC2A)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in(21), SPH_C32(0x2BA9C55D)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in(10), SPH_C32(0x741831F6)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in(23), SPH_C32(0xCE5C3E16)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in(11), SPH_C32(0x9B87931E)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in( 5), SPH_C32(0xAFD6BA33)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in( 2), SPH_C32(0x6C24CF5C)); \ - } while (0) - -#define PASS4(n, in) do { \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in(24), SPH_C32(0x7A325381)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in( 4), SPH_C32(0x28958677)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in( 0), SPH_C32(0x3B8F4898)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(14), SPH_C32(0x6B4BB9AF)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in( 2), SPH_C32(0xC4BFE81B)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in( 7), SPH_C32(0x66282193)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(28), SPH_C32(0x61D809CC)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in(23), SPH_C32(0xFB21A991)); \ - \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in(26), SPH_C32(0x487CAC60)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in( 6), SPH_C32(0x5DEC8032)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in(30), SPH_C32(0xEF845D5D)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(20), SPH_C32(0xE98575B1)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in(18), SPH_C32(0xDC262302)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in(25), SPH_C32(0xEB651B88)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(19), SPH_C32(0x23893E81)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in( 3), SPH_C32(0xD396ACC5)); \ - \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in(22), SPH_C32(0x0F6D6FF3)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in(11), SPH_C32(0x83F44239)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in(31), SPH_C32(0x2E0B4482)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(21), SPH_C32(0xA4842004)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in( 8), SPH_C32(0x69C8F04A)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in(27), SPH_C32(0x9E1F9B5E)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(12), SPH_C32(0x21C66842)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in( 9), SPH_C32(0xF6E96C9A)); \ - \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in( 1), SPH_C32(0x670C9C61)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in(29), SPH_C32(0xABD388F0)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in( 5), SPH_C32(0x6A51A0D2)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(15), SPH_C32(0xD8542F68)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in(17), SPH_C32(0x960FA728)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in(10), SPH_C32(0xAB5133A3)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(16), SPH_C32(0x6EEF0B6C)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in(13), SPH_C32(0x137A3BE4)); \ - } while (0) - -#define PASS5(n, in) do { \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in(27), SPH_C32(0xBA3BF050)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in( 3), SPH_C32(0x7EFB2A98)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(21), SPH_C32(0xA1F1651D)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in(26), SPH_C32(0x39AF0176)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in(17), SPH_C32(0x66CA593E)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in(11), SPH_C32(0x82430E88)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(20), SPH_C32(0x8CEE8619)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(29), SPH_C32(0x456F9FB4)); \ - \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in(19), SPH_C32(0x7D84A5C3)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in( 0), SPH_C32(0x3B8B5EBE)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(12), SPH_C32(0xE06F75D8)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in( 7), SPH_C32(0x85C12073)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in(13), SPH_C32(0x401A449F)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in( 8), SPH_C32(0x56C16AA6)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(31), SPH_C32(0x4ED3AA62)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(10), SPH_C32(0x363F7706)); \ - \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in( 5), SPH_C32(0x1BFEDF72)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in( 9), SPH_C32(0x429B023D)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(14), SPH_C32(0x37D0D724)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in(30), SPH_C32(0xD00A1248)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in(18), SPH_C32(0xDB0FEAD3)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in( 6), SPH_C32(0x49F1C09B)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(28), SPH_C32(0x075372C9)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(24), SPH_C32(0x80991B7B)); \ - \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in( 2), SPH_C32(0x25D479D8)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in(23), SPH_C32(0xF6E8DEF7)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(16), SPH_C32(0xE3FE501A)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in(22), SPH_C32(0xB6794C3B)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in( 4), SPH_C32(0x976CE0BD)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in( 1), SPH_C32(0x04C006BA)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(25), SPH_C32(0xC1A94FB6)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(15), SPH_C32(0x409F60C4)); \ - } while (0) - -#endif - -#define SAVE_STATE \ - sph_u32 u0, u1, u2, u3, u4, u5, u6, u7; \ - do { \ - u0 = s0; \ - u1 = s1; \ - u2 = s2; \ - u3 = s3; \ - u4 = s4; \ - u5 = s5; \ - u6 = s6; \ - u7 = s7; \ - } while (0) - -#define UPDATE_STATE do { \ - s0 = SPH_T32(s0 + u0); \ - s1 = SPH_T32(s1 + u1); \ - s2 = SPH_T32(s2 + u2); \ - s3 = SPH_T32(s3 + u3); \ - s4 = SPH_T32(s4 + u4); \ - s5 = SPH_T32(s5 + u5); \ - s6 = SPH_T32(s6 + u6); \ - s7 = SPH_T32(s7 + u7); \ - } while (0) - -/* - * COREn(in) performs the core HAVAL computation for "n" passes, using - * the one-argument macro "in" to access the input words. Running state - * is held in variable "s0" to "s7". - */ - -#define CORE3(in) do { \ - SAVE_STATE; \ - PASS1(3, in); \ - PASS2(3, in); \ - PASS3(3, in); \ - UPDATE_STATE; \ - } while (0) - -#define CORE4(in) do { \ - SAVE_STATE; \ - PASS1(4, in); \ - PASS2(4, in); \ - PASS3(4, in); \ - PASS4(4, in); \ - UPDATE_STATE; \ - } while (0) - -#define CORE5(in) do { \ - SAVE_STATE; \ - PASS1(5, in); \ - PASS2(5, in); \ - PASS3(5, in); \ - PASS4(5, in); \ - PASS5(5, in); \ - UPDATE_STATE; \ - } while (0) - -/* - * DSTATE declares the state variables "s0" to "s7". - */ -#define DSTATE sph_u32 s0, s1, s2, s3, s4, s5, s6, s7 - -/* - * RSTATE fills the state variables from the context "sc". - */ -#define RSTATE do { \ - s0 = sc->s0; \ - s1 = sc->s1; \ - s2 = sc->s2; \ - s3 = sc->s3; \ - s4 = sc->s4; \ - s5 = sc->s5; \ - s6 = sc->s6; \ - s7 = sc->s7; \ - } while (0) - -/* - * WSTATE updates the context "sc" from the state variables. - */ -#define WSTATE do { \ - sc->s0 = s0; \ - sc->s1 = s1; \ - sc->s2 = s2; \ - sc->s3 = s3; \ - sc->s4 = s4; \ - sc->s5 = s5; \ - sc->s6 = s6; \ - sc->s7 = s7; \ - } while (0) - -/* - * Initialize a context. "olen" is the output length, in 32-bit words - * (between 4 and 8, inclusive). "passes" is the number of passes - * (3, 4 or 5). - */ -static void -haval_init(sph_haval_context *sc, unsigned olen, unsigned passes) -{ - sc->s0 = SPH_C32(0x243F6A88); - sc->s1 = SPH_C32(0x85A308D3); - sc->s2 = SPH_C32(0x13198A2E); - sc->s3 = SPH_C32(0x03707344); - sc->s4 = SPH_C32(0xA4093822); - sc->s5 = SPH_C32(0x299F31D0); - sc->s6 = SPH_C32(0x082EFA98); - sc->s7 = SPH_C32(0xEC4E6C89); - sc->olen = olen; - sc->passes = passes; -#if SPH_64 - sc->count = 0; -#else - sc->count_high = 0; - sc->count_low = 0; -#endif - -} - -/* - * IN_PREPARE(data) contains declarations and code to prepare for - * reading input words pointed to by "data". - * INW(i) reads the word number "i" (from 0 to 31). - */ -#if SPH_LITTLE_FAST -#define IN_PREPARE(indata) const unsigned char *const load_ptr = \ - (const unsigned char *)(indata) -#define INW(i) sph_dec32le_aligned(load_ptr + 4 * (i)) -#else -#define IN_PREPARE(indata) \ - sph_u32 X_var[32]; \ - int load_index; \ - \ - for (load_index = 0; load_index < 32; load_index ++) \ - X_var[load_index] = sph_dec32le_aligned( \ - (const unsigned char *)(indata) + 4 * load_index) -#define INW(i) X_var[i] -#endif - -/* - * Mixing operation used for 128-bit output tailoring. This function - * takes the byte 0 from a0, byte 1 from a1, byte 2 from a2 and byte 3 - * from a3, and combines them into a 32-bit word, which is then rotated - * to the left by n bits. - */ -static SPH_INLINE sph_u32 -mix128(sph_u32 a0, sph_u32 a1, sph_u32 a2, sph_u32 a3, int n) -{ - sph_u32 tmp; - - tmp = (a0 & SPH_C32(0x000000FF)) - | (a1 & SPH_C32(0x0000FF00)) - | (a2 & SPH_C32(0x00FF0000)) - | (a3 & SPH_C32(0xFF000000)); - if (n > 0) - tmp = SPH_ROTL32(tmp, n); - return tmp; -} - -/* - * Mixing operation used to compute output word 0 for 160-bit output. - */ -static SPH_INLINE sph_u32 -mix160_0(sph_u32 x5, sph_u32 x6, sph_u32 x7) -{ - sph_u32 tmp; - - tmp = (x5 & SPH_C32(0x01F80000)) - | (x6 & SPH_C32(0xFE000000)) - | (x7 & SPH_C32(0x0000003F)); - return SPH_ROTL32(tmp, 13); -} - -/* - * Mixing operation used to compute output word 1 for 160-bit output. - */ -static SPH_INLINE sph_u32 -mix160_1(sph_u32 x5, sph_u32 x6, sph_u32 x7) -{ - sph_u32 tmp; - - tmp = (x5 & SPH_C32(0xFE000000)) - | (x6 & SPH_C32(0x0000003F)) - | (x7 & SPH_C32(0x00000FC0)); - return SPH_ROTL32(tmp, 7); -} - -/* - * Mixing operation used to compute output word 2 for 160-bit output. - */ -static SPH_INLINE sph_u32 -mix160_2(sph_u32 x5, sph_u32 x6, sph_u32 x7) -{ - sph_u32 tmp; - - tmp = (x5 & SPH_C32(0x0000003F)) - | (x6 & SPH_C32(0x00000FC0)) - | (x7 & SPH_C32(0x0007F000)); - return tmp; -} - -/* - * Mixing operation used to compute output word 3 for 160-bit output. - */ -static SPH_INLINE sph_u32 -mix160_3(sph_u32 x5, sph_u32 x6, sph_u32 x7) -{ - sph_u32 tmp; - - tmp = (x5 & SPH_C32(0x00000FC0)) - | (x6 & SPH_C32(0x0007F000)) - | (x7 & SPH_C32(0x01F80000)); - return tmp >> 6; -} - -/* - * Mixing operation used to compute output word 4 for 160-bit output. - */ -static SPH_INLINE sph_u32 -mix160_4(sph_u32 x5, sph_u32 x6, sph_u32 x7) -{ - sph_u32 tmp; - - tmp = (x5 & SPH_C32(0x0007F000)) - | (x6 & SPH_C32(0x01F80000)) - | (x7 & SPH_C32(0xFE000000)); - return tmp >> 12; -} - -/* - * Mixing operation used to compute output word 0 for 192-bit output. - */ -static SPH_INLINE sph_u32 -mix192_0(sph_u32 x6, sph_u32 x7) -{ - sph_u32 tmp; - - tmp = (x6 & SPH_C32(0xFC000000)) | (x7 & SPH_C32(0x0000001F)); - return SPH_ROTL32(tmp, 6); -} - -/* - * Mixing operation used to compute output word 1 for 192-bit output. - */ -static SPH_INLINE sph_u32 -mix192_1(sph_u32 x6, sph_u32 x7) -{ - return (x6 & SPH_C32(0x0000001F)) | (x7 & SPH_C32(0x000003E0)); -} - -/* - * Mixing operation used to compute output word 2 for 192-bit output. - */ -static SPH_INLINE sph_u32 -mix192_2(sph_u32 x6, sph_u32 x7) -{ - return ((x6 & SPH_C32(0x000003E0)) | (x7 & SPH_C32(0x0000FC00))) >> 5; -} - -/* - * Mixing operation used to compute output word 3 for 192-bit output. - */ -static SPH_INLINE sph_u32 -mix192_3(sph_u32 x6, sph_u32 x7) -{ - return ((x6 & SPH_C32(0x0000FC00)) | (x7 & SPH_C32(0x001F0000))) >> 10; -} - -/* - * Mixing operation used to compute output word 4 for 192-bit output. - */ -static SPH_INLINE sph_u32 -mix192_4(sph_u32 x6, sph_u32 x7) -{ - return ((x6 & SPH_C32(0x001F0000)) | (x7 & SPH_C32(0x03E00000))) >> 16; -} - -/* - * Mixing operation used to compute output word 5 for 192-bit output. - */ -static SPH_INLINE sph_u32 -mix192_5(sph_u32 x6, sph_u32 x7) -{ - return ((x6 & SPH_C32(0x03E00000)) | (x7 & SPH_C32(0xFC000000))) >> 21; -} - -/* - * Write out HAVAL output. The output length is tailored to the requested - * length. - */ -static void -haval_out(sph_haval_context *sc, void *dst) -{ - DSTATE; - unsigned char *buf; - - buf = dst; - RSTATE; - switch (sc->olen) { - case 4: - sph_enc32le(buf, SPH_T32(s0 + mix128(s7, s4, s5, s6, 24))); - sph_enc32le(buf + 4, SPH_T32(s1 + mix128(s6, s7, s4, s5, 16))); - sph_enc32le(buf + 8, SPH_T32(s2 + mix128(s5, s6, s7, s4, 8))); - sph_enc32le(buf + 12, SPH_T32(s3 + mix128(s4, s5, s6, s7, 0))); - break; - case 5: - sph_enc32le(buf, SPH_T32(s0 + mix160_0(s5, s6, s7))); - sph_enc32le(buf + 4, SPH_T32(s1 + mix160_1(s5, s6, s7))); - sph_enc32le(buf + 8, SPH_T32(s2 + mix160_2(s5, s6, s7))); - sph_enc32le(buf + 12, SPH_T32(s3 + mix160_3(s5, s6, s7))); - sph_enc32le(buf + 16, SPH_T32(s4 + mix160_4(s5, s6, s7))); - break; - case 6: - sph_enc32le(buf, SPH_T32(s0 + mix192_0(s6, s7))); - sph_enc32le(buf + 4, SPH_T32(s1 + mix192_1(s6, s7))); - sph_enc32le(buf + 8, SPH_T32(s2 + mix192_2(s6, s7))); - sph_enc32le(buf + 12, SPH_T32(s3 + mix192_3(s6, s7))); - sph_enc32le(buf + 16, SPH_T32(s4 + mix192_4(s6, s7))); - sph_enc32le(buf + 20, SPH_T32(s5 + mix192_5(s6, s7))); - break; - case 7: - sph_enc32le(buf, SPH_T32(s0 + ((s7 >> 27) & 0x1F))); - sph_enc32le(buf + 4, SPH_T32(s1 + ((s7 >> 22) & 0x1F))); - sph_enc32le(buf + 8, SPH_T32(s2 + ((s7 >> 18) & 0x0F))); - sph_enc32le(buf + 12, SPH_T32(s3 + ((s7 >> 13) & 0x1F))); - sph_enc32le(buf + 16, SPH_T32(s4 + ((s7 >> 9) & 0x0F))); - sph_enc32le(buf + 20, SPH_T32(s5 + ((s7 >> 4) & 0x1F))); - sph_enc32le(buf + 24, SPH_T32(s6 + ((s7 ) & 0x0F))); - break; - case 8: - sph_enc32le(buf, s0); - sph_enc32le(buf + 4, s1); - sph_enc32le(buf + 8, s2); - sph_enc32le(buf + 12, s3); - sph_enc32le(buf + 16, s4); - sph_enc32le(buf + 20, s5); - sph_enc32le(buf + 24, s6); - sph_enc32le(buf + 28, s7); - break; - } -} - -/* - * The main core functions inline the code with the COREx() macros. We - * use a helper file, included three times, which avoids code copying. - */ - -#undef PASSES -#define PASSES 3 -#include "haval-helper.c" - -#undef PASSES -#define PASSES 4 -#include "haval-helper.c" - -#undef PASSES -#define PASSES 5 -#include "haval-helper.c" - -/* ====================================================================== */ - -#define API(xxx, y) \ -void \ -sph_haval ## xxx ## _ ## y ## _init(void *cc) \ -{ \ - haval_init(cc, xxx >> 5, y); \ -} \ - \ -void \ -sph_haval ## xxx ## _ ## y (void *cc, const void *data, size_t len) \ -{ \ - haval ## y(cc, data, len); \ -} \ - \ -void \ -sph_haval ## xxx ## _ ## y ## _close(void *cc, void *dst) \ -{ \ - haval ## y ## _close(cc, 0, 0, dst); \ -} \ - \ -void \ -sph_haval ## xxx ## _ ## y ## addbits_and_close( \ - void *cc, unsigned ub, unsigned n, void *dst) \ -{ \ - haval ## y ## _close(cc, ub, n, dst); \ -} - -API(128, 3) -API(128, 4) -API(128, 5) -API(160, 3) -API(160, 4) -API(160, 5) -API(192, 3) -API(192, 4) -API(192, 5) -API(224, 3) -API(224, 4) -API(224, 5) -API(256, 3) -API(256, 4) -API(256, 5) - -#define RVAL do { \ - s0 = val[0]; \ - s1 = val[1]; \ - s2 = val[2]; \ - s3 = val[3]; \ - s4 = val[4]; \ - s5 = val[5]; \ - s6 = val[6]; \ - s7 = val[7]; \ - } while (0) - -#define WVAL do { \ - val[0] = s0; \ - val[1] = s1; \ - val[2] = s2; \ - val[3] = s3; \ - val[4] = s4; \ - val[5] = s5; \ - val[6] = s6; \ - val[7] = s7; \ - } while (0) - -#define INMSG(i) msg[i] - -/* see sph_haval.h */ -void -sph_haval_3_comp(const sph_u32 msg[32], sph_u32 val[8]) -{ - DSTATE; - - RVAL; - CORE3(INMSG); - WVAL; -} - -/* see sph_haval.h */ -void -sph_haval_4_comp(const sph_u32 msg[32], sph_u32 val[8]) -{ - DSTATE; - - RVAL; - CORE4(INMSG); - WVAL; -} - -/* see sph_haval.h */ -void -sph_haval_5_comp(const sph_u32 msg[32], sph_u32 val[8]) -{ - DSTATE; - - RVAL; - CORE5(INMSG); - WVAL; -} - -#ifdef __cplusplus -} -#endif diff --git a/algo/haval/sph-haval.h b/algo/haval/sph-haval.h deleted file mode 100644 index 9ec5772..0000000 --- a/algo/haval/sph-haval.h +++ /dev/null @@ -1,976 +0,0 @@ -/* $Id: sph_haval.h 218 2010-06-08 17:06:34Z tp $ */ -/** - * HAVAL interface. - * - * HAVAL is actually a family of 15 hash functions, depending on whether - * the internal computation uses 3, 4 or 5 passes, and on the output - * length, which is 128, 160, 192, 224 or 256 bits. This implementation - * provides interface functions for all 15, which internally map to - * three cores (depending on the number of passes). Note that output - * lengths other than 256 bits are not obtained by a simple truncation - * of a longer result; the requested length is encoded within the - * padding data. - * - * HAVAL was published in: Yuliang Zheng, Josef Pieprzyk and Jennifer - * Seberry: "HAVAL -- a one-way hashing algorithm with variable length - * of output", Advances in Cryptology -- AUSCRYPT'92, Lecture Notes in - * Computer Science, Vol.718, pp.83-104, Springer-Verlag, 1993. - * - * This paper, and a reference implementation, are available on the - * Calyptix web site: http://labs.calyptix.com/haval.php - * - * The HAVAL reference paper is quite unclear on the data encoding - * details, i.e. endianness (both byte order within a 32-bit word, and - * word order within a message block). This implementation has been - * made compatible with the reference implementation referenced above. - * - * @warning A collision for HAVAL-128/3 (HAVAL with three passes and - * 128-bit output) has been published; this function is thus considered - * as cryptographically broken. The status for other variants is unclear; - * use only with care. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_haval.h - * @author Thomas Pornin - */ - -#ifndef SPH_HAVAL_H__ -#define SPH_HAVAL_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "algo/sha/sph_types.h" - -/** - * Output size (in bits) for HAVAL-128/3. - */ -#define SPH_SIZE_haval128_3 128 - -/** - * Output size (in bits) for HAVAL-128/4. - */ -#define SPH_SIZE_haval128_4 128 - -/** - * Output size (in bits) for HAVAL-128/5. - */ -#define SPH_SIZE_haval128_5 128 - -/** - * Output size (in bits) for HAVAL-160/3. - */ -#define SPH_SIZE_haval160_3 160 - -/** - * Output size (in bits) for HAVAL-160/4. - */ -#define SPH_SIZE_haval160_4 160 - -/** - * Output size (in bits) for HAVAL-160/5. - */ -#define SPH_SIZE_haval160_5 160 - -/** - * Output size (in bits) for HAVAL-192/3. - */ -#define SPH_SIZE_haval192_3 192 - -/** - * Output size (in bits) for HAVAL-192/4. - */ -#define SPH_SIZE_haval192_4 192 - -/** - * Output size (in bits) for HAVAL-192/5. - */ -#define SPH_SIZE_haval192_5 192 - -/** - * Output size (in bits) for HAVAL-224/3. - */ -#define SPH_SIZE_haval224_3 224 - -/** - * Output size (in bits) for HAVAL-224/4. - */ -#define SPH_SIZE_haval224_4 224 - -/** - * Output size (in bits) for HAVAL-224/5. - */ -#define SPH_SIZE_haval224_5 224 - -/** - * Output size (in bits) for HAVAL-256/3. - */ -#define SPH_SIZE_haval256_3 256 - -/** - * Output size (in bits) for HAVAL-256/4. - */ -#define SPH_SIZE_haval256_4 256 - -/** - * Output size (in bits) for HAVAL-256/5. - */ -#define SPH_SIZE_haval256_5 256 - -/** - * This structure is a context for HAVAL computations: it contains the - * intermediate values and some data from the last entered block. Once - * a HAVAL computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running HAVAL computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[128]; /* first field, for alignment */ - sph_u32 s0, s1, s2, s3, s4, s5, s6, s7; - unsigned olen, passes; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif -#endif -} sph_haval_context; - -/** - * Type for a HAVAL-128/3 context (identical to the common context). - */ -typedef sph_haval_context sph_haval128_3_context; - -/** - * Type for a HAVAL-128/4 context (identical to the common context). - */ -typedef sph_haval_context sph_haval128_4_context; - -/** - * Type for a HAVAL-128/5 context (identical to the common context). - */ -typedef sph_haval_context sph_haval128_5_context; - -/** - * Type for a HAVAL-160/3 context (identical to the common context). - */ -typedef sph_haval_context sph_haval160_3_context; - -/** - * Type for a HAVAL-160/4 context (identical to the common context). - */ -typedef sph_haval_context sph_haval160_4_context; - -/** - * Type for a HAVAL-160/5 context (identical to the common context). - */ -typedef sph_haval_context sph_haval160_5_context; - -/** - * Type for a HAVAL-192/3 context (identical to the common context). - */ -typedef sph_haval_context sph_haval192_3_context; - -/** - * Type for a HAVAL-192/4 context (identical to the common context). - */ -typedef sph_haval_context sph_haval192_4_context; - -/** - * Type for a HAVAL-192/5 context (identical to the common context). - */ -typedef sph_haval_context sph_haval192_5_context; - -/** - * Type for a HAVAL-224/3 context (identical to the common context). - */ -typedef sph_haval_context sph_haval224_3_context; - -/** - * Type for a HAVAL-224/4 context (identical to the common context). - */ -typedef sph_haval_context sph_haval224_4_context; - -/** - * Type for a HAVAL-224/5 context (identical to the common context). - */ -typedef sph_haval_context sph_haval224_5_context; - -/** - * Type for a HAVAL-256/3 context (identical to the common context). - */ -typedef sph_haval_context sph_haval256_3_context; - -/** - * Type for a HAVAL-256/4 context (identical to the common context). - */ -typedef sph_haval_context sph_haval256_4_context; - -/** - * Type for a HAVAL-256/5 context (identical to the common context). - */ -typedef sph_haval_context sph_haval256_5_context; - -/** - * Initialize the context for HAVAL-128/3. - * - * @param cc context to initialize (pointer to a - * sph_haval128_3_context structure) - */ -void sph_haval128_3_init(void *cc); - -/** - * Process some data bytes for HAVAL-128/3. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-128/3 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval128_3(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-128/3 computation. The output buffer must be wide - * enough to accomodate the result (16 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-128/3 context - * @param dst the output buffer - */ -void sph_haval128_3_close(void *cc, void *dst); - -/** - * Close a HAVAL-128/3 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (16 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-128/3 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval128_3_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-128/4. - * - * @param cc context to initialize (pointer to a - * sph_haval128_4_context structure) - */ -void sph_haval128_4_init(void *cc); - -/** - * Process some data bytes for HAVAL-128/4. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-128/4 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval128_4(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-128/4 computation. The output buffer must be wide - * enough to accomodate the result (16 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-128/4 context - * @param dst the output buffer - */ -void sph_haval128_4_close(void *cc, void *dst); - -/** - * Close a HAVAL-128/4 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (16 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-128/4 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval128_4_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-128/5. - * - * @param cc context to initialize (pointer to a - * sph_haval128_5_context structure) - */ -void sph_haval128_5_init(void *cc); - -/** - * Process some data bytes for HAVAL-128/5. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-128/5 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval128_5(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-128/5 computation. The output buffer must be wide - * enough to accomodate the result (16 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-128/5 context - * @param dst the output buffer - */ -void sph_haval128_5_close(void *cc, void *dst); - -/** - * Close a HAVAL-128/5 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (16 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-128/5 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval128_5_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-160/3. - * - * @param cc context to initialize (pointer to a - * sph_haval160_3_context structure) - */ -void sph_haval160_3_init(void *cc); - -/** - * Process some data bytes for HAVAL-160/3. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-160/3 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval160_3(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-160/3 computation. The output buffer must be wide - * enough to accomodate the result (20 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-160/3 context - * @param dst the output buffer - */ -void sph_haval160_3_close(void *cc, void *dst); - -/** - * Close a HAVAL-160/3 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (20 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-160/3 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval160_3_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-160/4. - * - * @param cc context to initialize (pointer to a - * sph_haval160_4_context structure) - */ -void sph_haval160_4_init(void *cc); - -/** - * Process some data bytes for HAVAL-160/4. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-160/4 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval160_4(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-160/4 computation. The output buffer must be wide - * enough to accomodate the result (20 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-160/4 context - * @param dst the output buffer - */ -void sph_haval160_4_close(void *cc, void *dst); - -/** - * Close a HAVAL-160/4 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (20 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-160/4 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval160_3_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-160/5. - * - * @param cc context to initialize (pointer to a - * sph_haval160_5_context structure) - */ -void sph_haval160_5_init(void *cc); - -/** - * Process some data bytes for HAVAL-160/5. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-160/5 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval160_5(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-160/5 computation. The output buffer must be wide - * enough to accomodate the result (20 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-160/5 context - * @param dst the output buffer - */ -void sph_haval160_5_close(void *cc, void *dst); - -/** - * Close a HAVAL-160/5 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (20 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-160/5 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval160_5_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-192/3. - * - * @param cc context to initialize (pointer to a - * sph_haval192_3_context structure) - */ -void sph_haval192_3_init(void *cc); - -/** - * Process some data bytes for HAVAL-192/3. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-192/3 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval192_3(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-192/3 computation. The output buffer must be wide - * enough to accomodate the result (24 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-192/3 context - * @param dst the output buffer - */ -void sph_haval192_3_close(void *cc, void *dst); - -/** - * Close a HAVAL-192/3 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (24 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-192/3 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval192_3_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-192/4. - * - * @param cc context to initialize (pointer to a - * sph_haval192_4_context structure) - */ -void sph_haval192_4_init(void *cc); - -/** - * Process some data bytes for HAVAL-192/4. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-192/4 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval192_4(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-192/4 computation. The output buffer must be wide - * enough to accomodate the result (24 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-192/4 context - * @param dst the output buffer - */ -void sph_haval192_4_close(void *cc, void *dst); - -/** - * Close a HAVAL-192/4 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (24 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-192/4 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval192_4_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-192/5. - * - * @param cc context to initialize (pointer to a - * sph_haval192_5_context structure) - */ -void sph_haval192_5_init(void *cc); - -/** - * Process some data bytes for HAVAL-192/5. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-192/5 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval192_5(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-192/5 computation. The output buffer must be wide - * enough to accomodate the result (24 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-192/5 context - * @param dst the output buffer - */ -void sph_haval192_5_close(void *cc, void *dst); - -/** - * Close a HAVAL-192/5 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (24 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-192/5 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval192_5_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-224/3. - * - * @param cc context to initialize (pointer to a - * sph_haval224_3_context structure) - */ -void sph_haval224_3_init(void *cc); - -/** - * Process some data bytes for HAVAL-224/3. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-224/3 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval224_3(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-224/3 computation. The output buffer must be wide - * enough to accomodate the result (28 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-224/3 context - * @param dst the output buffer - */ -void sph_haval224_3_close(void *cc, void *dst); - -/** - * Close a HAVAL-224/3 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (28 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-224/3 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval224_3_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-224/4. - * - * @param cc context to initialize (pointer to a - * sph_haval224_4_context structure) - */ -void sph_haval224_4_init(void *cc); - -/** - * Process some data bytes for HAVAL-224/4. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-224/4 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval224_4(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-224/4 computation. The output buffer must be wide - * enough to accomodate the result (28 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-224/4 context - * @param dst the output buffer - */ -void sph_haval224_4_close(void *cc, void *dst); - -/** - * Close a HAVAL-224/4 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (28 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-224/4 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval224_4_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-224/5. - * - * @param cc context to initialize (pointer to a - * sph_haval224_5_context structure) - */ -void sph_haval224_5_init(void *cc); - -/** - * Process some data bytes for HAVAL-224/5. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-224/5 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval224_5(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-224/5 computation. The output buffer must be wide - * enough to accomodate the result (28 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-224/5 context - * @param dst the output buffer - */ -void sph_haval224_5_close(void *cc, void *dst); - -/** - * Close a HAVAL-224/5 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (28 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-224/5 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval224_5_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-256/3. - * - * @param cc context to initialize (pointer to a - * sph_haval256_3_context structure) - */ -void sph_haval256_3_init(void *cc); - -/** - * Process some data bytes for HAVAL-256/3. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-256/3 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval256_3(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-256/3 computation. The output buffer must be wide - * enough to accomodate the result (32 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-256/3 context - * @param dst the output buffer - */ -void sph_haval256_3_close(void *cc, void *dst); - -/** - * Close a HAVAL-256/3 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (32 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-256/3 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval256_3_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-256/4. - * - * @param cc context to initialize (pointer to a - * sph_haval256_4_context structure) - */ -void sph_haval256_4_init(void *cc); - -/** - * Process some data bytes for HAVAL-256/4. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-256/4 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval256_4(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-256/4 computation. The output buffer must be wide - * enough to accomodate the result (32 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-256/4 context - * @param dst the output buffer - */ -void sph_haval256_4_close(void *cc, void *dst); - -/** - * Close a HAVAL-256/4 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (32 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-256/4 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval256_4_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Initialize the context for HAVAL-256/5. - * - * @param cc context to initialize (pointer to a - * sph_haval256_5_context structure) - */ -void sph_haval256_5_init(void *cc); - -/** - * Process some data bytes for HAVAL-256/5. If len is 0, - * then this function does nothing. - * - * @param cc the HAVAL-256/5 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_haval256_5(void *cc, const void *data, size_t len); - -/** - * Close a HAVAL-256/5 computation. The output buffer must be wide - * enough to accomodate the result (32 bytes). The context is automatically - * reinitialized. - * - * @param cc the HAVAL-256/5 context - * @param dst the output buffer - */ -void sph_haval256_5_close(void *cc, void *dst); - -/** - * Close a HAVAL-256/5 computation. Up to 7 extra input bits may be added - * to the input message; these are the n upper bits of - * the ub byte (i.e. the first extra bit has value 128 in - * ub, the second extra bit has value 64, and so on). Other - * bits in ub are ignored. - * - * The output buffer must be wide enough to accomodate the result (32 - * bytes). The context is automatically reinitialized. - * - * @param cc the HAVAL-256/5 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the output buffer - */ -void sph_haval256_5_addbits_and_close(void *cc, - unsigned ub, unsigned n, void *dst); - -/** - * Apply the HAVAL compression function on the provided data. The - * msg parameter contains the 32 32-bit input blocks, - * as numerical values (hence after the little-endian decoding). The - * val parameter contains the 8 32-bit input blocks for - * the compression function; the output is written in place in this - * array. This function uses three internal passes. - * - * @param msg the message block (32 values) - * @param val the function 256-bit input and output - */ -void sph_haval_3_comp(const sph_u32 msg[32], sph_u32 val[8]); - -/** - * Apply the HAVAL compression function on the provided data. The - * msg parameter contains the 32 32-bit input blocks, - * as numerical values (hence after the little-endian decoding). The - * val parameter contains the 8 32-bit input blocks for - * the compression function; the output is written in place in this - * array. This function uses four internal passes. - * - * @param msg the message block (32 values) - * @param val the function 256-bit input and output - */ -void sph_haval_4_comp(const sph_u32 msg[32], sph_u32 val[8]); - -/** - * Apply the HAVAL compression function on the provided data. The - * msg parameter contains the 32 32-bit input blocks, - * as numerical values (hence after the little-endian decoding). The - * val parameter contains the 8 32-bit input blocks for - * the compression function; the output is written in place in this - * array. This function uses five internal passes. - * - * @param msg the message block (32 values) - * @param val the function 256-bit input and output - */ -void sph_haval_5_comp(const sph_u32 msg[32], sph_u32 val[8]); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/algo/heavy/bastion.c b/algo/heavy/bastion.c deleted file mode 100644 index afbbdab..0000000 --- a/algo/heavy/bastion.c +++ /dev/null @@ -1,177 +0,0 @@ -#include "algo-gate-api.h" - -#include -#include -#include -#include -#include - -#include "sph_hefty1.h" - -#include "algo/luffa/sph_luffa.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/skein/sph_skein.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/skein/sse2/skein.c" - -#ifndef NO_AES_NI - #include "algo/echo/aes_ni/hash_api.h" -#endif - -void bastionhash(void *output, const void *input) -{ - unsigned char hash[64] __attribute__ ((aligned (64))); - -#ifdef NO_AES_NI - sph_echo512_context ctx_echo; -#else - hashState_echo ctx_echo; -#endif - hashState_luffa ctx_luffa; - sph_fugue512_context ctx_fugue; - sph_whirlpool_context ctx_whirlpool; - sph_shabal512_context ctx_shabal; - sph_hamsi512_context ctx_hamsi; - - unsigned char hashbuf[128] __attribute__ ((aligned (16))); - sph_u64 hashctA; -// sph_u64 hashctB; - size_t hashptr; - - HEFTY1(input, 80, hash); - - init_luffa( &ctx_luffa, 512 ); - update_and_final_luffa( &ctx_luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); -// update_luffa( &ctx_luffa, hash, 64 ); -// final_luffa( &ctx_luffa, hash ); - - if (hash[0] & 0x8) - { - sph_fugue512_init(&ctx_fugue); - sph_fugue512(&ctx_fugue, hash, 64); - sph_fugue512_close(&ctx_fugue, hash); - } else { - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - } - - sph_whirlpool_init(&ctx_whirlpool); - sph_whirlpool(&ctx_whirlpool, hash, 64); - sph_whirlpool_close(&ctx_whirlpool, hash); - - sph_fugue512_init(&ctx_fugue); - sph_fugue512(&ctx_fugue, hash, 64); - sph_fugue512_close(&ctx_fugue, hash); - - if (hash[0] & 0x8) - { -#ifdef NO_AES_NI - sph_echo512_init(&ctx_echo); - sph_echo512(&ctx_echo, hash, 64); - sph_echo512_close(&ctx_echo, hash); -#else - init_echo( &ctx_echo, 512 ); - update_final_echo ( &ctx_echo,(BitSequence*)hash, - (const BitSequence*)hash, 512 ); -// update_echo ( &ctx_echo, hash, 512 ); -// final_echo( &ctx_echo, hash ); -#endif - } else { - init_luffa( &ctx_luffa, 512 ); - update_and_final_luffa( &ctx_luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); -// update_luffa( &ctx_luffa, hash, 64 ); -// final_luffa( &ctx_luffa, hash ); - } - - sph_shabal512_init(&ctx_shabal); - sph_shabal512(&ctx_shabal, hash, 64); - sph_shabal512_close(&ctx_shabal, hash); - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - if (hash[0] & 0x8) - { - sph_shabal512_init(&ctx_shabal); - sph_shabal512(&ctx_shabal, hash, 64); - sph_shabal512_close(&ctx_shabal, hash); - } else { - sph_whirlpool_init(&ctx_whirlpool); - sph_whirlpool(&ctx_whirlpool, hash, 64); - sph_whirlpool_close(&ctx_whirlpool, hash); - } - - sph_shabal512_init(&ctx_shabal); - sph_shabal512(&ctx_shabal, hash, 64); - sph_shabal512_close(&ctx_shabal, hash); - - if (hash[0] & 0x8) - { - sph_hamsi512_init(&ctx_hamsi); - sph_hamsi512(&ctx_hamsi, hash, 64); - sph_hamsi512_close(&ctx_hamsi, hash); - } else { - init_luffa( &ctx_luffa, 512 ); - update_and_final_luffa( &ctx_luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); -// update_luffa( &ctx_luffa, hash, 64 ); -// final_luffa( &ctx_luffa, hash ); - } - - memcpy(output, hash, 32); -} - -int scanhash_bastion( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t _ALIGN(64) hash32[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - - uint32_t n = first_nonce; - - for (int i=0; i < 19; i++) - be32enc(&endiandata[i], pdata[i]); - - do { - be32enc(&endiandata[19], n); - bastionhash(hash32, endiandata); - if (hash32[7] < Htarg && fulltest(hash32, ptarget)) { - work_set_target_ratio(work, hash32); - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return true; - } - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} - -bool register_bastion_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT; - gate->scanhash = (void*)&scanhash_bastion; - gate->hash = (void*)&bastionhash; - return true; -}; - diff --git a/algo/heavy/heavy.c b/algo/heavy/heavy.c deleted file mode 100644 index 68e5bc7..0000000 --- a/algo/heavy/heavy.c +++ /dev/null @@ -1,111 +0,0 @@ -#include -#include -#include - -#include "algo-gate-api.h" -#include "sph_hefty1.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/blake/sph_blake.h" -#include "algo/groestl/sph_groestl.h" - -/* Combines top 64-bits from each hash into a single hash */ -static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4) -{ - uint32_t *hash[4] = { hash1, hash2, hash3, hash4 }; - - /* Transpose first 64 bits of each hash into out */ - memset(out, 0, 32); - int bits = 0; - for (unsigned int i = 7; i >= 6; i--) { - for (uint32_t mask = 0x80000000; mask; mask >>= 1) { - for (unsigned int k = 0; k < 4; k++) { - out[(255 - bits)/32] <<= 1; - if ((hash[k][i] & mask) != 0) - out[(255 - bits)/32] |= 1; - bits++; - } - } - } -} - -extern void heavyhash(unsigned char* output, const unsigned char* input, int len) -{ - unsigned char hash1[32]; - HEFTY1(input, len, hash1); - -// HEFTY1 is new, so take an extra security measure to eliminate -// * the possiblity of collisions: -// * -// * Hash(x) = SHA256(x + HEFTY1(x)) -// * -// * N.B. '+' is concatenation. -// - unsigned char hash2[32];; - SHA256_CTX ctx; - SHA256_Init(&ctx); - SHA256_Update(&ctx, input, len); - SHA256_Update(&ctx, hash1, sizeof(hash1)); - SHA256_Final(hash2, &ctx); - -// * Additional security: Do not rely on a single cryptographic hash -// * function. Instead, combine the outputs of 4 of the most secure -// * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512 -// * and BLAKE512. - - - uint32_t hash3[16]; - sph_keccak512_context keccakCtx; - sph_keccak512_init(&keccakCtx); - sph_keccak512(&keccakCtx, input, len); - sph_keccak512(&keccakCtx, hash1, sizeof(hash1)); - sph_keccak512_close(&keccakCtx, (void *)&hash3); - - uint32_t hash4[16]; - sph_groestl512_context groestlCtx; - sph_groestl512_init(&groestlCtx); - sph_groestl512(&groestlCtx, input, len); - sph_groestl512(&groestlCtx, hash1, sizeof(hash1)); - sph_groestl512_close(&groestlCtx, (void *)&hash4); - - uint32_t hash5[16]; - sph_blake512_context blakeCtx; - sph_blake512_init(&blakeCtx); - sph_blake512(&blakeCtx, input, len); - sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)); - sph_blake512_close(&blakeCtx, (void *)&hash5); - - uint32_t *final = (uint32_t *)output; - combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5); - -} - -int scanhash_heavy( uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[8]; - uint32_t start_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - - do { - heavyhash((unsigned char *)hash, (unsigned char *)pdata, 80); - - if (hash[7] <= ptarget[7]) { - if (fulltest(hash, ptarget)) { - *hashes_done = pdata[19] - start_nonce; - return 1; - break; - } - } - pdata[19]++; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - *hashes_done = pdata[19] - start_nonce; - return 0; -} - -bool register_heavy_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_heavy; - gate->hash = (void*)&heavyhash; - return true; -}; - diff --git a/algo/heavy/sph_hefty1.c b/algo/heavy/sph_hefty1.c deleted file mode 100644 index 8a8203c..0000000 --- a/algo/heavy/sph_hefty1.c +++ /dev/null @@ -1,382 +0,0 @@ -/* - * HEFTY1 cryptographic hash function - * - * Copyright (c) 2014, dbcc14 - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the FreeBSD Project. - */ - -#include -#include - -#ifdef _MSC_VER -#define inline __inline -#endif - -#include "sph_hefty1.h" - -#define Min(A, B) (A <= B ? A : B) -#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \ - { \ - /* To thwart parallelism, Br modifies itself each time it's \ - * called. This also means that calling it in different \ - * orders yeilds different results. In C the order of \ - * evaluation of function arguments and + operands are \ - * unspecified (and depends on the compiler), so we must make \ - * the order of Br calls explicit. \ - */ \ - uint32_t brG = Br(ctx, G); \ - uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K; \ - uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E)); \ - uint32_t brC = Br(ctx, C); \ - uint32_t brB = Br(ctx, B); \ - uint32_t tmp3 = Ma(Br(ctx, A), brB, brC); \ - uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A)); \ - H = G; \ - G = F; \ - F = E; \ - E = D + Br(ctx, tmp2); \ - D = C; \ - C = B; \ - B = A; \ - A = tmp2 + tmp4; \ - } \ - -/* Nothing up my sleeve constants */ -const static uint32_t K[64] = { - 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, - 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, - 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, - 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, - 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, - 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, - 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, - 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, - 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, - 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, - 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, - 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, - 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, - 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, - 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, - 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL -}; - -/* Initial hash values */ -const static uint32_t H[HEFTY1_STATE_WORDS] = { - 0x6a09e667UL, - 0xbb67ae85UL, - 0x3c6ef372UL, - 0xa54ff53aUL, - 0x510e527fUL, - 0x9b05688cUL, - 0x1f83d9abUL, - 0x5be0cd19UL -}; - -static inline uint32_t Rr(uint32_t X, uint8_t n) -{ - return (X >> n) | (X << (32 - n)); -} - -static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G) -{ - return (E & F) ^ (~E & G); -} - -static inline uint32_t Sigma1(uint32_t E) -{ - return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25); -} - -static inline uint32_t sigma1(uint32_t X) -{ - return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10); -} - -static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C) -{ - return (A & B) ^ (A & C) ^ (B & C); -} - -static inline uint32_t Sigma0(uint32_t A) -{ - return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22); -} - -static inline uint32_t sigma0(uint32_t X) -{ - return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3); -} - -static inline uint32_t Reverse32(uint32_t n) -{ - #if BYTE_ORDER == LITTLE_ENDIAN - return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24; - #else - return n; - #endif -} - -static inline uint64_t Reverse64(uint64_t n) -{ - #if BYTE_ORDER == LITTLE_ENDIAN - uint32_t a = n >> 32; - uint32_t b = (n << 32) >> 32; - - return (uint64_t)Reverse32(b) << 32 | Reverse32(a); - #else - return n; - #endif -} - -/* Smoosh byte into nibble */ -static inline uint8_t Smoosh4(uint8_t X) -{ - return (X >> 4) ^ (X & 0xf); -} - -/* Smoosh 32-bit word into 2-bits */ -static inline uint8_t Smoosh2(uint32_t X) -{ - uint16_t w = (X >> 16) ^ (X & 0xffff); - uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff)); - return (n >> 2) ^ (n & 0x3); -} - -static void Mangle(uint32_t *S) -{ - uint32_t *R = S; - uint32_t *C = &S[1]; - - uint8_t r0 = Smoosh4(R[0] >> 24); - uint8_t r1 = Smoosh4(R[0] >> 16); - uint8_t r2 = Smoosh4(R[0] >> 8); - uint8_t r3 = Smoosh4(R[0] & 0xff); - - int i; - - /* Diffuse */ - uint32_t tmp = 0; - for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) { - uint8_t r = Smoosh2(tmp); - switch (r) { - case 0: - C[i] ^= Rr(R[0], i + r0); - break; - case 1: - C[i] += Rr(~R[0], i + r1); - break; - case 2: - C[i] &= Rr(~R[0], i + r2); - break; - case 3: - C[i] ^= Rr(R[0], i + r3); - break; - } - tmp ^= C[i]; - } - - /* Compress */ - tmp = 0; - for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) - if (i % 2) - tmp ^= C[i]; - else - tmp += C[i]; - R[0] ^= tmp; -} - -static void Absorb(uint32_t *S, uint32_t X) -{ - uint32_t *R = S; - R[0] ^= X; - Mangle(S); -} - -static uint32_t Squeeze(uint32_t *S) -{ - uint32_t Y = S[0]; - Mangle(S); - return Y; -} - -/* Branch, compress and serialize function */ -static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X) -{ - uint32_t R = Squeeze(ctx->sponge); - - uint8_t r0 = R >> 8; - uint8_t r1 = R & 0xff; - - uint32_t Y = 1 << (r0 % 32); - - switch (r1 % 4) - { - case 0: - /* Do nothing */ - break; - case 1: - return X & ~Y; - case 2: - return X | Y; - case 3: - return X ^ Y; - } - - return X; -} - -static void HashBlock(HEFTY1_CTX *ctx) -{ - uint32_t A, B, C, D, E, F, G, H; - uint32_t W[HEFTY1_BLOCK_BYTES]; - - assert(ctx); - - A = ctx->h[0]; - B = ctx->h[1]; - C = ctx->h[2]; - D = ctx->h[3]; - E = ctx->h[4]; - F = ctx->h[5]; - G = ctx->h[6]; - H = ctx->h[7]; - - int t = 0; - for (; t < 16; t++) { - W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */ - Absorb(ctx->sponge, W[t] ^ K[t]); - } - - for (t = 0; t < 16; t++) { - Absorb(ctx->sponge, D ^ H); - RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); - } - for (t = 16; t < 64; t++) { - Absorb(ctx->sponge, H + D); - W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16]; - RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); - } - - ctx->h[0] += A; - ctx->h[1] += B; - ctx->h[2] += C; - ctx->h[3] += D; - ctx->h[4] += E; - ctx->h[5] += F; - ctx->h[6] += G; - ctx->h[7] += H; - - A = 0; - B = 0; - C = 0; - D = 0; - E = 0; - F = 0; - G = 0; - H = 0; - - memset(W, 0, sizeof(W)); -} - -/* Public interface */ - -void HEFTY1_Init(HEFTY1_CTX *ctx) -{ - assert(ctx); - - memcpy(ctx->h, H, sizeof(ctx->h)); - memset(ctx->block, 0, sizeof(ctx->block)); - ctx->written = 0; - memset(ctx->sponge, 0, sizeof(ctx->sponge)); -} - -void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len) -{ - assert(ctx); - - uint64_t read = 0; - while (len) { - size_t end = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES); - size_t count = Min(len, HEFTY1_BLOCK_BYTES - end); - memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count); - len -= count; - read += count; - ctx->written += count; - if (!(ctx->written % HEFTY1_BLOCK_BYTES)) - HashBlock(ctx); - } -} - -void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx) -{ - assert(digest); - assert(ctx); - - /* Pad message (FIPS 180 Section 5.1.1) */ - size_t used = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES); - ctx->block[used++] = 0x80; /* Append 1 to end of message */ - if (used > HEFTY1_BLOCK_BYTES - 8) { - /* We have already written into the last 64bits, so - * we must continue into the next block. */ - memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used); - HashBlock(ctx); - used = 0; /* Create a new block (below) */ - } - - /* All remaining bits to zero */ - memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used); - - /* The last 64bits encode the length (in network byte order) */ - uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8]; - *len = Reverse64(ctx->written*8); - - HashBlock(ctx); - - /* Convert back to network byte order */ - int i = 0; - for (; i < HEFTY1_STATE_WORDS; i++) - ctx->h[i] = Reverse32(ctx->h[i]); - - memcpy(digest, ctx->h, sizeof(ctx->h)); - memset(ctx, 0, sizeof(HEFTY1_CTX)); -} - -unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest) -{ - HEFTY1_CTX ctx; - static unsigned char m[HEFTY1_DIGEST_BYTES]; - - if (!digest) - digest = m; - - HEFTY1_Init(&ctx); - HEFTY1_Update(&ctx, buf, len); - HEFTY1_Final(digest, &ctx); - - return digest; -} \ No newline at end of file diff --git a/algo/heavy/sph_hefty1.h b/algo/heavy/sph_hefty1.h deleted file mode 100644 index afcd274..0000000 --- a/algo/heavy/sph_hefty1.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * HEFTY1 cryptographic hash function - * - * Copyright (c) 2014, dbcc14 - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the FreeBSD Project. - */ - -#ifndef __HEFTY1_H__ -#define __HEFTY1_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef WIN32 -#include -#endif - -#include - -#define HEFTY1_DIGEST_BYTES 32 -#define HEFTY1_BLOCK_BYTES 64 -#define HEFTY1_STATE_WORDS 8 -#define HEFTY1_SPONGE_WORDS 4 - -typedef struct HEFTY1_CTX { - uint32_t h[HEFTY1_STATE_WORDS]; - uint8_t block[HEFTY1_BLOCK_BYTES]; - uint64_t written; - uint32_t sponge[HEFTY1_SPONGE_WORDS]; -} HEFTY1_CTX; - -void HEFTY1_Init(HEFTY1_CTX *cxt); -void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len); -void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt); -unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest); - -#ifdef __cplusplus -} -#endif - -#endif /* __HEFTY1_H__ */ \ No newline at end of file diff --git a/algo/hodl/aes.c b/algo/hodl/aes.c deleted file mode 100644 index 5be2af3..0000000 --- a/algo/hodl/aes.c +++ /dev/null @@ -1,182 +0,0 @@ -#include -#include -#include "wolf-aes.h" -#include "miner.h" - -#if defined(__AES__) - -static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2) -{ - __m128i tmp4; - *tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF); - tmp4 = _mm_slli_si128(*tmp1, 0x04); - *tmp1 = _mm_xor_si128(*tmp1, tmp4); - tmp4 = _mm_slli_si128(tmp4, 0x04); - *tmp1 = _mm_xor_si128(*tmp1, tmp4); - tmp4 = _mm_slli_si128(tmp4, 0x04); - *tmp1 = _mm_xor_si128(*tmp1, tmp4); - *tmp1 = _mm_xor_si128(*tmp1, *tmp2); -} - -static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3) -{ - __m128i tmp2, tmp4; - - tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00); - tmp2 = _mm_shuffle_epi32(tmp4, 0xAA); - tmp4 = _mm_slli_si128(*tmp3, 0x04); - *tmp3 = _mm_xor_si128(*tmp3, tmp4); - tmp4 = _mm_slli_si128(tmp4, 0x04); - *tmp3 = _mm_xor_si128(*tmp3, tmp4); - tmp4 = _mm_slli_si128(tmp4, 0x04); - *tmp3 = _mm_xor_si128(*tmp3, tmp4); - *tmp3 = _mm_xor_si128(*tmp3, tmp2); -} - -// Special thanks to Intel for helping me -// with ExpandAESKey256() and its subroutines -void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf) -{ - __m128i tmp1, tmp2, tmp3; - - tmp1 = keys[0] = KeyBuf[0]; - tmp3 = keys[1] = KeyBuf[1]; - - tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01); - ExpandAESKey256_sub1(&tmp1, &tmp2); - keys[2] = tmp1; - ExpandAESKey256_sub2(&tmp1, &tmp3); - keys[3] = tmp3; - - tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02); - ExpandAESKey256_sub1(&tmp1, &tmp2); - keys[4] = tmp1; - ExpandAESKey256_sub2(&tmp1, &tmp3); - keys[5] = tmp3; - - tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04); - ExpandAESKey256_sub1(&tmp1, &tmp2); - keys[6] = tmp1; - ExpandAESKey256_sub2(&tmp1, &tmp3); - keys[7] = tmp3; - - tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08); - ExpandAESKey256_sub1(&tmp1, &tmp2); - keys[8] = tmp1; - ExpandAESKey256_sub2(&tmp1, &tmp3); - keys[9] = tmp3; - - tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10); - ExpandAESKey256_sub1(&tmp1, &tmp2); - keys[10] = tmp1; - ExpandAESKey256_sub2(&tmp1, &tmp3); - keys[11] = tmp3; - - tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20); - ExpandAESKey256_sub1(&tmp1, &tmp2); - keys[12] = tmp1; - ExpandAESKey256_sub2(&tmp1, &tmp3); - keys[13] = tmp3; - - tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40); - ExpandAESKey256_sub1(&tmp1, &tmp2); - keys[14] = tmp1; -} - -#if defined(__SSE4_2__) -//#ifdef __AVX__ - -#define AESENC(i,j) \ - State[j] = _mm_aesenc_si128(State[j], ExpandedKey[j][i]); - -#define AESENC_N(i) \ - AESENC(i,0) \ - AESENC(i,1) \ - AESENC(i,2) \ - AESENC(i,3) \ - AESENC(i,4) \ - AESENC(i,5) \ - AESENC(i,6) \ - AESENC(i,7) \ - - -static inline void AES256Core(__m128i* State, __m128i ExpandedKey[][16]) -{ - const uint32_t N = AES_PARALLEL_N; - - for(int j=0; j> 8) & 0xff) | (((x) & 0xff) << 8))) - -static __inline unsigned short int -__bswap_16 (unsigned short int __bsx) -{ - return __bswap_constant_16 (__bsx); -} - -// LE -# define htobe16(x) __bswap_16 (x) -# define htole16(x) (x) -# define be16toh(x) __bswap_16 (x) -# define le16toh(x) (x) - -// BE -//# define htole16(x) __bswap_16 (x) -//# define htobe16(x) (x) -//# define le16toh(x) __bswap_16 (x) -//# define be16toh(x) (x) - -#define __bswap_constant_32(x) \ - ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ - (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) - -static __inline unsigned int -__bswap_32 (unsigned int __bsx) -{ - return __builtin_bswap32 (__bsx); -} - -// LE -# define htobe32(x) __bswap_32 (x) -# define htole32(x) (x) -# define be32toh(x) __bswap_32 (x) -# define le32toh(x) (x) - -// BE -//# define htole32(x) __bswap_32 (x) -//# define htobe32(x) (x) -//# define le32toh(x) __bswap_32 (x) -//# define be32toh(x) (x) - -# define __bswap_constant_64(x) \ - ((((x) & 0xff00000000000000ull) >> 56) \ - | (((x) & 0x00ff000000000000ull) >> 40) \ - | (((x) & 0x0000ff0000000000ull) >> 24) \ - | (((x) & 0x000000ff00000000ull) >> 8) \ - | (((x) & 0x00000000ff000000ull) << 8) \ - | (((x) & 0x0000000000ff0000ull) << 24) \ - | (((x) & 0x000000000000ff00ull) << 40) \ - | (((x) & 0x00000000000000ffull) << 56)) - -static __inline uint64_t -__bswap_64 (uint64_t __bsx) -{ - return __bswap_constant_64 (__bsx); -} - -// LE -# define htobe64(x) __bswap_64 (x) -# define htole64(x) (x) -# define be64toh(x) __bswap_64 (x) -# define le64toh(x) (x) - -// BE -//# define htole64(x) __bswap_64 (x) -//# define htobe64(x) (x) -//# define le64toh(x) __bswap_64 (x) -//# define be64toh(x) (x) - -#endif \ No newline at end of file diff --git a/algo/hodl/hodl-gate.c b/algo/hodl/hodl-gate.c deleted file mode 100644 index e3df7d1..0000000 --- a/algo/hodl/hodl-gate.c +++ /dev/null @@ -1,185 +0,0 @@ -#include -#include - -#include "hodl-gate.h" -#include "hodl-wolf.h" - -#define HODL_NSTARTLOC_INDEX 20 -#define HODL_NFINALCALC_INDEX 21 - -static struct work hodl_work; - -pthread_barrier_t hodl_barrier; - -// All references to this buffer are local to this file, so no args -// need to be passed. -unsigned char *hodl_scratchbuf = NULL; - -void hodl_set_target( struct work* work, double diff ) -{ - diff_to_target(work->target, diff / 8388608.0 ); -} - -void hodl_le_build_stratum_request( char* req, struct work* work, - struct stratum_ctx *sctx ) -{ - uint32_t ntime, nonce, nstartloc, nfinalcalc; - char ntimestr[9], noncestr[9], nstartlocstr[9], nfinalcalcstr[9]; - unsigned char *xnonce2str; - - le32enc( &ntime, work->data[ algo_gate.ntime_index ] ); - le32enc( &nonce, work->data[ algo_gate.nonce_index ] ); - bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) ); - bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) ); - xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len ); - le32enc( &nstartloc, work->data[ HODL_NSTARTLOC_INDEX ] ); - le32enc( &nfinalcalc, work->data[ HODL_NFINALCALC_INDEX ] ); - bin2hex( nstartlocstr, (char*)(&nstartloc), sizeof(uint32_t) ); - bin2hex( nfinalcalcstr, (char*)(&nfinalcalc), sizeof(uint32_t) ); - sprintf( req, "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr, - nstartlocstr, nfinalcalcstr ); - free( xnonce2str ); -} - -char* hodl_malloc_txs_request( struct work *work ) -{ - char* req; - json_t *val; - char data_str[2 * sizeof(work->data) + 1]; - int i; - - for ( i = 0; i < ARRAY_SIZE(work->data); i++ ) - be32enc( work->data + i, work->data[i] ); - - bin2hex( data_str, (unsigned char *)work->data, 88 ); - if ( work->workid ) - { - char *params; - val = json_object(); - json_object_set_new( val, "workid", json_string( work->workid ) ); - params = json_dumps( val, 0 ); - json_decref( val ); - req = malloc( 128 + 2*88 + strlen( work->txs ) + strlen( params ) ); - sprintf( req, - "{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n", - data_str, work->txs, params); - free( params ); - } - else - { - req = malloc( 128 + 2*88 + strlen(work->txs)); - sprintf( req, - "{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n", - data_str, work->txs); - } - return req; -} - -void hodl_build_block_header( struct work* g_work, uint32_t version, - uint32_t *prevhash, uint32_t *merkle_tree, - uint32_t ntime, uint32_t nbits ) -{ - int i; - - memset( g_work->data, 0, sizeof(g_work->data) ); - g_work->data[0] = version; - - if ( have_stratum ) - for ( i = 0; i < 8; i++ ) - g_work->data[ 1+i ] = le32dec( prevhash + i ); - else - for (i = 0; i < 8; i++) - g_work->data[ 8-i ] = le32dec( prevhash + i ); - - for ( i = 0; i < 8; i++ ) - g_work->data[ 9+i ] = be32dec( merkle_tree + i ); - - g_work->data[ algo_gate.ntime_index ] = ntime; - g_work->data[ algo_gate.nbits_index ] = nbits; - g_work->data[22] = 0x80000000; - g_work->data[31] = 0x00000280; -} - -// called only by thread 0, saves a backup of g_work -void hodl_get_new_work( struct work* work, struct work* g_work) -{ - work_free( &hodl_work ); - work_copy( &hodl_work, g_work ); - hodl_work.data[ algo_gate.nonce_index ] = ( clock() + rand() ) % 9999; -} - -json_t *hodl_longpoll_rpc_call( CURL *curl, int *err, char* lp_url ) -{ - json_t *val; - char *req = NULL; - - if ( have_gbt ) - { - req = malloc( strlen( gbt_lp_req ) + strlen( lp_id ) + 1 ); - sprintf( req, gbt_lp_req, lp_id ); - } - val = json_rpc_call( curl, lp_url, rpc_userpass, - req ? req : getwork_req, err, JSON_RPC_LONGPOLL ); - free( req ); - return val; -} - -// called by every thread, copies the backup to each thread's work. -void hodl_resync_threads( struct work* work ) -{ - int nonce_index = algo_gate.nonce_index; - pthread_barrier_wait( &hodl_barrier ); - if ( memcmp( work->data, hodl_work.data, algo_gate.work_cmp_size ) ) - { - work_free( work ); - work_copy( work, &hodl_work ); - } - work->data[ nonce_index ] = swab32( hodl_work.data[ nonce_index ] ); -} - -bool hodl_do_this_thread( int thr_id ) -{ - return ( thr_id == 0 ); -} - -int hodl_scanhash( struct work* work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ -#if defined(__AES__) - GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id ); - pthread_barrier_wait( &hodl_barrier ); - return scanhash_hodl_wolf( work, max_nonce, hashes_done, thr_info ); -#endif - return false; -} - -bool register_hodl_algo( algo_gate_t* gate ) -{ -#if !defined(__AES__) - applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version."); - return false; -#endif -// if ( TOTAL_CHUNKS % opt_n_threads ) -// { -// applog(LOG_ERR,"Thread count must be power of 2."); -// return false; -// } - pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads ); - gate->optimizations = AES_OPT | AVX_OPT | AVX2_OPT; - gate->scanhash = (void*)&hodl_scanhash; - gate->get_new_work = (void*)&hodl_get_new_work; - gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call; - gate->set_target = (void*)&hodl_set_target; - gate->build_stratum_request = (void*)&hodl_le_build_stratum_request; - gate->malloc_txs_request = (void*)&hodl_malloc_txs_request; - gate->build_block_header = (void*)&hodl_build_block_header; - gate->resync_threads = (void*)&hodl_resync_threads; - gate->do_this_thread = (void*)&hodl_do_this_thread; - gate->work_cmp_size = 76; - hodl_scratchbuf = (unsigned char*)malloc( 1 << 30 ); - allow_getwork = false; - return ( hodl_scratchbuf != NULL ); -} - - diff --git a/algo/hodl/hodl-gate.h b/algo/hodl/hodl-gate.h deleted file mode 100644 index 9a8ecf7..0000000 --- a/algo/hodl/hodl-gate.h +++ /dev/null @@ -1,6 +0,0 @@ -#include "algo-gate-api.h" - -extern unsigned char *hodl_scratchbuf; - -bool register_hodl_algo ( algo_gate_t* gate ); - diff --git a/algo/hodl/hodl-wolf.c b/algo/hodl/hodl-wolf.c deleted file mode 100644 index d84dfb9..0000000 --- a/algo/hodl/hodl-wolf.c +++ /dev/null @@ -1,222 +0,0 @@ -#include -#include -#include -#include -#include "sha512-avx.h" -#include "wolf-aes.h" -#include "hodl-gate.h" -#include "hodl-wolf.h" -#include "miner.h" - -#if defined(__AES__) - -void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount, - void *MidHash ) -{ - const int Chunk = TOTAL_CHUNKS / ThreadCount; - const uint32_t StartChunk = ThreadID * Chunk; - const uint32_t EndChunk = StartChunk + Chunk; - -#if defined(__SSE4_2__) -//#ifdef __AVX__ - uint64_t* TempBufs[ SHA512_PARALLEL_N ] ; - uint64_t* desination[ SHA512_PARALLEL_N ]; - - for ( int i=0; i < SHA512_PARALLEL_N; ++i ) - { - TempBufs[i] = (uint64_t*)malloc( 32 ); - memcpy( TempBufs[i], MidHash, 32 ); - } - - for ( uint32_t i = StartChunk; i < EndChunk; i += SHA512_PARALLEL_N ) - { - for ( int j = 0; j < SHA512_PARALLEL_N; ++j ) - { - ( (uint32_t*)TempBufs[j] )[0] = i + j; - desination[j] = (uint64_t*)( (uint8_t *)Garbage + ( (i+j) - * GARBAGE_CHUNK_SIZE ) ); - } - sha512Compute32b_parallel( TempBufs, desination ); - } - - for ( int i = 0; i < SHA512_PARALLEL_N; ++i ) - free( TempBufs[i] ); -#else - uint32_t TempBuf[8]; - memcpy( TempBuf, MidHash, 32 ); - - for ( uint32_t i = StartChunk; i < EndChunk; ++i ) - { - TempBuf[0] = i; - SHA512( ( uint8_t *)TempBuf, 32, - ( (uint8_t *)Garbage ) + ( i * GARBAGE_CHUNK_SIZE ) ); - } -#endif -} - -/* -void Rev256(uint32_t *Dest, const uint32_t *Src) -{ - for(int i = 0; i < 8; ++i) Dest[i] = swab32(Src[i]); -} -*/ - -int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ -#if defined(__SSE4_2__) -//#ifdef __AVX__ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int threadNumber = mythr->id; - CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf; - CacheEntry Cache[AES_PARALLEL_N]; - __m128i* data[AES_PARALLEL_N]; - const __m128i* next[AES_PARALLEL_N]; - uint32_t CollisionCount = 0; - - for ( int n=0; n> 2) - 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE; - next[n] = Garbage[nextLocation].dqwords; - - __m128i last[2]; - last[0] = _mm_xor_si128(Cache[n].dqwords[254], next[n][254]); - last[1] = _mm_xor_si128(Cache[n].dqwords[255], next[n][255]); - - // Key is last 32b of Cache - // IV is last 16b of Cache - ExpandAESKey256(ExpKey[n], last); - ivs[n] = last[1]; - } - AES256CBC(data, next, ExpKey, ivs); - } - - for(int n=0; n> 2) - 1] & (COMPARE_SIZE - 1)) < 1000) - { - uint32_t BlockHdr[22], FinalPoW[8]; - - swab32_array( BlockHdr, pdata, 20 ); - - BlockHdr[20] = k + n; - BlockHdr[21] = Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 2]; - - sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 ); - CollisionCount++; - if( FinalPoW[7] <= ptarget[7] ) - { - pdata[20] = swab32( BlockHdr[20] ); - pdata[21] = swab32( BlockHdr[21] ); - *hashes_done = CollisionCount; - return(1); - } - } - } - - *hashes_done = CollisionCount; - return(0); - - -#else // no AVX - - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t BlockHdr[22], FinalPoW[8]; - CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf; - CacheEntry Cache; - uint32_t CollisionCount = 0; - int threadNumber = mythr->id; - - swab32_array( BlockHdr, pdata, 20 ); - // Search for pattern in psuedorandom data - int searchNumber = COMPARE_SIZE / opt_n_threads; - int startLoc = threadNumber * searchNumber; - - if ( opt_debug ) - applog( LOG_DEBUG,"Hash target= %08lx", ptarget[7] ); - - for(int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k++) - { - // copy data to first l2 cache - memcpy(Cache.dwords, Garbage + k, GARBAGE_SLICE_SIZE); - for(int j = 0; j < AES_ITERATIONS; j++) - { - CacheEntry TmpXOR; - __m128i ExpKey[16]; - - // use last 4 bytes of first cache as next location - uint32_t nextLocation = Cache.dwords[(GARBAGE_SLICE_SIZE >> 2) - - 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE; - - // Copy data from indicated location to second l2 cache - - memcpy(&TmpXOR, Garbage + nextLocation, GARBAGE_SLICE_SIZE); - //XOR location data into second cache - for( int i = 0; i < (GARBAGE_SLICE_SIZE >> 4); ++i ) - TmpXOR.dqwords[i] = _mm_xor_si128( Cache.dqwords[i], - TmpXOR.dqwords[i] ); - // Key is last 32b of TmpXOR - // IV is last 16b of TmpXOR - - ExpandAESKey256( ExpKey, TmpXOR.dqwords + - (GARBAGE_SLICE_SIZE / sizeof(__m128i)) - 2 ); - AES256CBC( Cache.dqwords, TmpXOR.dqwords, ExpKey, - TmpXOR.dqwords[ (GARBAGE_SLICE_SIZE / sizeof(__m128i)) - - 1 ], 256 ); } - // use last X bits as solution - if( ( Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 1 ] - & (COMPARE_SIZE - 1) ) < 1000 ) - { - BlockHdr[20] = k; - BlockHdr[21] = Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 2 ]; - sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 ); - CollisionCount++; - if( FinalPoW[7] <= ptarget[7] ) - { - pdata[20] = swab32( BlockHdr[20] ); - pdata[21] = swab32( BlockHdr[21] ); - *hashes_done = CollisionCount; - return(1); - } - } - } - - *hashes_done = CollisionCount; - return(0); - -#endif // AVX else - -} - -void GenRandomGarbage(CacheEntry *Garbage, uint32_t *pdata, int thr_id) -{ - uint32_t BlockHdr[20], MidHash[8]; - swab32_array( BlockHdr, pdata, 20 ); - sha256d((uint8_t *)MidHash, (uint8_t *)BlockHdr, 80); - GenerateGarbageCore(Garbage, thr_id, opt_n_threads, MidHash); -} - -#endif // AES - diff --git a/algo/hodl/hodl-wolf.h b/algo/hodl/hodl-wolf.h deleted file mode 100644 index 47c8fb8..0000000 --- a/algo/hodl/hodl-wolf.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef __HODL_H -#define __HODL_H - -#include -#include -#include "miner.h" - -#define AES_ITERATIONS 15 - -#define GARBAGE_SIZE (1 << 30) -#define GARBAGE_CHUNK_SIZE (1 << 6) -#define GARBAGE_SLICE_SIZE (1 << 12) -#define TOTAL_CHUNKS (1 << 24) // GARBAGE_SIZE / GARBAGE_CHUNK_SIZE -#define COMPARE_SIZE (1 << 18) // GARBAGE_SIZE / GARBAGE_SLICE_SIZE - -typedef union _CacheEntry -{ - uint32_t dwords[GARBAGE_SLICE_SIZE >> 2] __attribute__((aligned(16))); - __m128i dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16))); -} CacheEntry; - -int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void GenRandomGarbage( CacheEntry *Garbage, uint32_t *pdata, int thr_id); - -#endif // __HODL_H diff --git a/algo/hodl/hodlminer.1 b/algo/hodl/hodlminer.1 deleted file mode 100644 index da855e6..0000000 --- a/algo/hodl/hodlminer.1 +++ /dev/null @@ -1,208 +0,0 @@ -.TH MINERD 1 "March 2016" "cpuminer 2.4.3" -.SH NAME -hodlminer \- CPU miner for Hodlcoin -.SH SYNOPSIS -.B hodlminer -[\fIOPTION\fR]... -.SH DESCRIPTION -.B hodlminer -is a multi-threaded CPU miner for Hodlcoin. -It supports the getwork and getblocktemplate (BIP 22) methods, -as well as the Stratum mining protocol. -.PP -In its normal mode of operation, \fBhodlminer\fR connects to a mining server -(specified with the \fB\-o\fR option), receives work from it and starts hashing. -As soon as a solution is found, it is submitted to the same mining server, -which can accept or reject it. -When using getwork or getblocktemplate, -\fBhodlminer\fR can take advantage of long polling, if the server supports it; -in any case, fresh work is fetched as needed. -When using the Stratum protocol this is not possible, -and the server is responsible for sending fresh work at least every minute; -if it fails to do so, -\fBhodlminer\fR may drop the connection and try reconnecting again. -.PP -By default, \fBhodlminer\fR writes all its messages to standard error. -On systems that have a syslog, the \fB\-\-syslog\fR option can be used -to write to it instead. -.PP -On start, the nice value of all miner threads is set to 19. -On Linux, the scheduling policy is also changed to SCHED_IDLE, -or to SCHED_BATCH if that fails. -On multiprocessor systems, \fBhodlminer\fR -automatically sets the CPU affinity of miner threads -if the number of threads is a multiple of the number of processors. -.SH EXAMPLES -To connect to the Hodlcoin mining pool that provides a Stratum server -at hodl.blockquarry.com on port 8332, authenticating as worker "user.worker" with password "x": -.PP -.nf -.RS -hodlminer \-o stratum+tcp://hodl.blockquarry.com:8332 \-u user.worker -p x -q -.RE -.fi -.PP -To mine to a local Hodlcoin instance running on port 18332, -authenticating with username "rpcuser" and password "rpcpass": -.PP -.nf -.RS -hodlminer \-a hodl \-o http://localhost:18332 \-O rpcuser:rpcpass \\ - \-\-coinbase\-addr=mpXwg4jMtRhuSpVq4xS3HFHmCmWp9NyGKt -.RE -.fi -.PP -.SH OPTIONS -.TP -\fB\-a\fR, \fB\-\-algo\fR=\fIALGORITHM\fR -Set the hashing algorithm to use. -Default is hodl. -Possible values are: -.RS 11 -.TP 10 -.B hodl -.TP -\fB\-\-benchmark\fR -Run in offline benchmark mode. -.TP -\fB\-B\fR, \fB\-\-background\fR -Run in the background as a daemon. -.TP -\fB\-\-cert\fR=\fIFILE\fR -Set an SSL certificate to use with the mining server. -Only supported when using the HTTPS protocol. -.TP -\fB\-\-coinbase\-addr\fR=\fIADDRESS\fR -Set a payout address for solo mining. -This is only used in getblocktemplate mode, -and only if the server does not provide a coinbase transaction. -.TP -\fB\-\-coinbase\-sig\fR=\fITEXT\fR -Set a string to be included in the coinbase (if allowed by the server). -This is only used in getblocktemplate mode. -.TP -\fB\-c\fR, \fB\-\-config\fR=\fIFILE\fR -Load options from a configuration file. -\fIFILE\fR must contain a JSON object -mapping long options to their arguments (as strings), -or to \fBtrue\fR if no argument is required. -Sample configuration file: - -.nf - { - "url": "stratum+tcp://hodl.blockquarry.com:8332", - "userpass": "foo:bar", - "retry-pause": "10", - "quiet": true - } -.fi -.TP -\fB\-D\fR, \fB\-\-debug\fR -Enable debug output. -.TP -\fB\-h\fR, \fB\-\-help\fR -Print a help message and exit. -.TP -\fB\-\-no\-gbt\fR -Do not use the getblocktemplate RPC method. -.TP -\fB\-\-no\-getwork\fR -Do not use the getwork RPC method. -.TP -\fB\-\-no\-longpoll\fR -Do not use long polling. -.TP -\fB\-\-no\-redirect\fR -Ignore requests from the server to switch to a different URL. -.TP -\fB\-\-no\-stratum\fR -Do not switch to Stratum, even if the server advertises support for it. -.TP -\fB\-o\fR, \fB\-\-url\fR=[\fISCHEME\fR://][\fIUSERNAME\fR[:\fIPASSWORD\fR]@]\fIHOST\fR:\fIPORT\fR[/\fIPATH\fR] -Set the URL of the mining server to connect to. -Supported schemes are \fBhttp\fR, \fBhttps\fR, \fBstratum+tcp\fR -and \fBstratum+tcps\fR. -If no scheme is specified, http is assumed. -Specifying a \fIPATH\fR is only supported for HTTP and HTTPS. -Specifying credentials has the same effect as using the \fB\-O\fR option. - -By default, on HTTP and HTTPS, -the miner tries to use the getblocktemplate RPC method, -and falls back to using getwork if getblocktemplate is unavailable. -This behavior can be modified by using the \fB\-\-no\-gbt\fR -and \fB\-\-no\-getwork\fR options. -.TP -\fB\-O\fR, \fB\-\-userpass\fR=\fIUSERNAME\fR:\fIPASSWORD\fR -Set the credentials to use for connecting to the mining server. -Any value previously set with \fB\-u\fR or \fB\-p\fR is discarded. -.TP -\fB\-p\fR, \fB\-\-pass\fR=\fIPASSWORD\fR -Set the password to use for connecting to the mining server. -Any password previously set with \fB\-O\fR is discarded. -.TP -\fB\-P\fR, \fB\-\-protocol\-dump\fR -Enable output of all protocol-level activities. -.TP -\fB\-q\fR, \fB\-\-quiet\fR -Disable per-thread hashmeter output. -.TP -\fB\-r\fR, \fB\-\-retries\fR=\fIN\fR -Set the maximum number of times to retry if a network call fails. -If not specified, the miner will retry indefinitely. -.TP -\fB\-R\fR, \fB\-\-retry\-pause\fR=\fISECONDS\fR -Set how long to wait between retries. Default is 30 seconds. -.TP -\fB\-s\fR, \fB\-\-scantime\fR=\fISECONDS\fR -Set an upper bound on the time the miner can go without fetching fresh work. -This setting has no effect in Stratum mode or when long polling is activated. -Default is 5 seconds. -.TP -\fB\-S\fR, \fB\-\-syslog\fR -Log to the syslog facility instead of standard error. -.TP -\fB\-t\fR, \fB\-\-threads\fR=\fIN\fR -Set the number of miner threads. -If not specified, the miner will try to detect the number of available processors -and use that. -.TP -\fB\-T\fR, \fB\-\-timeout\fR=\fISECONDS\fR -Set a timeout for long polling. -.TP -\fB\-u\fR, \fB\-\-user\fR=\fIUSERNAME\fR -Set the username to use for connecting to the mining server. -Any username previously set with \fB\-O\fR is discarded. -.TP -\fB\-V\fR, \fB\-\-version\fR -Display version information and quit. -.TP -\fB\-x\fR, \fB\-\-proxy\fR=[\fISCHEME\fR://][\fIUSERNAME\fR:\fIPASSWORD\fR@]\fIHOST\fR:\fIPORT\fR -Connect to the mining server through a proxy. -Supported schemes are: \fBhttp\fR, \fBsocks4\fR, \fBsocks5\fR. -Since libcurl 7.18.0, the following are also supported: -\fBsocks4a\fR, \fBsocks5h\fR (SOCKS5 with remote name resolving). -If no scheme is specified, the proxy is treated as an HTTP proxy. -.SH ENVIRONMENT -The following environment variables can be specified in lower case or upper case; -the lower-case version has precedence. \fBhttp_proxy\fR is an exception -as it is only available in lower case. -.PP -.RS -.TP -\fBhttp_proxy\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR -Sets the proxy server to use for HTTP. -.TP -\fBHTTPS_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR -Sets the proxy server to use for HTTPS. -.TP -\fBALL_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR -Sets the proxy server to use if no protocol-specific proxy is set. -.RE -.PP -Using an environment variable to set the proxy has the same effect as -using the \fB\-x\fR option. -.SH AUTHOR -Most of the code in the current version of minerd was written by -Pooler with contributions from others. - -The original minerd was written by Jeff Garzik . diff --git a/algo/hodl/sha512-avx.h b/algo/hodl/sha512-avx.h deleted file mode 100644 index eb7f094..0000000 --- a/algo/hodl/sha512-avx.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef _SHA512_H -#define _SHA512_H - -#include -#include "emmintrin.h" - -//SHA-512 block size -#define SHA512_BLOCK_SIZE 128 -//SHA-512 digest size -#define SHA512_DIGEST_SIZE 64 - -/* -#ifndef __AVX2__ -#ifndef __AVX__ -#error "Either AVX or AVX2 supported needed" -#endif // __AVX__ -#endif // __AVX2__ -*/ - -typedef struct -{ -#ifdef __AVX2__ - __m256i h[8]; - __m256i w[80]; -#elif defined(__SSE4_2__) -//#elif defined(__AVX__) - __m128i h[8]; - __m128i w[80]; -#else - int dummy; -#endif -} Sha512Context; - -#ifdef __AVX2__ -#define SHA512_PARALLEL_N 8 -#elif defined(__SSE4_2__) -//#elif defined(__AVX__) -#define SHA512_PARALLEL_N 4 -#else -#define SHA512_PARALLEL_N 1 // dummy value -#endif - -//SHA-512 related functions -void sha512Compute32b_parallel( - uint64_t *data[SHA512_PARALLEL_N], - uint64_t *digest[SHA512_PARALLEL_N]); - -void sha512ProcessBlock(Sha512Context *context); - -#endif diff --git a/algo/hodl/sha512_avx.c b/algo/hodl/sha512_avx.c deleted file mode 100644 index 1c7c089..0000000 --- a/algo/hodl/sha512_avx.c +++ /dev/null @@ -1,235 +0,0 @@ -#ifndef __AVX2__ - -#if defined(__SSE4_2__) -//#ifdef __AVX__ - -//Dependencies -#include -#include - -#ifdef __FreeBSD__ -#include -#endif - -#if defined(__CYGWIN__) -#include -#endif - -#include "tmmintrin.h" -#include "smmintrin.h" - -#include "sha512-avx.h" -#if ((defined(_WIN64) || defined(__WINDOWS__))) -#include "hodl-endian.h" -#endif - -//SHA-512 auxiliary functions -#define Ch(x, y, z) (((x) & (y)) | (~(x) & (z))) -#define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) -#define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39)) -#define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41)) -#define SIGMA3(x) (ROR64(x, 1) ^ ROR64(x, 8) ^ SHR64(x, 7)) -#define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6)) - -//Rotate right operation -#define ROR64(a, n) _mm_or_si128(_mm_srli_epi64(a, n), _mm_slli_epi64(a, 64 - n)) - -//Shift right operation -#define SHR64(a, n) _mm_srli_epi64(a, n) - -__m128i mm_htobe_epi64(__m128i a) { - __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - return _mm_shuffle_epi8(a, mask); -} - -__m128i mm_betoh_epi64(__m128i a) { - return mm_htobe_epi64(a); -} - -//SHA-512 padding -static const uint8_t padding[128] = -{ - 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 -}; - -//SHA-512 constants -static const uint64_t k[80] = -{ - 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, - 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, - 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, - 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694, - 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, - 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, - 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, - 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70, - 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, - 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B, - 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30, - 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8, - 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, - 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, - 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC, - 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, - 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, - 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B, - 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, - 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 -}; - - -void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) { - Sha512Context context[2]; - context[0].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908); - context[0].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B); - context[0].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B); - context[0].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1); - context[0].h[4] = _mm_set1_epi64x(0x510E527FADE682D1); - context[0].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F); - context[0].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B); - context[0].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179); - - context[1].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908); - context[1].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B); - context[1].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B); - context[1].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1); - context[1].h[4] = _mm_set1_epi64x(0x510E527FADE682D1); - context[1].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F); - context[1].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B); - context[1].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179); - - for(int i=0; i<4; ++i) { - context[0].w[i] = _mm_set_epi64x ( data[1][i], data[0][i] ); - context[1].w[i] = _mm_set_epi64x ( data[3][i], data[2][i] ); - } - for(int i=0; i<10; ++i) { - context[0].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] ); - context[1].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] ); - } - - //Length of the original message (before padding) - uint64_t totalSize = 32 * 8; - - //Append the length of the original message - context[0].w[14] = _mm_set1_epi64x(0); - context[0].w[15] = _mm_set1_epi64x(htobe64(totalSize)); - - context[1].w[14] = _mm_set1_epi64x(0); - context[1].w[15] = _mm_set1_epi64x(htobe64(totalSize)); - - //Calculate the message digest - sha512ProcessBlock(context); - - //Convert from host byte order to big-endian byte order - for (int i = 0; i < 8; i++) { - context[0].h[i] = mm_htobe_epi64(context[0].h[i]); - context[1].h[i] = mm_htobe_epi64(context[1].h[i]); - } - - //Copy the resulting digest - for(int i=0; i<8; ++i) { - digest[0][i] = _mm_extract_epi64(context[0].h[i], 0); - digest[1][i] = _mm_extract_epi64(context[0].h[i], 1); - digest[2][i] = _mm_extract_epi64(context[1].h[i], 0); - digest[3][i] = _mm_extract_epi64(context[1].h[i], 1); - } -} - -#define blk0(n, i) (block[n][i] = mm_betoh_epi64(block[n][i])) -#define blk(n, i) (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \ - SIGMA4(block[n][i - 2]) + block[n][i - 7]) - -#define ROUND512(a,b,c,d,e,f,g,h) \ - T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \ - T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \ - (d[0]) += T0; \ - (d[1]) += T1; \ - (h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \ - (h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \ - i++ - -#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \ - T0 = blk0(0, i); \ - T1 = blk0(1, i); \ - ROUND512(a,b,c,d,e,f,g,h) - -#define ROUND512_16_TO_80(a,b,c,d,e,f,g,h) \ - T0 = blk(0, i); \ - T1 = blk(1, i); \ - ROUND512(a,b,c,d,e,f,g,h) - -#define R512_0 \ - ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \ - ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \ - ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \ - ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \ - ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \ - ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \ - ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \ - ROUND512_0_TO_15(b, c, d, e, f, g, h, a) - -#define R512_16 \ - ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \ - ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \ - ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \ - ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \ - ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \ - ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \ - ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \ - ROUND512_16_TO_80(b, c, d, e, f, g, h, a) - -#define INIT(x,n) \ - x[0] = context[0].h[n]; \ - x[1] = context[1].h[n]; \ - -void sha512ProcessBlock(Sha512Context context[2]) -{ - __m128i* block[2]; - block[0] = context[0].w; - block[1] = context[1].w; - - __m128i T0, T1; - __m128i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2]; - INIT(a, 0) - INIT(b, 1) - INIT(c, 2) - INIT(d, 3) - INIT(e, 4) - INIT(f, 5) - INIT(g, 6) - INIT(h, 7) - - int i = 0; - R512_0; R512_0; - for(int j=0; j<8; ++j) { - R512_16; - } - - context[0].h[0] += a[0]; - context[0].h[1] += b[0]; - context[0].h[2] += c[0]; - context[0].h[3] += d[0]; - context[0].h[4] += e[0]; - context[0].h[5] += f[0]; - context[0].h[6] += g[0]; - context[0].h[7] += h[0]; - - context[1].h[0] += a[1]; - context[1].h[1] += b[1]; - context[1].h[2] += c[1]; - context[1].h[3] += d[1]; - context[1].h[4] += e[1]; - context[1].h[5] += f[1]; - context[1].h[6] += g[1]; - context[1].h[7] += h[1]; -} - -#endif // __AVX__ -#endif // __AVX2__ diff --git a/algo/hodl/sha512_avx2.c b/algo/hodl/sha512_avx2.c deleted file mode 100644 index 58e421c..0000000 --- a/algo/hodl/sha512_avx2.c +++ /dev/null @@ -1,241 +0,0 @@ -#ifdef __AVX2__ - -//Dependencies -#include -#include - -#ifdef __FreeBSD__ -#include -#endif - -#if defined(__CYGWIN__) -#include -#endif - -#include "tmmintrin.h" -#include "smmintrin.h" -#include "immintrin.h" - -#include "sha512-avx.h" -#if ((defined(_WIN64) || defined(__WINDOWS__))) -#include "hodl-endian.h" -#endif - -//SHA-512 auxiliary functions -#define Ch(x, y, z) (((x) & (y)) | (~(x) & (z))) -#define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) -#define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39)) -#define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41)) -#define SIGMA3(x) (ROR64(x, 1) ^ ROR64(x, 8) ^ SHR64(x, 7)) -#define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6)) - -//Rotate right operation -#define ROR64(a, n) _mm256_or_si256(_mm256_srli_epi64(a, n), _mm256_slli_epi64(a, 64 - n)) - -//Shift right operation -#define SHR64(a, n) _mm256_srli_epi64(a, n) - -__m256i mm256_htobe_epi64(__m256i a) { - __m256i mask = _mm256_set_epi8( - 24,25,26,27,28,29,30,31, - 16,17,18,19,20,21,22,23, - 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7); - return _mm256_shuffle_epi8(a, mask); -} - -__m256i mm256_betoh_epi64(__m256i a) { - return mm256_htobe_epi64(a); -} - -//SHA-512 padding -static const uint8_t padding[128] = -{ - 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 -}; - -//SHA-512 constants -static const uint64_t k[80] = -{ - 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, - 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, - 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, - 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694, - 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, - 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, - 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, - 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70, - 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, - 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B, - 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30, - 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8, - 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, - 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, - 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC, - 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, - 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, - 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B, - 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, - 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 -}; - - -void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) { - Sha512Context context[2]; - context[0].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908); - context[0].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B); - context[0].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B); - context[0].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1); - context[0].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1); - context[0].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F); - context[0].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B); - context[0].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179); - - context[1].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908); - context[1].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B); - context[1].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B); - context[1].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1); - context[1].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1); - context[1].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F); - context[1].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B); - context[1].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179); - - for(int i=0; i<4; ++i) { - context[0].w[i] = _mm256_set_epi64x ( data[3][i], data[2][i], data[1][i], data[0][i] ); - context[1].w[i] = _mm256_set_epi64x ( data[7][i], data[6][i], data[5][i], data[4][i] ); - } - for(int i=0; i<10; ++i) { - context[0].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] ); - context[1].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] ); - } - - //Length of the original message (before padding) - uint64_t totalSize = 32 * 8; - - //Append the length of the original message - context[0].w[14] = _mm256_set1_epi64x(0); - context[0].w[15] = _mm256_set1_epi64x(htobe64(totalSize)); - - context[1].w[14] = _mm256_set1_epi64x(0); - context[1].w[15] = _mm256_set1_epi64x(htobe64(totalSize)); - - //Calculate the message digest - sha512ProcessBlock(context); - - //Convert from host byte order to big-endian byte order - for (int i = 0; i < 8; i++) { - context[0].h[i] = mm256_htobe_epi64(context[0].h[i]); - context[1].h[i] = mm256_htobe_epi64(context[1].h[i]); - } - - //Copy the resulting digest - for(int i=0; i<8; ++i) { - digest[0][i] = _mm256_extract_epi64(context[0].h[i], 0); - digest[1][i] = _mm256_extract_epi64(context[0].h[i], 1); - digest[2][i] = _mm256_extract_epi64(context[0].h[i], 2); - digest[3][i] = _mm256_extract_epi64(context[0].h[i], 3); - - digest[4][i] = _mm256_extract_epi64(context[1].h[i], 0); - digest[5][i] = _mm256_extract_epi64(context[1].h[i], 1); - digest[6][i] = _mm256_extract_epi64(context[1].h[i], 2); - digest[7][i] = _mm256_extract_epi64(context[1].h[i], 3); - } -} - -#define blk0(n, i) (block[n][i] = mm256_betoh_epi64(block[n][i])) -#define blk(n, i) (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \ - SIGMA4(block[n][i - 2]) + block[n][i - 7]) - -#define ROUND512(a,b,c,d,e,f,g,h) \ - T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \ - T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \ - (d[0]) += T0; \ - (d[1]) += T1; \ - (h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \ - (h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \ - i++ - -#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \ - T0 = blk0(0, i); \ - T1 = blk0(1, i); \ - ROUND512(a,b,c,d,e,f,g,h) - -#define ROUND512_16_TO_80(a,b,c,d,e,f,g,h) \ - T0 = blk(0, i); \ - T1 = blk(1, i); \ - ROUND512(a,b,c,d,e,f,g,h) - -#define R512_0 \ - ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \ - ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \ - ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \ - ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \ - ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \ - ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \ - ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \ - ROUND512_0_TO_15(b, c, d, e, f, g, h, a) - -#define R512_16 \ - ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \ - ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \ - ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \ - ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \ - ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \ - ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \ - ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \ - ROUND512_16_TO_80(b, c, d, e, f, g, h, a) - -#define INIT(x,n) \ - x[0] = context[0].h[n]; \ - x[1] = context[1].h[n]; \ - -void sha512ProcessBlock(Sha512Context context[2]) -{ - __m256i* block[2]; - block[0] = context[0].w; - block[1] = context[1].w; - - __m256i T0, T1; - __m256i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2]; - INIT(a, 0) - INIT(b, 1) - INIT(c, 2) - INIT(d, 3) - INIT(e, 4) - INIT(f, 5) - INIT(g, 6) - INIT(h, 7) - - int i = 0; - R512_0; R512_0; - for(int j=0; j<8; ++j) { - R512_16; - } - - context[0].h[0] += a[0]; - context[0].h[1] += b[0]; - context[0].h[2] += c[0]; - context[0].h[3] += d[0]; - context[0].h[4] += e[0]; - context[0].h[5] += f[0]; - context[0].h[6] += g[0]; - context[0].h[7] += h[0]; - - context[1].h[0] += a[1]; - context[1].h[1] += b[1]; - context[1].h[2] += c[1]; - context[1].h[3] += d[1]; - context[1].h[4] += e[1]; - context[1].h[5] += f[1]; - context[1].h[6] += g[1]; - context[1].h[7] += h[1]; -} - -#endif // __AVX2__ diff --git a/algo/hodl/wolf-aes.h b/algo/hodl/wolf-aes.h deleted file mode 100644 index b33407f..0000000 --- a/algo/hodl/wolf-aes.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __WOLF_AES_H -#define __WOLF_AES_H - -#include -#include - -void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf); - -#if defined(__SSE4_2__) -//#ifdef __AVX__ - -#define AES_PARALLEL_N 8 -#define BLOCK_COUNT 256 - -void AES256CBC( __m128i** data, const __m128i** next, __m128i ExpandedKey[][16], - __m128i* IV ); - -#else - -void AES256CBC( __m128i *Ciphertext, const __m128i *Plaintext, - const __m128i *ExpandedKey, __m128i IV, uint32_t BlockCount ); - -#endif - -#endif // __WOLF_AES_H diff --git a/algo/jh/jh-hash-4way.c b/algo/jh/jh-hash-4way.c deleted file mode 100644 index 111e5f3..0000000 --- a/algo/jh/jh-hash-4way.c +++ /dev/null @@ -1,609 +0,0 @@ -/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */ -/* - * JH implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#ifdef __AVX2__ - -#include -#include - -#include "jh-hash-4way.h" - -#ifdef __cplusplus -extern "C"{ -#endif - - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH -#define SPH_SMALL_FOOTPRINT_JH 1 -#endif - -#if !defined SPH_JH_64 && SPH_64_TRUE -#define SPH_JH_64 1 -#endif - -#if !SPH_64 -#undef SPH_JH_64 -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -/* - * The internal bitslice representation may use either big-endian or - * little-endian (true bitslice operations do not care about the bit - * ordering, and the bit-swapping linear operations in JH happen to - * be invariant through endianness-swapping). The constants must be - * defined according to the chosen endianness; we use some - * byte-swapping macros for that. - */ - -#if SPH_LITTLE_ENDIAN - -#if SPH_64 -#define C64e(x) ((SPH_C64(x) >> 56) \ - | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ - | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ - | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ - | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ - | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ - | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ - | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) -#define dec64e_aligned sph_dec64le_aligned -#define enc64e sph_enc64le -#endif - -#else - -#if SPH_64 -#define C64e(x) SPH_C64(x) -#define dec64e_aligned sph_dec64be_aligned -#define enc64e sph_enc64be -#endif - -#endif - -#define Sb(x0, x1, x2, x3, c) \ -do { \ - __m256i cc = _mm256_set_epi64x( c, c, c, c ); \ - x3 = mm256_not( x3 ); \ - x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \ - tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \ - x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \ - x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \ - x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \ - x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \ - x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \ - x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \ - x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \ - x2 = _mm256_xor_si256( x2, tmp ); \ -} while (0) - -#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \ -do { \ - x4 = _mm256_xor_si256( x4, x1 ); \ - x5 = _mm256_xor_si256( x5, x2 ); \ - x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \ - x7 = _mm256_xor_si256( x7, x0 ); \ - x0 = _mm256_xor_si256( x0, x5 ); \ - x1 = _mm256_xor_si256( x1, x6 ); \ - x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \ - x3 = _mm256_xor_si256( x3, x4 ); \ -} while (0) - -#if SPH_JH_64 - -static const sph_u64 C[] = { - C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557), - C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40), - C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a), - C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231), - C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410), - C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc), - C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0), - C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3), - C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce), - C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23), - C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8), - C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197), - C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95), - C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214), - C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80), - C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4), - C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989), - C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36), - C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7), - C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f), - C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727), - C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b), - C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e), - C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062), - C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984), - C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5), - C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2), - C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f), - C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465), - C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a), - C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1), - C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf), - C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48), - C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0), - C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134), - C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a), - C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff), - C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6), - C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae), - C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567), - C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a), - C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518), - C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446), - C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e), - C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee), - C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001), - C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779), - C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83), - C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a), - C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef), - C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d), - C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65), - C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a), - C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c), - C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d), - C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71), - C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc), - C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0), - C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c), - C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f), - C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751), - C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad), - C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56), - C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6), - C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a), - C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163), - C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826), - C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f), - C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30), - C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a), - C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3), - C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505), - C64e(0xb17681d913326cce), C64e(0x3c175284f805a262), - C64e(0xf42bcbb378471547), C64e(0xff46548223936a48), - C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e), - C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e), - C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd), - C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7), - C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be), - C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de), - C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9), - C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a), - C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b), - C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2) -}; - -#define Ceven_hi(r) (C[((r) << 2) + 0]) -#define Ceven_lo(r) (C[((r) << 2) + 1]) -#define Codd_hi(r) (C[((r) << 2) + 2]) -#define Codd_lo(r) (C[((r) << 2) + 3]) - -#define S(x0, x1, x2, x3, cb, r) do { \ - Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \ - Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \ - } while (0) - -#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ - Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \ - x4 ## h, x5 ## h, x6 ## h, x7 ## h); \ - Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \ - x4 ## l, x5 ## l, x6 ## l, x7 ## l); \ - } while (0) - - -#define Wz(x, c, n) \ -do { \ - __m256i t = _mm256_slli_epi64( _mm256_and_si256(x ## h, (c)), (n) ); \ - x ## h = _mm256_or_si256( _mm256_and_si256( \ - _mm256_srli_epi64(x ## h, (n)), (c)), t ); \ - t = _mm256_slli_epi64( _mm256_and_si256(x ## l, (c)), (n) ); \ - x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \ -} while (0) - - -/* -#define Wz(x, c, n) do { \ - sph_u64 t = (x ## h & (c)) << (n); \ - x ## h = ((x ## h >> (n)) & (c)) | t; \ - t = (x ## l & (c)) << (n); \ - x ## l = ((x ## l >> (n)) & (c)) | t; \ - } while (0) -*/ - -#define W0(x) Wz(x, _mm256_set_epi64x( 0x5555555555555555, \ - 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 ) -#define W1(x) Wz(x, _mm256_set_epi64x( 0x3333333333333333, \ - 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 ) -#define W2(x) Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \ - 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 ) -#define W3(x) Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \ - 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 ) -#define W4(x) Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \ - 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 ) -#define W5(x) Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \ - 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 ) -#define W6(x) \ -do { \ - __m256i t = x ## h; \ - x ## h = x ## l; \ - x ## l = t; \ -} while (0) - -/* -#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1) -#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2) -#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4) -#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8) -#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16) -#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32) -#define W6(x) do { \ - sph_u64 t = x ## h; \ - x ## h = x ## l; \ - x ## l = t; \ - } while (0) -*/ - -#define DECL_STATE \ - __m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \ - __m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \ - __m256i tmp; - -#define READ_STATE(state) do { \ - h0h = (state)->H[ 0]; \ - h0l = (state)->H[ 1]; \ - h1h = (state)->H[ 2]; \ - h1l = (state)->H[ 3]; \ - h2h = (state)->H[ 4]; \ - h2l = (state)->H[ 5]; \ - h3h = (state)->H[ 6]; \ - h3l = (state)->H[ 7]; \ - h4h = (state)->H[ 8]; \ - h4l = (state)->H[ 9]; \ - h5h = (state)->H[10]; \ - h5l = (state)->H[11]; \ - h6h = (state)->H[12]; \ - h6l = (state)->H[13]; \ - h7h = (state)->H[14]; \ - h7l = (state)->H[15]; \ - } while (0) - -#define WRITE_STATE(state) do { \ - (state)->H[ 0] = h0h; \ - (state)->H[ 1] = h0l; \ - (state)->H[ 2] = h1h; \ - (state)->H[ 3] = h1l; \ - (state)->H[ 4] = h2h; \ - (state)->H[ 5] = h2l; \ - (state)->H[ 6] = h3h; \ - (state)->H[ 7] = h3l; \ - (state)->H[ 8] = h4h; \ - (state)->H[ 9] = h4l; \ - (state)->H[10] = h5h; \ - (state)->H[11] = h5l; \ - (state)->H[12] = h6h; \ - (state)->H[13] = h6l; \ - (state)->H[14] = h7h; \ - (state)->H[15] = h7l; \ - } while (0) - -#define INPUT_BUF1 \ - __m256i m0h = buf[0]; \ - __m256i m0l = buf[1]; \ - __m256i m1h = buf[2]; \ - __m256i m1l = buf[3]; \ - __m256i m2h = buf[4]; \ - __m256i m2l = buf[5]; \ - __m256i m3h = buf[6]; \ - __m256i m3l = buf[7]; \ - h0h = _mm256_xor_si256( h0h, m0h ); \ - h0l = _mm256_xor_si256( h0l, m0l ); \ - h1h = _mm256_xor_si256( h1h, m1h ); \ - h1l = _mm256_xor_si256( h1l, m1l ); \ - h2h = _mm256_xor_si256( h2h, m2h ); \ - h2l = _mm256_xor_si256( h2l, m2l ); \ - h3h = _mm256_xor_si256( h3h, m3h ); \ - h3l = _mm256_xor_si256( h3l, m3l ); \ - -#define INPUT_BUF2 \ - h4h = _mm256_xor_si256( h4h, m0h ); \ - h4l = _mm256_xor_si256( h4l, m0l ); \ - h5h = _mm256_xor_si256( h5h, m1h ); \ - h5l = _mm256_xor_si256( h5l, m1l ); \ - h6h = _mm256_xor_si256( h6h, m2h ); \ - h6l = _mm256_xor_si256( h6l, m2l ); \ - h7h = _mm256_xor_si256( h7h, m3h ); \ - h7l = _mm256_xor_si256( h7l, m3l ); \ - -static const sph_u64 IV256[] = { - C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1), - C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03), - C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477), - C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8), - C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262), - C64e(0x277695f776248f94), C64e(0x87d5b6574780296c), - C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f), - C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769) -}; - - -static const sph_u64 IV512[] = { - C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543), - C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361), - C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80), - C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7), - C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a), - C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199), - C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156), - C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b) -}; - -#else - - -#endif - -#define SL(ro) SLu(r + ro, ro) - -#define SLu(r, ro) do { \ - S(h0, h2, h4, h6, Ceven_, r); \ - S(h1, h3, h5, h7, Codd_, r); \ - L(h0, h2, h4, h6, h1, h3, h5, h7); \ - W ## ro(h1); \ - W ## ro(h3); \ - W ## ro(h5); \ - W ## ro(h7); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_JH - -#if SPH_JH_64 - -/* - * The "small footprint" 64-bit version just uses a partially unrolled - * loop. - */ - -#define E8 do { \ - unsigned r; \ - for (r = 0; r < 42; r += 7) { \ - SL(0); \ - SL(1); \ - SL(2); \ - SL(3); \ - SL(4); \ - SL(5); \ - SL(6); \ - } \ - } while (0) - -#else - - -#endif - -#else - -#if SPH_JH_64 - -/* - * On a "true 64-bit" architecture, we can unroll at will. - */ - -#define E8 do { \ - SLu( 0, 0); \ - SLu( 1, 1); \ - SLu( 2, 2); \ - SLu( 3, 3); \ - SLu( 4, 4); \ - SLu( 5, 5); \ - SLu( 6, 6); \ - SLu( 7, 0); \ - SLu( 8, 1); \ - SLu( 9, 2); \ - SLu(10, 3); \ - SLu(11, 4); \ - SLu(12, 5); \ - SLu(13, 6); \ - SLu(14, 0); \ - SLu(15, 1); \ - SLu(16, 2); \ - SLu(17, 3); \ - SLu(18, 4); \ - SLu(19, 5); \ - SLu(20, 6); \ - SLu(21, 0); \ - SLu(22, 1); \ - SLu(23, 2); \ - SLu(24, 3); \ - SLu(25, 4); \ - SLu(26, 5); \ - SLu(27, 6); \ - SLu(28, 0); \ - SLu(29, 1); \ - SLu(30, 2); \ - SLu(31, 3); \ - SLu(32, 4); \ - SLu(33, 5); \ - SLu(34, 6); \ - SLu(35, 0); \ - SLu(36, 1); \ - SLu(37, 2); \ - SLu(38, 3); \ - SLu(39, 4); \ - SLu(40, 5); \ - SLu(41, 6); \ - } while (0) - -#else - - -#endif - -#endif - -static void -jh_4way_init( jh_4way_context *sc, const void *iv ) -{ - uint64_t *v = (uint64_t*)iv; - - for ( int i = 0; i < 16; i++ ) - sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] ); - sc->ptr = 0; - sc->block_count = 0; -} - -static void -jh_4way_core( jh_4way_context *sc, const void *data, size_t len ) -{ - __m256i *buf; - __m256i *vdata = (__m256i*)data; - const int buf_size = 64; // 64 * _m256i - size_t ptr; - DECL_STATE - - buf = sc->buf; - ptr = sc->ptr; - - if ( len < (buf_size - ptr) ) - { - memcpy_256( buf + (ptr>>3), vdata, len>>3 ); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE(sc); - while ( len > 0 ) - { - size_t clen; - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - - memcpy_256( buf + (ptr>>3), vdata, clen>>3 ); - ptr += clen; - vdata += (clen>>3); - len -= clen; - if ( ptr == buf_size ) - { - INPUT_BUF1; - E8; - INPUT_BUF2; - sc->block_count ++; - ptr = 0; - } - } - WRITE_STATE(sc); - sc->ptr = ptr; -} - -static void -jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst, - size_t out_size_w32, const void *iv ) -{ - __m256i buf[16*4]; - __m256i *dst256 = (__m256i*)dst; - size_t numz, u; - sph_u64 l0, l1, l0e, l1e; - - buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 ); - - if ( sc->ptr == 0 ) - numz = 48; - else - numz = 112 - sc->ptr; - - memset_zero_256( buf+1, (numz>>3) - 1 ); - - l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3); - l1 = SPH_T64(sc->block_count >> 55); - sph_enc64be( &l0e, l0 ); - sph_enc64be( &l1e, l1 ); - *(buf + (numz>>3) ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e ); - *(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e ); - - jh_4way_core( sc, buf, numz + 16 ); - - for ( u=0; u < 8; u++ ) - buf[u] = sc->H[u+8]; - - memcpy_256( dst256, buf, 8 ); -} - -void -jh256_4way_init(void *cc) -{ - jh_4way_init(cc, IV256); -} - -void -jh256_4way(void *cc, const void *data, size_t len) -{ - jh_4way_core(cc, data, len); -} - -void -jh256_4way_close(void *cc, void *dst) -{ - jh_4way_close(cc, 0, 0, dst, 8, IV256); -} - -void -jh512_4way_init(void *cc) -{ - jh_4way_init(cc, IV512); -} - -void -jh512_4way(void *cc, const void *data, size_t len) -{ - jh_4way_core(cc, data, len); -} - -void -jh512_4way_close(void *cc, void *dst) -{ - jh_4way_close(cc, 0, 0, dst, 16, IV512); -} - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/jh/jh-hash-4way.h b/algo/jh/jh-hash-4way.h deleted file mode 100644 index 14ad113..0000000 --- a/algo/jh/jh-hash-4way.h +++ /dev/null @@ -1,100 +0,0 @@ -/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * JH interface. JH is a family of functions which differ by - * their output size; this implementation defines JH for output - * sizes 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_jh.h - * @author Thomas Pornin - */ - -#ifndef JH_HASH_4WAY_H__ -#define JH_HASH_4WAY_H__ - -#ifdef __AVX2__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "algo/sha/sph_types.h" -#include "simd-utils.h" - -#define SPH_SIZE_jh256 256 - -#define SPH_SIZE_jh512 512 - -/** - * This structure is a context for JH computations: it contains the - * intermediate values and some data from the last entered block. Once - * a JH computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running JH computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { - __m256i buf[8] __attribute__ ((aligned (64))); - __m256i H[16]; - size_t ptr; - uint64_t block_count; -/* - unsigned char buf[64]; - size_t ptr; - union { - sph_u64 wide[16]; - } H; - sph_u64 block_count; -*/ -} jh_4way_context; - -typedef jh_4way_context jh256_4way_context; - -typedef jh_4way_context jh512_4way_context; - -void jh256_4way_init(void *cc); - -void jh256_4way(void *cc, const void *data, size_t len); - -void jh256_4way_close(void *cc, void *dst); - -void jh512_4way_init(void *cc); - -void jh512_4way(void *cc, const void *data, size_t len); - -void jh512_4way_close(void *cc, void *dst); - -#ifdef __cplusplus -} -#endif - -#endif - -#endif diff --git a/algo/jh/jha-4way.c b/algo/jh/jha-4way.c deleted file mode 100644 index 2c76a33..0000000 --- a/algo/jh/jha-4way.c +++ /dev/null @@ -1,143 +0,0 @@ -#include "jha-gate.h" -#include -#include -#include -#include - -#if defined(JHA_4WAY) - -#include "algo/blake/blake-hash-4way.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" - -void jha_hash_4way( void *out, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhashA[8*4] __attribute__ ((aligned (64))); - uint64_t vhashB[8*4] __attribute__ ((aligned (64))); - __m256i* vh = (__m256i*)vhash; - __m256i* vhA = (__m256i*)vhashA; - __m256i* vhB = (__m256i*)vhashB; - __m256i vh_mask; - - blake512_4way_context ctx_blake; - hashState_groestl ctx_groestl; - jh512_4way_context ctx_jh; - skein512_4way_context ctx_skein; - keccak512_4way_context ctx_keccak; - - keccak512_4way_init( &ctx_keccak ); - keccak512_4way( &ctx_keccak, input, 80 ); - keccak512_4way_close( &ctx_keccak, vhash ); - - // Heavy & Light Pair Loop - for ( int round = 0; round < 3; round++ ) - { - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( - vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash0, - (char*)hash0, 512 ); - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash1, - (char*)hash1, 512 ); - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash2, - (char*)hash2, 512 ); - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash3, - (char*)hash3, 512 ); - intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); - - skein512_4way_init( &ctx_skein ); - skein512_4way( &ctx_skein, vhash, 64 ); - skein512_4way_close( &ctx_skein, vhashB ); - - for ( int i = 0; i < 8; i++ ) - vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); - - blake512_4way_init( &ctx_blake ); - blake512_4way( &ctx_blake, vhash, 64 ); - blake512_4way_close( &ctx_blake, vhashA ); - - jh512_4way_init( &ctx_jh ); - jh512_4way( &ctx_jh, vhash, 64 ); - jh512_4way_close( &ctx_jh, vhashB ); - - for ( int i = 0; i < 8; i++ ) - casti_m256i( out, i ) = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); - } -} - -int scanhash_jha_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[25]); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t n = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - for ( int m = 0; m < 6; m++ ) - { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - jha_hash_4way( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) if ( !( (hash7[i] & mask ) == 0 ) ) - { - extr_lane_4x64( lane_hash, hash, i, 256 ); - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, lane_hash, mythr, i ); - } - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - } - *hashes_done = n - first_nonce + 1; - return 0; -} -#endif diff --git a/algo/jh/jha-gate.c b/algo/jh/jha-gate.c deleted file mode 100644 index ca3d4fa..0000000 --- a/algo/jh/jha-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "jha-gate.h" - - -bool register_jha_algo( algo_gate_t* gate ) -{ -#if defined (JHA_4WAY) - four_way_not_tested(); - gate->scanhash = (void*)&scanhash_jha_4way; - gate->hash = (void*)&jha_hash_4way; -#else - gate->scanhash = (void*)&scanhash_jha; - gate->hash = (void*)&jha_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->set_target = (void*)&scrypt_set_target; - return true; -}; - diff --git a/algo/jh/jha-gate.h b/algo/jh/jha-gate.h deleted file mode 100644 index 8a0ddad..0000000 --- a/algo/jh/jha-gate.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef JHA_GATE_H__ -#define JHA_GATE_H__ - -#include "algo-gate-api.h" -#include - - -#if defined(__AVX2__) && defined(__AES__) - #define JHA_4WAY -#endif - -#if defined JHA_4WAY -void jha_hash_4way( void *state, const void *input ); - -int scanhash_jha_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - -void jha_hash( void *state, const void *input ); - -int scanhash_jha( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - diff --git a/algo/jh/jha.c b/algo/jh/jha.c deleted file mode 100644 index 42767e9..0000000 --- a/algo/jh/jha.c +++ /dev/null @@ -1,157 +0,0 @@ -#include "jha-gate.h" - -#include -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" - -#ifdef NO_AES_NI - #include "algo/groestl/sph_groestl.h" -#else - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64))); - -void jha_kec_midstate( const void* input ) -{ - sph_keccak512_init( &jha_kec_mid ); - sph_keccak512( &jha_kec_mid, input, 64 ); -} - -void jha_hash(void *output, const void *input) -{ - uint8_t _ALIGN(128) hash[64]; - -#ifdef NO_AES_NI - sph_groestl512_context ctx_groestl; -#else - hashState_groestl ctx_groestl; -#endif - sph_blake512_context ctx_blake; - sph_jh512_context ctx_jh; - sph_keccak512_context ctx_keccak; - sph_skein512_context ctx_skein; - - memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid ); - sph_keccak512(&ctx_keccak, input+64, 16 ); - sph_keccak512_close(&ctx_keccak, hash ); - - // Heavy & Light Pair Loop - for (int round = 0; round < 3; round++) - { - if (hash[0] & 0x01) - { -#ifdef NO_AES_NI - sph_groestl512_init(&ctx_groestl); - sph_groestl512(&ctx_groestl, hash, 64 ); - sph_groestl512_close(&ctx_groestl, hash ); -#else - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash, - (char*)hash, 512 ); -#endif - } - else - { - sph_skein512_init(&ctx_skein); - sph_skein512(&ctx_skein, hash, 64); - sph_skein512_close(&ctx_skein, hash ); - } - - if (hash[0] & 0x01) - { - sph_blake512_init(&ctx_blake); - sph_blake512(&ctx_blake, hash, 64); - sph_blake512_close(&ctx_blake, hash ); - } - else - { - sph_jh512_init(&ctx_jh); - sph_jh512(&ctx_jh, hash, 64 ); - sph_jh512_close(&ctx_jh, hash ); - } - } - - memcpy(output, hash, 32); -} - -int scanhash_jha( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t n = pdata[19] - 1; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - for (int i=0; i < 19; i++) { - be32enc(&endiandata[i], pdata[i]); - } - - jha_kec_midstate( endiandata ); - -#ifdef DEBUG_ALGO - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - jha_hash(hash32, endiandata); -#ifndef DEBUG_ALGO - if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) { - work_set_target_ratio(work, hash32); - *hashes_done = n - first_nonce + 1; - return 1; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash32[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash32, ptarget)) { - work_set_target_ratio(work, hash32); - *hashes_done = n - first_nonce + 1; - return 1; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/jh/sse2/jh.c b/algo/jh/sse2/jh.c deleted file mode 100644 index 41487a5..0000000 --- a/algo/jh/sse2/jh.c +++ /dev/null @@ -1,1116 +0,0 @@ -/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */ -/* - * JH implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph_jh.h" - -#ifdef __cplusplus -extern "C"{ -#endif - - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH -#define SPH_SMALL_FOOTPRINT_JH 1 -#endif - -#if !defined SPH_JH_64 && SPH_64_TRUE -#define SPH_JH_64 1 -#endif - -#if !SPH_64 -#undef SPH_JH_64 -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -/* - * The internal bitslice representation may use either big-endian or - * little-endian (true bitslice operations do not care about the bit - * ordering, and the bit-swapping linear operations in JH happen to - * be invariant through endianness-swapping). The constants must be - * defined according to the chosen endianness; we use some - * byte-swapping macros for that. - */ - -#if SPH_LITTLE_ENDIAN - -#define C32e(x) ((SPH_C32(x) >> 24) \ - | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ - | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ - | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) -#define dec32e_aligned sph_dec32le_aligned -#define enc32e sph_enc32le - -#if SPH_64 -#define C64e(x) ((SPH_C64(x) >> 56) \ - | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ - | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ - | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ - | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ - | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ - | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ - | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) -#define dec64e_aligned sph_dec64le_aligned -#define enc64e sph_enc64le -#endif - -#else - -#define C32e(x) SPH_C32(x) -#define dec32e_aligned sph_dec32be_aligned -#define enc32e sph_enc32be -#if SPH_64 -#define C64e(x) SPH_C64(x) -#define dec64e_aligned sph_dec64be_aligned -#define enc64e sph_enc64be -#endif - -#endif - -#define Sb(x0, x1, x2, x3, c) do { \ - x3 = ~x3; \ - x0 ^= (c) & ~x2; \ - tmp = (c) ^ (x0 & x1); \ - x0 ^= x2 & x3; \ - x3 ^= ~x1 & x2; \ - x1 ^= x0 & x2; \ - x2 ^= x0 & ~x3; \ - x0 ^= x1 | x3; \ - x3 ^= x1 & x2; \ - x1 ^= tmp & x0; \ - x2 ^= tmp; \ - } while (0) - -#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \ - x4 ^= x1; \ - x5 ^= x2; \ - x6 ^= x3 ^ x0; \ - x7 ^= x0; \ - x0 ^= x5; \ - x1 ^= x6; \ - x2 ^= x7 ^ x4; \ - x3 ^= x4; \ - } while (0) - -#if SPH_JH_64 - -static const sph_u64 C[] = { - C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557), - C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40), - C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a), - C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231), - C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410), - C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc), - C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0), - C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3), - C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce), - C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23), - C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8), - C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197), - C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95), - C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214), - C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80), - C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4), - C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989), - C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36), - C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7), - C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f), - C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727), - C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b), - C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e), - C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062), - C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984), - C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5), - C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2), - C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f), - C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465), - C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a), - C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1), - C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf), - C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48), - C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0), - C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134), - C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a), - C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff), - C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6), - C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae), - C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567), - C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a), - C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518), - C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446), - C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e), - C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee), - C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001), - C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779), - C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83), - C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a), - C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef), - C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d), - C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65), - C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a), - C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c), - C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d), - C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71), - C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc), - C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0), - C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c), - C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f), - C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751), - C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad), - C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56), - C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6), - C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a), - C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163), - C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826), - C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f), - C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30), - C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a), - C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3), - C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505), - C64e(0xb17681d913326cce), C64e(0x3c175284f805a262), - C64e(0xf42bcbb378471547), C64e(0xff46548223936a48), - C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e), - C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e), - C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd), - C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7), - C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be), - C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de), - C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9), - C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a), - C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b), - C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2) -}; - -#define Ceven_hi(r) (C[((r) << 2) + 0]) -#define Ceven_lo(r) (C[((r) << 2) + 1]) -#define Codd_hi(r) (C[((r) << 2) + 2]) -#define Codd_lo(r) (C[((r) << 2) + 3]) - -#define S(x0, x1, x2, x3, cb, r) do { \ - Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \ - Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \ - } while (0) - -#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ - Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \ - x4 ## h, x5 ## h, x6 ## h, x7 ## h); \ - Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \ - x4 ## l, x5 ## l, x6 ## l, x7 ## l); \ - } while (0) - -#define Wz(x, c, n) do { \ - sph_u64 t = (x ## h & (c)) << (n); \ - x ## h = ((x ## h >> (n)) & (c)) | t; \ - t = (x ## l & (c)) << (n); \ - x ## l = ((x ## l >> (n)) & (c)) | t; \ - } while (0) - -#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1) -#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2) -#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4) -#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8) -#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16) -#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32) -#define W6(x) do { \ - sph_u64 t = x ## h; \ - x ## h = x ## l; \ - x ## l = t; \ - } while (0) - -#define DECL_STATE \ - sph_u64 h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \ - sph_u64 h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \ - sph_u64 tmp; - -#define READ_STATE(state) do { \ - h0h = (state)->H.wide[ 0]; \ - h0l = (state)->H.wide[ 1]; \ - h1h = (state)->H.wide[ 2]; \ - h1l = (state)->H.wide[ 3]; \ - h2h = (state)->H.wide[ 4]; \ - h2l = (state)->H.wide[ 5]; \ - h3h = (state)->H.wide[ 6]; \ - h3l = (state)->H.wide[ 7]; \ - h4h = (state)->H.wide[ 8]; \ - h4l = (state)->H.wide[ 9]; \ - h5h = (state)->H.wide[10]; \ - h5l = (state)->H.wide[11]; \ - h6h = (state)->H.wide[12]; \ - h6l = (state)->H.wide[13]; \ - h7h = (state)->H.wide[14]; \ - h7l = (state)->H.wide[15]; \ - } while (0) - -#define WRITE_STATE(state) do { \ - (state)->H.wide[ 0] = h0h; \ - (state)->H.wide[ 1] = h0l; \ - (state)->H.wide[ 2] = h1h; \ - (state)->H.wide[ 3] = h1l; \ - (state)->H.wide[ 4] = h2h; \ - (state)->H.wide[ 5] = h2l; \ - (state)->H.wide[ 6] = h3h; \ - (state)->H.wide[ 7] = h3l; \ - (state)->H.wide[ 8] = h4h; \ - (state)->H.wide[ 9] = h4l; \ - (state)->H.wide[10] = h5h; \ - (state)->H.wide[11] = h5l; \ - (state)->H.wide[12] = h6h; \ - (state)->H.wide[13] = h6l; \ - (state)->H.wide[14] = h7h; \ - (state)->H.wide[15] = h7l; \ - } while (0) - -#define INPUT_BUF1 \ - sph_u64 m0h = dec64e_aligned(buf + 0); \ - sph_u64 m0l = dec64e_aligned(buf + 8); \ - sph_u64 m1h = dec64e_aligned(buf + 16); \ - sph_u64 m1l = dec64e_aligned(buf + 24); \ - sph_u64 m2h = dec64e_aligned(buf + 32); \ - sph_u64 m2l = dec64e_aligned(buf + 40); \ - sph_u64 m3h = dec64e_aligned(buf + 48); \ - sph_u64 m3l = dec64e_aligned(buf + 56); \ - h0h ^= m0h; \ - h0l ^= m0l; \ - h1h ^= m1h; \ - h1l ^= m1l; \ - h2h ^= m2h; \ - h2l ^= m2l; \ - h3h ^= m3h; \ - h3l ^= m3l; - -#define INPUT_BUF2 \ - h4h ^= m0h; \ - h4l ^= m0l; \ - h5h ^= m1h; \ - h5l ^= m1l; \ - h6h ^= m2h; \ - h6l ^= m2l; \ - h7h ^= m3h; \ - h7l ^= m3l; - -static const sph_u64 IV224[] = { - C64e(0x2dfedd62f99a98ac), C64e(0xae7cacd619d634e7), - C64e(0xa4831005bc301216), C64e(0xb86038c6c9661494), - C64e(0x66d9899f2580706f), C64e(0xce9ea31b1d9b1adc), - C64e(0x11e8325f7b366e10), C64e(0xf994857f02fa06c1), - C64e(0x1b4f1b5cd8c840b3), C64e(0x97f6a17f6e738099), - C64e(0xdcdf93a5adeaa3d3), C64e(0xa431e8dec9539a68), - C64e(0x22b4a98aec86a1e4), C64e(0xd574ac959ce56cf0), - C64e(0x15960deab5ab2bbf), C64e(0x9611dcf0dd64ea6e) -}; - -static const sph_u64 IV256[] = { - C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1), - C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03), - C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477), - C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8), - C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262), - C64e(0x277695f776248f94), C64e(0x87d5b6574780296c), - C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f), - C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769) -}; - -static const sph_u64 IV384[] = { - C64e(0x481e3bc6d813398a), C64e(0x6d3b5e894ade879b), - C64e(0x63faea68d480ad2e), C64e(0x332ccb21480f8267), - C64e(0x98aec84d9082b928), C64e(0xd455ea3041114249), - C64e(0x36f555b2924847ec), C64e(0xc7250a93baf43ce1), - C64e(0x569b7f8a27db454c), C64e(0x9efcbd496397af0e), - C64e(0x589fc27d26aa80cd), C64e(0x80c08b8c9deb2eda), - C64e(0x8a7981e8f8d5373a), C64e(0xf43967adddd17a71), - C64e(0xa9b4d3bda475d394), C64e(0x976c3fba9842737f) -}; - -static const sph_u64 IV512[] = { - C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543), - C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361), - C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80), - C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7), - C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a), - C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199), - C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156), - C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b) -}; - -#else - -static const sph_u32 C[] = { - C32e(0x72d5dea2), C32e(0xdf15f867), C32e(0x7b84150a), - C32e(0xb7231557), C32e(0x81abd690), C32e(0x4d5a87f6), - C32e(0x4e9f4fc5), C32e(0xc3d12b40), C32e(0xea983ae0), - C32e(0x5c45fa9c), C32e(0x03c5d299), C32e(0x66b2999a), - C32e(0x660296b4), C32e(0xf2bb538a), C32e(0xb556141a), - C32e(0x88dba231), C32e(0x03a35a5c), C32e(0x9a190edb), - C32e(0x403fb20a), C32e(0x87c14410), C32e(0x1c051980), - C32e(0x849e951d), C32e(0x6f33ebad), C32e(0x5ee7cddc), - C32e(0x10ba1392), C32e(0x02bf6b41), C32e(0xdc786515), - C32e(0xf7bb27d0), C32e(0x0a2c8139), C32e(0x37aa7850), - C32e(0x3f1abfd2), C32e(0x410091d3), C32e(0x422d5a0d), - C32e(0xf6cc7e90), C32e(0xdd629f9c), C32e(0x92c097ce), - C32e(0x185ca70b), C32e(0xc72b44ac), C32e(0xd1df65d6), - C32e(0x63c6fc23), C32e(0x976e6c03), C32e(0x9ee0b81a), - C32e(0x2105457e), C32e(0x446ceca8), C32e(0xeef103bb), - C32e(0x5d8e61fa), C32e(0xfd9697b2), C32e(0x94838197), - C32e(0x4a8e8537), C32e(0xdb03302f), C32e(0x2a678d2d), - C32e(0xfb9f6a95), C32e(0x8afe7381), C32e(0xf8b8696c), - C32e(0x8ac77246), C32e(0xc07f4214), C32e(0xc5f4158f), - C32e(0xbdc75ec4), C32e(0x75446fa7), C32e(0x8f11bb80), - C32e(0x52de75b7), C32e(0xaee488bc), C32e(0x82b8001e), - C32e(0x98a6a3f4), C32e(0x8ef48f33), C32e(0xa9a36315), - C32e(0xaa5f5624), C32e(0xd5b7f989), C32e(0xb6f1ed20), - C32e(0x7c5ae0fd), C32e(0x36cae95a), C32e(0x06422c36), - C32e(0xce293543), C32e(0x4efe983d), C32e(0x533af974), - C32e(0x739a4ba7), C32e(0xd0f51f59), C32e(0x6f4e8186), - C32e(0x0e9dad81), C32e(0xafd85a9f), C32e(0xa7050667), - C32e(0xee34626a), C32e(0x8b0b28be), C32e(0x6eb91727), - C32e(0x47740726), C32e(0xc680103f), C32e(0xe0a07e6f), - C32e(0xc67e487b), C32e(0x0d550aa5), C32e(0x4af8a4c0), - C32e(0x91e3e79f), C32e(0x978ef19e), C32e(0x86767281), - C32e(0x50608dd4), C32e(0x7e9e5a41), C32e(0xf3e5b062), - C32e(0xfc9f1fec), C32e(0x4054207a), C32e(0xe3e41a00), - C32e(0xcef4c984), C32e(0x4fd794f5), C32e(0x9dfa95d8), - C32e(0x552e7e11), C32e(0x24c354a5), C32e(0x5bdf7228), - C32e(0xbdfe6e28), C32e(0x78f57fe2), C32e(0x0fa5c4b2), - C32e(0x05897cef), C32e(0xee49d32e), C32e(0x447e9385), - C32e(0xeb28597f), C32e(0x705f6937), C32e(0xb324314a), - C32e(0x5e8628f1), C32e(0x1dd6e465), C32e(0xc71b7704), - C32e(0x51b920e7), C32e(0x74fe43e8), C32e(0x23d4878a), - C32e(0x7d29e8a3), C32e(0x927694f2), C32e(0xddcb7a09), - C32e(0x9b30d9c1), C32e(0x1d1b30fb), C32e(0x5bdc1be0), - C32e(0xda24494f), C32e(0xf29c82bf), C32e(0xa4e7ba31), - C32e(0xb470bfff), C32e(0x0d324405), C32e(0xdef8bc48), - C32e(0x3baefc32), C32e(0x53bbd339), C32e(0x459fc3c1), - C32e(0xe0298ba0), C32e(0xe5c905fd), C32e(0xf7ae090f), - C32e(0x94703412), C32e(0x4290f134), C32e(0xa271b701), - C32e(0xe344ed95), C32e(0xe93b8e36), C32e(0x4f2f984a), - C32e(0x88401d63), C32e(0xa06cf615), C32e(0x47c1444b), - C32e(0x8752afff), C32e(0x7ebb4af1), C32e(0xe20ac630), - C32e(0x4670b6c5), C32e(0xcc6e8ce6), C32e(0xa4d5a456), - C32e(0xbd4fca00), C32e(0xda9d844b), C32e(0xc83e18ae), - C32e(0x7357ce45), C32e(0x3064d1ad), C32e(0xe8a6ce68), - C32e(0x145c2567), C32e(0xa3da8cf2), C32e(0xcb0ee116), - C32e(0x33e90658), C32e(0x9a94999a), C32e(0x1f60b220), - C32e(0xc26f847b), C32e(0xd1ceac7f), C32e(0xa0d18518), - C32e(0x32595ba1), C32e(0x8ddd19d3), C32e(0x509a1cc0), - C32e(0xaaa5b446), C32e(0x9f3d6367), C32e(0xe4046bba), - C32e(0xf6ca19ab), C32e(0x0b56ee7e), C32e(0x1fb179ea), - C32e(0xa9282174), C32e(0xe9bdf735), C32e(0x3b3651ee), - C32e(0x1d57ac5a), C32e(0x7550d376), C32e(0x3a46c2fe), - C32e(0xa37d7001), C32e(0xf735c1af), C32e(0x98a4d842), - C32e(0x78edec20), C32e(0x9e6b6779), C32e(0x41836315), - C32e(0xea3adba8), C32e(0xfac33b4d), C32e(0x32832c83), - C32e(0xa7403b1f), C32e(0x1c2747f3), C32e(0x5940f034), - C32e(0xb72d769a), C32e(0xe73e4e6c), C32e(0xd2214ffd), - C32e(0xb8fd8d39), C32e(0xdc5759ef), C32e(0x8d9b0c49), - C32e(0x2b49ebda), C32e(0x5ba2d749), C32e(0x68f3700d), - C32e(0x7d3baed0), C32e(0x7a8d5584), C32e(0xf5a5e9f0), - C32e(0xe4f88e65), C32e(0xa0b8a2f4), C32e(0x36103b53), - C32e(0x0ca8079e), C32e(0x753eec5a), C32e(0x91689492), - C32e(0x56e8884f), C32e(0x5bb05c55), C32e(0xf8babc4c), - C32e(0xe3bb3b99), C32e(0xf387947b), C32e(0x75daf4d6), - C32e(0x726b1c5d), C32e(0x64aeac28), C32e(0xdc34b36d), - C32e(0x6c34a550), C32e(0xb828db71), C32e(0xf861e2f2), - C32e(0x108d512a), C32e(0xe3db6433), C32e(0x59dd75fc), - C32e(0x1cacbcf1), C32e(0x43ce3fa2), C32e(0x67bbd13c), - C32e(0x02e843b0), C32e(0x330a5bca), C32e(0x8829a175), - C32e(0x7f34194d), C32e(0xb416535c), C32e(0x923b94c3), - C32e(0x0e794d1e), C32e(0x797475d7), C32e(0xb6eeaf3f), - C32e(0xeaa8d4f7), C32e(0xbe1a3921), C32e(0x5cf47e09), - C32e(0x4c232751), C32e(0x26a32453), C32e(0xba323cd2), - C32e(0x44a3174a), C32e(0x6da6d5ad), C32e(0xb51d3ea6), - C32e(0xaff2c908), C32e(0x83593d98), C32e(0x916b3c56), - C32e(0x4cf87ca1), C32e(0x7286604d), C32e(0x46e23ecc), - C32e(0x086ec7f6), C32e(0x2f9833b3), C32e(0xb1bc765e), - C32e(0x2bd666a5), C32e(0xefc4e62a), C32e(0x06f4b6e8), - C32e(0xbec1d436), C32e(0x74ee8215), C32e(0xbcef2163), - C32e(0xfdc14e0d), C32e(0xf453c969), C32e(0xa77d5ac4), - C32e(0x06585826), C32e(0x7ec11416), C32e(0x06e0fa16), - C32e(0x7e90af3d), C32e(0x28639d3f), C32e(0xd2c9f2e3), - C32e(0x009bd20c), C32e(0x5faace30), C32e(0xb7d40c30), - C32e(0x742a5116), C32e(0xf2e03298), C32e(0x0deb30d8), - C32e(0xe3cef89a), C32e(0x4bc59e7b), C32e(0xb5f17992), - C32e(0xff51e66e), C32e(0x048668d3), C32e(0x9b234d57), - C32e(0xe6966731), C32e(0xcce6a6f3), C32e(0x170a7505), - C32e(0xb17681d9), C32e(0x13326cce), C32e(0x3c175284), - C32e(0xf805a262), C32e(0xf42bcbb3), C32e(0x78471547), - C32e(0xff465482), C32e(0x23936a48), C32e(0x38df5807), - C32e(0x4e5e6565), C32e(0xf2fc7c89), C32e(0xfc86508e), - C32e(0x31702e44), C32e(0xd00bca86), C32e(0xf04009a2), - C32e(0x3078474e), C32e(0x65a0ee39), C32e(0xd1f73883), - C32e(0xf75ee937), C32e(0xe42c3abd), C32e(0x2197b226), - C32e(0x0113f86f), C32e(0xa344edd1), C32e(0xef9fdee7), - C32e(0x8ba0df15), C32e(0x762592d9), C32e(0x3c85f7f6), - C32e(0x12dc42be), C32e(0xd8a7ec7c), C32e(0xab27b07e), - C32e(0x538d7dda), C32e(0xaa3ea8de), C32e(0xaa25ce93), - C32e(0xbd0269d8), C32e(0x5af643fd), C32e(0x1a7308f9), - C32e(0xc05fefda), C32e(0x174a19a5), C32e(0x974d6633), - C32e(0x4cfd216a), C32e(0x35b49831), C32e(0xdb411570), - C32e(0xea1e0fbb), C32e(0xedcd549b), C32e(0x9ad063a1), - C32e(0x51974072), C32e(0xf6759dbf), C32e(0x91476fe2) -}; - -#define Ceven_w3(r) (C[((r) << 3) + 0]) -#define Ceven_w2(r) (C[((r) << 3) + 1]) -#define Ceven_w1(r) (C[((r) << 3) + 2]) -#define Ceven_w0(r) (C[((r) << 3) + 3]) -#define Codd_w3(r) (C[((r) << 3) + 4]) -#define Codd_w2(r) (C[((r) << 3) + 5]) -#define Codd_w1(r) (C[((r) << 3) + 6]) -#define Codd_w0(r) (C[((r) << 3) + 7]) - -#define S(x0, x1, x2, x3, cb, r) do { \ - Sb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, cb ## w3(r)); \ - Sb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, cb ## w2(r)); \ - Sb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, cb ## w1(r)); \ - Sb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, cb ## w0(r)); \ - } while (0) - -#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ - Lb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, \ - x4 ## 3, x5 ## 3, x6 ## 3, x7 ## 3); \ - Lb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, \ - x4 ## 2, x5 ## 2, x6 ## 2, x7 ## 2); \ - Lb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, \ - x4 ## 1, x5 ## 1, x6 ## 1, x7 ## 1); \ - Lb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, \ - x4 ## 0, x5 ## 0, x6 ## 0, x7 ## 0); \ - } while (0) - -#define Wz(x, c, n) do { \ - sph_u32 t = (x ## 3 & (c)) << (n); \ - x ## 3 = ((x ## 3 >> (n)) & (c)) | t; \ - t = (x ## 2 & (c)) << (n); \ - x ## 2 = ((x ## 2 >> (n)) & (c)) | t; \ - t = (x ## 1 & (c)) << (n); \ - x ## 1 = ((x ## 1 >> (n)) & (c)) | t; \ - t = (x ## 0 & (c)) << (n); \ - x ## 0 = ((x ## 0 >> (n)) & (c)) | t; \ - } while (0) - -#define W0(x) Wz(x, SPH_C32(0x55555555), 1) -#define W1(x) Wz(x, SPH_C32(0x33333333), 2) -#define W2(x) Wz(x, SPH_C32(0x0F0F0F0F), 4) -#define W3(x) Wz(x, SPH_C32(0x00FF00FF), 8) -#define W4(x) Wz(x, SPH_C32(0x0000FFFF), 16) -#define W5(x) do { \ - sph_u32 t = x ## 3; \ - x ## 3 = x ## 2; \ - x ## 2 = t; \ - t = x ## 1; \ - x ## 1 = x ## 0; \ - x ## 0 = t; \ - } while (0) -#define W6(x) do { \ - sph_u32 t = x ## 3; \ - x ## 3 = x ## 1; \ - x ## 1 = t; \ - t = x ## 2; \ - x ## 2 = x ## 0; \ - x ## 0 = t; \ - } while (0) - -#define DECL_STATE \ - sph_u32 h03, h02, h01, h00, h13, h12, h11, h10; \ - sph_u32 h23, h22, h21, h20, h33, h32, h31, h30; \ - sph_u32 h43, h42, h41, h40, h53, h52, h51, h50; \ - sph_u32 h63, h62, h61, h60, h73, h72, h71, h70; \ - sph_u32 tmp; - -#define READ_STATE(state) do { \ - h03 = (state)->H.narrow[ 0]; \ - h02 = (state)->H.narrow[ 1]; \ - h01 = (state)->H.narrow[ 2]; \ - h00 = (state)->H.narrow[ 3]; \ - h13 = (state)->H.narrow[ 4]; \ - h12 = (state)->H.narrow[ 5]; \ - h11 = (state)->H.narrow[ 6]; \ - h10 = (state)->H.narrow[ 7]; \ - h23 = (state)->H.narrow[ 8]; \ - h22 = (state)->H.narrow[ 9]; \ - h21 = (state)->H.narrow[10]; \ - h20 = (state)->H.narrow[11]; \ - h33 = (state)->H.narrow[12]; \ - h32 = (state)->H.narrow[13]; \ - h31 = (state)->H.narrow[14]; \ - h30 = (state)->H.narrow[15]; \ - h43 = (state)->H.narrow[16]; \ - h42 = (state)->H.narrow[17]; \ - h41 = (state)->H.narrow[18]; \ - h40 = (state)->H.narrow[19]; \ - h53 = (state)->H.narrow[20]; \ - h52 = (state)->H.narrow[21]; \ - h51 = (state)->H.narrow[22]; \ - h50 = (state)->H.narrow[23]; \ - h63 = (state)->H.narrow[24]; \ - h62 = (state)->H.narrow[25]; \ - h61 = (state)->H.narrow[26]; \ - h60 = (state)->H.narrow[27]; \ - h73 = (state)->H.narrow[28]; \ - h72 = (state)->H.narrow[29]; \ - h71 = (state)->H.narrow[30]; \ - h70 = (state)->H.narrow[31]; \ - } while (0) - -#define WRITE_STATE(state) do { \ - (state)->H.narrow[ 0] = h03; \ - (state)->H.narrow[ 1] = h02; \ - (state)->H.narrow[ 2] = h01; \ - (state)->H.narrow[ 3] = h00; \ - (state)->H.narrow[ 4] = h13; \ - (state)->H.narrow[ 5] = h12; \ - (state)->H.narrow[ 6] = h11; \ - (state)->H.narrow[ 7] = h10; \ - (state)->H.narrow[ 8] = h23; \ - (state)->H.narrow[ 9] = h22; \ - (state)->H.narrow[10] = h21; \ - (state)->H.narrow[11] = h20; \ - (state)->H.narrow[12] = h33; \ - (state)->H.narrow[13] = h32; \ - (state)->H.narrow[14] = h31; \ - (state)->H.narrow[15] = h30; \ - (state)->H.narrow[16] = h43; \ - (state)->H.narrow[17] = h42; \ - (state)->H.narrow[18] = h41; \ - (state)->H.narrow[19] = h40; \ - (state)->H.narrow[20] = h53; \ - (state)->H.narrow[21] = h52; \ - (state)->H.narrow[22] = h51; \ - (state)->H.narrow[23] = h50; \ - (state)->H.narrow[24] = h63; \ - (state)->H.narrow[25] = h62; \ - (state)->H.narrow[26] = h61; \ - (state)->H.narrow[27] = h60; \ - (state)->H.narrow[28] = h73; \ - (state)->H.narrow[29] = h72; \ - (state)->H.narrow[30] = h71; \ - (state)->H.narrow[31] = h70; \ - } while (0) - -#define INPUT_BUF1 \ - sph_u32 m03 = dec32e_aligned(buf + 0); \ - sph_u32 m02 = dec32e_aligned(buf + 4); \ - sph_u32 m01 = dec32e_aligned(buf + 8); \ - sph_u32 m00 = dec32e_aligned(buf + 12); \ - sph_u32 m13 = dec32e_aligned(buf + 16); \ - sph_u32 m12 = dec32e_aligned(buf + 20); \ - sph_u32 m11 = dec32e_aligned(buf + 24); \ - sph_u32 m10 = dec32e_aligned(buf + 28); \ - sph_u32 m23 = dec32e_aligned(buf + 32); \ - sph_u32 m22 = dec32e_aligned(buf + 36); \ - sph_u32 m21 = dec32e_aligned(buf + 40); \ - sph_u32 m20 = dec32e_aligned(buf + 44); \ - sph_u32 m33 = dec32e_aligned(buf + 48); \ - sph_u32 m32 = dec32e_aligned(buf + 52); \ - sph_u32 m31 = dec32e_aligned(buf + 56); \ - sph_u32 m30 = dec32e_aligned(buf + 60); \ - h03 ^= m03; \ - h02 ^= m02; \ - h01 ^= m01; \ - h00 ^= m00; \ - h13 ^= m13; \ - h12 ^= m12; \ - h11 ^= m11; \ - h10 ^= m10; \ - h23 ^= m23; \ - h22 ^= m22; \ - h21 ^= m21; \ - h20 ^= m20; \ - h33 ^= m33; \ - h32 ^= m32; \ - h31 ^= m31; \ - h30 ^= m30; - -#define INPUT_BUF2 \ - h43 ^= m03; \ - h42 ^= m02; \ - h41 ^= m01; \ - h40 ^= m00; \ - h53 ^= m13; \ - h52 ^= m12; \ - h51 ^= m11; \ - h50 ^= m10; \ - h63 ^= m23; \ - h62 ^= m22; \ - h61 ^= m21; \ - h60 ^= m20; \ - h73 ^= m33; \ - h72 ^= m32; \ - h71 ^= m31; \ - h70 ^= m30; - -static const sph_u32 IV224[] = { - C32e(0x2dfedd62), C32e(0xf99a98ac), C32e(0xae7cacd6), C32e(0x19d634e7), - C32e(0xa4831005), C32e(0xbc301216), C32e(0xb86038c6), C32e(0xc9661494), - C32e(0x66d9899f), C32e(0x2580706f), C32e(0xce9ea31b), C32e(0x1d9b1adc), - C32e(0x11e8325f), C32e(0x7b366e10), C32e(0xf994857f), C32e(0x02fa06c1), - C32e(0x1b4f1b5c), C32e(0xd8c840b3), C32e(0x97f6a17f), C32e(0x6e738099), - C32e(0xdcdf93a5), C32e(0xadeaa3d3), C32e(0xa431e8de), C32e(0xc9539a68), - C32e(0x22b4a98a), C32e(0xec86a1e4), C32e(0xd574ac95), C32e(0x9ce56cf0), - C32e(0x15960dea), C32e(0xb5ab2bbf), C32e(0x9611dcf0), C32e(0xdd64ea6e) -}; - -static const sph_u32 IV256[] = { - C32e(0xeb98a341), C32e(0x2c20d3eb), C32e(0x92cdbe7b), C32e(0x9cb245c1), - C32e(0x1c935191), C32e(0x60d4c7fa), C32e(0x260082d6), C32e(0x7e508a03), - C32e(0xa4239e26), C32e(0x7726b945), C32e(0xe0fb1a48), C32e(0xd41a9477), - C32e(0xcdb5ab26), C32e(0x026b177a), C32e(0x56f02442), C32e(0x0fff2fa8), - C32e(0x71a39689), C32e(0x7f2e4d75), C32e(0x1d144908), C32e(0xf77de262), - C32e(0x277695f7), C32e(0x76248f94), C32e(0x87d5b657), C32e(0x4780296c), - C32e(0x5c5e272d), C32e(0xac8e0d6c), C32e(0x518450c6), C32e(0x57057a0f), - C32e(0x7be4d367), C32e(0x702412ea), C32e(0x89e3ab13), C32e(0xd31cd769) -}; - -static const sph_u32 IV384[] = { - C32e(0x481e3bc6), C32e(0xd813398a), C32e(0x6d3b5e89), C32e(0x4ade879b), - C32e(0x63faea68), C32e(0xd480ad2e), C32e(0x332ccb21), C32e(0x480f8267), - C32e(0x98aec84d), C32e(0x9082b928), C32e(0xd455ea30), C32e(0x41114249), - C32e(0x36f555b2), C32e(0x924847ec), C32e(0xc7250a93), C32e(0xbaf43ce1), - C32e(0x569b7f8a), C32e(0x27db454c), C32e(0x9efcbd49), C32e(0x6397af0e), - C32e(0x589fc27d), C32e(0x26aa80cd), C32e(0x80c08b8c), C32e(0x9deb2eda), - C32e(0x8a7981e8), C32e(0xf8d5373a), C32e(0xf43967ad), C32e(0xddd17a71), - C32e(0xa9b4d3bd), C32e(0xa475d394), C32e(0x976c3fba), C32e(0x9842737f) -}; - -static const sph_u32 IV512[] = { - C32e(0x6fd14b96), C32e(0x3e00aa17), C32e(0x636a2e05), C32e(0x7a15d543), - C32e(0x8a225e8d), C32e(0x0c97ef0b), C32e(0xe9341259), C32e(0xf2b3c361), - C32e(0x891da0c1), C32e(0x536f801e), C32e(0x2aa9056b), C32e(0xea2b6d80), - C32e(0x588eccdb), C32e(0x2075baa6), C32e(0xa90f3a76), C32e(0xbaf83bf7), - C32e(0x0169e605), C32e(0x41e34a69), C32e(0x46b58a8e), C32e(0x2e6fe65a), - C32e(0x1047a7d0), C32e(0xc1843c24), C32e(0x3b6e71b1), C32e(0x2d5ac199), - C32e(0xcf57f6ec), C32e(0x9db1f856), C32e(0xa706887c), C32e(0x5716b156), - C32e(0xe3c2fcdf), C32e(0xe68517fb), C32e(0x545a4678), C32e(0xcc8cdd4b) -}; - -#endif - -#define SL(ro) SLu(r + ro, ro) - -#define SLu(r, ro) do { \ - S(h0, h2, h4, h6, Ceven_, r); \ - S(h1, h3, h5, h7, Codd_, r); \ - L(h0, h2, h4, h6, h1, h3, h5, h7); \ - W ## ro(h1); \ - W ## ro(h3); \ - W ## ro(h5); \ - W ## ro(h7); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_JH - -#if SPH_JH_64 - -/* - * The "small footprint" 64-bit version just uses a partially unrolled - * loop. - */ - -#define E8 do { \ - unsigned r; \ - for (r = 0; r < 42; r += 7) { \ - SL(0); \ - SL(1); \ - SL(2); \ - SL(3); \ - SL(4); \ - SL(5); \ - SL(6); \ - } \ - } while (0) - -#else - -#define E8 do { \ - unsigned r, g; \ - for (r = g = 0; r < 42; r ++) { \ - S(h0, h2, h4, h6, Ceven_, r); \ - S(h1, h3, h5, h7, Codd_, r); \ - L(h0, h2, h4, h6, h1, h3, h5, h7); \ - switch (g) { \ - case 0: \ - W0(h1); \ - W0(h3); \ - W0(h5); \ - W0(h7); \ - break; \ - case 1: \ - W1(h1); \ - W1(h3); \ - W1(h5); \ - W1(h7); \ - break; \ - case 2: \ - W2(h1); \ - W2(h3); \ - W2(h5); \ - W2(h7); \ - break; \ - case 3: \ - W3(h1); \ - W3(h3); \ - W3(h5); \ - W3(h7); \ - break; \ - case 4: \ - W4(h1); \ - W4(h3); \ - W4(h5); \ - W4(h7); \ - break; \ - case 5: \ - W5(h1); \ - W5(h3); \ - W5(h5); \ - W5(h7); \ - break; \ - case 6: \ - W6(h1); \ - W6(h3); \ - W6(h5); \ - W6(h7); \ - break; \ - } \ - if (++ g == 7) \ - g = 0; \ - } \ - } while (0) - -#endif - -#else - -#if SPH_JH_64 - -/* - * On a "true 64-bit" architecture, we can unroll at will. - */ - -#define E8 do { \ - SLu( 0, 0); \ - SLu( 1, 1); \ - SLu( 2, 2); \ - SLu( 3, 3); \ - SLu( 4, 4); \ - SLu( 5, 5); \ - SLu( 6, 6); \ - SLu( 7, 0); \ - SLu( 8, 1); \ - SLu( 9, 2); \ - SLu(10, 3); \ - SLu(11, 4); \ - SLu(12, 5); \ - SLu(13, 6); \ - SLu(14, 0); \ - SLu(15, 1); \ - SLu(16, 2); \ - SLu(17, 3); \ - SLu(18, 4); \ - SLu(19, 5); \ - SLu(20, 6); \ - SLu(21, 0); \ - SLu(22, 1); \ - SLu(23, 2); \ - SLu(24, 3); \ - SLu(25, 4); \ - SLu(26, 5); \ - SLu(27, 6); \ - SLu(28, 0); \ - SLu(29, 1); \ - SLu(30, 2); \ - SLu(31, 3); \ - SLu(32, 4); \ - SLu(33, 5); \ - SLu(34, 6); \ - SLu(35, 0); \ - SLu(36, 1); \ - SLu(37, 2); \ - SLu(38, 3); \ - SLu(39, 4); \ - SLu(40, 5); \ - SLu(41, 6); \ - } while (0) - -#else - -/* - * We are not aiming at a small footprint, but we are still using a - * 32-bit implementation. Full loop unrolling would smash the L1 - * cache on some "big" architectures (32 kB L1 cache). - */ - -#define E8 do { \ - unsigned r; \ - for (r = 0; r < 42; r += 7) { \ - SL(0); \ - SL(1); \ - SL(2); \ - SL(3); \ - SL(4); \ - SL(5); \ - SL(6); \ - } \ - } while (0) - -#endif - -#endif - -static void -jh_init(sph_jh_context *sc, const void *iv) -{ - sc->ptr = 0; -#if SPH_JH_64 - memcpy(sc->H.wide, iv, sizeof sc->H.wide); -#else - memcpy(sc->H.narrow, iv, sizeof sc->H.narrow); -#endif -#if SPH_64 - sc->block_count = 0; -#else - sc->block_count_high = 0; - sc->block_count_low = 0; -#endif -} - -static void -jh_core(sph_jh_context *sc, const void *data, size_t len) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE - - buf = sc->buf; - ptr = sc->ptr; - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == sizeof sc->buf) { - INPUT_BUF1; - E8; - INPUT_BUF2; -#if SPH_64 - sc->block_count ++; -#else - if ((sc->block_count_low = SPH_T32( - sc->block_count_low + 1)) == 0) - sc->block_count_high ++; -#endif - ptr = 0; - } - } - WRITE_STATE(sc); - sc->ptr = ptr; -} - -static void -jh_close(sph_jh_context *sc, unsigned ub, unsigned n, - void *dst, size_t out_size_w32, const void *iv) -{ - unsigned z; - unsigned char buf[128]; - size_t numz, u; -#if SPH_64 - sph_u64 l0, l1; -#else - sph_u32 l0, l1, l2, l3; -#endif - - z = 0x80 >> n; - buf[0] = ((ub & -z) | z) & 0xFF; - if (sc->ptr == 0 && n == 0) { - numz = 47; - } else { - numz = 111 - sc->ptr; - } - memset(buf + 1, 0, numz); -#if SPH_64 - l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3) + n; - l1 = SPH_T64(sc->block_count >> 55); - sph_enc64be(buf + numz + 1, l1); - sph_enc64be(buf + numz + 9, l0); -#else - l0 = SPH_T32(sc->block_count_low << 9) + (sc->ptr << 3) + n; - l1 = SPH_T32(sc->block_count_low >> 23) - + SPH_T32(sc->block_count_high << 9); - l2 = SPH_T32(sc->block_count_high >> 23); - l3 = 0; - sph_enc32be(buf + numz + 1, l3); - sph_enc32be(buf + numz + 5, l2); - sph_enc32be(buf + numz + 9, l1); - sph_enc32be(buf + numz + 13, l0); -#endif - jh_core(sc, buf, numz + 17); -#if SPH_JH_64 - for (u = 0; u < 8; u ++) - enc64e(buf + (u << 3), sc->H.wide[u + 8]); -#else - for (u = 0; u < 16; u ++) - enc32e(buf + (u << 2), sc->H.narrow[u + 16]); -#endif - memcpy(dst, buf + ((16 - out_size_w32) << 2), out_size_w32 << 2); - jh_init(sc, iv); -} - -/* see sph_jh.h */ -void -sph_jh224_init(void *cc) -{ - jh_init(cc, IV224); -} - -/* see sph_jh.h */ -void -sph_jh224(void *cc, const void *data, size_t len) -{ - jh_core(cc, data, len); -} - -/* see sph_jh.h */ -void -sph_jh224_close(void *cc, void *dst) -{ - jh_close(cc, 0, 0, dst, 7, IV224); -} - -/* see sph_jh.h */ -void -sph_jh224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - jh_close(cc, ub, n, dst, 7, IV224); -} - -/* see sph_jh.h */ -void -sph_jh256_init(void *cc) -{ - jh_init(cc, IV256); -} - -/* see sph_jh.h */ -void -sph_jh256(void *cc, const void *data, size_t len) -{ - jh_core(cc, data, len); -} - -/* see sph_jh.h */ -void -sph_jh256_close(void *cc, void *dst) -{ - jh_close(cc, 0, 0, dst, 8, IV256); -} - -/* see sph_jh.h */ -void -sph_jh256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - jh_close(cc, ub, n, dst, 8, IV256); -} - -/* see sph_jh.h */ -void -sph_jh384_init(void *cc) -{ - jh_init(cc, IV384); -} - -/* see sph_jh.h */ -void -sph_jh384(void *cc, const void *data, size_t len) -{ - jh_core(cc, data, len); -} - -/* see sph_jh.h */ -void -sph_jh384_close(void *cc, void *dst) -{ - jh_close(cc, 0, 0, dst, 12, IV384); -} - -/* see sph_jh.h */ -void -sph_jh384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - jh_close(cc, ub, n, dst, 12, IV384); -} - -/* see sph_jh.h */ -void -sph_jh512_init(void *cc) -{ - jh_init(cc, IV512); -} - -/* see sph_jh.h */ -void -sph_jh512(void *cc, const void *data, size_t len) -{ - jh_core(cc, data, len); -} - -/* see sph_jh.h */ -void -sph_jh512_close(void *cc, void *dst) -{ - jh_close(cc, 0, 0, dst, 16, IV512); -} - -/* see sph_jh.h */ -void -sph_jh512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - jh_close(cc, ub, n, dst, 16, IV512); -} - -#ifdef __cplusplus -} -#endif diff --git a/algo/jh/sse2/jh_sse2_opt32.h b/algo/jh/sse2/jh_sse2_opt32.h deleted file mode 100644 index ecbd229..0000000 --- a/algo/jh/sse2/jh_sse2_opt32.h +++ /dev/null @@ -1,465 +0,0 @@ -/* This program gives the optimized SSE2 bitslice implementation of JH for 32-bit platform (with 8 128-bit XMM registers). - - ----------------------------------------- - Performance: - - Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz) - Operating System: 32-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic) - Speed for long message: - 1) 23.6 cycles/byte compiler: Intel C++ Compiler 11.1 compilation option: icc -O2 - 2) 24.1 cycles/byte compiler: gcc 4.4.3 compilation option: gcc -msse2 -O3 - - ------------------------------------------ - Comparing with the original JH sse2 code for 32-bit platform, the following modifications are made: - a) The Sbox implementation follows exactly the description given in the document - b) Data alignment definition is improved so that the code can be compiled by GCC, Intel C++ compiler and Microsoft Visual C compiler - c) Using y0,y1,..,y7 variables in Function F8 for performance improvement (local variable in function F8 so that compiler can optimize the code easily) - d) Removed a number of intermediate variables from the program (so as to given compiler more freedom to optimize the code) - e) Using "for" loop to implement 42 rounds (with 7 rounds in each loop), so as to reduce the code size. - ------------------------------------------ - - Last Modified: January 16, 2011 -*/ - - - -#include -#include - -typedef unsigned int uint32; -typedef __m128i word128; /*word128 defines a 128-bit SSE2 word*/ - -typedef unsigned char BitSequence; -typedef unsigned long long DataLength; -typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn; - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) -#else - #define DATA_ALIGN16(x) __declspec(align(16)) x -#endif - -typedef struct { - int hashbitlen; /*the message digest size*/ - unsigned long long databitlen; /*the message size in bits*/ - unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/ - word128 x0,x1,x2,x3,x4,x5,x6,x7; /*1024-bit state;*/ - unsigned char buffer[64]; /*512-bit message block;*/ -} hashState; - -/*The initial hash value H(0)*/ -DATA_ALIGN16(const unsigned char JH224_H0[128])={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e}; -DATA_ALIGN16(const unsigned char JH256_H0[128])={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69}; -DATA_ALIGN16(const unsigned char JH384_H0[128])={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f}; -DATA_ALIGN16(const unsigned char JH512_H0[128])={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b}; - -/*42 round constants, each round constant is 32-byte (256-bit)*/ -DATA_ALIGN16(const unsigned char E8_bitslice_roundconstant[42][32])={ -{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40}, -{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31}, -{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc}, -{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3}, -{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23}, -{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97}, -{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14}, -{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4}, -{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36}, -{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f}, -{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b}, -{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62}, -{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5}, -{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f}, -{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a}, -{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf}, -{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0}, -{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a}, -{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6}, -{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67}, -{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18}, -{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e}, -{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1}, -{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83}, -{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef}, -{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65}, -{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c}, -{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71}, -{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0}, -{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f}, -{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad}, -{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6}, -{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63}, -{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f}, -{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a}, -{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5}, -{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48}, -{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e}, -{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7}, -{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde}, -{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a}, -{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}}; - - -void F8(hashState *state); /* the compression function F8 */ - -/*The API functions*/ -HashReturn Init(hashState *state, int hashbitlen); -HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen); -HashReturn Final(hashState *state, BitSequence *hashval); -HashReturn Hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval); - -/*The following defines operations on 128-bit word(s)*/ -#define CONSTANT(b) _mm_set1_epi8((b)) /*set each byte in a 128-bit register to be "b"*/ - -#define XOR(x,y) _mm_xor_si128((x),(y)) /*XOR(x,y) = x ^ y, where x and y are two 128-bit word*/ -#define AND(x,y) _mm_and_si128((x),(y)) /*AND(x,y) = x & y, where x and y are two 128-bit word*/ -#define ANDNOT(x,y) _mm_andnot_si128((x),(y)) /*ANDNOT(x,y) = (!x) & y, where x and y are two 128-bit word*/ -#define OR(x,y) _mm_or_si128((x),(y)) /*OR(x,y) = x | y, where x and y are two 128-bit word*/ - -#define SHR1(x) _mm_srli_epi16((x), 1) /*SHR1(x) = x >> 1, where x is a 128 bit word*/ -#define SHR2(x) _mm_srli_epi16((x), 2) /*SHR2(x) = x >> 2, where x is a 128 bit word*/ -#define SHR4(x) _mm_srli_epi16((x), 4) /*SHR4(x) = x >> 4, where x is a 128 bit word*/ -#define SHR8(x) _mm_slli_epi16((x), 8) /*SHR8(x) = x >> 8, where x is a 128 bit word*/ -#define SHR16(x) _mm_slli_epi32((x), 16) /*SHR16(x) = x >> 16, where x is a 128 bit word*/ -#define SHR32(x) _mm_slli_epi64((x), 32) /*SHR32(x) = x >> 32, where x is a 128 bit word*/ -#define SHR64(x) _mm_slli_si128((x), 8) /*SHR64(x) = x >> 64, where x is a 128 bit word*/ - -#define SHL1(x) _mm_slli_epi16((x), 1) /*SHL1(x) = x << 1, where x is a 128 bit word*/ -#define SHL2(x) _mm_slli_epi16((x), 2) /*SHL2(x) = x << 2, where x is a 128 bit word*/ -#define SHL4(x) _mm_slli_epi16((x), 4) /*SHL4(x) = x << 4, where x is a 128 bit word*/ -#define SHL8(x) _mm_srli_epi16((x), 8) /*SHL8(x) = x << 8, where x is a 128 bit word*/ -#define SHL16(x) _mm_srli_epi32((x), 16) /*SHL16(x) = x << 16, where x is a 128 bit word*/ -#define SHL32(x) _mm_srli_epi64((x), 32) /*SHL32(x) = x << 32, where x is a 128 bit word*/ -#define SHL64(x) _mm_srli_si128((x), 8) /*SHL64(x) = x << 64, where x is a 128 bit word*/ - -#define SWAP1(x) OR(SHR1(AND((x),CONSTANT(0xaa))),SHL1(AND((x),CONSTANT(0x55)))) /*swapping bit 2i with bit 2i+1 of the 128-bit x */ -#define SWAP2(x) OR(SHR2(AND((x),CONSTANT(0xcc))),SHL2(AND((x),CONSTANT(0x33)))) /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 of the 128-bit x */ -#define SWAP4(x) OR(SHR4(AND((x),CONSTANT(0xf0))),SHL4(AND((x),CONSTANT(0xf)))) /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of the 128-bit x */ -#define SWAP8(x) OR(SHR8(x),SHL8(x)) /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 of the 128-bit x */ -#define SWAP16(x) OR(SHR16(x),SHL16(x)) /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 of the 128-bit x */ -#define SWAP32(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(2,3,0,1)) /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 of the 128-bit x*/ -#define SWAP64(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(1,0,3,2)) /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 of the 128-bit x*/ - -#define STORE(x,p) _mm_store_si128((__m128i *)(p), (x)) /*store the 128-bit word x into memeory address p, where p is the multile of 16 bytes*/ -#define LOAD(p) _mm_load_si128((__m128i *)(p)) /*load 16 bytes from the memory address p, return a 128-bit word, where p is the multile of 16 bytes*/ - -/*The MDS code*/ -#define L(m0,m1,m2,m3,m4,m5,m6,m7) \ - (m4) = XOR((m4),(m1)); \ - (m5) = XOR((m5),(m2)); \ - (m6) = XOR(XOR((m6),(m3)),(m0)); \ - (m7) = XOR((m7),(m0)); \ - (m0) = XOR((m0),(m5)); \ - (m1) = XOR((m1),(m6)); \ - (m2) = XOR(XOR((m2),(m7)),(m4)); \ - (m3) = XOR((m3),(m4)); - -/*The Sbox, it implements S0 and S1, selected by a constant bit*/ -#define S(m0,m1,m2,m3,c0) \ - m3 = XOR(m3,CONSTANT(0xff)); \ - m0 = XOR(m0,ANDNOT(m2,c0)); \ - temp0 = XOR(c0,AND(m0,m1)); \ - m0 = XOR(m0,AND(m3,m2)); \ - m3 = XOR(m3,ANDNOT(m1,m2)); \ - m1 = XOR(m1,AND(m0,m2)); \ - m2 = XOR(m2,ANDNOT(m3,m0)); \ - m0 = XOR(m0,OR(m1,m3)); \ - m3 = XOR(m3,AND(m1,m2)); \ - m2 = XOR(m2,temp0); \ - m1 = XOR(m1,AND(temp0,m0)); - -/* The linear transform of the (7i+0)th round*/ -#define lineartransform_R00(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bit 2i with bit 2i+1 for m4,m5,m6 and m7 */ \ - m4 = SWAP1(m4); m5 = SWAP1(m5); m6 = SWAP1(m6); m7 = SWAP1(m7); - -/* The linear transform of the (7i+1)th round*/ -#define lineartransform_R01(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 for m4,m5,m6 and m7 */ \ - m4 = SWAP2(m4); m5 = SWAP2(m5); m6 = SWAP2(m6); m7 = SWAP2(m7); - -/* The linear transform of the (7i+2)th round*/ -#define lineartransform_R02(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 for m4,m5,m6 and m7*/ \ - m4 = SWAP4(m4); m5 = SWAP4(m5); m6 = SWAP4(m6); m7 = SWAP4(m7); - -/* The linear transform of the (7i+3)th round*/ -#define lineartransform_R03(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 for m4,m5,m6 and m7*/ \ - m4 = SWAP8(m4); m5 = SWAP8(m5); m6 = SWAP8(m6); m7 = SWAP8(m7); - -/* The linear transform of the (7i+4)th round*/ -#define lineartransform_R04(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 for m0,m1,m2 and m3*/ \ - m4 = SWAP16(m4); m5 = SWAP16(m5); m6 = SWAP16(m6); m7 = SWAP16(m7); - -/* The linear transform of the (7i+5)th round -- faster*/ -#define lineartransform_R05(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 for m0,m1,m2 and m3*/ \ - m4 = SWAP32(m4); m5 = SWAP32(m5); m6 = SWAP32(m6); m7 = SWAP32(m7); - -/* The linear transform of the (7i+6)th round -- faster*/ -#define lineartransform_R06(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 for m0,m1,m2 and m3*/ \ - m4 = SWAP64(m4); m5 = SWAP64(m5); m6 = SWAP64(m6); m7 = SWAP64(m7); - -/*the round function of E8 */ -#define round_function(nn,r) \ - S(y0,y2,y4,y6, LOAD(E8_bitslice_roundconstant[r]) ); \ - S(y1,y3,y5,y7, LOAD(E8_bitslice_roundconstant[r]+16) ); \ - lineartransform_R##nn(y0,y2,y4,y6,y1,y3,y5,y7); - -/*the compression function F8 */ -void F8(hashState *state) -{ - uint32 i; - word128 y0,y1,y2,y3,y4,y5,y6,y7; - word128 temp0; - - y0 = state->x0; - y1 = state->x1; - y2 = state->x2; - y3 = state->x3; - y4 = state->x4; - y5 = state->x5; - y6 = state->x6; - y7 = state->x7; - - /*xor the 512-bit message with the fist half of the 1024-bit hash state*/ - - y0 = XOR(y0, LOAD(state->buffer)); - y1 = XOR(y1, LOAD(state->buffer+16)); - y2 = XOR(y2, LOAD(state->buffer+32)); - y3 = XOR(y3, LOAD(state->buffer+48)); - - /*perform 42 rounds*/ - for (i = 0; i < 42; i = i+7) { - round_function(00,i); - round_function(01,i+1); - round_function(02,i+2); - round_function(03,i+3); - round_function(04,i+4); - round_function(05,i+5); - round_function(06,i+6); - } - - /*xor the 512-bit message with the second half of the 1024-bit hash state*/ - - y4 = XOR(y4, LOAD(state->buffer)); - y5 = XOR(y5, LOAD(state->buffer+16)); - y6 = XOR(y6, LOAD(state->buffer+32)); - y7 = XOR(y7, LOAD(state->buffer+48)); - - state->x0 = y0; - state->x1 = y1; - state->x2 = y2; - state->x3 = y3; - state->x4 = y4; - state->x5 = y5; - state->x6 = y6; - state->x7 = y7; -} - -/*before hashing a message, initialize the hash state as H0 */ -HashReturn Init(hashState *state, int hashbitlen) -{ - - state->databitlen = 0; - state->datasize_in_buffer = 0; - - state->hashbitlen = hashbitlen; - - /*initialize the initial hash value of JH*/ - /*load the intital hash value into state*/ - - switch(hashbitlen) - { - case 224: - state->x0 = LOAD(JH224_H0); - state->x1 = LOAD(JH224_H0+16); - state->x2 = LOAD(JH224_H0+32); - state->x3 = LOAD(JH224_H0+48); - state->x4 = LOAD(JH224_H0+64); - state->x5 = LOAD(JH224_H0+80); - state->x6 = LOAD(JH224_H0+96); - state->x7 = LOAD(JH224_H0+112); - break; - - case 256: - state->x0 = LOAD(JH256_H0); - state->x1 = LOAD(JH256_H0+16); - state->x2 = LOAD(JH256_H0+32); - state->x3 = LOAD(JH256_H0+48); - state->x4 = LOAD(JH256_H0+64); - state->x5 = LOAD(JH256_H0+80); - state->x6 = LOAD(JH256_H0+96); - state->x7 = LOAD(JH256_H0+112); - break; - - case 384: - state->x0 = LOAD(JH384_H0); - state->x1 = LOAD(JH384_H0+16); - state->x2 = LOAD(JH384_H0+32); - state->x3 = LOAD(JH384_H0+48); - state->x4 = LOAD(JH384_H0+64); - state->x5 = LOAD(JH384_H0+80); - state->x6 = LOAD(JH384_H0+96); - state->x7 = LOAD(JH384_H0+112); - break; - - case 512: - state->x0 = LOAD(JH512_H0); - state->x1 = LOAD(JH512_H0+16); - state->x2 = LOAD(JH512_H0+32); - state->x3 = LOAD(JH512_H0+48); - state->x4 = LOAD(JH512_H0+64); - state->x5 = LOAD(JH512_H0+80); - state->x6 = LOAD(JH512_H0+96); - state->x7 = LOAD(JH512_H0+112); - break; - } - - return(SUCCESS); -} - -/*hash each 512-bit message block, except the last partial block*/ -HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) -{ - DataLength index; /*the starting address of the data to be compressed*/ - - state->databitlen += databitlen; - index = 0; - - /*if there is remaining data in the buffer, fill it to a full message block first*/ - /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/ - - /*There is data in the buffer, but the incoming data is insufficient for a full block*/ - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) ) { - if ( (databitlen & 7) == 0 ) { - memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ; - } - else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ; - state->datasize_in_buffer += databitlen; - databitlen = 0; - } - - /*There is data in the buffer, and the incoming data is sufficient for a full block*/ - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) { - memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ; - index = 64-(state->datasize_in_buffer >> 3); - databitlen = databitlen - (512 - state->datasize_in_buffer); - F8(state); - state->datasize_in_buffer = 0; - } - - /*hash the remaining full message blocks*/ - for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) { - memcpy(state->buffer, data+index, 64); - F8(state); - } - - /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/ - if ( databitlen > 0) { - if ((databitlen & 7) == 0) - memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3); - else - memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1); - state->datasize_in_buffer = databitlen; - } - - return(SUCCESS); -} - -/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/ -HashReturn Final(hashState *state, BitSequence *hashval) -{ - unsigned int i; - DATA_ALIGN16(unsigned char t[64]); - - if ( (state->databitlen & 0x1ff) == 0 ) - { - /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ - memset(state->buffer,0,64); - state->buffer[0] = 0x80; - state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; - state->buffer[61] = (state->databitlen >> 16) & 0xff; - state->buffer[60] = (state->databitlen >> 24) & 0xff; - state->buffer[59] = (state->databitlen >> 32) & 0xff; - state->buffer[58] = (state->databitlen >> 40) & 0xff; - state->buffer[57] = (state->databitlen >> 48) & 0xff; - state->buffer[56] = (state->databitlen >> 56) & 0xff; - F8(state); - } - else { - /*set the rest of the bytes in the buffer to 0*/ - if ( (state->datasize_in_buffer & 7) == 0) - for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0; - else - for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) state->buffer[i] = 0; - - /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/ - state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7)); - F8(state); - memset(state->buffer,0,64); - state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; - state->buffer[61] = (state->databitlen >> 16) & 0xff; - state->buffer[60] = (state->databitlen >> 24) & 0xff; - state->buffer[59] = (state->databitlen >> 32) & 0xff; - state->buffer[58] = (state->databitlen >> 40) & 0xff; - state->buffer[57] = (state->databitlen >> 48) & 0xff; - state->buffer[56] = (state->databitlen >> 56) & 0xff; - F8(state); - } - - /*truncting the final hash value to generate the message digest*/ - - STORE(state->x4,t); - STORE(state->x5,t+16); - STORE(state->x6,t+32); - STORE(state->x7,t+48); - - switch (state->hashbitlen) - { - case 224: memcpy(hashval,t+36,28); break; - case 256: memcpy(hashval,t+32,32); break; - case 384: memcpy(hashval,t+16,48); break; - case 512: memcpy(hashval,t,64); break; - } - - return(SUCCESS); -} - -/* hash a message, - three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen) - one output: message digest (hashval) -*/ -HashReturn Hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval) -{ - hashState state; - - if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 ) - { - Init(&state, hashbitlen); - Update(&state, data, databitlen); - Final(&state, hashval); - return SUCCESS; - } - else - return(BAD_HASHLEN); -} diff --git a/algo/jh/sse2/jh_sse2_opt64.h b/algo/jh/sse2/jh_sse2_opt64.h deleted file mode 100644 index 06195b3..0000000 --- a/algo/jh/sse2/jh_sse2_opt64.h +++ /dev/null @@ -1,357 +0,0 @@ -/*This program gives the optimized SSE2 bitslice implementation of JH for 64-bit platform (with 16 128-bit XMM registers). - - -------------------------------- - Performance - - Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz) - Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic) - Speed for long message: - 1) 19.9 cycles/byte compiler: Intel C++ Compiler 11.1 compilation option: icc -O3 - 2) 20.9 cycles/byte compiler: gcc 4.4.3 compilation option: gcc -msse2 -O3 - - -------------------------------- - Compare with the original JH sse2 code (October 2008) for 64-bit platform, we made the modifications: - a) The Sbox implementation follows exactly the description given in the document - b) Data alignment definition is improved so that the code can be compiled by GCC, Intel C++ compiler and Microsoft Visual C compiler - c) Using y0,y1,..,y7 variables in Function F8 for performance improvement (local variable in function F8 so that compiler can optimize the code easily) - d) Removed a number of intermediate variables from the program (so as to given compiler more freedom to optimize the code) - e) Using "for" loop to implement 42 rounds (with 7 rounds in each loop), so as to reduce the code size. - - -------------------------------- - Last Modified: January 16, 2011 -*/ - - -#include -#include -#include -#include "algo/sha/sha3-defs.h" - -typedef __m128i word128; /*word128 defines a 128-bit SSE2 word*/ -typedef enum {jhSUCCESS = 0, jhFAIL = 1, jhBAD_HASHLEN = 2} jhReturn; - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) -#else - #define DATA_ALIGN16(x) __declspec(align(16)) x -#endif - -typedef struct { - DataLength jhbitlen; /*the message digest size*/ - DataLength databitlen; /*the message size in bits*/ - DataLength datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/ - word128 x0,x1,x2,x3,x4,x5,x6,x7; /*1024-bit state;*/ - unsigned char buffer[64]; /*512-bit message block;*/ -} jhState; - -#define DECL_JH \ - word128 jhSx0,jhSx1,jhSx2,jhSx3,jhSx4,jhSx5,jhSx6,jhSx7; \ - unsigned char jhSbuffer[64]; - - -/*The initial hash value H(0)*/ -static DATA_ALIGN16(const unsigned char JH512_H0[128])={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b}; - -/*42 round constants, each round constant is 32-byte (256-bit)*/ -static DATA_ALIGN16(const unsigned char jhE8_bitslice_roundconstant[42][32])={ -{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40}, -{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31}, -{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc}, -{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3}, -{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23}, -{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97}, -{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14}, -{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4}, -{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36}, -{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f}, -{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b}, -{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62}, -{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5}, -{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f}, -{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a}, -{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf}, -{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0}, -{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a}, -{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6}, -{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67}, -{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18}, -{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e}, -{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1}, -{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83}, -{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef}, -{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65}, -{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c}, -{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71}, -{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0}, -{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f}, -{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad}, -{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6}, -{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63}, -{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f}, -{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a}, -{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5}, -{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48}, -{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e}, -{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7}, -{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde}, -{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a}, -{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}}; - - -//static void jhF8(jhState *state); /* the compression function F8 */ - -/*The API functions*/ - -/*The following defines operations on 128-bit word(s)*/ -#define jhCONSTANT(b) _mm_set1_epi8((b)) /*set each byte in a 128-bit register to be "b"*/ - -#define jhXOR(x,y) _mm_xor_si128((x),(y)) /*jhXOR(x,y) = x ^ y, where x and y are two 128-bit word*/ -#define jhAND(x,y) _mm_and_si128((x),(y)) /*jhAND(x,y) = x & y, where x and y are two 128-bit word*/ -#define jhANDNOT(x,y) _mm_andnot_si128((x),(y)) /*jhANDNOT(x,y) = (!x) & y, where x and y are two 128-bit word*/ -#define jhOR(x,y) _mm_or_si128((x),(y)) /*jhOR(x,y) = x | y, where x and y are two 128-bit word*/ - -#define jhSHR1(x) _mm_srli_epi16((x), 1) /*jhSHR1(x) = x >> 1, where x is a 128 bit word*/ -#define jhSHR2(x) _mm_srli_epi16((x), 2) /*jhSHR2(x) = x >> 2, where x is a 128 bit word*/ -#define jhSHR4(x) _mm_srli_epi16((x), 4) /*jhSHR4(x) = x >> 4, where x is a 128 bit word*/ -#define jhSHR8(x) _mm_slli_epi16((x), 8) /*jhSHR8(x) = x >> 8, where x is a 128 bit word*/ -#define jhSHR16(x) _mm_slli_epi32((x), 16) /*jhSHR16(x) = x >> 16, where x is a 128 bit word*/ -#define jhSHR32(x) _mm_slli_epi64((x), 32) /*jhSHR32(x) = x >> 32, where x is a 128 bit word*/ -#define jhSHR64(x) _mm_slli_si128((x), 8) /*jhSHR64(x) = x >> 64, where x is a 128 bit word*/ - -#define jhSHL1(x) _mm_slli_epi16((x), 1) /*jhSHL1(x) = x << 1, where x is a 128 bit word*/ -#define jhSHL2(x) _mm_slli_epi16((x), 2) /*jhSHL2(x) = x << 2, where x is a 128 bit word*/ -#define jhSHL4(x) _mm_slli_epi16((x), 4) /*jhSHL4(x) = x << 4, where x is a 128 bit word*/ -#define jhSHL8(x) _mm_srli_epi16((x), 8) /*jhSHL8(x) = x << 8, where x is a 128 bit word*/ -#define jhSHL16(x) _mm_srli_epi32((x), 16) /*jhSHL16(x) = x << 16, where x is a 128 bit word*/ -#define jhSHL32(x) _mm_srli_epi64((x), 32) /*jhSHL32(x) = x << 32, where x is a 128 bit word*/ -#define jhSHL64(x) _mm_srli_si128((x), 8) /*jhSHL64(x) = x << 64, where x is a 128 bit word*/ - -#define jhSWAP1(x) jhOR(jhSHR1(jhAND((x),jhCONSTANT(0xaa))),jhSHL1(jhAND((x),jhCONSTANT(0x55)))) /*swapping bit 2i with bit 2i+1 of the 128-bit x */ -#define jhSWAP2(x) jhOR(jhSHR2(jhAND((x),jhCONSTANT(0xcc))),jhSHL2(jhAND((x),jhCONSTANT(0x33)))) /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 of the 128-bit x */ -#define jhSWAP4(x) jhOR(jhSHR4(jhAND((x),jhCONSTANT(0xf0))),jhSHL4(jhAND((x),jhCONSTANT(0xf)))) /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of the 128-bit x */ -#define jhSWAP8(x) jhOR(jhSHR8(x),jhSHL8(x)) /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 of the 128-bit x */ -#define jhSWAP16(x) jhOR(jhSHR16(x),jhSHL16(x)) /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 of the 128-bit x */ -#define jhSWAP32(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(2,3,0,1)) /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 of the 128-bit x*/ -#define jhSWAP64(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(1,0,3,2)) /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 of the 128-bit x*/ -#define jhSTORE(x,p) _mm_store_si128((__m128i *)(p), (x)) /*store the 128-bit word x into memeory address p, where p is the multile of 16 bytes*/ -#define jhLOAD(p) _mm_load_si128((__m128i *)(p)) /*load 16 bytes from the memory address p, return a 128-bit word, where p is the multile of 16 bytes*/ - -/*The MDS code*/ -#define jhL(m0,m1,m2,m3,m4,m5,m6,m7) \ - (m4) = jhXOR((m4),(m1)); \ - (m5) = jhXOR((m5),(m2)); \ - (m6) = jhXOR(jhXOR((m6),(m3)),(m0)); \ - (m7) = jhXOR((m7),(m0)); \ - (m0) = jhXOR((m0),(m5)); \ - (m1) = jhXOR((m1),(m6)); \ - (m2) = jhXOR(jhXOR((m2),(m7)),(m4)); \ - (m3) = jhXOR((m3),(m4)); - -/*Two Sboxes computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/ -/*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power of SSE2 instructions*/ -#define jhSS(m0,m1,m2,m3,m4,m5,m6,m7,constant0,constant1) \ - m3 = jhXOR(m3,jhCONSTANT(0xff)); \ - m7 = jhXOR(m7,jhCONSTANT(0xff)); \ - m0 = jhXOR(m0,jhANDNOT(m2,constant0)); \ - m4 = jhXOR(m4,jhANDNOT(m6,constant1)); \ - a0 = jhXOR(constant0,jhAND(m0,m1)); \ - a1 = jhXOR(constant1,jhAND(m4,m5)); \ - m0 = jhXOR(m0,jhAND(m3,m2)); \ - m4 = jhXOR(m4,jhAND(m7,m6)); \ - m3 = jhXOR(m3,jhANDNOT(m1,m2)); \ - m7 = jhXOR(m7,jhANDNOT(m5,m6)); \ - m1 = jhXOR(m1,jhAND(m0,m2)); \ - m5 = jhXOR(m5,jhAND(m4,m6)); \ - m2 = jhXOR(m2,jhANDNOT(m3,m0)); \ - m6 = jhXOR(m6,jhANDNOT(m7,m4)); \ - m0 = jhXOR(m0,jhOR(m1,m3)); \ - m4 = jhXOR(m4,jhOR(m5,m7)); \ - m3 = jhXOR(m3,jhAND(m1,m2)); \ - m7 = jhXOR(m7,jhAND(m5,m6)); \ - m2 = jhXOR(m2,a0); \ - m6 = jhXOR(m6,a1); \ - m1 = jhXOR(m1,jhAND(a0,m0)); \ - m5 = jhXOR(m5,jhAND(a1,m4)); - -/* The linear transform of the (7*i+0)th round*/ -#define jhlineartransform_R00(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bit 2i with bit 2i+1 for m4,m5,m6 and m7 */ \ - m4 = jhSWAP1(m4); m5 = jhSWAP1(m5); m6 = jhSWAP1(m6); m7 = jhSWAP1(m7); - -/* The linear transform of the (7*i+1)th round*/ -#define jhlineartransform_R01(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 for m4,m5,m6 and m7 */ \ - m4 = jhSWAP2(m4); m5 = jhSWAP2(m5); m6 = jhSWAP2(m6); m7 = jhSWAP2(m7); - -/* The linear transform of the (7*i+2)th round*/ -#define jhlineartransform_R02(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 for m4,m5,m6 and m7*/ \ - m4 = jhSWAP4(m4); m5 = jhSWAP4(m5); m6 = jhSWAP4(m6); m7 = jhSWAP4(m7); - -/* The linear transform of the (7*i+3)th round*/ -#define jhlineartransform_R03(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 for m4,m5,m6 and m7*/ \ - m4 = jhSWAP8(m4); m5 = jhSWAP8(m5); m6 = jhSWAP8(m6); m7 = jhSWAP8(m7); - -/* The linear transform of the (7*i+4)th round*/ -#define jhlineartransform_R04(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 for m0,m1,m2 and m3*/ \ - m4 = jhSWAP16(m4); m5 = jhSWAP16(m5); m6 = jhSWAP16(m6); m7 = jhSWAP16(m7); - -/* The linear transform of the (7*i+5)th round -- faster*/ -#define jhlineartransform_R05(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 for m0,m1,m2 and m3*/ \ - m4 = jhSWAP32(m4); m5 = jhSWAP32(m5); m6 = jhSWAP32(m6); m7 = jhSWAP32(m7); - -/* The linear transform of the (7*i+6)th round -- faster*/ -#define jhlineartransform_R06(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 for m0,m1,m2 and m3*/ \ - m4 = jhSWAP64(m4); m5 = jhSWAP64(m5); m6 = jhSWAP64(m6); m7 = jhSWAP64(m7); - -/*the round function of E8 */ -#define jhround_function(nn,r) \ - jhSS(y0,y2,y4,y6,y1,y3,y5,y7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \ - jhlineartransform_R##nn(y0,y2,y4,y6,y1,y3,y5,y7); - -/*the round function of E8 */ -#define jhround_functionI(nn,r) \ - jhSS(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \ - jhlineartransform_R##nn(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7); - -/* -//the compression function F8 -static void jhF8(jhState *state) -{ - return; - uint64_t i; - word128 y0,y1,y2,y3,y4,y5,y6,y7; - word128 a0,a1; - - y0 = state->x0, - y0 = jhXOR(y0, jhLOAD(state->buffer)); - y1 = state->x1, - y1 = jhXOR(y1, jhLOAD(state->buffer+16)); - y2 = state->x2, - y2 = jhXOR(y2, jhLOAD(state->buffer+32)); - y3 = state->x3, - y3 = jhXOR(y3, jhLOAD(state->buffer+48)); - y4 = state->x4; - y5 = state->x5; - y6 = state->x6; - y7 = state->x7; - - //xor the 512-bit message with the fist half of the 1024-bit hash state - - //perform 42 rounds - for (i = 0; i < 42; i = i+7) { - jhround_function(00,i); - jhround_function(01,i+1); - jhround_function(02,i+2); - jhround_function(03,i+3); - jhround_function(04,i+4); - jhround_function(05,i+5); - jhround_function(06,i+6); - } - - //xor the 512-bit message with the second half of the 1024-bit hash state - - state->x0 = y0; - state->x1 = y1; - state->x2 = y2; - state->x3 = y3; - y4 = jhXOR(y4, jhLOAD(state->buffer)), - state->x4 = y4; - y5 = jhXOR(y5, jhLOAD(state->buffer+16)), - state->x5 = y5; - y6 = jhXOR(y6, jhLOAD(state->buffer+32)), - state->x6 = y6; - y7 = jhXOR(y7, jhLOAD(state->buffer+48)), - state->x7 = y7; -} -*/ - -#define jhF8I \ -do { \ - uint64_t i; \ - word128 a0,a1; \ - jhSx0 = jhXOR(jhSx0, jhLOAD(jhSbuffer)); \ - jhSx1 = jhXOR(jhSx1, jhLOAD(jhSbuffer+16)); \ - jhSx2 = jhXOR(jhSx2, jhLOAD(jhSbuffer+32)); \ - jhSx3 = jhXOR(jhSx3, jhLOAD(jhSbuffer+48)); \ - for (i = 0; i < 42; i = i+7) { \ - jhround_functionI(00,i); \ - jhround_functionI(01,i+1); \ - jhround_functionI(02,i+2); \ - jhround_functionI(03,i+3); \ - jhround_functionI(04,i+4); \ - jhround_functionI(05,i+5); \ - jhround_functionI(06,i+6); \ - } \ - jhSx4 = jhXOR(jhSx4, jhLOAD(jhSbuffer)); \ - jhSx5 = jhXOR(jhSx5, jhLOAD(jhSbuffer+16)); \ - jhSx6 = jhXOR(jhSx6, jhLOAD(jhSbuffer+32)); \ - jhSx7 = jhXOR(jhSx7, jhLOAD(jhSbuffer+48)); \ -} while (0) - -/* the whole thing - * load from hash - * hash = JH512(loaded) - */ -#define JH_H \ -do { \ - jhSx0 = jhLOAD(JH512_H0); \ - jhSx1 = jhLOAD(JH512_H0+16); \ - jhSx2 = jhLOAD(JH512_H0+32); \ - jhSx3 = jhLOAD(JH512_H0+48); \ - jhSx4 = jhLOAD(JH512_H0+64); \ - jhSx5 = jhLOAD(JH512_H0+80); \ - jhSx6 = jhLOAD(JH512_H0+96); \ - jhSx7 = jhLOAD(JH512_H0+112); \ - /* for break loop */ \ - /* one inlined copy of JHF8i */ \ - int b = false; \ - memcpy(jhSbuffer, hash, 64); \ - for(;;) { \ - jhF8I; \ - if (b) break; \ - memset(jhSbuffer,0,48); \ - jhSbuffer[0] = 0x80; \ - jhSbuffer[48] = 0x00, \ - jhSbuffer[49] = 0x00, \ - jhSbuffer[50] = 0x00, \ - jhSbuffer[51] = 0x00, \ - jhSbuffer[52] = 0x00, \ - jhSbuffer[53] = 0x00, \ - jhSbuffer[54] = 0x00, \ - jhSbuffer[55] = 0x00; \ - jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \ - jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \ - jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \ - jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \ - jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \ - jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \ - jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \ - jhSbuffer[63] = (64*8) & 0xff; \ - b = true; \ - } \ -jhSTORE(jhSx4,(char *)(hash)); \ -jhSTORE(jhSx5,(char *)(hash)+16); \ -jhSTORE(jhSx6,(char *)(hash)+32); \ -jhSTORE(jhSx7,(char *)(hash)+48); \ -} while (0) - diff --git a/algo/jh/sse2/sph_jh.h b/algo/jh/sse2/sph_jh.h deleted file mode 100644 index 473d7e2..0000000 --- a/algo/jh/sse2/sph_jh.h +++ /dev/null @@ -1,127 +0,0 @@ -/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * JH interface. JH is a family of functions which differ by - * their output size; this implementation defines JH for output - * sizes 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_jh.h - * @author Thomas Pornin - */ - -#ifndef SPH_JH_H__ -#define SPH_JH_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "sph_types.h" - -#define QSTATIC static - -/** - * Output size (in bits) for JH-512. - */ -#define SPH_SIZE_jh512 512 - -/** - * This structure is a context for JH computations: it contains the - * intermediate values and some data from the last entered block. Once - * a JH computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running JH computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - size_t ptr; - union { - sph_u64 wide[16]; - sph_u32 narrow[32]; - } H; - sph_u64 block_count; -} sph_jh_context; - -/** - * Type for a JH-512 context (identical to the common context). - */ -typedef sph_jh_context sph_jh512_context; - -/** - * Initialize a JH-512 context. This process performs no memory allocation. - * - * @param cc the JH-512 context (pointer to a - * sph_jh512_context) - */ -QSTATIC void sph_jh512_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the JH-512 context - * @param data the input data - * @param len the input data length (in bytes) - */ -QSTATIC void sph_jh512(void *cc, const void *data, size_t len); - -/** - * Terminate the current JH-512 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (64 bytes). The context is automatically - * reinitialized. - * - * @param cc the JH-512 context - * @param dst the destination buffer - */ -QSTATIC void sph_jh512_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (64 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the JH-512 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -QSTATIC void sph_jh512_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c index 5d2c87d..514fc75 100644 --- a/algo/keccak/keccak-4way.c +++ b/algo/keccak/keccak-4way.c @@ -1,18 +1,70 @@ #include "keccak-gate.h" - -#ifdef KECCAK_4WAY - #include #include #include #include "sph_keccak.h" #include "keccak-hash-4way.h" +#if defined(KECCAK_8WAY) + +void keccakhash_8way(void *state, const void *input) +{ + keccak256_8way_context ctx; + keccak256_8way_init( &ctx ); + keccak256_8way_update( &ctx, input, 80 ); + keccak256_8way_close( &ctx, state ); +} + +int scanhash_keccak_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[24*8] __attribute__ ((aligned (128))); + uint32_t hash[16*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[49]); // 3*16+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + const uint32_t Htarg = ptarget[7]; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + *noncev = mm512_intrlv_blend_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); + do { + keccakhash_8way( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) + { + extr_lane_8x64( lane_hash, hash, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) + { + pdata[19] = bswap_32( n + lane ); + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); + n += 8; + + } while ( (n < max_nonce-8) && !work_restart[thr_id].restart); + pdata[19] = n; + *hashes_done = n - first_nonce + 1; + return 0; +} + +#elif defined(KECCAK_4WAY) + void keccakhash_4way(void *state, const void *input) { keccak256_4way_context ctx; keccak256_4way_init( &ctx ); - keccak256_4way( &ctx, input, 80 ); + keccak256_4way_update( &ctx, input, 80 ); keccak256_4way_close( &ctx, state ); } @@ -28,30 +80,31 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; __m256i *noncev = (__m256i*)vdata + 9; // aligned -// const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated + const uint32_t Htarg = ptarget[7]; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; mm256_bswap32_intrlv80_4x64( vdata, pdata ); + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - keccakhash_4way( hash, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) ) + if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) ) + if ( valid_hash( lane_hash, ptarget )) { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); + pdata[19] = bswap_32( n + lane ); + submit_solution( work, lane_hash, mythr ); } } + *noncev = _mm256_add_epi32( *noncev, + m256_const1_64( 0x0000000400000000 ) ); n += 4; - } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - + pdata[19] = n; *hashes_done = n - first_nonce + 1; return 0; } diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c index 215b0e9..96be623 100644 --- a/algo/keccak/keccak-gate.c +++ b/algo/keccak/keccak-gate.c @@ -1,46 +1,85 @@ #include "keccak-gate.h" +#include "sph_keccak.h" -void keccak_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (128.0 * opt_diff_factor) ); -} +int hard_coded_eb = 1; + +// KECCAK -int64_t keccak_get_max64() { return 0x7ffffLL; } - -bool register_keccak_algo( algo_gate_t* gate ) -{ - gate->optimizations = AVX2_OPT; - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; - gate->set_target = (void*)&keccak_set_target; - gate->get_max64 = (void*)&keccak_get_max64; -#if defined (KECCAK_4WAY) - gate->scanhash = (void*)&scanhash_keccak_4way; - gate->hash = (void*)&keccakhash_4way; +bool register_keccak_algo(algo_gate_t *gate) { + gate->optimizations = AVX2_OPT | AVX512_OPT; + gate->gen_merkle_root = (void *)&SHA256_gen_merkle_root; + opt_target_factor = 128.0; +#if defined(KECCAK_8WAY) + gate->scanhash = (void *)&scanhash_keccak_8way; + gate->hash = (void *)&keccakhash_8way; +#elif defined(KECCAK_4WAY) + gate->scanhash = (void *)&scanhash_keccak_4way; + gate->hash = (void *)&keccakhash_4way; #else - gate->scanhash = (void*)&scanhash_keccak; - gate->hash = (void*)&keccakhash; + gate->scanhash = (void *)&scanhash_keccak; + gate->hash = (void *)&keccakhash; #endif return true; }; -void keccakc_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} +// KECCAKC -bool register_keccakc_algo( algo_gate_t* gate ) -{ - gate->optimizations = AVX2_OPT; - gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root; - gate->set_target = (void*)&keccakc_set_target; - gate->get_max64 = (void*)&keccak_get_max64; -#if defined (KECCAK_4WAY) - gate->scanhash = (void*)&scanhash_keccak_4way; - gate->hash = (void*)&keccakhash_4way; +bool register_keccakc_algo(algo_gate_t *gate) { + gate->optimizations = AVX2_OPT | AVX512_OPT; + gate->gen_merkle_root = (void *)&sha256d_gen_merkle_root; + opt_target_factor = 256.0; +#if defined(KECCAK_8WAY) + gate->scanhash = (void *)&scanhash_keccak_8way; + gate->hash = (void *)&keccakhash_8way; +#elif defined(KECCAK_4WAY) + gate->scanhash = (void *)&scanhash_keccak_4way; + gate->hash = (void *)&keccakhash_4way; #else - gate->scanhash = (void*)&scanhash_keccak; - gate->hash = (void*)&keccakhash; + gate->scanhash = (void *)&scanhash_keccak; + gate->hash = (void *)&keccakhash; #endif return true; }; +// SHA3D + +void sha3d(void *state, const void *input, int len) { + uint32_t _ALIGN(64) buffer[16], hash[16]; + sph_keccak_context ctx_keccak; + + sph_keccak256_init(&ctx_keccak); + sph_keccak256(&ctx_keccak, input, len); + sph_keccak256_close(&ctx_keccak, (void *)buffer); + + sph_keccak256_init(&ctx_keccak); + sph_keccak256(&ctx_keccak, buffer, 32); + sph_keccak256_close(&ctx_keccak, (void *)hash); + + memcpy(state, hash, 32); +} + +void sha3d_gen_merkle_root(char *merkle_root, struct stratum_ctx *sctx) { + sha3d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); + for (int i = 0; i < sctx->job.merkle_count; i++) { + memcpy(merkle_root + 32, sctx->job.merkle[i], 32); + sha256d(merkle_root, merkle_root, 64); + } +} + +bool register_sha3d_algo(algo_gate_t *gate) { + hard_coded_eb = 6; + // opt_extranonce = false; + gate->optimizations = AVX2_OPT | AVX512_OPT; + gate->gen_merkle_root = (void *)&sha3d_gen_merkle_root; +#if defined(KECCAK_8WAY) + gate->scanhash = (void *)&scanhash_sha3d_8way; + gate->hash = (void *)&sha3d_hash_8way; +#elif defined(KECCAK_4WAY) + gate->scanhash = (void *)&scanhash_sha3d_4way; + gate->hash = (void *)&sha3d_hash_4way; +#else + gate->scanhash = (void *)&scanhash_sha3d; + gate->hash = (void *)&sha3d_hash; +#endif + return true; +}; diff --git a/algo/keccak/keccak-gate.h b/algo/keccak/keccak-gate.h index e9fc5e7..cee3d00 100644 --- a/algo/keccak/keccak-gate.h +++ b/algo/keccak/keccak-gate.h @@ -1,23 +1,46 @@ #ifndef KECCAK_GATE_H__ -#define KECCAK_GATE_H__ +#define KECCAK_GATE_H__ 1 #include "algo-gate-api.h" #include -#if defined(__AVX2__) - #define KECCAK_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define KECCAK_8WAY 1 +#elif defined(__AVX2__) + #define KECCAK_4WAY 1 #endif -#if defined(KECCAK_4WAY) +extern int hard_coded_eb; + +#if defined(KECCAK_8WAY) + +void keccakhash_8way( void *state, const void *input ); +int scanhash_keccak_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +void sha3d_hash_8way( void *state, const void *input ); +int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(KECCAK_4WAY) void keccakhash_4way( void *state, const void *input ); int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -#endif +void sha3d_hash_4way( void *state, const void *input ); +int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#else void keccakhash( void *state, const void *input ); int scanhash_keccak( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +void sha3d_hash( void *state, const void *input ); +int scanhash_sha3d( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#endif #endif diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index b8646d0..cc88332 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -1,23 +1,27 @@ #include +#include #include "keccak-hash-4way.h" +#include "keccak-gate.h" #if defined(__AVX2__) -static const sph_u64 RC[] = { - SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), - SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), - SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), - SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), - SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), - SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), - SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), - SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), - SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), - SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) +static const uint64_t RC[] = { + 0x0000000000000001, 0x0000000000008082, + 0x800000000000808A, 0x8000000080008000, + 0x000000000000808B, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, + 0x000000000000008A, 0x0000000000000088, + 0x0000000080008009, 0x000000008000000A, + 0x000000008000808B, 0x800000000000008B, + 0x8000000000008089, 0x8000000000008003, + 0x8000000000008002, 0x8000000000000080, + 0x000000000000800A, 0x800000008000000A, + 0x8000000080008081, 0x8000000000008080, + 0x0000000080000001, 0x8000000080008008 }; +// generic macros + #define a00 (kc->w[ 0]) #define a10 (kc->w[ 1]) #define a20 (kc->w[ 2]) @@ -48,6 +52,197 @@ static const sph_u64 RC[] = { #define READ_STATE(sc) #define WRITE_STATE(sc) +#define MOV64(d, s) (d = s) +#define XOR64_IOTA XOR64 + +#define LPAR ( +#define RPAR ) + +#define DO(x) x + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size>>3); j++ ) \ + kc->w[j ] = _mm512_xor_si512( kc->w[j], buf[j] ); \ +} while (0) + +// Targetted macros, keccak-macros.h is included for each target. + +#define DECL64(x) __m512i x +#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b)) +#define AND64(d, a, b) (d = _mm512_and_si512(a,b)) +#define OR64(d, a, b) (d = _mm512_or_si512(a,b)) +#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) +#define ROL64(d, v, n) (d = mm512_rol_64(v, n)) + +#include "keccak-macros.c" + +#define KECCAK_F_1600 DO(KECCAK_F_1600_512) + +#define KECCAK_F_1600_512 do { \ + int j; \ + for (j = 0; j < 24; j += 8) \ + { \ + KF_ELT( 0, 1, _mm512_set1_epi64( RC[j + 0] ) ); \ + KF_ELT( 1, 2, _mm512_set1_epi64( RC[j + 1] ) ); \ + KF_ELT( 2, 3, _mm512_set1_epi64( RC[j + 2] ) ); \ + KF_ELT( 3, 4, _mm512_set1_epi64( RC[j + 3] ) ); \ + KF_ELT( 4, 5, _mm512_set1_epi64( RC[j + 4] ) ); \ + KF_ELT( 5, 6, _mm512_set1_epi64( RC[j + 5] ) ); \ + KF_ELT( 6, 7, _mm512_set1_epi64( RC[j + 6] ) ); \ + KF_ELT( 7, 8, _mm512_set1_epi64( RC[j + 7] ) ); \ + P8_TO_P0; \ + } \ +} while (0) + +static void keccak64_8way_init( keccak64_ctx_m512i *kc, unsigned out_size ) +{ + __m512i zero = m512_zero; + __m512i neg1 = m512_neg1; + + // Initialization for the "lane complement". + kc->w[ 0] = zero; kc->w[ 1] = neg1; + kc->w[ 2] = neg1; kc->w[ 3] = zero; + kc->w[ 4] = zero; kc->w[ 5] = zero; + kc->w[ 6] = zero; kc->w[ 7] = zero; + kc->w[ 8] = neg1; kc->w[ 9] = zero; + kc->w[10] = zero; kc->w[11] = zero; + kc->w[12] = neg1; kc->w[13] = zero; + kc->w[14] = zero; kc->w[15] = zero; + kc->w[16] = zero; kc->w[17] = neg1; + kc->w[18] = zero; kc->w[19] = zero; + kc->w[20] = neg1; kc->w[21] = zero; + kc->w[22] = zero; kc->w[23] = zero; + kc->w[24] = zero; kc->ptr = 0; + kc->lim = 200 - (out_size >> 2); +} + +static void +keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len, + size_t lim ) +{ + __m512i *buf; + __m512i *vdata = (__m512i*)data; + size_t ptr; + DECL_STATE + + buf = kc->buf; + ptr = kc->ptr; + + if ( len < (lim - ptr) ) + { + memcpy_512( buf + (ptr>>3), vdata, len>>3 ); + kc->ptr = ptr + len; + return; + } + READ_STATE( kc ); + while ( len > 0 ) + { + size_t clen; + + clen = (lim - ptr); + if ( clen > len ) + clen = len; + memcpy_512( buf + (ptr>>3), vdata, clen>>3 ); + ptr += clen; + vdata = vdata + (clen>>3); + len -= clen; + if ( ptr == lim ) + { + INPUT_BUF( lim ); + KECCAK_F_1600; + ptr = 0; + } + } + WRITE_STATE( kc ); + kc->ptr = ptr; +} + +static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst, + size_t byte_len, size_t lim ) +{ + unsigned eb; + union { + __m512i tmp[lim + 1]; + uint64_t dummy; /* for alignment */ + } u; + size_t j; + size_t m512_len = byte_len >> 3; + + eb = hard_coded_eb; + if ( kc->ptr == (lim - 8) ) + { + const uint64_t t = eb | 0x8000000000000000; + u.tmp[0] = m512_const1_64( t ); + j = 8; + } + else + { + j = lim - kc->ptr; + u.tmp[0] = m512_const1_64( eb ); + memset_zero_512( u.tmp + 1, (j>>3) - 2 ); + u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 ); + } + keccak64_8way_core( kc, u.tmp, j, lim ); + /* Finalize the "lane complement" */ + NOT64( kc->w[ 1], kc->w[ 1] ); + NOT64( kc->w[ 2], kc->w[ 2] ); + NOT64( kc->w[ 8], kc->w[ 8] ); + NOT64( kc->w[12], kc->w[12] ); + NOT64( kc->w[17], kc->w[17] ); + NOT64( kc->w[20], kc->w[20] ); + memcpy_512( dst, kc->w, m512_len ); +} + +void keccak256_8way_init( void *kc ) +{ + keccak64_8way_init( kc, 256 ); +} + +void +keccak256_8way_update(void *cc, const void *data, size_t len) +{ + keccak64_8way_core(cc, data, len, 136); +} + +void +keccak256_8way_close(void *cc, void *dst) +{ + keccak64_8way_close(cc, dst, 32, 136); +} + +void keccak512_8way_init( void *kc ) +{ + keccak64_8way_init( kc, 512 ); +} + +void +keccak512_8way_update(void *cc, const void *data, size_t len) +{ + keccak64_8way_core(cc, data, len, 72); +} + +void +keccak512_8way_close(void *cc, void *dst) +{ + keccak64_8way_close(cc, dst, 64, 72); +} + +#undef INPUT_BUF +#undef DECL64 +#undef XOR64 +#undef AND64 +#undef OR64 +#undef NOT64 +#undef ROL64 +#undef KECCAK_F_1600 + +#endif // AVX512 + +// AVX2 + #define INPUT_BUF(size) do { \ size_t j; \ for (j = 0; j < (size>>3); j++ ) \ @@ -55,314 +250,28 @@ static const sph_u64 RC[] = { } while (0) #define DECL64(x) __m256i x -#define MOV64(d, s) (d = s) #define XOR64(d, a, b) (d = _mm256_xor_si256(a,b)) #define AND64(d, a, b) (d = _mm256_and_si256(a,b)) #define OR64(d, a, b) (d = _mm256_or_si256(a,b)) #define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1)) #define ROL64(d, v, n) (d = mm256_rol_64(v, n)) -#define XOR64_IOTA XOR64 -#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ - DECL64(tt0); \ - DECL64(tt1); \ - DECL64(tt2); \ - DECL64(tt3); \ - XOR64(tt0, d0, d1); \ - XOR64(tt1, d2, d3); \ - XOR64(tt0, tt0, d4); \ - XOR64(tt0, tt0, tt1); \ - ROL64(tt0, tt0, 1); \ - XOR64(tt2, c0, c1); \ - XOR64(tt3, c2, c3); \ - XOR64(tt0, tt0, c4); \ - XOR64(tt2, tt2, tt3); \ - XOR64(t, tt0, tt2); \ - } while (0) - -#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - DECL64(t0); \ - DECL64(t1); \ - DECL64(t2); \ - DECL64(t3); \ - DECL64(t4); \ - TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ - TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ - TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ - TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ - TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ - XOR64(b00, b00, t0); \ - XOR64(b01, b01, t0); \ - XOR64(b02, b02, t0); \ - XOR64(b03, b03, t0); \ - XOR64(b04, b04, t0); \ - XOR64(b10, b10, t1); \ - XOR64(b11, b11, t1); \ - XOR64(b12, b12, t1); \ - XOR64(b13, b13, t1); \ - XOR64(b14, b14, t1); \ - XOR64(b20, b20, t2); \ - XOR64(b21, b21, t2); \ - XOR64(b22, b22, t2); \ - XOR64(b23, b23, t2); \ - XOR64(b24, b24, t2); \ - XOR64(b30, b30, t3); \ - XOR64(b31, b31, t3); \ - XOR64(b32, b32, t3); \ - XOR64(b33, b33, t3); \ - XOR64(b34, b34, t3); \ - XOR64(b40, b40, t4); \ - XOR64(b41, b41, t4); \ - XOR64(b42, b42, t4); \ - XOR64(b43, b43, t4); \ - XOR64(b44, b44, t4); \ - } while (0) - -#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - /* ROL64(b00, b00, 0); */ \ - ROL64(b01, b01, 36); \ - ROL64(b02, b02, 3); \ - ROL64(b03, b03, 41); \ - ROL64(b04, b04, 18); \ - ROL64(b10, b10, 1); \ - ROL64(b11, b11, 44); \ - ROL64(b12, b12, 10); \ - ROL64(b13, b13, 45); \ - ROL64(b14, b14, 2); \ - ROL64(b20, b20, 62); \ - ROL64(b21, b21, 6); \ - ROL64(b22, b22, 43); \ - ROL64(b23, b23, 15); \ - ROL64(b24, b24, 61); \ - ROL64(b30, b30, 28); \ - ROL64(b31, b31, 55); \ - ROL64(b32, b32, 25); \ - ROL64(b33, b33, 21); \ - ROL64(b34, b34, 56); \ - ROL64(b40, b40, 27); \ - ROL64(b41, b41, 20); \ - ROL64(b42, b42, 39); \ - ROL64(b43, b43, 8); \ - ROL64(b44, b44, 14); \ - } while (0) - -/* - * The KHI macro integrates the "lane complement" optimization. On input, - * some words are complemented: - * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43 - * On output, the following words are complemented: - * a04 a10 a20 a22 a23 a31 - * - * The (implicit) permutation and the theta expansion will bring back - * the input mask for the next round. - */ - -#define KHI_XO(d, a, b, c) do { \ - DECL64(kt); \ - OR64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI_XA(d, a, b, c) do { \ - DECL64(kt); \ - AND64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - DECL64(c0); \ - DECL64(c1); \ - DECL64(c2); \ - DECL64(c3); \ - DECL64(c4); \ - DECL64(bnn); \ - NOT64(bnn, b20); \ - KHI_XO(c0, b00, b10, b20); \ - KHI_XO(c1, b10, bnn, b30); \ - KHI_XA(c2, b20, b30, b40); \ - KHI_XO(c3, b30, b40, b00); \ - KHI_XA(c4, b40, b00, b10); \ - MOV64(b00, c0); \ - MOV64(b10, c1); \ - MOV64(b20, c2); \ - MOV64(b30, c3); \ - MOV64(b40, c4); \ - NOT64(bnn, b41); \ - KHI_XO(c0, b01, b11, b21); \ - KHI_XA(c1, b11, b21, b31); \ - KHI_XO(c2, b21, b31, bnn); \ - KHI_XO(c3, b31, b41, b01); \ - KHI_XA(c4, b41, b01, b11); \ - MOV64(b01, c0); \ - MOV64(b11, c1); \ - MOV64(b21, c2); \ - MOV64(b31, c3); \ - MOV64(b41, c4); \ - NOT64(bnn, b32); \ - KHI_XO(c0, b02, b12, b22); \ - KHI_XA(c1, b12, b22, b32); \ - KHI_XA(c2, b22, bnn, b42); \ - KHI_XO(c3, bnn, b42, b02); \ - KHI_XA(c4, b42, b02, b12); \ - MOV64(b02, c0); \ - MOV64(b12, c1); \ - MOV64(b22, c2); \ - MOV64(b32, c3); \ - MOV64(b42, c4); \ - NOT64(bnn, b33); \ - KHI_XA(c0, b03, b13, b23); \ - KHI_XO(c1, b13, b23, b33); \ - KHI_XO(c2, b23, bnn, b43); \ - KHI_XA(c3, bnn, b43, b03); \ - KHI_XO(c4, b43, b03, b13); \ - MOV64(b03, c0); \ - MOV64(b13, c1); \ - MOV64(b23, c2); \ - MOV64(b33, c3); \ - MOV64(b43, c4); \ - NOT64(bnn, b14); \ - KHI_XA(c0, b04, bnn, b24); \ - KHI_XO(c1, bnn, b24, b34); \ - KHI_XA(c2, b24, b34, b44); \ - KHI_XO(c3, b34, b44, b04); \ - KHI_XA(c4, b44, b04, b14); \ - MOV64(b04, c0); \ - MOV64(b14, c1); \ - MOV64(b24, c2); \ - MOV64(b34, c3); \ - MOV64(b44, c4); \ - } while (0) - -#define IOTA(r) XOR64_IOTA(a00, a00, r) - -#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \ - a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44 -#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \ - a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14 -#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \ - a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31 -#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \ - a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13 -#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \ - a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01 -#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \ - a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30 -#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \ - a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33 -#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \ - a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23 -#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \ - a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12 -#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \ - a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21 -#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \ - a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02 -#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \ - a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10 -#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \ - a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11 -#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \ - a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41 -#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \ - a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24 -#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \ - a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42 -#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \ - a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04 -#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \ - a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20 -#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \ - a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22 -#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \ - a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32 -#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \ - a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43 -#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \ - a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34 -#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \ - a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03 -#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \ - a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40 - -#define P8_TO_P0 do { \ - DECL64(t); \ - MOV64(t, a01); \ - MOV64(a01, a11); \ - MOV64(a11, a43); \ - MOV64(a43, t); \ - MOV64(t, a02); \ - MOV64(a02, a22); \ - MOV64(a22, a31); \ - MOV64(a31, t); \ - MOV64(t, a03); \ - MOV64(a03, a33); \ - MOV64(a33, a24); \ - MOV64(a24, t); \ - MOV64(t, a04); \ - MOV64(a04, a44); \ - MOV64(a44, a12); \ - MOV64(a12, t); \ - MOV64(t, a10); \ - MOV64(a10, a32); \ - MOV64(a32, a13); \ - MOV64(a13, t); \ - MOV64(t, a14); \ - MOV64(a14, a21); \ - MOV64(a21, a20); \ - MOV64(a20, t); \ - MOV64(t, a23); \ - MOV64(a23, a42); \ - MOV64(a42, a40); \ - MOV64(a40, t); \ - MOV64(t, a30); \ - MOV64(a30, a41); \ - MOV64(a41, a34); \ - MOV64(a34, t); \ - } while (0) +#include "keccak-macros.c" -#define LPAR ( -#define RPAR ) +#define KECCAK_F_1600 DO(KECCAK_F_1600_256) -#define KF_ELT(r, s, k) do { \ - THETA LPAR P ## r RPAR; \ - RHO LPAR P ## r RPAR; \ - KHI LPAR P ## s RPAR; \ - IOTA(k); \ - } while (0) - -#define DO(x) x - -#define KECCAK_F_1600 DO(KECCAK_F_1600_) - -#define KECCAK_F_1600_ do { \ +#define KECCAK_F_1600_256 do { \ int j; \ for (j = 0; j < 24; j += 8) \ { \ - KF_ELT( 0, 1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \ - RC[j + 0], RC[j + 0])) ); \ - KF_ELT( 1, 2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \ - RC[j + 1], RC[j + 1])) ); \ - KF_ELT( 2, 3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \ - RC[j + 2], RC[j + 2])) ); \ - KF_ELT( 3, 4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \ - RC[j + 3], RC[j + 3])) ); \ - KF_ELT( 4, 5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \ - RC[j + 4], RC[j + 4])) ); \ - KF_ELT( 5, 6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \ - RC[j + 5], RC[j + 5])) ); \ - KF_ELT( 6, 7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \ - RC[j + 6], RC[j + 6])) ); \ - KF_ELT( 7, 8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \ - RC[j + 7], RC[j + 7])) ); \ + KF_ELT( 0, 1, _mm256_set1_epi64x( RC[j + 0] ) ); \ + KF_ELT( 1, 2, _mm256_set1_epi64x( RC[j + 1] ) ); \ + KF_ELT( 2, 3, _mm256_set1_epi64x( RC[j + 2] ) ); \ + KF_ELT( 3, 4, _mm256_set1_epi64x( RC[j + 3] ) ); \ + KF_ELT( 4, 5, _mm256_set1_epi64x( RC[j + 4] ) ); \ + KF_ELT( 5, 6, _mm256_set1_epi64x( RC[j + 5] ) ); \ + KF_ELT( 6, 7, _mm256_set1_epi64x( RC[j + 6] ) ); \ + KF_ELT( 7, 8, _mm256_set1_epi64x( RC[j + 7] ) ); \ P8_TO_P0; \ } \ } while (0) @@ -370,18 +279,23 @@ static const sph_u64 RC[] = { static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size ) { - int i; - for (i = 0; i < 25; i ++) - kc->w[i] = _mm256_setzero_si256(); + __m256i zero = m256_zero; + __m256i neg1 = m256_neg1; // Initialization for the "lane complement". - kc->w[ 1] = m256_neg1; - kc->w[ 2] = m256_neg1; - kc->w[ 8] = m256_neg1; - kc->w[12] = m256_neg1; - kc->w[17] = m256_neg1; - kc->w[20] = m256_neg1; - kc->ptr = 0; + kc->w[ 0] = zero; kc->w[ 1] = neg1; + kc->w[ 2] = neg1; kc->w[ 3] = zero; + kc->w[ 4] = zero; kc->w[ 5] = zero; + kc->w[ 6] = zero; kc->w[ 7] = zero; + kc->w[ 8] = neg1; kc->w[ 9] = zero; + kc->w[10] = zero; kc->w[11] = zero; + kc->w[12] = neg1; kc->w[13] = zero; + kc->w[14] = zero; kc->w[15] = zero; + kc->w[16] = zero; kc->w[17] = neg1; + kc->w[18] = zero; kc->w[19] = zero; + kc->w[20] = neg1; kc->w[21] = zero; + kc->w[22] = zero; kc->w[23] = zero; + kc->w[24] = zero; kc->ptr = 0; kc->lim = 200 - (out_size >> 2); } @@ -433,25 +347,24 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len, unsigned eb; union { __m256i tmp[lim + 1]; - sph_u64 dummy; /* for alignment */ + uint64_t dummy; /* for alignment */ } u; size_t j; size_t m256_len = byte_len >> 3; - eb = 0x100 >> 8; + eb = hard_coded_eb; if ( kc->ptr == (lim - 8) ) { - uint64_t t = eb | 0x8000000000000000; - u.tmp[0] = _mm256_set_epi64x( t, t, t, t ); + const uint64_t t = eb | 0x8000000000000000; + u.tmp[0] = m256_const1_64( t ); j = 8; } else { j = lim - kc->ptr; - u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb ); + u.tmp[0] = m256_const1_64( eb ); memset_zero_256( u.tmp + 1, (j>>3) - 2 ); - u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000, - 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); + u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 ); } keccak64_core( kc, u.tmp, j, lim ); /* Finalize the "lane complement" */ @@ -461,9 +374,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len, NOT64( kc->w[12], kc->w[12] ); NOT64( kc->w[17], kc->w[17] ); NOT64( kc->w[20], kc->w[20] ); - for ( j = 0; j < m256_len; j++ ) - u.tmp[j] = kc->w[j]; - memcpy_256( dst, u.tmp, m256_len ); + memcpy_256( dst, kc->w, m256_len ); } void keccak256_4way_init( void *kc ) @@ -472,7 +383,7 @@ void keccak256_4way_init( void *kc ) } void -keccak256_4way(void *cc, const void *data, size_t len) +keccak256_4way_update(void *cc, const void *data, size_t len) { keccak64_core(cc, data, len, 136); } @@ -489,15 +400,24 @@ void keccak512_4way_init( void *kc ) } void -keccak512_4way(void *cc, const void *data, size_t len) +keccak512_4way_update(void *cc, const void *data, size_t len) { - keccak64_core(cc, data, len, 72); + keccak64_core(cc, data, len, 72); } void keccak512_4way_close(void *cc, void *dst) { - keccak64_close(cc, dst, 64, 72); + keccak64_close(cc, dst, 64, 72); } -#endif +#undef INPUT_BUF +#undef DECL64 +#undef XOR64 +#undef AND64 +#undef OR64 +#undef NOT64 +#undef ROL64 +#undef KECCAK_F_1600 + +#endif // AVX2 diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h index 8f6d6a3..5b91bcf 100644 --- a/algo/keccak/keccak-hash-4way.h +++ b/algo/keccak/keccak-hash-4way.h @@ -43,16 +43,8 @@ extern "C"{ #ifdef __AVX2__ #include -#include "algo/sha/sph_types.h" #include "simd-utils.h" -#define SPH_SIZE_keccak256 256 - -/** - * Output size (in bits) for Keccak-512. - */ -#define SPH_SIZE_keccak512 512 - /** * This structure is a context for Keccak computations: it contains the * intermediate values and some data from the last entered block. Once a @@ -64,23 +56,44 @@ extern "C"{ * memcpy()). */ +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +typedef struct { + __m512i buf[144*8]; + __m512i w[25]; + size_t ptr, lim; +} keccak64_ctx_m512i __attribute__((aligned(128))); + +typedef keccak64_ctx_m512i keccak256_8way_context; +typedef keccak64_ctx_m512i keccak512_8way_context; + +void keccak256_8way_init(void *cc); +void keccak256_8way_update(void *cc, const void *data, size_t len); +void keccak256_8way_close(void *cc, void *dst); + +void keccak512_8way_init(void *cc); +void keccak512_8way_update(void *cc, const void *data, size_t len); +void keccak512_8way_close(void *cc, void *dst); +void keccak512_8way_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + typedef struct { - __m256i buf[144*8]; /* first field, for alignment */ + __m256i buf[144*8]; __m256i w[25]; size_t ptr, lim; -// sph_u64 wide[25]; -} keccak64_ctx_m256i; +} keccak64_ctx_m256i __attribute__((aligned(128))); typedef keccak64_ctx_m256i keccak256_4way_context; typedef keccak64_ctx_m256i keccak512_4way_context; void keccak256_4way_init(void *cc); -void keccak256_4way(void *cc, const void *data, size_t len); +void keccak256_4way_update(void *cc, const void *data, size_t len); void keccak256_4way_close(void *cc, void *dst); - void keccak512_4way_init(void *cc); -void keccak512_4way(void *cc, const void *data, size_t len); +void keccak512_4way_update(void *cc, const void *data, size_t len); void keccak512_4way_close(void *cc, void *dst); void keccak512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); diff --git a/algo/keccak/keccak-macros.c b/algo/keccak/keccak-macros.c new file mode 100644 index 0000000..8d5197c --- /dev/null +++ b/algo/keccak/keccak-macros.c @@ -0,0 +1,307 @@ +#ifdef TH_ELT +#undef TH_ELT +#endif +#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ + DECL64(tt0); \ + DECL64(tt1); \ + DECL64(tt2); \ + DECL64(tt3); \ + XOR64(tt0, d0, d1); \ + XOR64(tt1, d2, d3); \ + XOR64(tt0, tt0, d4); \ + XOR64(tt0, tt0, tt1); \ + ROL64(tt0, tt0, 1); \ + XOR64(tt2, c0, c1); \ + XOR64(tt3, c2, c3); \ + XOR64(tt0, tt0, c4); \ + XOR64(tt2, tt2, tt3); \ + XOR64(t, tt0, tt2); \ + } while (0) + +#ifdef THETA +#undef THETA +#endif +#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(t0); \ + DECL64(t1); \ + DECL64(t2); \ + DECL64(t3); \ + DECL64(t4); \ + TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ + TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ + TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ + TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ + TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ + XOR64(b00, b00, t0); \ + XOR64(b01, b01, t0); \ + XOR64(b02, b02, t0); \ + XOR64(b03, b03, t0); \ + XOR64(b04, b04, t0); \ + XOR64(b10, b10, t1); \ + XOR64(b11, b11, t1); \ + XOR64(b12, b12, t1); \ + XOR64(b13, b13, t1); \ + XOR64(b14, b14, t1); \ + XOR64(b20, b20, t2); \ + XOR64(b21, b21, t2); \ + XOR64(b22, b22, t2); \ + XOR64(b23, b23, t2); \ + XOR64(b24, b24, t2); \ + XOR64(b30, b30, t3); \ + XOR64(b31, b31, t3); \ + XOR64(b32, b32, t3); \ + XOR64(b33, b33, t3); \ + XOR64(b34, b34, t3); \ + XOR64(b40, b40, t4); \ + XOR64(b41, b41, t4); \ + XOR64(b42, b42, t4); \ + XOR64(b43, b43, t4); \ + XOR64(b44, b44, t4); \ + } while (0) + +#ifdef RHO +#undef RHO +#endif +#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + /* ROL64(b00, b00, 0); */ \ + ROL64(b01, b01, 36); \ + ROL64(b02, b02, 3); \ + ROL64(b03, b03, 41); \ + ROL64(b04, b04, 18); \ + ROL64(b10, b10, 1); \ + ROL64(b11, b11, 44); \ + ROL64(b12, b12, 10); \ + ROL64(b13, b13, 45); \ + ROL64(b14, b14, 2); \ + ROL64(b20, b20, 62); \ + ROL64(b21, b21, 6); \ + ROL64(b22, b22, 43); \ + ROL64(b23, b23, 15); \ + ROL64(b24, b24, 61); \ + ROL64(b30, b30, 28); \ + ROL64(b31, b31, 55); \ + ROL64(b32, b32, 25); \ + ROL64(b33, b33, 21); \ + ROL64(b34, b34, 56); \ + ROL64(b40, b40, 27); \ + ROL64(b41, b41, 20); \ + ROL64(b42, b42, 39); \ + ROL64(b43, b43, 8); \ + ROL64(b44, b44, 14); \ + } while (0) + +/* + * The KHI macro integrates the "lane complement" optimization. On input, + * some words are complemented: + * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43 + * On output, the following words are complemented: + * a04 a10 a20 a22 a23 a31 + * + * The (implicit) permutation and the theta expansion will bring back + * the input mask for the next round. + */ + +#ifdef KHI_XO +#undef KHI_XO +#endif +#define KHI_XO(d, a, b, c) do { \ + DECL64(kt); \ + OR64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#ifdef KHI_XA +#undef KHI_XA +#endif +#define KHI_XA(d, a, b, c) do { \ + DECL64(kt); \ + AND64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#ifdef KHI +#undef KHI +#endif +#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(c0); \ + DECL64(c1); \ + DECL64(bnn); \ + NOT64(bnn, b20); \ + KHI_XO(c0, b00, b10, b20); \ + KHI_XO(c1, b10, bnn, b30); \ + KHI_XA(b20, b20, b30, b40); \ + KHI_XO(b30, b30, b40, b00); \ + KHI_XA(b40, b40, b00, b10); \ + MOV64(b00, c0); \ + MOV64(b10, c1); \ + NOT64(bnn, b41); \ + KHI_XO(c0, b01, b11, b21); \ + KHI_XA(c1, b11, b21, b31); \ + KHI_XO(b21, b21, b31, bnn); \ + KHI_XO(b31, b31, b41, b01); \ + KHI_XA(b41, b41, b01, b11); \ + MOV64(b01, c0); \ + MOV64(b11, c1); \ + NOT64(bnn, b32); \ + KHI_XO(c0, b02, b12, b22); \ + KHI_XA(c1, b12, b22, b32); \ + KHI_XA(b22, b22, bnn, b42); \ + KHI_XO(b32, bnn, b42, b02); \ + KHI_XA(b42, b42, b02, b12); \ + MOV64(b02, c0); \ + MOV64(b12, c1); \ + NOT64(bnn, b33); \ + KHI_XA(c0, b03, b13, b23); \ + KHI_XO(c1, b13, b23, b33); \ + KHI_XO(b23, b23, bnn, b43); \ + KHI_XA(b33, bnn, b43, b03); \ + KHI_XO(b43, b43, b03, b13); \ + MOV64(b03, c0); \ + MOV64(b13, c1); \ + NOT64(bnn, b14); \ + KHI_XA(c0, b04, bnn, b24); \ + KHI_XO(c1, bnn, b24, b34); \ + KHI_XA(b24, b24, b34, b44); \ + KHI_XO(b34, b34, b44, b04); \ + KHI_XA(b44, b44, b04, b14); \ + MOV64(b04, c0); \ + MOV64(b14, c1); \ + } while (0) + +#ifdef IOTA +#undef IOTA +#endif +#define IOTA(r) XOR64_IOTA(a00, a00, r) + +#ifdef P0 +#undef P0 +#undef P1 +#undef P2 +#undef P3 +#undef P4 +#undef P5 +#undef P6 +#undef P7 +#undef P8 +#undef P9 +#undef P10 +#undef p11 +#undef P12 +#undef P13 +#undef P14 +#undef P15 +#undef P16 +#undef P17 +#undef P18 +#undef P19 +#undef P20 +#undef P21 +#undef P22 +#undef P23 +#endif + +#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \ + a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44 +#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \ + a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14 +#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \ + a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31 +#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \ + a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13 +#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \ + a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01 +#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \ + a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30 +#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \ + a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33 +#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \ + a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23 +#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \ + a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12 +#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \ + a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21 +#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \ + a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02 +#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \ + a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10 +#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \ + a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11 +#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \ + a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41 +#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \ + a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24 +#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \ + a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42 +#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \ + a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04 +#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \ + a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20 +#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \ + a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22 +#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \ + a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32 +#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \ + a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43 +#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \ + a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34 +#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \ + a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03 +#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \ + a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40 + +#ifdef P8_TO_P0 +#undef P8_TO_P0 +#endif +#define P8_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a11); \ + MOV64(a11, a43); \ + MOV64(a43, t); \ + MOV64(t, a02); \ + MOV64(a02, a22); \ + MOV64(a22, a31); \ + MOV64(a31, t); \ + MOV64(t, a03); \ + MOV64(a03, a33); \ + MOV64(a33, a24); \ + MOV64(a24, t); \ + MOV64(t, a04); \ + MOV64(a04, a44); \ + MOV64(a44, a12); \ + MOV64(a12, t); \ + MOV64(t, a10); \ + MOV64(a10, a32); \ + MOV64(a32, a13); \ + MOV64(a13, t); \ + MOV64(t, a14); \ + MOV64(a14, a21); \ + MOV64(a21, a20); \ + MOV64(a20, t); \ + MOV64(t, a23); \ + MOV64(a23, a42); \ + MOV64(a42, a40); \ + MOV64(a40, t); \ + MOV64(t, a30); \ + MOV64(a30, a41); \ + MOV64(a41, a34); \ + MOV64(a34, t); \ + } while (0) + +#define KF_ELT(r, s, k) do { \ + THETA LPAR P ## r RPAR; \ + RHO LPAR P ## r RPAR; \ + KHI LPAR P ## s RPAR; \ + IOTA(k); \ + } while (0) + + diff --git a/algo/keccak/keccak.c b/algo/keccak/keccak.c index 1a66bc1..2dde233 100644 --- a/algo/keccak/keccak.c +++ b/algo/keccak/keccak.c @@ -1,4 +1,6 @@ -#include "algo-gate-api.h" +#include "keccak-gate.h" + +#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY) #include #include @@ -18,36 +20,35 @@ void keccakhash(void *state, const void *input) memcpy(state, hash, 32); } -int scanhash_keccak( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) +int scanhash_keccak( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - //const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint32_t _ALIGN(32) hash64[8]; - uint32_t endiandata[32]; - - for (int i=0; i < 19; i++) - be32enc(&endiandata[i], pdata[i]); - - do { - - pdata[19] = ++n; - be32enc(&endiandata[19], n); - keccakhash(hash64, endiandata); - if (((hash64[7]&0xFFFFFF00)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; + uint32_t _ALIGN(64) hash64[8]; + uint32_t _ALIGN(64) endiandata[32]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce; + const int thr_id = mythr->id; + + for ( int i=0; i < 19; i++ ) + be32enc( &endiandata[i], pdata[i] ); + + do { + be32enc( &endiandata[19], n ); + keccakhash( hash64, endiandata ); + if ( valid_hash( hash64, ptarget ) && !opt_benchmark ) + { + pdata[19] = n; + submit_solution( work, hash64, mythr ); + } + n++; + } while ( n < last_nonce && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce; + pdata[19] = n; + return 0; } +#endif diff --git a/algo/keccak/sha3d-4way.c b/algo/keccak/sha3d-4way.c new file mode 100644 index 0000000..14451c0 --- /dev/null +++ b/algo/keccak/sha3d-4way.c @@ -0,0 +1,126 @@ +#include "keccak-gate.h" +#include +#include +#include +#include "sph_keccak.h" +#include "keccak-hash-4way.h" + +#if defined(KECCAK_8WAY) + +void sha3d_hash_8way(void *state, const void *input) +{ + uint32_t buffer[16*8] __attribute__ ((aligned (128))); + keccak256_8way_context ctx; + + keccak256_8way_init( &ctx ); + keccak256_8way_update( &ctx, input, 80 ); + keccak256_8way_close( &ctx, buffer ); + + keccak256_8way_init( &ctx ); + keccak256_8way_update( &ctx, buffer, 32 ); + keccak256_8way_close( &ctx, state ); +} + +int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[24*8] __attribute__ ((aligned (128))); + uint32_t hash[16*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[49]); // 3*16+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + const uint32_t Htarg = ptarget[7]; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + *noncev = mm512_intrlv_blend_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); + do { + sha3d_hash_8way( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) ) + { + extr_lane_8x64( lane_hash, hash, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) + { + pdata[19] = bswap_32( n + lane ); + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); + n += 8; + + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined(KECCAK_4WAY) + +void sha3d_hash_4way(void *state, const void *input) +{ + uint32_t buffer[16*4] __attribute__ ((aligned (64))); + keccak256_4way_context ctx; + + keccak256_4way_init( &ctx ); + keccak256_4way_update( &ctx, input, 80 ); + keccak256_4way_close( &ctx, buffer ); + + keccak256_4way_init( &ctx ); + keccak256_4way_update( &ctx, buffer, 32 ); + keccak256_4way_close( &ctx, state ); +} + +int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t hash[16*4] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash7 = &(hash[25]); // 3*8+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; + __m256i *noncev = (__m256i*)vdata + 9; // aligned + const uint32_t Htarg = ptarget[7]; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); + do { + sha3d_hash_4way( hash, vdata ); + + for ( int lane = 0; lane < 4; lane++ ) + if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) ) + { + extr_lane_4x64( lane_hash, hash, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) + { + pdata[19] = bswap_32( n + lane ); + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm256_add_epi32( *noncev, + m256_const1_64( 0x0000000400000000 ) ); + n += 4; + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif diff --git a/algo/keccak/sha3d.c b/algo/keccak/sha3d.c new file mode 100644 index 0000000..43d8c4f --- /dev/null +++ b/algo/keccak/sha3d.c @@ -0,0 +1,54 @@ +#include "keccak-gate.h" + +#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY) + +#include +#include +#include +#include "sph_keccak.h" + +void sha3d_hash(void *state, const void *input) +{ + uint32_t buffer[16]; + sph_keccak256_context ctx_keccak; + + sph_keccak256_init( &ctx_keccak ); + sph_keccak256 ( &ctx_keccak, input, 80 ); + sph_keccak256_close( &ctx_keccak, buffer ); + sph_keccak256_init( &ctx_keccak ); + sph_keccak256 ( &ctx_keccak, buffer, 32 ); + sph_keccak256_close( &ctx_keccak, state ); +} + +int scanhash_sha3d( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t _ALIGN(64) hash64[8]; + uint32_t _ALIGN(64) endiandata[32]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce; + const int thr_id = mythr->id; + + for ( int i=0; i < 19; i++ ) + be32enc( &endiandata[i], pdata[i] ); + + do { + be32enc( &endiandata[19], n ); + sha3d_hash( hash64, endiandata ); + if ( valid_hash( hash64, ptarget ) && !opt_benchmark ) + { + pdata[19] = n; + submit_solution( work, hash64, mythr ); + } + n++; + } while ( n < last_nonce && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce; + pdata[19] = n; + return 0; +} + +#endif diff --git a/algo/keccak/sph_keccak.c b/algo/keccak/sph_keccak.c index de7784f..45f3d37 100644 --- a/algo/keccak/sph_keccak.c +++ b/algo/keccak/sph_keccak.c @@ -32,8 +32,8 @@ #include #include - #include "sph_keccak.h" +#include "keccak-gate.h" #ifdef __cplusplus extern "C"{ @@ -1616,7 +1616,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim) } u; \ size_t j; \ \ - eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ + eb = hard_coded_eb; \ if (kc->ptr == (lim - 1)) { \ if (n == 7) { \ u.tmp[0] = eb; \ diff --git a/algo/keccak/sse2/keccak.c b/algo/keccak/sse2/keccak.c deleted file mode 100644 index a1b4674..0000000 --- a/algo/keccak/sse2/keccak.c +++ /dev/null @@ -1,845 +0,0 @@ -/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */ -/* - * Keccak implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#define QSTATIC static - -#include -#include -#include - -#include "sph_keccak.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -/* - * Parameters: - * - * SPH_KECCAK_64 use a 64-bit type - * SPH_KECCAK_INTERLEAVE use bit-interleaving (32-bit type only) - * SPH_KECCAK_NOCOPY do not copy the state into local variables - * - * If there is no usable 64-bit type, the code automatically switches - * back to the 32-bit implementation. - * - * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1 - * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core - * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302, - * 8 kB L1 code cache), seem to show that the following are optimal: - * - * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds, - * do not copy the state; unrolling 2, 6 or all rounds also provides - * near-optimal performance. - * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds, - * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds - * also provides near-optimal performance. - * -- PowerPC: use the 64-bit implementation, unroll 8 rounds, - * copy the state. Unrolling 4 or 6 rounds is near-optimal. - * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds, - * copy the state. - * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy - * the state. Unrolling only 1 round is also near-optimal. - * - * Also, interleaving does not always yield actual improvements when - * using a 32-bit implementation; in particular when the architecture - * does not offer a native rotation opcode (interleaving replaces one - * 64-bit rotation with two 32-bit rotations, which is a gain only if - * there is a native 32-bit rotation opcode and not a native 64-bit - * rotation opcode; also, interleaving implies a small overhead when - * processing input words). - * - * To sum up: - * -- when possible, use the 64-bit code - * -- exception: on 32-bit x86, use 32-bit code - * -- when using 32-bit code, use interleaving - * -- copy the state, except on x86 - * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines - */ - - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -/* -static const sph_u64 RC[] = { - SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), - SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), - SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), - SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), - SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), - SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), - SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), - SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), - SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), - SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) -}; -*/ -#define kekDECL_STATE \ - sph_u64 keca00, keca01, keca02, keca03, keca04; \ - sph_u64 keca10, keca11, keca12, keca13, keca14; \ - sph_u64 keca20, keca21, keca22, keca23, keca24; \ - sph_u64 keca30, keca31, keca32, keca33, keca34; \ - sph_u64 keca40, keca41, keca42, keca43, keca44; - -#define kekREAD_STATE(state) do { \ - keca00 = (state)->kecu.wide[ 0]; \ - keca10 = (state)->kecu.wide[ 1]; \ - keca20 = (state)->kecu.wide[ 2]; \ - keca30 = (state)->kecu.wide[ 3]; \ - keca40 = (state)->kecu.wide[ 4]; \ - keca01 = (state)->kecu.wide[ 5]; \ - keca11 = (state)->kecu.wide[ 6]; \ - keca21 = (state)->kecu.wide[ 7]; \ - keca31 = (state)->kecu.wide[ 8]; \ - keca41 = (state)->kecu.wide[ 9]; \ - keca02 = (state)->kecu.wide[10]; \ - keca12 = (state)->kecu.wide[11]; \ - keca22 = (state)->kecu.wide[12]; \ - keca32 = (state)->kecu.wide[13]; \ - keca42 = (state)->kecu.wide[14]; \ - keca03 = (state)->kecu.wide[15]; \ - keca13 = (state)->kecu.wide[16]; \ - keca23 = (state)->kecu.wide[17]; \ - keca33 = (state)->kecu.wide[18]; \ - keca43 = (state)->kecu.wide[19]; \ - keca04 = (state)->kecu.wide[20]; \ - keca14 = (state)->kecu.wide[21]; \ - keca24 = (state)->kecu.wide[22]; \ - keca34 = (state)->kecu.wide[23]; \ - keca44 = (state)->kecu.wide[24]; \ - } while (0) - -#define kecREAD_STATE(state) do { \ - keca00 = kecu.wide[ 0]; \ - keca10 = kecu.wide[ 1]; \ - keca20 = kecu.wide[ 2]; \ - keca30 = kecu.wide[ 3]; \ - keca40 = kecu.wide[ 4]; \ - keca01 = kecu.wide[ 5]; \ - keca11 = kecu.wide[ 6]; \ - keca21 = kecu.wide[ 7]; \ - keca31 = kecu.wide[ 8]; \ - keca41 = kecu.wide[ 9]; \ - keca02 = kecu.wide[10]; \ - keca12 = kecu.wide[11]; \ - keca22 = kecu.wide[12]; \ - keca32 = kecu.wide[13]; \ - keca42 = kecu.wide[14]; \ - keca03 = kecu.wide[15]; \ - keca13 = kecu.wide[16]; \ - keca23 = kecu.wide[17]; \ - keca33 = kecu.wide[18]; \ - keca43 = kecu.wide[19]; \ - keca04 = kecu.wide[20]; \ - keca14 = kecu.wide[21]; \ - keca24 = kecu.wide[22]; \ - keca34 = kecu.wide[23]; \ - keca44 = kecu.wide[24]; \ - } while (0) - -#define kecINIT_STATE() do { \ - keca00 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 0); \ - keca10 = 0xFFFFFFFFFFFFFFFF \ - ^ sph_dec64le_aligned(buf + 8); \ - keca20 = 0xFFFFFFFFFFFFFFFF \ - ^ sph_dec64le_aligned(buf + 16); \ - keca30 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 24); \ - keca40 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 32); \ - keca01 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 40); \ - keca11 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 48); \ - keca21 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 56); \ - keca31 = 0xFFFFFFFFFFFFFFFF \ - ^ sph_dec64le_aligned(buf + 64); \ - keca41 = 0x0000000000000000, \ - keca02 = 0x0000000000000000, \ - keca12 = 0x0000000000000000, \ - keca32 = 0x0000000000000000, \ - keca42 = 0x0000000000000000, \ - keca03 = 0x0000000000000000, \ - keca13 = 0x0000000000000000, \ - keca33 = 0x0000000000000000, \ - keca43 = 0x0000000000000000, \ - keca14 = 0x0000000000000000, \ - keca24 = 0x0000000000000000, \ - keca34 = 0x0000000000000000, \ - keca44 = 0x0000000000000000; \ - keca23 = 0xFFFFFFFFFFFFFFFF, \ - keca04 = 0xFFFFFFFFFFFFFFFF, \ - keca22 = 0xFFFFFFFFFFFFFFFF; \ - } while (0) - -#define kekWRITE_STATE(state) do { \ - (state)->kecu.wide[ 0] = keca00; \ - (state)->kecu.wide[ 1] = ~keca10; \ - (state)->kecu.wide[ 2] = ~keca20; \ - (state)->kecu.wide[ 3] = keca30; \ - (state)->kecu.wide[ 4] = keca40; \ - (state)->kecu.wide[ 5] = keca01; \ - (state)->kecu.wide[ 6] = keca11; \ - (state)->kecu.wide[ 7] = keca21; \ - (state)->kecu.wide[ 8] = ~keca31; \ - (state)->kecu.wide[ 9] = keca41; \ - (state)->kecu.wide[10] = keca02; \ - (state)->kecu.wide[11] = keca12; \ - (state)->kecu.wide[12] = ~keca22; \ - (state)->kecu.wide[13] = keca32; \ - (state)->kecu.wide[14] = keca42; \ - (state)->kecu.wide[15] = keca03; \ - (state)->kecu.wide[16] = keca13; \ - (state)->kecu.wide[17] = ~keca23; \ - (state)->kecu.wide[18] = keca33; \ - (state)->kecu.wide[19] = keca43; \ - (state)->kecu.wide[20] = ~keca04; \ - (state)->kecu.wide[21] = keca14; \ - (state)->kecu.wide[22] = keca24; \ - (state)->kecu.wide[23] = keca34; \ - (state)->kecu.wide[24] = keca44; \ - } while (0) - -/* only usefull for one round final */ -#define kecWRITE_STATE(state) do { \ - kecu.wide[ 0] = keca00; \ - kecu.wide[ 1] = ~keca10; \ - kecu.wide[ 2] = ~keca20; \ - kecu.wide[ 3] = keca30; \ - kecu.wide[ 4] = keca40; \ - kecu.wide[ 5] = keca01; \ - kecu.wide[ 6] = keca11; \ - kecu.wide[ 7] = keca21; \ - kecu.wide[ 8] = ~keca31; \ - kecu.wide[ 9] = keca41; \ - kecu.wide[10] = keca02; \ - kecu.wide[11] = keca12; \ - kecu.wide[12] = ~keca22; \ - kecu.wide[13] = keca32; \ - kecu.wide[14] = keca42; \ - kecu.wide[15] = keca03; \ - kecu.wide[16] = keca13; \ - kecu.wide[17] = ~keca23; \ - kecu.wide[18] = keca33; \ - kecu.wide[19] = keca43; \ - kecu.wide[20] = ~keca04; \ - kecu.wide[21] = keca14; \ - kecu.wide[22] = keca24; \ - kecu.wide[23] = keca34; \ - kecu.wide[24] = keca44; \ - } while (0) - -#define kecPRINT_STATE(state) do { \ - printf("keca00=%lX\n", keca00); \ - printf("keca10=%lX\n", keca10); \ - printf("keca20=%lX\n", keca20); \ - printf("keca30=%lX\n", keca30); \ - printf("keca40=%lX\n", keca40); \ - printf("keca01=%lX\n", keca01); \ - printf("keca11=%lX\n", keca11); \ - printf("keca21=%lX\n", keca21); \ - printf("keca31=%lX\n", keca31); \ - printf("keca41=%lX\n", keca41); \ - printf("keca02=%lX\n", keca02); \ - printf("keca12=%lX\n", keca12); \ - printf("keca22=%lX\n", keca22); \ - printf("keca32=%lX\n", keca32); \ - printf("keca42=%lX\n", keca42); \ - printf("keca03=%lX\n", keca03); \ - printf("keca13=%lX\n", keca13); \ - printf("keca23=%lX\n", keca23); \ - printf("keca33=%lX\n", keca33); \ - printf("keca43=%lX\n", keca43); \ - printf("keca04=%lX\n", keca04); \ - printf("keca14=%lX\n", keca14); \ - printf("keca24=%lX\n", keca24); \ - printf("keca34=%lX\n", keca34); \ - printf("keca44=%lX\n", keca44); \ - abort(); \ - } while (0) - -#define kekINPUT_BUF() do { \ - } while (0) - - -#define kekDECL64(x) sph_u64 x -#define MOV64(d, s) (d = s) -#define XOR64(d, a, b) (d = a ^ b) -#define AND64(d, a, b) (d = a & b) -#define OR64(d, a, b) (d = a | b) -#define NOT64(d, s) (d = SPH_T64(~s)) -#define ROL64(d, v, n) (d = SPH_ROTL64(v, n)) -#define XOR64_IOTA XOR64 - -#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ - kekDECL64(tt0); \ - kekDECL64(tt1); \ - kekDECL64(tt2); \ - kekDECL64(tt3); \ - XOR64(tt0, d0, d1); \ - XOR64(tt1, d2, d3); \ - XOR64(tt0, tt0, d4); \ - XOR64(tt0, tt0, tt1); \ - ROL64(tt0, tt0, 1); \ - XOR64(tt2, c0, c1); \ - XOR64(tt3, c2, c3); \ - XOR64(tt0, tt0, c4); \ - XOR64(tt2, tt2, tt3); \ - XOR64(t, tt0, tt2); \ - } while (0) - -#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - kekDECL64(t0); \ - kekDECL64(t1); \ - kekDECL64(t2); \ - kekDECL64(t3); \ - kekDECL64(t4); \ - TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ - TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ - TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ - TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ - TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ - XOR64(b00, b00, t0); \ - XOR64(b01, b01, t0); \ - XOR64(b02, b02, t0); \ - XOR64(b03, b03, t0); \ - XOR64(b04, b04, t0); \ - XOR64(b10, b10, t1); \ - XOR64(b11, b11, t1); \ - XOR64(b12, b12, t1); \ - XOR64(b13, b13, t1); \ - XOR64(b14, b14, t1); \ - XOR64(b20, b20, t2); \ - XOR64(b21, b21, t2); \ - XOR64(b22, b22, t2); \ - XOR64(b23, b23, t2); \ - XOR64(b24, b24, t2); \ - XOR64(b30, b30, t3); \ - XOR64(b31, b31, t3); \ - XOR64(b32, b32, t3); \ - XOR64(b33, b33, t3); \ - XOR64(b34, b34, t3); \ - XOR64(b40, b40, t4); \ - XOR64(b41, b41, t4); \ - XOR64(b42, b42, t4); \ - XOR64(b43, b43, t4); \ - XOR64(b44, b44, t4); \ - } while (0) - -#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - /* ROL64(b00, b00, 0); */ \ - ROL64(b01, b01, 36); \ - ROL64(b02, b02, 3); \ - ROL64(b03, b03, 41); \ - ROL64(b04, b04, 18); \ - ROL64(b10, b10, 1); \ - ROL64(b11, b11, 44); \ - ROL64(b12, b12, 10); \ - ROL64(b13, b13, 45); \ - ROL64(b14, b14, 2); \ - ROL64(b20, b20, 62); \ - ROL64(b21, b21, 6); \ - ROL64(b22, b22, 43); \ - ROL64(b23, b23, 15); \ - ROL64(b24, b24, 61); \ - ROL64(b30, b30, 28); \ - ROL64(b31, b31, 55); \ - ROL64(b32, b32, 25); \ - ROL64(b33, b33, 21); \ - ROL64(b34, b34, 56); \ - ROL64(b40, b40, 27); \ - ROL64(b41, b41, 20); \ - ROL64(b42, b42, 39); \ - ROL64(b43, b43, 8); \ - ROL64(b44, b44, 14); \ - } while (0) - -/* - * The KHI macro integrates the "lane complement" optimization. On input, - * some words are complemented: - * keca00 keca01 keca02 keca04 keca13 keca20 keca21 keca22 keca30 keca33 keca34 keca43 - * On output, the following words are complemented: - * keca04 keca10 keca20 keca22 keca23 keca31 - * - * The (implicit) permutation and the theta expansion will bring back - * the input mask for the next round. - */ - -#define KHI_XO(d, a, b, c) do { \ - kekDECL64(kt); \ - OR64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI_XA(d, a, b, c) do { \ - kekDECL64(kt); \ - AND64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - kekDECL64(c0); \ - kekDECL64(c1); \ - kekDECL64(c2); \ - kekDECL64(c3); \ - kekDECL64(c4); \ - kekDECL64(bnn); \ - NOT64(bnn, b20); \ - KHI_XO(c0, b00, b10, b20); \ - KHI_XO(c1, b10, bnn, b30); \ - KHI_XA(c2, b20, b30, b40); \ - KHI_XO(c3, b30, b40, b00); \ - KHI_XA(c4, b40, b00, b10); \ - MOV64(b00, c0); \ - MOV64(b10, c1); \ - MOV64(b20, c2); \ - MOV64(b30, c3); \ - MOV64(b40, c4); \ - NOT64(bnn, b41); \ - KHI_XO(c0, b01, b11, b21); \ - KHI_XA(c1, b11, b21, b31); \ - KHI_XO(c2, b21, b31, bnn); \ - KHI_XO(c3, b31, b41, b01); \ - KHI_XA(c4, b41, b01, b11); \ - MOV64(b01, c0); \ - MOV64(b11, c1); \ - MOV64(b21, c2); \ - MOV64(b31, c3); \ - MOV64(b41, c4); \ - NOT64(bnn, b32); \ - KHI_XO(c0, b02, b12, b22); \ - KHI_XA(c1, b12, b22, b32); \ - KHI_XA(c2, b22, bnn, b42); \ - KHI_XO(c3, bnn, b42, b02); \ - KHI_XA(c4, b42, b02, b12); \ - MOV64(b02, c0); \ - MOV64(b12, c1); \ - MOV64(b22, c2); \ - MOV64(b32, c3); \ - MOV64(b42, c4); \ - NOT64(bnn, b33); \ - KHI_XA(c0, b03, b13, b23); \ - KHI_XO(c1, b13, b23, b33); \ - KHI_XO(c2, b23, bnn, b43); \ - KHI_XA(c3, bnn, b43, b03); \ - KHI_XO(c4, b43, b03, b13); \ - MOV64(b03, c0); \ - MOV64(b13, c1); \ - MOV64(b23, c2); \ - MOV64(b33, c3); \ - MOV64(b43, c4); \ - NOT64(bnn, b14); \ - KHI_XA(c0, b04, bnn, b24); \ - KHI_XO(c1, bnn, b24, b34); \ - KHI_XA(c2, b24, b34, b44); \ - KHI_XO(c3, b34, b44, b04); \ - KHI_XA(c4, b44, b04, b14); \ - MOV64(b04, c0); \ - MOV64(b14, c1); \ - MOV64(b24, c2); \ - MOV64(b34, c3); \ - MOV64(b44, c4); \ - } while (0) - -#define IOTA(r) XOR64_IOTA(keca00, keca00, r) - -#define P0 keca00, keca01, keca02, keca03, keca04, keca10, keca11, keca12, keca13, keca14, keca20, keca21, \ - keca22, keca23, keca24, keca30, keca31, keca32, keca33, keca34, keca40, keca41, keca42, keca43, keca44 -#define P1 keca00, keca30, keca10, keca40, keca20, keca11, keca41, keca21, keca01, keca31, keca22, keca02, \ - keca32, keca12, keca42, keca33, keca13, keca43, keca23, keca03, keca44, keca24, keca04, keca34, keca14 -#define P2 keca00, keca33, keca11, keca44, keca22, keca41, keca24, keca02, keca30, keca13, keca32, keca10, \ - keca43, keca21, keca04, keca23, keca01, keca34, keca12, keca40, keca14, keca42, keca20, keca03, keca31 -#define P3 keca00, keca23, keca41, keca14, keca32, keca24, keca42, keca10, keca33, keca01, keca43, keca11, \ - keca34, keca02, keca20, keca12, keca30, keca03, keca21, keca44, keca31, keca04, keca22, keca40, keca13 -#define P4 keca00, keca12, keca24, keca31, keca43, keca42, keca04, keca11, keca23, keca30, keca34, keca41, \ - keca03, keca10, keca22, keca21, keca33, keca40, keca02, keca14, keca13, keca20, keca32, keca44, keca01 -#define P5 keca00, keca21, keca42, keca13, keca34, keca04, keca20, keca41, keca12, keca33, keca03, keca24, \ - keca40, keca11, keca32, keca02, keca23, keca44, keca10, keca31, keca01, keca22, keca43, keca14, keca30 -#define P6 keca00, keca02, keca04, keca01, keca03, keca20, keca22, keca24, keca21, keca23, keca40, keca42, \ - keca44, keca41, keca43, keca10, keca12, keca14, keca11, keca13, keca30, keca32, keca34, keca31, keca33 -#define P7 keca00, keca10, keca20, keca30, keca40, keca22, keca32, keca42, keca02, keca12, keca44, keca04, \ - keca14, keca24, keca34, keca11, keca21, keca31, keca41, keca01, keca33, keca43, keca03, keca13, keca23 -#define P8 keca00, keca11, keca22, keca33, keca44, keca32, keca43, keca04, keca10, keca21, keca14, keca20, \ - keca31, keca42, keca03, keca41, keca02, keca13, keca24, keca30, keca23, keca34, keca40, keca01, keca12 -#define P9 keca00, keca41, keca32, keca23, keca14, keca43, keca34, keca20, keca11, keca02, keca31, keca22, \ - keca13, keca04, keca40, keca24, keca10, keca01, keca42, keca33, keca12, keca03, keca44, keca30, keca21 -#define P10 keca00, keca24, keca43, keca12, keca31, keca34, keca03, keca22, keca41, keca10, keca13, keca32, \ - keca01, keca20, keca44, keca42, keca11, keca30, keca04, keca23, keca21, keca40, keca14, keca33, keca02 -#define P11 keca00, keca42, keca34, keca21, keca13, keca03, keca40, keca32, keca24, keca11, keca01, keca43, \ - keca30, keca22, keca14, keca04, keca41, keca33, keca20, keca12, keca02, keca44, keca31, keca23, keca10 -#define P12 keca00, keca04, keca03, keca02, keca01, keca40, keca44, keca43, keca42, keca41, keca30, keca34, \ - keca33, keca32, keca31, keca20, keca24, keca23, keca22, keca21, keca10, keca14, keca13, keca12, keca11 -#define P13 keca00, keca20, keca40, keca10, keca30, keca44, keca14, keca34, keca04, keca24, keca33, keca03, \ - keca23, keca43, keca13, keca22, keca42, keca12, keca32, keca02, keca11, keca31, keca01, keca21, keca41 -#define P14 keca00, keca22, keca44, keca11, keca33, keca14, keca31, keca03, keca20, keca42, keca23, keca40, \ - keca12, keca34, keca01, keca32, keca04, keca21, keca43, keca10, keca41, keca13, keca30, keca02, keca24 -#define P15 keca00, keca32, keca14, keca41, keca23, keca31, keca13, keca40, keca22, keca04, keca12, keca44, \ - keca21, keca03, keca30, keca43, keca20, keca02, keca34, keca11, keca24, keca01, keca33, keca10, keca42 -#define P16 keca00, keca43, keca31, keca24, keca12, keca13, keca01, keca44, keca32, keca20, keca21, keca14, \ - keca02, keca40, keca33, keca34, keca22, keca10, keca03, keca41, keca42, keca30, keca23, keca11, keca04 -#define P17 keca00, keca34, keca13, keca42, keca21, keca01, keca30, keca14, keca43, keca22, keca02, keca31, \ - keca10, keca44, keca23, keca03, keca32, keca11, keca40, keca24, keca04, keca33, keca12, keca41, keca20 -#define P18 keca00, keca03, keca01, keca04, keca02, keca30, keca33, keca31, keca34, keca32, keca10, keca13, \ - keca11, keca14, keca12, keca40, keca43, keca41, keca44, keca42, keca20, keca23, keca21, keca24, keca22 -#define P19 keca00, keca40, keca30, keca20, keca10, keca33, keca23, keca13, keca03, keca43, keca11, keca01, \ - keca41, keca31, keca21, keca44, keca34, keca24, keca14, keca04, keca22, keca12, keca02, keca42, keca32 -#define P20 keca00, keca44, keca33, keca22, keca11, keca23, keca12, keca01, keca40, keca34, keca41, keca30, \ - keca24, keca13, keca02, keca14, keca03, keca42, keca31, keca20, keca32, keca21, keca10, keca04, keca43 -#define P21 keca00, keca14, keca23, keca32, keca41, keca12, keca21, keca30, keca44, keca03, keca24, keca33, \ - keca42, keca01, keca10, keca31, keca40, keca04, keca13, keca22, keca43, keca02, keca11, keca20, keca34 -#define P22 keca00, keca31, keca12, keca43, keca24, keca21, keca02, keca33, keca14, keca40, keca42, keca23, \ - keca04, keca30, keca11, keca13, keca44, keca20, keca01, keca32, keca34, keca10, keca41, keca22, keca03 -#define P23 keca00, keca13, keca21, keca34, keca42, keca02, keca10, keca23, keca31, keca44, keca04, keca12, \ - keca20, keca33, keca41, keca01, keca14, keca22, keca30, keca43, keca03, keca11, keca24, keca32, keca40 - -#define P1_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca30); \ - MOV64(keca30, keca33); \ - MOV64(keca33, keca23); \ - MOV64(keca23, keca12); \ - MOV64(keca12, keca21); \ - MOV64(keca21, keca02); \ - MOV64(keca02, keca10); \ - MOV64(keca10, keca11); \ - MOV64(keca11, keca41); \ - MOV64(keca41, keca24); \ - MOV64(keca24, keca42); \ - MOV64(keca42, keca04); \ - MOV64(keca04, keca20); \ - MOV64(keca20, keca22); \ - MOV64(keca22, keca32); \ - MOV64(keca32, keca43); \ - MOV64(keca43, keca34); \ - MOV64(keca34, keca03); \ - MOV64(keca03, keca40); \ - MOV64(keca40, keca44); \ - MOV64(keca44, keca14); \ - MOV64(keca14, keca31); \ - MOV64(keca31, keca13); \ - MOV64(keca13, t); \ - } while (0) - -#define P2_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca33); \ - MOV64(keca33, keca12); \ - MOV64(keca12, keca02); \ - MOV64(keca02, keca11); \ - MOV64(keca11, keca24); \ - MOV64(keca24, keca04); \ - MOV64(keca04, keca22); \ - MOV64(keca22, keca43); \ - MOV64(keca43, keca03); \ - MOV64(keca03, keca44); \ - MOV64(keca44, keca31); \ - MOV64(keca31, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca41); \ - MOV64(keca41, keca42); \ - MOV64(keca42, keca20); \ - MOV64(keca20, keca32); \ - MOV64(keca32, keca34); \ - MOV64(keca34, keca40); \ - MOV64(keca40, keca14); \ - MOV64(keca14, keca13); \ - MOV64(keca13, keca30); \ - MOV64(keca30, keca23); \ - MOV64(keca23, keca21); \ - MOV64(keca21, t); \ - } while (0) - -#define P4_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca12); \ - MOV64(keca12, keca11); \ - MOV64(keca11, keca04); \ - MOV64(keca04, keca43); \ - MOV64(keca43, keca44); \ - MOV64(keca44, t); \ - MOV64(t, keca02); \ - MOV64(keca02, keca24); \ - MOV64(keca24, keca22); \ - MOV64(keca22, keca03); \ - MOV64(keca03, keca31); \ - MOV64(keca31, keca33); \ - MOV64(keca33, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca42); \ - MOV64(keca42, keca32); \ - MOV64(keca32, keca40); \ - MOV64(keca40, keca13); \ - MOV64(keca13, keca23); \ - MOV64(keca23, t); \ - MOV64(t, keca14); \ - MOV64(keca14, keca30); \ - MOV64(keca30, keca21); \ - MOV64(keca21, keca41); \ - MOV64(keca41, keca20); \ - MOV64(keca20, keca34); \ - MOV64(keca34, t); \ - } while (0) - -#define P6_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca02); \ - MOV64(keca02, keca04); \ - MOV64(keca04, keca03); \ - MOV64(keca03, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca20); \ - MOV64(keca20, keca40); \ - MOV64(keca40, keca30); \ - MOV64(keca30, t); \ - MOV64(t, keca11); \ - MOV64(keca11, keca22); \ - MOV64(keca22, keca44); \ - MOV64(keca44, keca33); \ - MOV64(keca33, t); \ - MOV64(t, keca12); \ - MOV64(keca12, keca24); \ - MOV64(keca24, keca43); \ - MOV64(keca43, keca31); \ - MOV64(keca31, t); \ - MOV64(t, keca13); \ - MOV64(keca13, keca21); \ - MOV64(keca21, keca42); \ - MOV64(keca42, keca34); \ - MOV64(keca34, t); \ - MOV64(t, keca14); \ - MOV64(keca14, keca23); \ - MOV64(keca23, keca41); \ - MOV64(keca41, keca32); \ - MOV64(keca32, t); \ - } while (0) - -#define P8_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca11); \ - MOV64(keca11, keca43); \ - MOV64(keca43, t); \ - MOV64(t, keca02); \ - MOV64(keca02, keca22); \ - MOV64(keca22, keca31); \ - MOV64(keca31, t); \ - MOV64(t, keca03); \ - MOV64(keca03, keca33); \ - MOV64(keca33, keca24); \ - MOV64(keca24, t); \ - MOV64(t, keca04); \ - MOV64(keca04, keca44); \ - MOV64(keca44, keca12); \ - MOV64(keca12, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca32); \ - MOV64(keca32, keca13); \ - MOV64(keca13, t); \ - MOV64(t, keca14); \ - MOV64(keca14, keca21); \ - MOV64(keca21, keca20); \ - MOV64(keca20, t); \ - MOV64(t, keca23); \ - MOV64(keca23, keca42); \ - MOV64(keca42, keca40); \ - MOV64(keca40, t); \ - MOV64(t, keca30); \ - MOV64(keca30, keca41); \ - MOV64(keca41, keca34); \ - MOV64(keca34, t); \ - } while (0) - -#define P12_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca04); \ - MOV64(keca04, t); \ - MOV64(t, keca02); \ - MOV64(keca02, keca03); \ - MOV64(keca03, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca40); \ - MOV64(keca40, t); \ - MOV64(t, keca11); \ - MOV64(keca11, keca44); \ - MOV64(keca44, t); \ - MOV64(t, keca12); \ - MOV64(keca12, keca43); \ - MOV64(keca43, t); \ - MOV64(t, keca13); \ - MOV64(keca13, keca42); \ - MOV64(keca42, t); \ - MOV64(t, keca14); \ - MOV64(keca14, keca41); \ - MOV64(keca41, t); \ - MOV64(t, keca20); \ - MOV64(keca20, keca30); \ - MOV64(keca30, t); \ - MOV64(t, keca21); \ - MOV64(keca21, keca34); \ - MOV64(keca34, t); \ - MOV64(t, keca22); \ - MOV64(keca22, keca33); \ - MOV64(keca33, t); \ - MOV64(t, keca23); \ - MOV64(keca23, keca32); \ - MOV64(keca32, t); \ - MOV64(t, keca24); \ - MOV64(keca24, keca31); \ - MOV64(keca31, t); \ - } while (0) - -#define LPAR ( -#define RPAR ) - -#define KF_ELT(r, s, k) do { \ - THETA LPAR P ## r RPAR; \ - RHO LPAR P ## r RPAR; \ - KHI LPAR P ## s RPAR; \ - IOTA(k); \ - } while (0) - -#define DO(x) x - -#define KECCAK_F_1600 DO(KECCAK_F_1600_) - -/* - * removed loop unrolling - * tested faster saving space -*/ -#define KECCAK_F_1600_ do { \ -static const sph_u64 RC[] = { \ - SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \ - SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \ - SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \ - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \ - SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \ - SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \ - SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \ - SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \ - SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \ - SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \ - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \ - SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \ -}; \ - int j; \ - for (j = 0; j < 24; j += 4) { \ - KF_ELT( 0, 1, RC[j + 0]); \ - KF_ELT( 1, 2, RC[j + 1]); \ - KF_ELT( 2, 3, RC[j + 2]); \ - KF_ELT( 3, 4, RC[j + 3]); \ - P4_TO_P0; \ - } \ - } while (0) - -/* - KF_ELT( 0, 1, RC[j + 0]); \ - KF_ELT( 1, 2, RC[j + 1]); \ - KF_ELT( 2, 3, RC[j + 2]); \ - KF_ELT( 3, 4, RC[j + 3]); \ - KF_ELT( 4, 5, RC[j + 4]); \ - KF_ELT( 5, 6, RC[j + 5]); \ - KF_ELT( 6, 7, RC[j + 6]); \ - KF_ELT( 7, 8, RC[j + 7]); \ - kekDECL_STATE \ -*/ -#define DECL_KEC - - -/* - sph_u64 keca00, keca01, keca02, keca03, keca04; \ - sph_u64 keca10, keca11, keca12, keca13, keca14; \ - sph_u64 keca20, keca21, keca22, keca23, keca24; \ - sph_u64 keca30, keca31, keca32, keca33, keca34; \ - sph_u64 keca40, keca41, keca42, keca43, keca44; -*/ - -/* load initial constants */ -#define KEC_I - -//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; -/* - unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \ -*/ - -/* load hash for loop */ -#define KEC_U \ -do { \ -static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \ - /*memcpy(hashbuf, hash, 64); */ \ - memcpy(hash + 64, keczword, 8); \ -} while (0); - -/* keccak512 hash loaded */ -/* hash = keccak512(loaded */ - -#define KEC_C \ -do { \ - kekDECL_STATE \ - unsigned char *buf = hash; \ - /*BEGIN CORE */ \ - kecINIT_STATE(); \ - KECCAK_F_1600; \ - /*END CORE */ \ - /* Finalize the "lane complement" */ \ - sph_enc64le_aligned((unsigned char*)(hash) + 0, keca00); \ - sph_enc64le_aligned((unsigned char*)(hash) + 8, ~keca10); \ - sph_enc64le_aligned((unsigned char*)(hash) + 16, ~keca20); \ - sph_enc64le_aligned((unsigned char*)(hash) + 24, keca30); \ - sph_enc64le_aligned((unsigned char*)(hash) + 32, keca40); \ - sph_enc64le_aligned((unsigned char*)(hash) + 40, keca01); \ - sph_enc64le_aligned((unsigned char*)(hash) + 48, keca11); \ - sph_enc64le_aligned((unsigned char*)(hash) + 56, keca21); \ -} while (0); - -#ifdef __cplusplus -} -#endif diff --git a/algo/keccak/sse2/sph_keccak.h b/algo/keccak/sse2/sph_keccak.h deleted file mode 100644 index b66d6d4..0000000 --- a/algo/keccak/sse2/sph_keccak.h +++ /dev/null @@ -1,102 +0,0 @@ -/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * Keccak interface. This is the interface for Keccak with the - * recommended parameters for SHA-3, with output lengths 224, 256, - * 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_keccak.h - * @author Thomas Pornin - */ - -#ifndef SPH_KECCAK_H__ -#define SPH_KECCAK_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "algo/sha/sph_types.h" - -#define QSTATIC static - -/** - * Output size (in bits) for Keccak-512. - */ -#define SPH_SIZE_keccak512 512 - -/** - * This structure is a context for Keccak computations: it contains the - * intermediate values and some data from the last entered block. Once a - * Keccak computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running Keccak computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -/** - * Type for a Keccak-512 context (identical to the common context). - */ - -/** - * Initialize a Keccak-512 context. This process performs no memory allocation. - * - * @param cc the Keccak-512 context (pointer to a - * sph_keccak512_context) - */ - -/** - * Terminate the current Keccak-512 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (64 bytes). The context is automatically - * reinitialized. - * - * @param cc the Keccak-512 context - * @param dst the destination buffer - */ - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (64 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Keccak-512 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c deleted file mode 100644 index 3f22423..0000000 --- a/algo/luffa/luffa-hash-2way.c +++ /dev/null @@ -1,583 +0,0 @@ -/* - * luffa_for_sse2.c - * Version 2.0 (Sep 15th 2009) - * - * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. - * - * Hitachi, Ltd. is the owner of this software and hereby grant - * the U.S. Government and any interested party the right to use - * this software for the purposes of the SHA-3 evaluation process, - * notwithstanding that this software is copyrighted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include "luffa-hash-2way.h" - -#if defined(__AVX2__) - -#include "simd-utils.h" - -#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \ - 0UL, 0UL, 0UL, 0xffffffffUL ) - -#define ADD_CONSTANT(a,b,c0,c1)\ - a = _mm256_xor_si256(a,c0);\ - b = _mm256_xor_si256(b,c1);\ - -#define MULT2(a0,a1) \ -do { \ - register __m256i b = _mm256_xor_si256( a0, \ - _mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \ - a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \ - a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \ -} while(0) - -// confirm pointer arithmetic -// ok but use array indexes -#define STEP_PART(x,c,t)\ - SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\ - SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\ - MIXWORD(*x,*(x+4),*t,*(t+1));\ - MIXWORD(*(x+1),*(x+5),*t,*(t+1));\ - MIXWORD(*(x+2),*(x+6),*t,*(t+1));\ - MIXWORD(*(x+3),*(x+7),*t,*(t+1));\ - ADD_CONSTANT(*x, *(x+4), *c, *(c+1)); - -#define SUBCRUMB(a0,a1,a2,a3,t)\ - t = _mm256_load_si256(&a0);\ - a0 = _mm256_or_si256(a0,a1);\ - a2 = _mm256_xor_si256(a2,a3);\ - a1 = _mm256_andnot_si256(a1, m256_neg1 );\ - a0 = _mm256_xor_si256(a0,a3);\ - a3 = _mm256_and_si256(a3,t);\ - a1 = _mm256_xor_si256(a1,a3);\ - a3 = _mm256_xor_si256(a3,a2);\ - a2 = _mm256_and_si256(a2,a0);\ - a0 = _mm256_andnot_si256(a0, m256_neg1 );\ - a2 = _mm256_xor_si256(a2,a1);\ - a1 = _mm256_or_si256(a1,a3);\ - t = _mm256_xor_si256(t,a1);\ - a3 = _mm256_xor_si256(a3,a2);\ - a2 = _mm256_and_si256(a2,a1);\ - a1 = _mm256_xor_si256(a1,a0);\ - a0 = _mm256_load_si256(&t);\ - -#define MIXWORD(a,b,t1,t2)\ - b = _mm256_xor_si256(a,b);\ - t1 = _mm256_slli_epi32(a,2);\ - t2 = _mm256_srli_epi32(a,30);\ - a = _mm256_or_si256(t1,t2);\ - a = _mm256_xor_si256(a,b);\ - t1 = _mm256_slli_epi32(b,14);\ - t2 = _mm256_srli_epi32(b,18);\ - b = _mm256_or_si256(t1,t2);\ - b = _mm256_xor_si256(a,b);\ - t1 = _mm256_slli_epi32(a,10);\ - t2 = _mm256_srli_epi32(a,22);\ - a = _mm256_or_si256(t1,t2);\ - a = _mm256_xor_si256(a,b);\ - t1 = _mm256_slli_epi32(b,1);\ - t2 = _mm256_srli_epi32(b,31);\ - b = _mm256_or_si256(t1,t2); - -#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\ - a1 = _mm256_shuffle_epi32(a1,147);\ - t0 = _mm256_load_si256(&a1);\ - a1 = _mm256_unpacklo_epi32(a1,a0);\ - t0 = _mm256_unpackhi_epi32(t0,a0);\ - t1 = _mm256_shuffle_epi32(t0,78);\ - a0 = _mm256_shuffle_epi32(a1,78);\ - SUBCRUMB(t1,t0,a0,a1,tmp0);\ - t0 = _mm256_unpacklo_epi32(t0,t1);\ - a1 = _mm256_unpacklo_epi32(a1,a0);\ - a0 = _mm256_load_si256(&a1);\ - a0 = _mm256_unpackhi_epi64(a0,t0);\ - a1 = _mm256_unpacklo_epi64(a1,t0);\ - a1 = _mm256_shuffle_epi32(a1,57);\ - MIXWORD(a0,a1,tmp0,tmp1);\ - ADD_CONSTANT(a0,a1,c0,c1); - -#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\ - s2 = _mm256_load_si256(&r1);\ - q2 = _mm256_load_si256(&p1);\ - r2 = _mm256_shuffle_epi32(r2,216);\ - p2 = _mm256_shuffle_epi32(p2,216);\ - r1 = _mm256_unpacklo_epi32(r1,r0);\ - p1 = _mm256_unpacklo_epi32(p1,p0);\ - s2 = _mm256_unpackhi_epi32(s2,r0);\ - q2 = _mm256_unpackhi_epi32(q2,p0);\ - s0 = _mm256_load_si256(&r2);\ - q0 = _mm256_load_si256(&p2);\ - r2 = _mm256_unpacklo_epi64(r2,r1);\ - p2 = _mm256_unpacklo_epi64(p2,p1);\ - s1 = _mm256_load_si256(&s0);\ - q1 = _mm256_load_si256(&q0);\ - s0 = _mm256_unpackhi_epi64(s0,r1);\ - q0 = _mm256_unpackhi_epi64(q0,p1);\ - r2 = _mm256_shuffle_epi32(r2,225);\ - p2 = _mm256_shuffle_epi32(p2,225);\ - r0 = _mm256_load_si256(&s1);\ - p0 = _mm256_load_si256(&q1);\ - s0 = _mm256_shuffle_epi32(s0,225);\ - q0 = _mm256_shuffle_epi32(q0,225);\ - s1 = _mm256_unpacklo_epi64(s1,s2);\ - q1 = _mm256_unpacklo_epi64(q1,q2);\ - r0 = _mm256_unpackhi_epi64(r0,s2);\ - p0 = _mm256_unpackhi_epi64(p0,q2);\ - s2 = _mm256_load_si256(&r0);\ - q2 = _mm256_load_si256(&p0);\ - s3 = _mm256_load_si256(&r2);\ - q3 = _mm256_load_si256(&p2);\ - -#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\ - s0 = _mm256_load_si256(&r0);\ - q0 = _mm256_load_si256(&p0);\ - s1 = _mm256_load_si256(&r2);\ - q1 = _mm256_load_si256(&p2);\ - r0 = _mm256_unpackhi_epi32(r0,r1);\ - p0 = _mm256_unpackhi_epi32(p0,p1);\ - r2 = _mm256_unpackhi_epi32(r2,r3);\ - p2 = _mm256_unpackhi_epi32(p2,p3);\ - s0 = _mm256_unpacklo_epi32(s0,r1);\ - q0 = _mm256_unpacklo_epi32(q0,p1);\ - s1 = _mm256_unpacklo_epi32(s1,r3);\ - q1 = _mm256_unpacklo_epi32(q1,p3);\ - r1 = _mm256_load_si256(&r0);\ - p1 = _mm256_load_si256(&p0);\ - r0 = _mm256_unpackhi_epi64(r0,r2);\ - p0 = _mm256_unpackhi_epi64(p0,p2);\ - s0 = _mm256_unpackhi_epi64(s0,s1);\ - q0 = _mm256_unpackhi_epi64(q0,q1);\ - r1 = _mm256_unpacklo_epi64(r1,r2);\ - p1 = _mm256_unpacklo_epi64(p1,p2);\ - s2 = _mm256_load_si256(&r0);\ - q2 = _mm256_load_si256(&p0);\ - s1 = _mm256_load_si256(&r1);\ - q1 = _mm256_load_si256(&p1);\ - -#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\ - s1 = _mm256_load_si256(&r3);\ - q1 = _mm256_load_si256(&p3);\ - s3 = _mm256_load_si256(&r3);\ - q3 = _mm256_load_si256(&p3);\ - s1 = _mm256_unpackhi_epi32(s1,r2);\ - q1 = _mm256_unpackhi_epi32(q1,p2);\ - s3 = _mm256_unpacklo_epi32(s3,r2);\ - q3 = _mm256_unpacklo_epi32(q3,p2);\ - s0 = _mm256_load_si256(&s1);\ - q0 = _mm256_load_si256(&q1);\ - s2 = _mm256_load_si256(&s3);\ - q2 = _mm256_load_si256(&q3);\ - r3 = _mm256_load_si256(&r1);\ - p3 = _mm256_load_si256(&p1);\ - r1 = _mm256_unpacklo_epi32(r1,r0);\ - p1 = _mm256_unpacklo_epi32(p1,p0);\ - r3 = _mm256_unpackhi_epi32(r3,r0);\ - p3 = _mm256_unpackhi_epi32(p3,p0);\ - s0 = _mm256_unpackhi_epi64(s0,r3);\ - q0 = _mm256_unpackhi_epi64(q0,p3);\ - s1 = _mm256_unpacklo_epi64(s1,r3);\ - q1 = _mm256_unpacklo_epi64(q1,p3);\ - s2 = _mm256_unpackhi_epi64(s2,r1);\ - q2 = _mm256_unpackhi_epi64(q2,p1);\ - s3 = _mm256_unpacklo_epi64(s3,r1);\ - q3 = _mm256_unpacklo_epi64(q3,p1); - -#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\ - NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3); - -/* initial values of chaining variables */ -static const uint32 IV[40] __attribute((aligned(32))) = { - 0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69, - 0xdef610bb,0xee058139,0x90152df4,0x6e292011, - 0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95, - 0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557, - 0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d, - 0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f, - 0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5, - 0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a, - 0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be, - 0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999 -}; - -/* Round Constants */ -static const uint32 CNS_INIT[128] __attribute((aligned(32))) = { - 0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6, - 0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818, - 0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299, - 0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d, - 0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12, - 0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442, - 0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e, - 0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f, - 0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f, - 0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6, - 0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d, - 0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4, - 0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882, - 0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7, - 0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12, - 0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d, - 0x00000000,0x00000000,0x00000000,0xf0d2e9e3, - 0x00000000,0x00000000,0x00000000,0x5090d577, - 0x00000000,0x00000000,0x00000000,0xac11d7fa, - 0x00000000,0x00000000,0x00000000,0x2d1925ab, - 0x00000000,0x00000000,0x00000000,0x1bcb66f2, - 0x00000000,0x00000000,0x00000000,0xb46496ac, - 0x00000000,0x00000000,0x00000000,0x6f2d9bc9, - 0x00000000,0x00000000,0x00000000,0xd1925ab0, - 0x00000000,0x00000000,0x00000000,0x78602649, - 0x00000000,0x00000000,0x00000000,0x29131ab6, - 0x00000000,0x00000000,0x00000000,0x8edae952, - 0x00000000,0x00000000,0x00000000,0x0fc053c3, - 0x00000000,0x00000000,0x00000000,0x3b6ba548, - 0x00000000,0x00000000,0x00000000,0x3f014f0c, - 0x00000000,0x00000000,0x00000000,0xedae9520, - 0x00000000,0x00000000,0x00000000,0xfc053c31 -}; - -__m256i CNS[32]; - -/***************************************************/ -/* Round function */ -/* state: hash context */ - -void rnd512_2way( luffa_2way_context *state, __m256i *msg ) -{ - __m256i t0, t1; - __m256i *chainv = state->chainv; - __m256i msg0, msg1; - __m256i tmp[2]; - __m256i x[8]; - - t0 = chainv[0]; - t1 = chainv[1]; - - t0 = _mm256_xor_si256( t0, chainv[2] ); - t1 = _mm256_xor_si256( t1, chainv[3] ); - t0 = _mm256_xor_si256( t0, chainv[4] ); - t1 = _mm256_xor_si256( t1, chainv[5] ); - t0 = _mm256_xor_si256( t0, chainv[6] ); - t1 = _mm256_xor_si256( t1, chainv[7] ); - t0 = _mm256_xor_si256( t0, chainv[8] ); - t1 = _mm256_xor_si256( t1, chainv[9] ); - - MULT2( t0, t1 ); - - msg0 = _mm256_shuffle_epi32( msg[0], 27 ); - msg1 = _mm256_shuffle_epi32( msg[1], 27 ); - - chainv[0] = _mm256_xor_si256( chainv[0], t0 ); - chainv[1] = _mm256_xor_si256( chainv[1], t1 ); - chainv[2] = _mm256_xor_si256( chainv[2], t0 ); - chainv[3] = _mm256_xor_si256( chainv[3], t1 ); - chainv[4] = _mm256_xor_si256( chainv[4], t0 ); - chainv[5] = _mm256_xor_si256( chainv[5], t1 ); - chainv[6] = _mm256_xor_si256( chainv[6], t0 ); - chainv[7] = _mm256_xor_si256( chainv[7], t1 ); - chainv[8] = _mm256_xor_si256( chainv[8], t0 ); - chainv[9] = _mm256_xor_si256( chainv[9], t1 ); - - t0 = chainv[0]; - t1 = chainv[1]; - - MULT2( chainv[0], chainv[1]); - chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] ); - chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] ); - - MULT2( chainv[2], chainv[3]); - chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]); - chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]); - - MULT2( chainv[4], chainv[5]); - chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]); - chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]); - - MULT2( chainv[6], chainv[7]); - chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]); - chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]); - - MULT2( chainv[8], chainv[9]); - chainv[8] = _mm256_xor_si256( chainv[8], t0 ); - chainv[9] = _mm256_xor_si256( chainv[9], t1 ); - - t0 = chainv[8]; - t1 = chainv[9]; - - MULT2( chainv[8], chainv[9]); - chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] ); - chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] ); - - MULT2( chainv[6], chainv[7]); - chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] ); - chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] ); - - MULT2( chainv[4], chainv[5]); - chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] ); - chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] ); - - MULT2( chainv[2], chainv[3] ); - chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] ); - chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] ); - - MULT2( chainv[0], chainv[1] ); - chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 ); - chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 ); - - MULT2( msg0, msg1); - chainv[2] = _mm256_xor_si256( chainv[2], msg0 ); - chainv[3] = _mm256_xor_si256( chainv[3], msg1 ); - - MULT2( msg0, msg1); - chainv[4] = _mm256_xor_si256( chainv[4], msg0 ); - chainv[5] = _mm256_xor_si256( chainv[5], msg1 ); - - MULT2( msg0, msg1); - chainv[6] = _mm256_xor_si256( chainv[6], msg0 ); - chainv[7] = _mm256_xor_si256( chainv[7], msg1 ); - - MULT2( msg0, msg1); - chainv[8] = _mm256_xor_si256( chainv[8], msg0 ); - chainv[9] = _mm256_xor_si256( chainv[9], msg1 ); - - MULT2( msg0, msg1); - - chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ), - _mm256_srli_epi32( chainv[3], 31 ) ); - chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ), - _mm256_srli_epi32( chainv[5], 30 ) ); - chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ), - _mm256_srli_epi32( chainv[7], 29 ) ); - chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ), - _mm256_srli_epi32( chainv[9], 28 ) ); - - NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], - x[0], x[1], x[2], x[3], - chainv[1],chainv[3],chainv[5],chainv[7], - x[4], x[5], x[6], x[7] ); - - STEP_PART( &x[0], &CNS[ 0], &tmp[0] ); - STEP_PART( &x[0], &CNS[ 2], &tmp[0] ); - STEP_PART( &x[0], &CNS[ 4], &tmp[0] ); - STEP_PART( &x[0], &CNS[ 6], &tmp[0] ); - STEP_PART( &x[0], &CNS[ 8], &tmp[0] ); - STEP_PART( &x[0], &CNS[10], &tmp[0] ); - STEP_PART( &x[0], &CNS[12], &tmp[0] ); - STEP_PART( &x[0], &CNS[14], &tmp[0] ); - - MIXTON1024( x[0], x[1], x[2], x[3], - chainv[0], chainv[2], chainv[4],chainv[6], - x[4], x[5], x[6], x[7], - chainv[1],chainv[3],chainv[5],chainv[7]); - - /* Process last 256-bit block */ - STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[16], CNS[17], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[18], CNS[19], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[20], CNS[21], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[22], CNS[23], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[24], CNS[25], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[26], CNS[27], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[28], CNS[29], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[30], CNS[31], - tmp[0], tmp[1] ); -} - - -/***************************************************/ -/* Finalization function */ -/* state: hash context */ -/* b[8]: hash values */ - -void finalization512_2way( luffa_2way_context *state, uint32 *b ) -{ - uint32 hash[8] __attribute((aligned(64))); - __m256i* chainv = state->chainv; - __m256i t[2]; - __m256i zero[2]; - zero[0] = zero[1] = _mm256_setzero_si256(); - - /*---- blank round with m=0 ----*/ - rnd512_2way( state, zero ); - - t[0] = chainv[0]; - t[1] = chainv[1]; - - t[0] = _mm256_xor_si256( t[0], chainv[2] ); - t[1] = _mm256_xor_si256( t[1], chainv[3] ); - t[0] = _mm256_xor_si256( t[0], chainv[4] ); - t[1] = _mm256_xor_si256( t[1], chainv[5] ); - t[0] = _mm256_xor_si256( t[0], chainv[6] ); - t[1] = _mm256_xor_si256( t[1], chainv[7] ); - t[0] = _mm256_xor_si256( t[0], chainv[8] ); - t[1] = _mm256_xor_si256( t[1], chainv[9] ); - - t[0] = _mm256_shuffle_epi32( t[0], 27 ); - t[1] = _mm256_shuffle_epi32( t[1], 27 ); - - _mm256_store_si256( (__m256i*)&hash[0], t[0] ); - _mm256_store_si256( (__m256i*)&hash[8], t[1] ); - - casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); - casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) ); - - rnd512_2way( state, zero ); - - t[0] = chainv[0]; - t[1] = chainv[1]; - t[0] = _mm256_xor_si256( t[0], chainv[2] ); - t[1] = _mm256_xor_si256( t[1], chainv[3] ); - t[0] = _mm256_xor_si256( t[0], chainv[4] ); - t[1] = _mm256_xor_si256( t[1], chainv[5] ); - t[0] = _mm256_xor_si256( t[0], chainv[6] ); - t[1] = _mm256_xor_si256( t[1], chainv[7] ); - t[0] = _mm256_xor_si256( t[0], chainv[8] ); - t[1] = _mm256_xor_si256( t[1], chainv[9] ); - - t[0] = _mm256_shuffle_epi32( t[0], 27 ); - t[1] = _mm256_shuffle_epi32( t[1], 27 ); - - _mm256_store_si256( (__m256i*)&hash[0], t[0] ); - _mm256_store_si256( (__m256i*)&hash[8], t[1] ); - - casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); - casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) ); -} - -int luffa_2way_init( luffa_2way_context *state, int hashbitlen ) -{ - int i; - state->hashbitlen = hashbitlen; - - for ( i=0; i<32; i++ ) CNS[i] = - _mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ], - CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ], - CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ], - CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ] ); - - for ( i=0; i<10; i++ ) state->chainv[i] = - _mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ], - IV[ (i<<2) +1 ], IV[ (i<<2) ], - IV[ (i<<2) +3 ], IV[ (i<<2) +2 ], - IV[ (i<<2) +1 ], IV[ (i<<2) ] ); - - ((__m256i*)state->buffer)[0] = m256_zero; - ((__m256i*)state->buffer)[1] = m256_zero; - - return 0; -} - -// Do not call luffa_update_close after having called luffa_update. -// Once luffa_update has been called only call luffa_update or luffa_close. -int luffa_2way_update( luffa_2way_context *state, const void *data, - size_t len ) -{ - __m256i *vdata = (__m256i*)data; - __m256i *buffer = (__m256i*)state->buffer; - __m256i msg[2]; - int i; - int blocks = (int)len >> 5; - state-> rembytes = (int)len & 0x1F; - - // full blocks - for ( i = 0; i < blocks; i++, vdata+=2 ) - { - msg[0] = mm256_bswap_32( vdata[ 0] ); - msg[1] = mm256_bswap_32( vdata[ 1 ] ); - rnd512_2way( state, msg ); - } - - // 16 byte partial block exists for 80 byte len - // store in buffer for transform in final for midstate to work - if ( state->rembytes ) - { - // remaining data bytes - buffer[0] = mm256_bswap_32( vdata[0] ); - buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ); - } - return 0; -} - -int luffa_2way_close( luffa_2way_context *state, void *hashval ) -{ - __m256i *buffer = (__m256i*)state->buffer; - __m256i msg[2]; - - // transform pad block - if ( state->rembytes ) - // not empty, data is in buffer - rnd512_2way( state, buffer ); - else - { // empty pad block, constant data - msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ); - msg[1] = m256_zero; - rnd512_2way( state, msg ); - } - finalization512_2way( state, (uint32*)hashval ); - - if ( state->hashbitlen > 512 ) - finalization512_2way( state, (uint32*)( hashval+32 ) ); - return 0; -} - -int luffa_2way_update_close( luffa_2way_context *state, - void *output, const void *data, size_t inlen ) -{ -// Optimized for integrals of 16 bytes, good for 64 and 80 byte len - const __m256i *vdata = (__m256i*)data; - __m256i msg[2]; - int i; - const int blocks = (int)( inlen >> 5 ); - state->rembytes = inlen & 0x1F; - - // full blocks - for ( i = 0; i < blocks; i++, vdata+=2 ) - { - msg[0] = mm256_bswap_32( vdata[ 0 ] ); - msg[1] = mm256_bswap_32( vdata[ 1 ] ); - rnd512_2way( state, msg ); - } - - // 16 byte partial block exists for 80 byte len - if ( state->rembytes ) - { - // padding of partial block - msg[0] = mm256_bswap_32( vdata[0] ); - msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ); - rnd512_2way( state, msg ); - } - else - { - // empty pad block - msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ); - msg[1] = m256_zero; - rnd512_2way( state, msg ); - } - - finalization512_2way( state, (uint32*)output ); - if ( state->hashbitlen > 512 ) - finalization512_2way( state, (uint32*)( output+32 ) ); - - return 0; -} - -#endif diff --git a/algo/luffa/luffa-hash-2way.h b/algo/luffa/luffa-hash-2way.h deleted file mode 100644 index fac3c89..0000000 --- a/algo/luffa/luffa-hash-2way.h +++ /dev/null @@ -1,69 +0,0 @@ -#if !defined(LUFFA_HASH_2WAY_H__) -#define LUFFA_HASH_2WAY_H__ 1 -/* - * luffa_for_sse2.h - * Version 2.0 (Sep 15th 2009) - * - * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. - * - * Hitachi, Ltd. is the owner of this software and hereby grant - * the U.S. Government and any interested party the right to use - * this software for the purposes of the SHA-3 evaluation process, - * notwithstanding that this software is copyrighted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#if defined(__AVX2__) - -#include -#include "algo/sha/sha3-defs.h" -#include "simd-utils.h" - -/* The length of digests*/ -#define DIGEST_BIT_LEN_224 224 -#define DIGEST_BIT_LEN_256 256 -#define DIGEST_BIT_LEN_384 384 -#define DIGEST_BIT_LEN_512 512 - -/*********************************/ -/* The parameters of Luffa */ -#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/ -#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length - * of a message block*/ - -/* The number of blocks in Luffa */ -#define WIDTH_224 3 -#define WIDTH_256 3 -#define WIDTH_384 4 -#define WIDTH_512 5 - -/* The limit of the length of message */ -#define LIMIT_224 64 -#define LIMIT_256 64 -#define LIMIT_384 128 -#define LIMIT_512 128 -/*********************************/ - -typedef struct { - uint32 buffer[8*2] __attribute((aligned(64))); - __m256i chainv[10] __attribute((aligned(32))); /* Chaining values */ - int hashbitlen; - int rembytes; -} luffa_2way_context; - -int luffa_2way_init( luffa_2way_context *state, int hashbitlen ); -int luffa_2way_update( luffa_2way_context *state, const void *data, - size_t len ); -int luffa_2way_close( luffa_2way_context *state, void *hashval ); -int luffa_2way_update_close( luffa_2way_context *state, void *output, - const void *data, size_t inlen ); - -#endif -#endif diff --git a/algo/luffa/luffa.c b/algo/luffa/luffa.c deleted file mode 100644 index 7d68af6..0000000 --- a/algo/luffa/luffa.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "algo-gate-api.h" - -#include -#include -#include -#include - -#include "sph_luffa.h" - -void luffahash(void *output, const void *input) -{ - unsigned char _ALIGN(128) hash[64]; - sph_luffa512_context ctx_luffa; - - sph_luffa512_init(&ctx_luffa); - sph_luffa512 (&ctx_luffa, input, 80); - sph_luffa512_close(&ctx_luffa, (void*) hash); - - memcpy(output, hash, 32); -} - -int scanhash_luffa(int thr_id, struct work *work, - uint32_t max_nonce, uint64_t *hashes_done) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - uint32_t _ALIGN(64) hash64[8]; - uint32_t _ALIGN(64) endiandata[20]; - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - - uint32_t n = first_nonce; - - for (int i=0; i < 19; i++) - be32enc(&endiandata[i], pdata[i]); - - do { - be32enc(&endiandata[19], n); - luffahash(hash64, endiandata); - if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return true; - } - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} - -bool register_luffa_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_luffa; - gate->hash = (void*)&luffahash; - return true; -}; - diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c deleted file mode 100644 index 10c0736..0000000 --- a/algo/luffa/luffa_for_sse2.c +++ /dev/null @@ -1,630 +0,0 @@ -/* - * luffa_for_sse2.c - * Version 2.0 (Sep 15th 2009) - * - * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. - * - * Hitachi, Ltd. is the owner of this software and hereby grant - * the U.S. Government and any interested party the right to use - * this software for the purposes of the SHA-3 evaluation process, - * notwithstanding that this software is copyrighted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include "simd-utils.h" -#include "luffa_for_sse2.h" - -#define MULT2(a0,a1) do \ -{ \ - __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \ - a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \ - a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); \ -} while(0) - -/* -static inline __m256i mult2_avx2( a ) -{ - __m128 a0, a0, b; - a0 = mm128_extractlo_256( a ); - a1 = mm128_extracthi_256( a ); - b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); - a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); - a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); - return mm256_concat_128( a1, a0 ); -} -*/ - -#define STEP_PART(x,c,t)\ - SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\ - SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\ - MIXWORD(*x,*(x+4),*t,*(t+1));\ - MIXWORD(*(x+1),*(x+5),*t,*(t+1));\ - MIXWORD(*(x+2),*(x+6),*t,*(t+1));\ - MIXWORD(*(x+3),*(x+7),*t,*(t+1));\ - ADD_CONSTANT(*x, *(x+4), *c, *(c+1)); - -#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\ - a1 = _mm_shuffle_epi32(a1,147);\ - t0 = _mm_load_si128(&a1);\ - a1 = _mm_unpacklo_epi32(a1,a0);\ - t0 = _mm_unpackhi_epi32(t0,a0);\ - t1 = _mm_shuffle_epi32(t0,78);\ - a0 = _mm_shuffle_epi32(a1,78);\ - SUBCRUMB(t1,t0,a0,a1,tmp0);\ - t0 = _mm_unpacklo_epi32(t0,t1);\ - a1 = _mm_unpacklo_epi32(a1,a0);\ - a0 = _mm_load_si128(&a1);\ - a0 = _mm_unpackhi_epi64(a0,t0);\ - a1 = _mm_unpacklo_epi64(a1,t0);\ - a1 = _mm_shuffle_epi32(a1,57);\ - MIXWORD(a0,a1,tmp0,tmp1);\ - ADD_CONSTANT(a0,a1,c0,c1); - -#define SUBCRUMB(a0,a1,a2,a3,t)\ - t = _mm_load_si128(&a0);\ - a0 = _mm_or_si128(a0,a1);\ - a2 = _mm_xor_si128(a2,a3);\ - a1 = _mm_andnot_si128(a1,ALLONE);\ - a0 = _mm_xor_si128(a0,a3);\ - a3 = _mm_and_si128(a3,t);\ - a1 = _mm_xor_si128(a1,a3);\ - a3 = _mm_xor_si128(a3,a2);\ - a2 = _mm_and_si128(a2,a0);\ - a0 = _mm_andnot_si128(a0,ALLONE);\ - a2 = _mm_xor_si128(a2,a1);\ - a1 = _mm_or_si128(a1,a3);\ - t = _mm_xor_si128(t,a1);\ - a3 = _mm_xor_si128(a3,a2);\ - a2 = _mm_and_si128(a2,a1);\ - a1 = _mm_xor_si128(a1,a0);\ - a0 = _mm_load_si128(&t);\ - -#define MIXWORD(a,b,t1,t2)\ - b = _mm_xor_si128(a,b);\ - t1 = _mm_slli_epi32(a,2);\ - t2 = _mm_srli_epi32(a,30);\ - a = _mm_or_si128(t1,t2);\ - a = _mm_xor_si128(a,b);\ - t1 = _mm_slli_epi32(b,14);\ - t2 = _mm_srli_epi32(b,18);\ - b = _mm_or_si128(t1,t2);\ - b = _mm_xor_si128(a,b);\ - t1 = _mm_slli_epi32(a,10);\ - t2 = _mm_srli_epi32(a,22);\ - a = _mm_or_si128(t1,t2);\ - a = _mm_xor_si128(a,b);\ - t1 = _mm_slli_epi32(b,1);\ - t2 = _mm_srli_epi32(b,31);\ - b = _mm_or_si128(t1,t2); - -#define ADD_CONSTANT(a,b,c0,c1)\ - a = _mm_xor_si128(a,c0);\ - b = _mm_xor_si128(b,c1);\ - -#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\ - s2 = _mm_load_si128(&r1);\ - q2 = _mm_load_si128(&p1);\ - r2 = _mm_shuffle_epi32(r2,216);\ - p2 = _mm_shuffle_epi32(p2,216);\ - r1 = _mm_unpacklo_epi32(r1,r0);\ - p1 = _mm_unpacklo_epi32(p1,p0);\ - s2 = _mm_unpackhi_epi32(s2,r0);\ - q2 = _mm_unpackhi_epi32(q2,p0);\ - s0 = _mm_load_si128(&r2);\ - q0 = _mm_load_si128(&p2);\ - r2 = _mm_unpacklo_epi64(r2,r1);\ - p2 = _mm_unpacklo_epi64(p2,p1);\ - s1 = _mm_load_si128(&s0);\ - q1 = _mm_load_si128(&q0);\ - s0 = _mm_unpackhi_epi64(s0,r1);\ - q0 = _mm_unpackhi_epi64(q0,p1);\ - r2 = _mm_shuffle_epi32(r2,225);\ - p2 = _mm_shuffle_epi32(p2,225);\ - r0 = _mm_load_si128(&s1);\ - p0 = _mm_load_si128(&q1);\ - s0 = _mm_shuffle_epi32(s0,225);\ - q0 = _mm_shuffle_epi32(q0,225);\ - s1 = _mm_unpacklo_epi64(s1,s2);\ - q1 = _mm_unpacklo_epi64(q1,q2);\ - r0 = _mm_unpackhi_epi64(r0,s2);\ - p0 = _mm_unpackhi_epi64(p0,q2);\ - s2 = _mm_load_si128(&r0);\ - q2 = _mm_load_si128(&p0);\ - s3 = _mm_load_si128(&r2);\ - q3 = _mm_load_si128(&p2);\ - -#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\ - s0 = _mm_load_si128(&r0);\ - q0 = _mm_load_si128(&p0);\ - s1 = _mm_load_si128(&r2);\ - q1 = _mm_load_si128(&p2);\ - r0 = _mm_unpackhi_epi32(r0,r1);\ - p0 = _mm_unpackhi_epi32(p0,p1);\ - r2 = _mm_unpackhi_epi32(r2,r3);\ - p2 = _mm_unpackhi_epi32(p2,p3);\ - s0 = _mm_unpacklo_epi32(s0,r1);\ - q0 = _mm_unpacklo_epi32(q0,p1);\ - s1 = _mm_unpacklo_epi32(s1,r3);\ - q1 = _mm_unpacklo_epi32(q1,p3);\ - r1 = _mm_load_si128(&r0);\ - p1 = _mm_load_si128(&p0);\ - r0 = _mm_unpackhi_epi64(r0,r2);\ - p0 = _mm_unpackhi_epi64(p0,p2);\ - s0 = _mm_unpackhi_epi64(s0,s1);\ - q0 = _mm_unpackhi_epi64(q0,q1);\ - r1 = _mm_unpacklo_epi64(r1,r2);\ - p1 = _mm_unpacklo_epi64(p1,p2);\ - s2 = _mm_load_si128(&r0);\ - q2 = _mm_load_si128(&p0);\ - s1 = _mm_load_si128(&r1);\ - q1 = _mm_load_si128(&p1);\ - -#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\ - s1 = _mm_load_si128(&r3);\ - q1 = _mm_load_si128(&p3);\ - s3 = _mm_load_si128(&r3);\ - q3 = _mm_load_si128(&p3);\ - s1 = _mm_unpackhi_epi32(s1,r2);\ - q1 = _mm_unpackhi_epi32(q1,p2);\ - s3 = _mm_unpacklo_epi32(s3,r2);\ - q3 = _mm_unpacklo_epi32(q3,p2);\ - s0 = _mm_load_si128(&s1);\ - q0 = _mm_load_si128(&q1);\ - s2 = _mm_load_si128(&s3);\ - q2 = _mm_load_si128(&q3);\ - r3 = _mm_load_si128(&r1);\ - p3 = _mm_load_si128(&p1);\ - r1 = _mm_unpacklo_epi32(r1,r0);\ - p1 = _mm_unpacklo_epi32(p1,p0);\ - r3 = _mm_unpackhi_epi32(r3,r0);\ - p3 = _mm_unpackhi_epi32(p3,p0);\ - s0 = _mm_unpackhi_epi64(s0,r3);\ - q0 = _mm_unpackhi_epi64(q0,p3);\ - s1 = _mm_unpacklo_epi64(s1,r3);\ - q1 = _mm_unpacklo_epi64(q1,p3);\ - s2 = _mm_unpackhi_epi64(s2,r1);\ - q2 = _mm_unpackhi_epi64(q2,p1);\ - s3 = _mm_unpacklo_epi64(s3,r1);\ - q3 = _mm_unpacklo_epi64(q3,p1); - -#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\ - NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3); - -static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 ); - -static void finalization512( hashState_luffa *state, uint32 *b ); - -/* initial values of chaining variables */ -static const uint32 IV[40] __attribute((aligned(16))) = { - 0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69, - 0xdef610bb,0xee058139,0x90152df4,0x6e292011, - 0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95, - 0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557, - 0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d, - 0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f, - 0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5, - 0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a, - 0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be, - 0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999 -}; - -/* Round Constants */ -static const uint32 CNS_INIT[128] __attribute((aligned(16))) = { - 0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6, - 0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818, - 0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299, - 0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d, - 0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12, - 0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442, - 0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e, - 0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f, - 0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f, - 0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6, - 0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d, - 0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4, - 0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882, - 0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7, - 0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12, - 0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d, - 0x00000000,0x00000000,0x00000000,0xf0d2e9e3, - 0x00000000,0x00000000,0x00000000,0x5090d577, - 0x00000000,0x00000000,0x00000000,0xac11d7fa, - 0x00000000,0x00000000,0x00000000,0x2d1925ab, - 0x00000000,0x00000000,0x00000000,0x1bcb66f2, - 0x00000000,0x00000000,0x00000000,0xb46496ac, - 0x00000000,0x00000000,0x00000000,0x6f2d9bc9, - 0x00000000,0x00000000,0x00000000,0xd1925ab0, - 0x00000000,0x00000000,0x00000000,0x78602649, - 0x00000000,0x00000000,0x00000000,0x29131ab6, - 0x00000000,0x00000000,0x00000000,0x8edae952, - 0x00000000,0x00000000,0x00000000,0x0fc053c3, - 0x00000000,0x00000000,0x00000000,0x3b6ba548, - 0x00000000,0x00000000,0x00000000,0x3f014f0c, - 0x00000000,0x00000000,0x00000000,0xedae9520, - 0x00000000,0x00000000,0x00000000,0xfc053c31 -}; - - -__m128i CNS128[32]; -__m128i ALLONE; -__m128i MASK; - -HashReturn init_luffa(hashState_luffa *state, int hashbitlen) -{ - int i; - state->hashbitlen = hashbitlen; - /* set the lower 32 bits to '1' */ - MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff); - /* set all bits to '1' */ - ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); - /* set the 32-bit round constant values to the 128-bit data field */ - for ( i=0; i<32; i++ ) - CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] ); - for ( i=0; i<10; i++ ) - state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] ); - memset(state->buffer, 0, sizeof state->buffer ); - return SUCCESS; -} - -HashReturn update_luffa( hashState_luffa *state, const BitSequence *data, - size_t len ) -{ - int i; - int blocks = (int)len / 32; - state-> rembytes = (int)len % 32; - - // full blocks - for ( i = 0; i < blocks; i++ ) - { - rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ), - mm128_bswap_32( casti_m128i( data, 0 ) ) ); - data += MSG_BLOCK_BYTE_LEN; - } - - // 16 byte partial block exists for 80 byte len - // store in buffer for transform in final for midstate to work - if ( state->rembytes ) - { - // remaining data bytes - casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) ); - // padding of partial block - casti_m128i( state->buffer, 1 ) = - _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ); - } - - return SUCCESS; -} - -HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval) -{ - // transform pad block - if ( state->rembytes ) - { - // not empty, data is in buffer - rnd512( state, casti_m128i( state->buffer, 1 ), - casti_m128i( state->buffer, 0 ) ); - } - else - { - // empty pad block, constant data - rnd512( state, _mm_setzero_si128(), - _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) ); - } - - finalization512(state, (uint32*) hashval); - if ( state->hashbitlen > 512 ) - finalization512( state, (uint32*)( hashval+128 ) ); - return SUCCESS; -} - -HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, - const BitSequence* data, size_t inlen ) -{ -// Optimized for integrals of 16 bytes, good for 64 and 80 byte len - int i; - int blocks = (int)( inlen / 32 ); - state->rembytes = inlen % 32; - - // full blocks - for ( i = 0; i < blocks; i++ ) - { - rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ), - mm128_bswap_32( casti_m128i( data, 0 ) ) ); - data += MSG_BLOCK_BYTE_LEN; - } - - // 16 byte partial block exists for 80 byte len - if ( state->rembytes ) - { - // padding of partial block - rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ), - mm128_bswap_32( cast_m128i( data ) ) ); - } - else - { - // empty pad block - rnd512( state, _mm_setzero_si128(), - _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) ); - } - - finalization512( state, (uint32*) output ); - if ( state->hashbitlen > 512 ) - finalization512( state, (uint32*)( output+128 ) ); - - return SUCCESS; -} - -/***************************************************/ -/* Round function */ -/* state: hash context */ - -static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 ) -{ - __m128i t[2]; - __m128i *chainv = state->chainv; - __m128i tmp[2]; - __m128i x[8]; - - t[0] = chainv[0]; - t[1] = chainv[1]; - - t[0] = _mm_xor_si128( t[0], chainv[2] ); - t[1] = _mm_xor_si128( t[1], chainv[3] ); - t[0] = _mm_xor_si128( t[0], chainv[4] ); - t[1] = _mm_xor_si128( t[1], chainv[5] ); - t[0] = _mm_xor_si128( t[0], chainv[6] ); - t[1] = _mm_xor_si128( t[1], chainv[7] ); - t[0] = _mm_xor_si128( t[0], chainv[8] ); - t[1] = _mm_xor_si128( t[1], chainv[9] ); - - MULT2( t[0], t[1] ); - - msg0 = _mm_shuffle_epi32( msg0, 27 ); - msg1 = _mm_shuffle_epi32( msg1, 27 ); - - chainv[0] = _mm_xor_si128( chainv[0], t[0] ); - chainv[1] = _mm_xor_si128( chainv[1], t[1] ); - chainv[2] = _mm_xor_si128( chainv[2], t[0] ); - chainv[3] = _mm_xor_si128( chainv[3], t[1] ); - chainv[4] = _mm_xor_si128( chainv[4], t[0] ); - chainv[5] = _mm_xor_si128( chainv[5], t[1] ); - chainv[6] = _mm_xor_si128( chainv[6], t[0] ); - chainv[7] = _mm_xor_si128( chainv[7], t[1] ); - chainv[8] = _mm_xor_si128( chainv[8], t[0] ); - chainv[9] = _mm_xor_si128( chainv[9], t[1] ); - - t[0] = chainv[0]; - t[1] = chainv[1]; - - MULT2( chainv[0], chainv[1]); - - chainv[0] = _mm_xor_si128( chainv[0], chainv[2] ); - chainv[1] = _mm_xor_si128( chainv[1], chainv[3] ); - - MULT2( chainv[2], chainv[3]); - - chainv[2] = _mm_xor_si128(chainv[2], chainv[4]); - chainv[3] = _mm_xor_si128(chainv[3], chainv[5]); - - MULT2( chainv[4], chainv[5]); - - chainv[4] = _mm_xor_si128(chainv[4], chainv[6]); - chainv[5] = _mm_xor_si128(chainv[5], chainv[7]); - - MULT2( chainv[6], chainv[7]); - - chainv[6] = _mm_xor_si128(chainv[6], chainv[8]); - chainv[7] = _mm_xor_si128(chainv[7], chainv[9]); - - MULT2( chainv[8], chainv[9]); - - chainv[8] = _mm_xor_si128( chainv[8], t[0] ); - chainv[9] = _mm_xor_si128( chainv[9], t[1] ); - - t[0] = chainv[8]; - t[1] = chainv[9]; - - MULT2( chainv[8], chainv[9]); - - chainv[8] = _mm_xor_si128( chainv[8], chainv[6] ); - chainv[9] = _mm_xor_si128( chainv[9], chainv[7] ); - - MULT2( chainv[6], chainv[7]); - - chainv[6] = _mm_xor_si128( chainv[6], chainv[4] ); - chainv[7] = _mm_xor_si128( chainv[7], chainv[5] ); - - MULT2( chainv[4], chainv[5]); - - chainv[4] = _mm_xor_si128( chainv[4], chainv[2] ); - chainv[5] = _mm_xor_si128( chainv[5], chainv[3] ); - - MULT2( chainv[2], chainv[3] ); - - chainv[2] = _mm_xor_si128( chainv[2], chainv[0] ); - chainv[3] = _mm_xor_si128( chainv[3], chainv[1] ); - - MULT2( chainv[0], chainv[1] ); - - chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t[0] ), msg0 ); - chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t[1] ), msg1 ); - - MULT2( msg0, msg1); - - chainv[2] = _mm_xor_si128( chainv[2], msg0 ); - chainv[3] = _mm_xor_si128( chainv[3], msg1 ); - - MULT2( msg0, msg1); - - chainv[4] = _mm_xor_si128( chainv[4], msg0 ); - chainv[5] = _mm_xor_si128( chainv[5], msg1 ); - - MULT2( msg0, msg1); - - chainv[6] = _mm_xor_si128( chainv[6], msg0 ); - chainv[7] = _mm_xor_si128( chainv[7], msg1 ); - - MULT2( msg0, msg1); - - chainv[8] = _mm_xor_si128( chainv[8], msg0 ); - chainv[9] = _mm_xor_si128( chainv[9], msg1 ); - - MULT2( msg0, msg1); - - chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1), - _mm_srli_epi32(chainv[3], 31) ); - chainv[5] = _mm_or_si128( _mm_slli_epi32(chainv[5], 2), - _mm_srli_epi32(chainv[5], 30) ); - chainv[7] = _mm_or_si128( _mm_slli_epi32(chainv[7], 3), - _mm_srli_epi32(chainv[7], 29) ); - chainv[9] = _mm_or_si128( _mm_slli_epi32(chainv[9], 4), - _mm_srli_epi32(chainv[9], 28) ); - - - NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], - x[0], x[1], x[2], x[3], - chainv[1],chainv[3],chainv[5],chainv[7], - x[4], x[5], x[6], x[7] ); - - STEP_PART( &x[0], &CNS128[ 0], &tmp[0] ); - STEP_PART( &x[0], &CNS128[ 2], &tmp[0] ); - STEP_PART( &x[0], &CNS128[ 4], &tmp[0] ); - STEP_PART( &x[0], &CNS128[ 6], &tmp[0] ); - STEP_PART( &x[0], &CNS128[ 8], &tmp[0] ); - STEP_PART( &x[0], &CNS128[10], &tmp[0] ); - STEP_PART( &x[0], &CNS128[12], &tmp[0] ); - STEP_PART( &x[0], &CNS128[14], &tmp[0] ); - - MIXTON1024( x[0], x[1], x[2], x[3], - chainv[0], chainv[2], chainv[4],chainv[6], - x[4], x[5], x[6], x[7], - chainv[1],chainv[3],chainv[5],chainv[7]); - - /* Process last 256-bit block */ - STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[16], CNS128[17], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[18], CNS128[19], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[20], CNS128[21], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[22], CNS128[23], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[24], CNS128[25], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[26], CNS128[27], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[28], CNS128[29], - tmp[0], tmp[1] ); - STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[30], CNS128[31], - tmp[0], tmp[1] ); -} - - -/***************************************************/ -/* Finalization function */ -/* state: hash context */ -/* b[8]: hash values */ - -#if defined (__AVX2__) - -static void finalization512( hashState_luffa *state, uint32 *b ) -{ - uint32 hash[8] __attribute((aligned(64))); - __m256i* chainv = (__m256i*)state->chainv; - __m256i t; - const __m128i zero = _mm_setzero_si128(); - - rnd512( state, zero, zero ); - - t = chainv[0]; - t = _mm256_xor_si256( t, chainv[1] ); - t = _mm256_xor_si256( t, chainv[2] ); - t = _mm256_xor_si256( t, chainv[3] ); - t = _mm256_xor_si256( t, chainv[4] ); - - t = _mm256_shuffle_epi32( t, 27 ); - - _mm256_store_si256( (__m256i*)hash, t ); - - casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); - - rnd512( state, zero, zero ); - - t = chainv[0]; - t = _mm256_xor_si256( t, chainv[1] ); - t = _mm256_xor_si256( t, chainv[2] ); - t = _mm256_xor_si256( t, chainv[3] ); - t = _mm256_xor_si256( t, chainv[4] ); - t = _mm256_shuffle_epi32( t, 27 ); - - _mm256_store_si256( (__m256i*)hash, t ); - - casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); -} - -#else - -static void finalization512( hashState_luffa *state, uint32 *b ) -{ - uint32 hash[8] __attribute((aligned(64))); - __m128i* chainv = state->chainv; - __m128i t[2]; - const __m128i zero = _mm_setzero_si128(); - - /*---- blank round with m=0 ----*/ - rnd512( state, zero, zero ); - - t[0] = chainv[0]; - t[1] = chainv[1]; - t[0] = _mm_xor_si128(t[0], chainv[2]); - t[1] = _mm_xor_si128(t[1], chainv[3]); - t[0] = _mm_xor_si128(t[0], chainv[4]); - t[1] = _mm_xor_si128(t[1], chainv[5]); - t[0] = _mm_xor_si128(t[0], chainv[6]); - t[1] = _mm_xor_si128(t[1], chainv[7]); - t[0] = _mm_xor_si128(t[0], chainv[8]); - t[1] = _mm_xor_si128(t[1], chainv[9]); - - t[0] = _mm_shuffle_epi32(t[0], 27); - t[1] = _mm_shuffle_epi32(t[1], 27); - - _mm_store_si128((__m128i*)&hash[0], t[0]); - _mm_store_si128((__m128i*)&hash[4], t[1]); - - casti_m128i( b, 0 ) = mm128_bswap_32( casti_m128i( hash, 0 ) ); - casti_m128i( b, 1 ) = mm128_bswap_32( casti_m128i( hash, 1 ) ); - - rnd512( state, zero, zero ); - - t[0] = chainv[0]; - t[1] = chainv[1]; - t[0] = _mm_xor_si128(t[0], chainv[2]); - t[1] = _mm_xor_si128(t[1], chainv[3]); - t[0] = _mm_xor_si128(t[0], chainv[4]); - t[1] = _mm_xor_si128(t[1], chainv[5]); - t[0] = _mm_xor_si128(t[0], chainv[6]); - t[1] = _mm_xor_si128(t[1], chainv[7]); - t[0] = _mm_xor_si128(t[0], chainv[8]); - t[1] = _mm_xor_si128(t[1], chainv[9]); - - t[0] = _mm_shuffle_epi32(t[0], 27); - t[1] = _mm_shuffle_epi32(t[1], 27); - - _mm_store_si128((__m128i*)&hash[0], t[0]); - _mm_store_si128((__m128i*)&hash[4], t[1]); - - casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) ); - casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) ); -} -#endif - -/***************************************************/ diff --git a/algo/luffa/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h deleted file mode 100644 index d21b34c..0000000 --- a/algo/luffa/luffa_for_sse2.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * luffa_for_sse2.h - * Version 2.0 (Sep 15th 2009) - * - * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. - * - * Hitachi, Ltd. is the owner of this software and hereby grant - * the U.S. Government and any interested party the right to use - * this software for the purposes of the SHA-3 evaluation process, - * notwithstanding that this software is copyrighted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include "algo/sha/sha3-defs.h" -/* The length of digests*/ -#define DIGEST_BIT_LEN_224 224 -#define DIGEST_BIT_LEN_256 256 -#define DIGEST_BIT_LEN_384 384 -#define DIGEST_BIT_LEN_512 512 - -/*********************************/ -/* The parameters of Luffa */ -#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/ -#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length - * of a message block*/ - -/* The number of blocks in Luffa */ -#define WIDTH_224 3 -#define WIDTH_256 3 -#define WIDTH_384 4 -#define WIDTH_512 5 - -/* The limit of the length of message */ -#define LIMIT_224 64 -#define LIMIT_256 64 -#define LIMIT_384 128 -#define LIMIT_512 128 -/*********************************/ - -typedef struct { - uint32 buffer[8] __attribute((aligned(32))); - __m128i chainv[10] __attribute((aligned(32))); /* Chaining values */ -// uint64 bitlen[2]; /* Message length in bits */ -// uint32 rembitlen; /* Length of buffer data to be hashed */ - int hashbitlen; - int rembytes; -} hashState_luffa; - -HashReturn init_luffa( hashState_luffa *state, int hashbitlen ); - -// len is in bytes -HashReturn update_luffa( hashState_luffa *state, const BitSequence *data, - size_t len ); - -HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval ); - -HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, - const BitSequence* data, size_t inlen ); - - - diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c deleted file mode 100644 index d1471a5..0000000 --- a/algo/lyra2/allium-4way.c +++ /dev/null @@ -1,131 +0,0 @@ -#include "lyra2-gate.h" -#include -#include - -#if defined (ALLIUM_4WAY) - -#include "algo/blake/blake-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/groestl/aes_ni/hash-groestl256.h" - -typedef struct { - blake256_4way_context blake; - keccak256_4way_context keccak; - cubehashParam cube; - skein256_4way_context skein; - hashState_groestl256 groestl; - -} allium_4way_ctx_holder; - -static __thread allium_4way_ctx_holder allium_4way_ctx; - -bool init_allium_4way_ctx() -{ - keccak256_4way_init( &allium_4way_ctx.keccak ); - cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 ); - skein256_4way_init( &allium_4way_ctx.skein ); - init_groestl256( &allium_4way_ctx.groestl, 32 ); - return true; -} - -void allium_4way_hash( void *state, const void *input ) -{ - uint32_t hash0[8] __attribute__ ((aligned (64))); - uint32_t hash1[8] __attribute__ ((aligned (32))); - uint32_t hash2[8] __attribute__ ((aligned (32))); - uint32_t hash3[8] __attribute__ ((aligned (32))); - uint32_t vhash32[8*4] __attribute__ ((aligned (64))); - uint32_t vhash64[8*4] __attribute__ ((aligned (64))); - allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); - - memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) ); - blake256_4way( &ctx.blake, input + (64<<2), 16 ); - blake256_4way_close( &ctx.blake, vhash32 ); - - rintrlv_4x32_4x64( vhash64, vhash32, 256 ); - keccak256_4way( &ctx.keccak, vhash64, 32 ); - keccak256_4way_close( &ctx.keccak, vhash64 ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); - - LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); - LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); - LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 ); - LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 ); - - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 ); - - LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); - LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); - LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 ); - LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 ); - - intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 ); - - skein256_4way( &ctx.skein, vhash64, 32 ); - skein256_4way_close( &ctx.skein, vhash64 ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); - - update_and_final_groestl256( &ctx.groestl, state, hash0, 256 ); - memcpy( &ctx.groestl, &allium_4way_ctx.groestl, - sizeof(hashState_groestl256) ); - update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 ); - memcpy( &ctx.groestl, &allium_4way_ctx.groestl, - sizeof(hashState_groestl256) ); - update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 ); - memcpy( &ctx.groestl, &allium_4way_ctx.groestl, - sizeof(hashState_groestl256) ); - update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 ); -} - -int scanhash_allium_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - const uint32_t Htarg = ptarget[7]; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ( (uint32_t*)ptarget )[7] = 0x0000ff; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - blake256_4way_init( &allium_4way_ctx.blake ); - blake256_4way( &allium_4way_ctx.blake, vdata, 64 ); - - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); - - allium_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg ) - { - if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, hash+(lane<<3), mythr, lane ); - } - } - n += 4; - } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/lyra2/allium.c b/algo/lyra2/allium.c deleted file mode 100644 index 593a997..0000000 --- a/algo/lyra2/allium.c +++ /dev/null @@ -1,109 +0,0 @@ -#include "lyra2-gate.h" -#include -#include "algo/blake/sph_blake.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/cubehash/cubehash_sse2.h" -#if defined(__AES__) -#include "algo/groestl/aes_ni/hash-groestl256.h" -#else -#include "algo/groestl/sph_groestl.h" -#endif -#include "lyra2.h" - -typedef struct { - sph_blake256_context blake; - sph_keccak256_context keccak; - cubehashParam cube; - sph_skein256_context skein; -#if defined (__AES__) - hashState_groestl256 groestl; -#else - sph_groestl256_context groestl; -#endif -} allium_ctx_holder; - -static __thread allium_ctx_holder allium_ctx; - -bool init_allium_ctx() -{ - sph_keccak256_init( &allium_ctx.keccak ); - cubehashInit( &allium_ctx.cube, 256, 16, 32 ); - sph_skein256_init( &allium_ctx.skein ); -#if defined (__AES__) - init_groestl256( &allium_ctx.groestl, 32 ); -#else - sph_groestl256_init( &allium_ctx.groestl ); -#endif - return true; -} - -void allium_hash(void *state, const void *input) -{ - uint32_t hash[8] __attribute__ ((aligned (64))); - allium_ctx_holder ctx __attribute__ ((aligned (32))); - - memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) ); - sph_blake256( &ctx.blake, input + 64, 16 ); - sph_blake256_close( &ctx.blake, hash ); - - sph_keccak256( &ctx.keccak, hash, 32 ); - sph_keccak256_close( &ctx.keccak, hash ); - - LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 ); - - cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 ); - - LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 ); - - sph_skein256( &ctx.skein, hash, 32 ); - sph_skein256_close( &ctx.skein, hash ); - -#if defined (__AES__) - update_and_final_groestl256( &ctx.groestl, hash, hash, 256 ); -#else - sph_groestl256( &ctx.groestl, hash, 32 ); - sph_groestl256_close( &ctx.groestl, hash ); -#endif - - memcpy(state, hash, 32); -} - -int scanhash_allium( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ptarget[7] = 0x3ffff; - - for ( int i = 0; i < 19; i++ ) - be32enc( &endiandata[i], pdata[i] ); - - sph_blake256_init( &allium_ctx.blake ); - sph_blake256( &allium_ctx.blake, endiandata, 64 ); - - do { - be32enc( &endiandata[19], nonce ); - allium_hash( hash, endiandata ); - if ( hash[7] <= Htarg ) - if ( fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !work_restart[thr_id].restart ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c deleted file mode 100644 index 66e3a25..0000000 --- a/algo/lyra2/lyra2-gate.c +++ /dev/null @@ -1,225 +0,0 @@ -#include "lyra2-gate.h" - - -// huge pages -// -// Use MAP_PRIVATE instead -// In register algo: -// replace thread safe whole matrix with a char** -// alloc huge pages matrixsize * threads -// make pointers to each thread to each thread, creating an -// array[thread][matrix]. -// Each thread can create its own matrix pointer: -// my_matrix = the matrix + ( thread_id * matrix_size ) -// -// Compiler version check? -// Fallback? -// -// create a generic utility to map & unmap huge pages. -// ptr = malloc_huge( size ); -// Yespower wrapper checks for 64 byte alignment, seems unnecessary as -// it should be aligned to the page boundary. It may be desireable to -// have the matrix size rounded up if necessary to something bigger -// than 64 byte, say 4 kbytes a small page size. - -// Define some constants for indivual parameters and matrix size for -// each algo. Use the parameter constants where apropriate. -// Convert algos that don't yet do so to use dynamic alllocation. -// Alloc huge pages globally. If ok each thread will create a pointer to -// its chunk. If fail each thread will use use _mm_alloc for itself. -// BLOCK_LEN_BYTES is 768. - -#define LYRA2REV3_NROWS 4 -#define LYRA2REV3_NCOLS 4 -/* -#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \ - (LYRA2REV3_NROWS)*8) -*/ - -#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4) - -__thread uint64_t* l2v3_wholeMatrix; - -bool lyra2rev3_thread_init() -{ - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - - int size = (int64_t)ROW_LEN_BYTES * 4; // nRows; - l2v3_wholeMatrix = _mm_malloc( size, 64 ); -#if defined (LYRA2REV3_8WAY) - init_lyra2rev3_8way_ctx();; -#elif defined (LYRA2REV3_4WAY) - init_lyra2rev3_4way_ctx();; -#else - init_lyra2rev3_ctx(); -#endif - return l2v3_wholeMatrix; -} - -bool register_lyra2rev3_algo( algo_gate_t* gate ) -{ -#if defined (LYRA2REV3_8WAY) - gate->scanhash = (void*)&scanhash_lyra2rev3_8way; - gate->hash = (void*)&lyra2rev3_8way_hash; -#elif defined (LYRA2REV3_4WAY) - gate->scanhash = (void*)&scanhash_lyra2rev3_4way; - gate->hash = (void*)&lyra2rev3_4way_hash; -#else - gate->scanhash = (void*)&scanhash_lyra2rev3; - gate->hash = (void*)&lyra2rev3_hash; -#endif - gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT; - gate->miner_thread_init = (void*)&lyra2rev3_thread_init; - gate->set_target = (void*)&alt_set_target; - return true; -}; - -////////////////////////////////// - -__thread uint64_t* l2v2_wholeMatrix; - -bool lyra2rev2_thread_init() -{ - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - - int size = (int64_t)ROW_LEN_BYTES * 4; // nRows; - l2v2_wholeMatrix = _mm_malloc( size, 64 ); -#if defined (LYRA2REV2_4WAY) - init_lyra2rev2_4way_ctx();; -#else - init_lyra2rev2_ctx(); -#endif - return l2v2_wholeMatrix; -} - -bool register_lyra2rev2_algo( algo_gate_t* gate ) -{ -#if defined (LYRA2REV2_4WAY) - gate->scanhash = (void*)&scanhash_lyra2rev2_4way; - gate->hash = (void*)&lyra2rev2_4way_hash; -#else - gate->scanhash = (void*)&scanhash_lyra2rev2; - gate->hash = (void*)&lyra2rev2_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT; - gate->miner_thread_init = (void*)&lyra2rev2_thread_init; - gate->set_target = (void*)&alt_set_target; - return true; -}; - -///////////////////////////// - -bool register_lyra2z_algo( algo_gate_t* gate ) -{ -#if defined(LYRA2Z_8WAY) - gate->miner_thread_init = (void*)&lyra2z_8way_thread_init; - gate->scanhash = (void*)&scanhash_lyra2z_8way; - gate->hash = (void*)&lyra2z_8way_hash; -#elif defined(LYRA2Z_4WAY) - gate->miner_thread_init = (void*)&lyra2z_4way_thread_init; - gate->scanhash = (void*)&scanhash_lyra2z_4way; - gate->hash = (void*)&lyra2z_4way_hash; -#else - gate->miner_thread_init = (void*)&lyra2z_thread_init; - gate->scanhash = (void*)&scanhash_lyra2z; - gate->hash = (void*)&lyra2z_hash; -#endif - gate->optimizations = SSE42_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0xffffLL; - gate->set_target = (void*)&alt_set_target; - return true; -}; - - -//////////////////////// - -bool register_lyra2h_algo( algo_gate_t* gate ) -{ -#ifdef LYRA2H_4WAY - gate->miner_thread_init = (void*)&lyra2h_4way_thread_init; - gate->scanhash = (void*)&scanhash_lyra2h_4way; - gate->hash = (void*)&lyra2h_4way_hash; -#else - gate->miner_thread_init = (void*)&lyra2h_thread_init; - gate->scanhash = (void*)&scanhash_lyra2h; - gate->hash = (void*)&lyra2h_hash; -#endif - gate->optimizations = SSE42_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0xffffLL; - gate->set_target = (void*)&alt_set_target; - return true; -}; - -///////////////////////////////// - -int64_t allium_get_max64_0xFFFFLL() { return 0xFFFFLL; } - -bool register_allium_algo( algo_gate_t* gate ) -{ -#if defined (ALLIUM_4WAY) - gate->miner_thread_init = (void*)&init_allium_4way_ctx; - gate->scanhash = (void*)&scanhash_allium_4way; - gate->hash = (void*)&allium_4way_hash; -#else - gate->miner_thread_init = (void*)&init_allium_ctx; - gate->scanhash = (void*)&scanhash_allium; - gate->hash = (void*)&allium_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT; - gate->set_target = (void*)&alt_set_target; - gate->get_max64 = (void*)&allium_get_max64_0xFFFFLL; - return true; -}; - -///////////////////////////////////////// - -bool phi2_has_roots; -bool phi2_use_roots = false; - -int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; } - -void phi2_decode_extra_data( struct work *work ) -{ - if ( work->data[0] & ( 1<<30 ) ) phi2_use_roots = true; - else for ( int i = 20; i < 36; i++ ) - { - if (work->data[i]) { phi2_use_roots = true; break; } - } -} - -void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) -{ - uchar merkle_tree[64] = { 0 }; - size_t t; - - algo_gate.gen_merkle_root( merkle_tree, sctx ); - // Increment extranonce2 - for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - // Assemble block header - algo_gate.build_block_header( g_work, le32dec( sctx->job.version ), - (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree, - le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) ); - for ( t = 0; t < 16; t++ ) - g_work->data[ 20+t ] = ((uint32_t*)sctx->job.extra)[t]; -} - - -bool register_phi2_algo( algo_gate_t* gate ) -{ -// init_phi2_ctx(); - gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT; - gate->get_work_data_size = (void*)&phi2_get_work_data_size; - gate->decode_extra_data = (void*)&phi2_decode_extra_data; - gate->build_extraheader = (void*)&phi2_build_extraheader; - gate->set_target = (void*)&alt_set_target; - gate->get_max64 = (void*)&get_max64_0xffffLL; -#if defined(PHI2_4WAY) - gate->scanhash = (void*)&scanhash_phi2_4way; -#else - init_phi2_ctx(); - gate->scanhash = (void*)&scanhash_phi2; -#endif - return true; -} diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h deleted file mode 100644 index 8a392ca..0000000 --- a/algo/lyra2/lyra2-gate.h +++ /dev/null @@ -1,178 +0,0 @@ -#ifndef LYRA2_GATE_H__ -#define LYRA2_GATE_H__ 1 - -#include "algo-gate-api.h" -#include -#include "lyra2.h" - -#if defined(__AVX2__) - #define LYRA2REV3_8WAY -#endif - -#if defined(__SSE2__) - #define LYRA2REV3_4WAY -#endif - -extern __thread uint64_t* l2v3_wholeMatrix; - -bool register_lyra2rev3_algo( algo_gate_t* gate ); -#if defined(LYRA2REV3_8WAY) - -void lyra2rev3_8way_hash( void *state, const void *input ); -int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool init_lyra2rev3_8way_ctx(); - -#elif defined(LYRA2REV3_4WAY) - -void lyra2rev3_4way_hash( void *state, const void *input ); -int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool init_lyra2rev3_4way_ctx(); - -#else - -void lyra2rev3_hash( void *state, const void *input ); -int scanhash_lyra2rev3( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool init_lyra2rev3_ctx(); - -#endif - -////////////////////////////////// - -#if defined(__AVX2__) - #define LYRA2REV2_4WAY -#endif - -extern __thread uint64_t* l2v2_wholeMatrix; - -bool register_lyra2rev2_algo( algo_gate_t* gate ); - -#if defined(LYRA2REV2_4WAY) - -void lyra2rev2_4way_hash( void *state, const void *input ); -int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool init_lyra2rev2_4way_ctx(); - -#else - -void lyra2rev2_hash( void *state, const void *input ); -int scanhash_lyra2rev2( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool init_lyra2rev2_ctx(); - -#endif - -///////////////////////// - -#if defined(__SSE2__) - #define LYRA2Z_4WAY -#endif -#if defined(__AVX2__) - #define LYRA2Z_8WAY -#endif - - -#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8 - -#if defined(LYRA2Z_8WAY) - -void lyra2z_8way_hash( void *state, const void *input ); -int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool lyra2z_8way_thread_init(); - -#elif defined(LYRA2Z_4WAY) - -void lyra2z_4way_hash( void *state, const void *input ); -int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool lyra2z_4way_thread_init(); - -#else - -void lyra2z_hash( void *state, const void *input ); -int scanhash_lyra2z( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool lyra2z_thread_init(); - -#endif - -//////////////////// - -#if defined(__AVX2__) - #define LYRA2H_4WAY -#endif - -#define LYRA2H_MATRIX_SIZE BLOCK_LEN_INT64 * 16 * 16 * 8 - -#if defined(LYRA2H_4WAY) - -void lyra2h_4way_hash( void *state, const void *input ); -int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool lyra2h_4way_thread_init(); - -#else - -void lyra2h_hash( void *state, const void *input ); -int scanhash_lyra2h( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool lyra2h_thread_init(); - -#endif - -////////////////////////////////// - -#if defined(__AVX2__) && defined(__AES__) - #define ALLIUM_4WAY -#endif - -bool register_allium_algo( algo_gate_t* gate ); - -#if defined(ALLIUM_4WAY) - -void allium_4way_hash( void *state, const void *input ); -int scanhash_allium_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool init_allium_4way_ctx(); - -#else - -void allium_hash( void *state, const void *input ); -int scanhash_allium( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool init_allium_ctx(); - -#endif - -///////////////////////////////////////// - -#if defined(__AVX2__) && defined(__AES__) -// #define PHI2_4WAY -#endif - -bool phi2_has_roots; - -bool register_phi2_algo( algo_gate_t* gate ); -#if defined(PHI2_4WAY) - -void phi2_hash_4way( void *state, const void *input ); -int scanhash_phi2_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -//void init_phi2_ctx(); - -#else - -void phi2_hash( void *state, const void *input ); -int scanhash_phi2( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_phi2_ctx(); - -#endif - -#endif // LYRA2_GATE_H__ - - diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c index 5c65d4e..a89d3fb 100644 --- a/algo/lyra2/lyra2.c +++ b/algo/lyra2/lyra2.c @@ -17,21 +17,28 @@ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "lyra2.h" +#include "compat.h" +#include "sponge.h" #include #include #include #include + +#if defined(__arm__) || defined(__aarch64__) +#include "sse2neon.h" +#else #include -#include "compat.h" -#include "lyra2.h" -#include "sponge.h" +#endif /** - * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords - * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, - * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all - * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value - * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols). + * Executes Lyra2 based on the G function from Blake2b. This version supports + * salts and passwords whose combined length is smaller than the size of the + * memory matrix, (i.e., (nRows x nCols x b) bits, where "b" is the underlying + * sponge's bitrate). In this implementation, the "basil" is composed by all + * integer parameters (treated as type "unsigned int") in the order they are + * provided, plus the value of nCols, (i.e., basil = kLen || pwdlen || saltlen + * || timeCost || nRows || nCols). * * @param K The derived key to be output by the algorithm * @param kLen Desired key length @@ -43,669 +50,739 @@ * @param nRows Number or rows of the memory matrix (R) * @param nCols Number of columns of the memory matrix (C) * - * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) + * @return 0 if the key is generated correctly; -1 if there is an error (usually + * due to lack of memory for allocation) */ -int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, - const uint64_t pwdlen, const void *salt, const uint64_t saltlen, - const uint64_t timeCost, const uint64_t nRows, - const uint64_t nCols ) -{ - //====================== Basic variables ============================// - uint64_t _ALIGN(256) state[16]; - int64_t row = 2; //index of row to be processed - int64_t prev = 1; //index of prev (last row ever computed/modified) - int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) - int64_t tau; //Time Loop iterator - int64_t step = 1; //Visitation step (used during Setup and Wandering phases) - int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) - int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 - int64_t i; //auxiliary iteration counter - int64_t v64; // 64bit var for memcpy - //====================================================================/ - - //=== Initializing the Memory Matrix and pointers to it =============// - //Tries to allocate enough space for the whole memory matrix - - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; -// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - // for Lyra2REv2, nCols = 4, v1 was using 8 - const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 - : BLOCK_LEN_BLAKE2_SAFE_BYTES; - uint64_t *ptrWord = wholeMatrix; - -// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows ); - - //=== Getting the password + salt + basil padded with 10*1 ==========// - //OBS.:The memory matrix will temporarily hold the password: not for saving memory, - //but this ensures that the password copied locally will be overwritten as soon as possible - - //First, we clean enough blocks for the password, salt, basil and padding - int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) ) - / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; - - byte *ptrByte = (byte*) wholeMatrix; - - //Prepends the password - memcpy(ptrByte, pwd, pwdlen); - ptrByte += pwdlen; - - //Concatenates the salt - memcpy(ptrByte, salt, saltlen); - ptrByte += saltlen; - - memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - - (saltlen + pwdlen) ); - - //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = pwdlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = saltlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = timeCost; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nRows; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nCols; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - - //Now comes the padding - *ptrByte = 0x80; //first byte of padding: right after the password - ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix - ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block - *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block - -// from here on it's all simd acces to state and matrix -// define vector pointers and adjust sizes and pointer offsets - - //================= Initializing the Sponge State ====================// - //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - - initState( state ); - - //========================= Setup Phase =============================// - //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits - - ptrWord = wholeMatrix; - for (i = 0; i < nBlocksInput; i++) - { - absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) - } - //Initializes M[0] and M[1] - reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here - - reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], - nCols); - - do - { - //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - - reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); - - //updates the value of row* (deterministically picked during Setup)) - rowa = (rowa + step) & (window - 1); - //update prev: it now points to the last row ever computed +int LYRA2REV2(uint64_t *wholeMatrix, void *K, uint64_t kLen, const void *pwd, + const uint64_t pwdlen, const void *salt, const uint64_t saltlen, + const uint64_t timeCost, const uint64_t nRows, + const uint64_t nCols) { + //====================== Basic variables ============================// + uint64_t _ALIGN(256) state[16]; + int64_t row = 2; // index of row to be processed + int64_t prev = 1; // index of prev (last row ever computed/modified) + int64_t rowa = 0; // index of row* (a previous row, deterministically picked + // during Setup and randomly picked while Wandering) + int64_t tau; // Time Loop iterator + int64_t step = 1; // Visitation step (used during Setup and Wandering phases) + int64_t window = 2; // Visitation window (used to define which rows can be + // revisited during Setup) + int64_t gap = 1; // Modifier to the step, assuming the values 1 or -1 + // int64_t i; //auxiliary iteration counter + int64_t v64; // 64bit var for memcpy + //====================================================================/ + + //=== Initializing the Memory Matrix and pointers to it =============// + // Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + // const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = + (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; + uint64_t *ptrWord = wholeMatrix; + + // memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows ); + + //=== Getting the password + salt + basil padded with 10*1 ==========// + // OBS.:The memory matrix will temporarily hold the password: not for saving + // memory, but this ensures that the password copied locally will be + // overwritten as soon as possible + + // First, we clean enough blocks for the password, salt, basil and padding + int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / + BLOCK_LEN_BLAKE2_SAFE_BYTES) + + 1; + + byte *ptrByte = (byte *)wholeMatrix; + + // Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + // Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + memset(ptrByte, 0, + nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)); + + // Concatenates the basil: every integer passed as parameter, in the order + // they are provided by the interface + memcpy(ptrByte, &kLen, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = pwdlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = saltlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = timeCost; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nRows; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nCols; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + + // Now comes the padding + *ptrByte = 0x80; // first byte of padding: right after the password + ptrByte = (byte *) + wholeMatrix; // resets the pointer to the start of the memory matrix + ptrByte += + nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - + 1; // sets the pointer to the correct position: end of incomplete block + *ptrByte ^= + 0x01; // last byte of padding: at the end of the last incomplete block + + // from here on it's all simd acces to state and matrix + // define vector pointers and adjust sizes and pointer offsets + + //================= Initializing the Sponge State ====================// + // Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate + // (b) and the remainder for the capacity (c) + + // initState( state ); + + //========================= Setup Phase =============================// + // Absorbing salt, password and basil: this is the only place in which the + // block length is hard-coded to 512 bits + + ptrWord = wholeMatrix; + + absorbBlockBlake2Safe(state, ptrWord, nBlocksInput, BLOCK_LEN); + /* + for (i = 0; i < nBlocksInput; i++) + { + absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of + pad(pwd || salt || basil) ptrWord += BLOCK_LEN; //goes to next block of + pad(pwd || salt || basil) + } + */ + + // Initializes M[0] and M[1] + reducedSqueezeRow0( + state, &wholeMatrix[0], + nCols); // The locally copied password is most likely overwritten here + + reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols); + + do { + // M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + + reducedDuplexRowSetup(state, &wholeMatrix[prev * ROW_LEN_INT64], + &wholeMatrix[rowa * ROW_LEN_INT64], + &wholeMatrix[row * ROW_LEN_INT64], nCols); + + // updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + // update prev: it now points to the last row ever computed + + prev = row; + // updates row: goes to the next row to be computed + row++; + + // Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; // changes the step: approximately doubles its value + window *= 2; // doubles the size of the re-visitation window + gap = -gap; // inverts the modifier to the step + } + } while (row < nRows); + + //===================== Wandering Phase =============================// + row = 0; // Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + // Step is approximately half the number of all rows of the memory matrix + // for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + // Selects a pseudorandom index row* + //----------------------------------------------- + rowa = state[0] & + (unsigned int)(nRows - 1); //(USE THIS IF nRows IS A POWER OF 2) + + // rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------- + + // Performs a reduced-round duplexing operation over M[row*] XOR M[prev], + // updating both M[row*] and M[row] + reducedDuplexRow(state, &wholeMatrix[prev * ROW_LEN_INT64], + &wholeMatrix[rowa * ROW_LEN_INT64], + &wholeMatrix[row * ROW_LEN_INT64], nCols); + // update prev: it now points to the last row ever computed prev = row; - //updates row: goes to the next row to be computed - row++; - //Checks if all rows in the window where visited. - if (rowa == 0) - { - step = window + gap; //changes the step: approximately doubles its value - window *= 2; //doubles the size of the re-visitation window - gap = -gap; //inverts the modifier to the step - } + // updates row: goes to the next row to be computed + //---------------------------------------------------- + row = (row + step) & + (unsigned int)(nRows - 1); //(USE THIS IF nRows IS A POWER OF 2) + // row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //---------------------------------------------------- - } while (row < nRows); - - //===================== Wandering Phase =============================// - row = 0; //Resets the visitation to the first row of the memory matrix - for (tau = 1; tau <= timeCost; tau++) - { - //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 - step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; - do - { - //Selects a pseudorandom index row* - //----------------------------------------------- - rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - - //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //------------------------------------------- - - //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); - //update prev: it now points to the last row ever computed - prev = row; - - //updates row: goes to the next row to be computed - //---------------------------------------------------- - row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //---------------------------------------------------- - - } while (row != 0); - } - - //===================== Wrap-up Phase ===============================// - //Absorbs the last block of the memory matrix - absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]); - //Squeezes the key - squeeze(state, K, (unsigned int) kLen); - - return 0; + } while (row != 0); + } + + //===================== Wrap-up Phase ===============================// + // Absorbs the last block of the memory matrix + absorbBlock(state, &wholeMatrix[rowa * ROW_LEN_INT64]); + // Squeezes the key + squeeze(state, K, (unsigned int)kLen); + + return 0; } ///////////////////////////////////////////////// -int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, - const uint64_t pwdlen, const void *salt, const uint64_t saltlen, - const uint64_t timeCost, const uint64_t nRows, - const uint64_t nCols ) -{ - //====================== Basic variables ============================// - uint64_t _ALIGN(256) state[16]; - int64_t row = 2; //index of row to be processed - int64_t prev = 1; //index of prev (last row ever computed/modified) - int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) - int64_t tau; //Time Loop iterator - int64_t step = 1; //Visitation step (used during Setup and Wandering phases) - int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) - int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 - int64_t i; //auxiliary iteration counter - int64_t v64; // 64bit var for memcpy - uint64_t instance = 0; - //====================================================================/ - - //=== Initializing the Memory Matrix and pointers to it =============// - //Tries to allocate enough space for the whole memory matrix - - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; -// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64; -/* - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; -// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - // for Lyra2REv2, nCols = 4, v1 was using 8 - const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 - : BLOCK_LEN_BLAKE2_SAFE_BYTES; -*/ - - uint64_t *ptrWord = wholeMatrix; - -// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows ); - - //=== Getting the password + salt + basil padded with 10*1 ==========// - //OBS.:The memory matrix will temporarily hold the password: not for saving memory, - //but this ensures that the password copied locally will be overwritten as soon as possible - - //First, we clean enough blocks for the password, salt, basil and padding - int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) ) - / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; - - byte *ptrByte = (byte*) wholeMatrix; - - //Prepends the password - memcpy(ptrByte, pwd, pwdlen); - ptrByte += pwdlen; - - //Concatenates the salt - memcpy(ptrByte, salt, saltlen); - ptrByte += saltlen; - - memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - - (saltlen + pwdlen) ); - - //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = pwdlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = saltlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = timeCost; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nRows; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nCols; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - - //Now comes the padding - *ptrByte = 0x80; //first byte of padding: right after the password - ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix - ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block - *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block - -// from here on it's all simd acces to state and matrix -// define vector pointers and adjust sizes and pointer offsets - - //================= Initializing the Sponge State ====================// - //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - - initState( state ); - - //========================= Setup Phase =============================// - //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits - - ptrWord = wholeMatrix; - for (i = 0; i < nBlocksInput; i++) - { - absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) - } - //Initializes M[0] and M[1] - reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here - - reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], - nCols); - - do - { - //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - - reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); - - //updates the value of row* (deterministically picked during Setup)) - rowa = (rowa + step) & (window - 1); - //update prev: it now points to the last row ever computed +int LYRA2REV3(uint64_t *wholeMatrix, void *K, uint64_t kLen, const void *pwd, + const uint64_t pwdlen, const void *salt, const uint64_t saltlen, + const uint64_t timeCost, const uint64_t nRows, + const uint64_t nCols) { + //====================== Basic variables ============================// + uint64_t _ALIGN(256) state[16]; + int64_t row = 2; // index of row to be processed + int64_t prev = 1; // index of prev (last row ever computed/modified) + int64_t rowa = 0; // index of row* (a previous row, deterministically picked + // during Setup and randomly picked while Wandering) + int64_t tau; // Time Loop iterator + int64_t step = 1; // Visitation step (used during Setup and Wandering phases) + int64_t window = 2; // Visitation window (used to define which rows can be + // revisited during Setup) + int64_t gap = 1; // Modifier to the step, assuming the values 1 or -1 + // int64_t i; //auxiliary iteration counter + int64_t v64; // 64bit var for memcpy + uint64_t instance = 0; + //====================================================================/ + + //=== Initializing the Memory Matrix and pointers to it =============// + // Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + // const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64; + /* + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + // const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 + : BLOCK_LEN_BLAKE2_SAFE_BYTES; + */ + + uint64_t *ptrWord = wholeMatrix; + + // memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows ); + + //=== Getting the password + salt + basil padded with 10*1 ==========// + // OBS.:The memory matrix will temporarily hold the password: not for saving + // memory, but this ensures that the password copied locally will be + // overwritten as soon as possible + + // First, we clean enough blocks for the password, salt, basil and padding + int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / + BLOCK_LEN_BLAKE2_SAFE_BYTES) + + 1; + + byte *ptrByte = (byte *)wholeMatrix; + + // Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + // Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + memset(ptrByte, 0, + nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)); + + // Concatenates the basil: every integer passed as parameter, in the order + // they are provided by the interface + memcpy(ptrByte, &kLen, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = pwdlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = saltlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = timeCost; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nRows; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nCols; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + + // Now comes the padding + *ptrByte = 0x80; // first byte of padding: right after the password + ptrByte = (byte *) + wholeMatrix; // resets the pointer to the start of the memory matrix + ptrByte += + nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - + 1; // sets the pointer to the correct position: end of incomplete block + *ptrByte ^= + 0x01; // last byte of padding: at the end of the last incomplete block + + // from here on it's all simd acces to state and matrix + // define vector pointers and adjust sizes and pointer offsets + + //================= Initializing the Sponge State ====================// + // Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate + // (b) and the remainder for the capacity (c) + + // initState( state ); + + //========================= Setup Phase =============================// + // Absorbing salt, password and basil: this is the only place in which the + // block length is hard-coded to 512 bits + + ptrWord = wholeMatrix; + + absorbBlockBlake2Safe(state, ptrWord, nBlocksInput, BLOCK_LEN); + /* + for (i = 0; i < nBlocksInput; i++) + { + absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of + pad(pwd || salt || basil) ptrWord += BLOCK_LEN; //goes to next block of + pad(pwd || salt || basil) + } + */ + // Initializes M[0] and M[1] + reducedSqueezeRow0( + state, &wholeMatrix[0], + nCols); // The locally copied password is most likely overwritten here + + reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols); + do { + // M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + + reducedDuplexRowSetup(state, &wholeMatrix[prev * ROW_LEN_INT64], + &wholeMatrix[rowa * ROW_LEN_INT64], + &wholeMatrix[row * ROW_LEN_INT64], nCols); + + // updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + // update prev: it now points to the last row ever computed + + prev = row; + // updates row: goes to the next row to be computed + row++; + + // Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; // changes the step: approximately doubles its value + window *= 2; // doubles the size of the re-visitation window + gap = -gap; // inverts the modifier to the step + } + } while (row < nRows); + + //===================== Wandering Phase =============================// + row = 0; // Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + // Step is approximately half the number of all rows of the memory matrix + // for an odd tau; otherwise, it is -1 + step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1; + // step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + // Selects a pseudorandom index row* + //----------------------------------------------- + instance = state[instance & 0xF]; + rowa = state[instance & 0xF] & (unsigned int)(nRows - 1); + // rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF + // nRows IS A POWER OF 2) + + // rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------- + + // Performs a reduced-round duplexing operation over M[row*] XOR M[prev], + // updating both M[row*] and M[row] + reducedDuplexRow(state, &wholeMatrix[prev * ROW_LEN_INT64], + &wholeMatrix[rowa * ROW_LEN_INT64], + &wholeMatrix[row * ROW_LEN_INT64], nCols); + // update prev: it now points to the last row ever computed prev = row; - //updates row: goes to the next row to be computed - row++; - //Checks if all rows in the window where visited. - if (rowa == 0) - { - step = window + gap; //changes the step: approximately doubles its value - window *= 2; //doubles the size of the re-visitation window - gap = -gap; //inverts the modifier to the step - } + // updates row: goes to the next row to be computed + //---------------------------------------------------- + row = (row + step) & + (unsigned int)(nRows - 1); //(USE THIS IF nRows IS A POWER OF 2) + // row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //---------------------------------------------------- - } while (row < nRows); - - //===================== Wandering Phase =============================// - row = 0; //Resets the visitation to the first row of the memory matrix - for (tau = 1; tau <= timeCost; tau++) - { - //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 - step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1; -// step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; - do - { - //Selects a pseudorandom index row* - //----------------------------------------------- - instance = state[instance & 0xF]; - rowa = state[instance & 0xF] & (unsigned int)(nRows-1); -// rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - - //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //------------------------------------------- - - //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); - //update prev: it now points to the last row ever computed - prev = row; - - //updates row: goes to the next row to be computed - //---------------------------------------------------- - row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //---------------------------------------------------- - - } while (row != 0); - } - - //===================== Wrap-up Phase ===============================// - //Absorbs the last block of the memory matrix - absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]); - //Squeezes the key - squeeze(state, K, (unsigned int) kLen); - - return 0; -} + } while (row != 0); + } + //===================== Wrap-up Phase ===============================// + // Absorbs the last block of the memory matrix + absorbBlock(state, &wholeMatrix[rowa * ROW_LEN_INT64]); + // Squeezes the key + squeeze(state, K, (unsigned int)kLen); + return 0; +} ////////////////////////////////////////////////// -int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, - const uint64_t pwdlen, const void *salt, const uint64_t saltlen, - const uint64_t timeCost, const uint64_t nRows, - const uint64_t nCols ) -{ - //========================== Basic variables ============================// - uint64_t _ALIGN(256) state[16]; - int64_t row = 2; //index of row to be processed - int64_t prev = 1; //index of prev (last row ever computed/modified) - int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) - int64_t tau; //Time Loop iterator - int64_t step = 1; //Visitation step (used during Setup and Wandering phases) - int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) - int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 - int64_t i; //auxiliary iteration counter - //=======================================================================/ - - //======= Initializing the Memory Matrix and pointers to it =============// - //Tries to allocate enough space for the whole memory matrix - - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; -// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - -// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows ); - - //==== Getting the password + salt + basil padded with 10*1 ============// - //OBS.:The memory matrix will temporarily hold the password: not for saving memory, - //but this ensures that the password copied locally will be overwritten as soon as possible - - //First, we clean enough blocks for the password, salt, basil and padding - uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * - sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; - byte *ptrByte = (byte*) wholeMatrix; - memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES ); - - //Prepends the password - memcpy(ptrByte, pwd, pwdlen); - ptrByte += pwdlen; - - //Concatenates the salt - memcpy(ptrByte, salt, saltlen); - ptrByte += saltlen; - //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &saltlen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &timeCost, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &nRows, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &nCols, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - - //Now comes the padding - *ptrByte = 0x80; //first byte of padding: right after the password - ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix - ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block - *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block - - //=================== Initializing the Sponge State ====================// - //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) -// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32); -// if (state == NULL) { -// return -1; -// } - initState( state ); - - //============================== Setup Phase =============================// - //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits - uint64_t *ptrWord = wholeMatrix; - for ( i = 0; i < nBlocksInput; i++ ) - { - absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil) +int LYRA2Z(uint64_t *wholeMatrix, void *K, uint64_t kLen, const void *pwd, + const uint64_t pwdlen, const void *salt, const uint64_t saltlen, + const uint64_t timeCost, const uint64_t nRows, + const uint64_t nCols) { + //========================== Basic variables ============================// + uint64_t _ALIGN(256) state[16]; + int64_t row = 2; // index of row to be processed + int64_t prev = 1; // index of prev (last row ever computed/modified) + int64_t rowa = 0; // index of row* (a previous row, deterministically picked + // during Setup and randomly picked while Wandering) + int64_t tau; // Time Loop iterator + int64_t step = 1; // Visitation step (used during Setup and Wandering phases) + int64_t window = 2; // Visitation window (used to define which rows can be + // revisited during Setup) + int64_t gap = 1; // Modifier to the step, assuming the values 1 or -1 + // int64_t i; //auxiliary iteration counter + //=======================================================================/ + + //======= Initializing the Memory Matrix and pointers to it =============// + // Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + // const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + + // memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows ); + + //==== Getting the password + salt + basil padded with 10*1 ============// + // OBS.:The memory matrix will temporarily hold the password: not for saving + // memory, but this ensures that the password copied locally will be + // overwritten as soon as possible + + // First, we clean enough blocks for the password, salt, basil and padding + uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / + BLOCK_LEN_BLAKE2_SAFE_BYTES) + + 1; + byte *ptrByte = (byte *)wholeMatrix; + memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); + + // Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + // Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + // Concatenates the basil: every integer passed as parameter, in the order + // they are provided by the interface + memcpy(ptrByte, &kLen, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &pwdlen, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &saltlen, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &timeCost, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &nRows, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &nCols, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + + // Now comes the padding + *ptrByte = 0x80; // first byte of padding: right after the password + ptrByte = (byte *) + wholeMatrix; // resets the pointer to the start of the memory matrix + ptrByte += + nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - + 1; // sets the pointer to the correct position: end of incomplete block + *ptrByte ^= + 0x01; // last byte of padding: at the end of the last incomplete block + + //=================== Initializing the Sponge State ====================// + // Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate + // (b) and the remainder for the capacity (c) + // uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32); + // if (state == NULL) { + // return -1; + // } + // initState( state ); + + //============================== Setup Phase =============================// + // Absorbing salt, password and basil: this is the only place in which the + // block length is hard-coded to 512 bits + uint64_t *ptrWord = wholeMatrix; + + absorbBlockBlake2Safe(state, ptrWord, nBlocksInput, + BLOCK_LEN_BLAKE2_SAFE_INT64); + /* + for ( i = 0; i < nBlocksInput; i++ ) + { + absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd + || salt || basil) ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next + block of pad(pwd || salt || basil) + } + */ + // Initializes M[0] and M[1] + reducedSqueezeRow0( + state, &wholeMatrix[0], + nCols); // The locally copied password is most likely overwritten here + reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols); + + do { + // M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + reducedDuplexRowSetup(state, &wholeMatrix[prev * ROW_LEN_INT64], + &wholeMatrix[rowa * ROW_LEN_INT64], + &wholeMatrix[row * ROW_LEN_INT64], nCols); + + // updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + // update prev: it now points to the last row ever computed + prev = row; + // updates row: goes to the next row to be computed + row++; + + // Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; // changes the step: approximately doubles its value + window *= 2; // doubles the size of the re-visitation window + gap = -gap; // inverts the modifier to the step } - //Initializes M[0] and M[1] - reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here - reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols); - - do { - //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols); - - //updates the value of row* (deterministically picked during Setup)) - rowa = (rowa + step) & (window - 1); - //update prev: it now points to the last row ever computed - prev = row; - //updates row: goes to the next row to be computed - row++; - - //Checks if all rows in the window where visited. - if (rowa == 0) { - step = window + gap; //changes the step: approximately doubles its value - window *= 2; //doubles the size of the re-visitation window - gap = -gap; //inverts the modifier to the step - } - - } while (row < nRows); - - //======================== Wandering Phase =============================// - row = 0; //Resets the visitation to the first row of the memory matrix - for ( tau = 1; tau <= timeCost; tau++ ) - { - //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 - step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; - do { - //Selects a pseudorandom index row* - //---------------------------------------------------------------------- - //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //----------------------------------------------------------------- - - //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols); - - //update prev: it now points to the last row ever computed - prev = row; - - //updates row: goes to the next row to be computed - //--------------------------------------------------------------- - //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //-------------------------------------------------------------------- - - } while (row != 0); - } + } while (row < nRows); + + //======================== Wandering Phase =============================// + row = 0; // Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + // Step is approximately half the number of all rows of the memory matrix + // for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + // Selects a pseudorandom index row* + //---------------------------------------------------------------------- + // rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A + // POWER OF 2) + rowa = ((uint64_t)(state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //----------------------------------------------------------------- + + // Performs a reduced-round duplexing operation over M[row*] XOR M[prev], + // updating both M[row*] and M[row] + reducedDuplexRow(state, &wholeMatrix[prev * ROW_LEN_INT64], + &wholeMatrix[rowa * ROW_LEN_INT64], + &wholeMatrix[row * ROW_LEN_INT64], nCols); + + // update prev: it now points to the last row ever computed + prev = row; + + // updates row: goes to the next row to be computed + //--------------------------------------------------------------- + // row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER + // OF 2) + row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //-------------------------------------------------------------------- - //========================= Wrap-up Phase ===============================// - //Absorbs the last block of the memory matrix - absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]); + } while (row != 0); + } - //Squeezes the key - squeeze( state, K, kLen ); + //========================= Wrap-up Phase ===============================// + // Absorbs the last block of the memory matrix + absorbBlock(state, &wholeMatrix[rowa * ROW_LEN_INT64]); - return 0; + // Squeezes the key + squeeze(state, K, kLen); + + return 0; } // Lyra2RE doesn't like the new wholeMatrix implementation -int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, - const void *salt, const uint64_t saltlen, const uint64_t timeCost, - const uint64_t nRows, const uint64_t nCols ) -{ - //====================== Basic variables ============================// - uint64_t _ALIGN(256) state[16]; - int64_t row = 2; //index of row to be processed - int64_t prev = 1; //index of prev (last row ever computed/modified) - int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) - int64_t tau; //Time Loop iterator - int64_t step = 1; //Visitation step (used during Setup and Wandering phases) - int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) - int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 - int64_t i; //auxiliary iteration counter - int64_t v64; // 64bit var for memcpy - //====================================================================/ - - //=== Initializing the Memory Matrix and pointers to it =============// - //Tries to allocate enough space for the whole memory matrix - - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - // for Lyra2REv2, nCols = 4, v1 was using 8 - const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 - : BLOCK_LEN_BLAKE2_SAFE_BYTES; - - i = (int64_t)ROW_LEN_BYTES * nRows; - uint64_t *wholeMatrix = _mm_malloc( i, 64 ); - if (wholeMatrix == NULL) - return -1; +int LYRA2RE(void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, + const void *salt, const uint64_t saltlen, const uint64_t timeCost, + const uint64_t nRows, const uint64_t nCols) { + //====================== Basic variables ============================// + uint64_t _ALIGN(256) state[16]; + int64_t row = 2; // index of row to be processed + int64_t prev = 1; // index of prev (last row ever computed/modified) + int64_t rowa = 0; // index of row* (a previous row, deterministically picked + // during Setup and randomly picked while Wandering) + int64_t tau; // Time Loop iterator + int64_t step = 1; // Visitation step (used during Setup and Wandering phases) + int64_t window = 2; // Visitation window (used to define which rows can be + // revisited during Setup) + int64_t gap = 1; // Modifier to the step, assuming the values 1 or -1 + int64_t i; // auxiliary iteration counter + int64_t v64; // 64bit var for memcpy + //====================================================================/ + + //=== Initializing the Memory Matrix and pointers to it =============// + // Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = + (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; + + i = (int64_t)ROW_LEN_BYTES * nRows; + uint64_t *wholeMatrix = _mm_malloc(i, 64); + if (wholeMatrix == NULL) + return -1; #if defined(__AVX2__) - memset_zero_256( (__m256i*)wholeMatrix, i>>5 ); + memset_zero_256((__m256i *)wholeMatrix, i >> 5); #elif defined(__SSE2__) - memset_zero_128( (__m128i*)wholeMatrix, i>>4 ); + memset_zero_128((__m128i *)wholeMatrix, i >> 4); #else - memset( wholeMatrix, 0, i ); + memset(wholeMatrix, 0, i); #endif - uint64_t *ptrWord = wholeMatrix; - - //=== Getting the password + salt + basil padded with 10*1 ==========// - //OBS.:The memory matrix will temporarily hold the password: not for saving memory, - //but this ensures that the password copied locally will be overwritten as soon as possible - - //First, we clean enough blocks for the password, salt, basil and padding - int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) ) - / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; - - byte *ptrByte = (byte*) wholeMatrix; - - //Prepends the password - memcpy(ptrByte, pwd, pwdlen); - ptrByte += pwdlen; - - //Concatenates the salt - memcpy(ptrByte, salt, saltlen); - ptrByte += saltlen; - -// memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES -// - (saltlen + pwdlen) ); - - //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = pwdlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = saltlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = timeCost; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nRows; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nCols; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - - //Now comes the padding - *ptrByte = 0x80; //first byte of padding: right after the password - ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix - ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block - *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block - - //================= Initializing the Sponge State ====================// - //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - - initState( state ); - - //========================= Setup Phase =============================// - //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits - - ptrWord = wholeMatrix; - for (i = 0; i < nBlocksInput; i++) - { - absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) - } - //Initializes M[0] and M[1] - reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here - - reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], - nCols); - - do - { - //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - - reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); - - //updates the value of row* (deterministically picked during Setup)) - rowa = (rowa + step) & (window - 1); - //update prev: it now points to the last row ever computed + uint64_t *ptrWord = wholeMatrix; + + //=== Getting the password + salt + basil padded with 10*1 ==========// + // OBS.:The memory matrix will temporarily hold the password: not for saving + // memory, but this ensures that the password copied locally will be + // overwritten as soon as possible + + // First, we clean enough blocks for the password, salt, basil and padding + int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / + BLOCK_LEN_BLAKE2_SAFE_BYTES) + + 1; + + byte *ptrByte = (byte *)wholeMatrix; + + // Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + // Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + // memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES + // - (saltlen + pwdlen) ); + + // Concatenates the basil: every integer passed as parameter, in the order + // they are provided by the interface + memcpy(ptrByte, &kLen, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = pwdlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = saltlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = timeCost; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nRows; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nCols; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + + // Now comes the padding + *ptrByte = 0x80; // first byte of padding: right after the password + ptrByte = (byte *) + wholeMatrix; // resets the pointer to the start of the memory matrix + ptrByte += + nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - + 1; // sets the pointer to the correct position: end of incomplete block + *ptrByte ^= + 0x01; // last byte of padding: at the end of the last incomplete block + + //================= Initializing the Sponge State ====================// + // Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate + // (b) and the remainder for the capacity (c) + + // initState( state ); + + //========================= Setup Phase =============================// + // Absorbing salt, password and basil: this is the only place in which the + // block length is hard-coded to 512 bits + + ptrWord = wholeMatrix; + + absorbBlockBlake2Safe(state, ptrWord, nBlocksInput, BLOCK_LEN); + /* + for (i = 0; i < nBlocksInput; i++) + { + absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of + pad(pwd || salt || basil) ptrWord += BLOCK_LEN; //goes to next block of + pad(pwd || salt || basil) + } + */ + // Initializes M[0] and M[1] + reducedSqueezeRow0( + state, &wholeMatrix[0], + nCols); // The locally copied password is most likely overwritten here + + reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols); + + do { + // M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + + reducedDuplexRowSetup(state, &wholeMatrix[prev * ROW_LEN_INT64], + &wholeMatrix[rowa * ROW_LEN_INT64], + &wholeMatrix[row * ROW_LEN_INT64], nCols); + + // updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + // update prev: it now points to the last row ever computed + + prev = row; + // updates row: goes to the next row to be computed + row++; + + // Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; // changes the step: approximately doubles its value + window *= 2; // doubles the size of the re-visitation window + gap = -gap; // inverts the modifier to the step + } + } while (row < nRows); + + //===================== Wandering Phase =============================// + row = 0; // Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + // Step is approximately half the number of all rows of the memory matrix + // for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + // Selects a pseudorandom index row* + //----------------------------------------------- + rowa = state[0] & + (unsigned int)(nRows - 1); //(USE THIS IF nRows IS A POWER OF 2) + + // rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------- + + // Performs a reduced-round duplexing operation over M[row*] XOR M[prev], + // updating both M[row*] and M[row] + reducedDuplexRow(state, &wholeMatrix[prev * ROW_LEN_INT64], + &wholeMatrix[rowa * ROW_LEN_INT64], + &wholeMatrix[row * ROW_LEN_INT64], nCols); + // update prev: it now points to the last row ever computed prev = row; - //updates row: goes to the next row to be computed - row++; - //Checks if all rows in the window where visited. - if (rowa == 0) - { - step = window + gap; //changes the step: approximately doubles its value - window *= 2; //doubles the size of the re-visitation window - gap = -gap; //inverts the modifier to the step - } + // updates row: goes to the next row to be computed + //---------------------------------------------------- + row = (row + step) & + (unsigned int)(nRows - 1); //(USE THIS IF nRows IS A POWER OF 2) + // row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //---------------------------------------------------- - } while (row < nRows); - - //===================== Wandering Phase =============================// - row = 0; //Resets the visitation to the first row of the memory matrix - for (tau = 1; tau <= timeCost; tau++) - { - //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 - step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; - do - { - //Selects a pseudorandom index row* - //----------------------------------------------- - rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - - //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //------------------------------------------- - - //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); - //update prev: it now points to the last row ever computed - prev = row; - - //updates row: goes to the next row to be computed - //---------------------------------------------------- - row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //---------------------------------------------------- - - } while (row != 0); - } - - //===================== Wrap-up Phase ===============================// - //Absorbs the last block of the memory matrix - absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]); - //Squeezes the key - squeeze(state, K, (unsigned int) kLen); - - //================== Freeing the memory =============================// - _mm_free(wholeMatrix); - - return 0; -} + } while (row != 0); + } + + //===================== Wrap-up Phase ===============================// + // Absorbs the last block of the memory matrix + absorbBlock(state, &wholeMatrix[rowa * ROW_LEN_INT64]); + // Squeezes the key + squeeze(state, K, (unsigned int)kLen); + //================== Freeing the memory =============================// + _mm_free(wholeMatrix); + + return 0; +} diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h index 3c2399e..5ab0b81 100644 --- a/algo/lyra2/lyra2.h +++ b/algo/lyra2/lyra2.h @@ -60,4 +60,23 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd, int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, + uint64_t timeCost, uint64_t nRows, uint64_t nCols ); + +int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd, + uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols ); + +int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd, + uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols ); + +int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd, + uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols ); + +int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, + uint64_t timeCost, uint64_t nRows, uint64_t nCols ); + +#endif + #endif /* LYRA2_H_ */ diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c deleted file mode 100644 index a76e68c..0000000 --- a/algo/lyra2/lyra2h-4way.c +++ /dev/null @@ -1,89 +0,0 @@ -#include "lyra2-gate.h" - -#ifdef LYRA2H_4WAY - -#include -#include -#include "lyra2.h" -//#include "algo/blake/sph_blake.h" -#include "algo/blake/blake-hash-4way.h" - -__thread uint64_t* lyra2h_4way_matrix; - -bool lyra2h_4way_thread_init() -{ - return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) ); -} - -static __thread blake256_4way_context l2h_4way_blake_mid; - -void lyra2h_4way_midstate( const void* input ) -{ - blake256_4way_init( &l2h_4way_blake_mid ); - blake256_4way( &l2h_4way_blake_mid, input, 64 ); -} - -void lyra2h_4way_hash( void *state, const void *input ) -{ - uint32_t hash0[8] __attribute__ ((aligned (64))); - uint32_t hash1[8] __attribute__ ((aligned (64))); - uint32_t hash2[8] __attribute__ ((aligned (64))); - uint32_t hash3[8] __attribute__ ((aligned (64))); - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - blake256_4way_context ctx_blake __attribute__ ((aligned (64))); - - memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid ); - blake256_4way( &ctx_blake, input + (64*4), 16 ); - blake256_4way_close( &ctx_blake, vhash ); - - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); - - LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32, - 16, 16, 16 ); - LYRA2Z( lyra2h_4way_matrix, state+32, 32, hash1, 32, hash1, - 32, 16, 16, 16 ); - LYRA2Z( lyra2h_4way_matrix, state+64, 32, hash2, 32, hash2, - 32, 16, 16, 16 ); - LYRA2Z( lyra2h_4way_matrix, state+96, 32, hash3, 32, hash3, - 32, 16, 16, 16 ); -} - -int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ptarget[7] = 0x0000ff; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - lyra2h_4way_midstate( vdata ); - - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); - lyra2h_4way_hash( hash, vdata ); - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) - && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - diff --git a/algo/lyra2/lyra2h.c b/algo/lyra2/lyra2h.c deleted file mode 100644 index 27b5a53..0000000 --- a/algo/lyra2/lyra2h.c +++ /dev/null @@ -1,73 +0,0 @@ -#include "lyra2-gate.h" -#include -#include -#include "lyra2.h" -#include "algo/blake/sph_blake.h" - -__thread uint64_t* lyra2h_matrix; - -bool lyra2h_thread_init() -{ - lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ); - return lyra2h_matrix; -} - -static __thread sph_blake256_context lyra2h_blake_mid; - -void lyra2h_midstate( const void* input ) -{ - sph_blake256_init( &lyra2h_blake_mid ); - sph_blake256( &lyra2h_blake_mid, input, 64 ); -} - -void lyra2h_hash( void *state, const void *input ) -{ - uint32_t _ALIGN(64) hash[16]; - - sph_blake256_context ctx_blake __attribute__ ((aligned (64))); - - memcpy( &ctx_blake, &lyra2h_blake_mid, sizeof lyra2h_blake_mid ); - sph_blake256( &ctx_blake, input + 64, 16 ); - sph_blake256_close( &ctx_blake, hash ); - - LYRA2Z( lyra2h_matrix, hash, 32, hash, 32, hash, 32, 16, 16, 16 ); - - memcpy(state, hash, 32); -} - -int scanhash_lyra2h( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) hash[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ptarget[7] = 0x0000ff; - - for (int i=0; i < 19; i++) { - be32enc(&endiandata[i], pdata[i]); - } - - lyra2h_midstate( endiandata ); - do { - be32enc(&endiandata[19], nonce); - lyra2h_hash( hash, endiandata ); - - if ( hash[7] <= Htarg ) - if ( fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while (nonce < max_nonce && !work_restart[thr_id].restart); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} diff --git a/algo/lyra2/lyra2re.c b/algo/lyra2/lyra2re.c deleted file mode 100644 index 5766b79..0000000 --- a/algo/lyra2/lyra2re.c +++ /dev/null @@ -1,136 +0,0 @@ -#include - -#include "algo/blake/sph_blake.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/skein/sph_skein.h" -#include "algo/keccak/sph_keccak.h" -#include "lyra2.h" -#include "algo-gate-api.h" -#include "simd-utils.h" -#if defined(__AES__) - #include "algo/groestl/aes_ni/hash-groestl256.h" -#endif - -//__thread uint64_t* lyra2re_wholeMatrix; - -typedef struct { - sph_blake256_context blake; - sph_keccak256_context keccak; - sph_skein256_context skein; -#if defined(__AES__) - hashState_groestl256 groestl; -#else - sph_groestl256_context groestl; -#endif -} lyra2re_ctx_holder; - -lyra2re_ctx_holder lyra2re_ctx; -static __thread sph_blake256_context lyra2_blake_mid; - -void init_lyra2re_ctx() -{ - sph_blake256_init(&lyra2re_ctx.blake); - sph_keccak256_init(&lyra2re_ctx.keccak); - sph_skein256_init(&lyra2re_ctx.skein); -#if defined(__AES__) - init_groestl256( &lyra2re_ctx.groestl, 32 ); -#else - sph_groestl256_init(&lyra2re_ctx.groestl); -#endif -} - -void lyra2_blake256_midstate( const void* input ) -{ - memcpy( &lyra2_blake_mid, &lyra2re_ctx.blake, sizeof lyra2_blake_mid ); - sph_blake256( &lyra2_blake_mid, input, 64 ); -} - -void lyra2re_hash(void *state, const void *input) -{ - lyra2re_ctx_holder ctx __attribute__ ((aligned (64))) ; - memcpy(&ctx, &lyra2re_ctx, sizeof(lyra2re_ctx)); - - uint8_t _ALIGN(64) hash[32*8]; - #define hashA hash - #define hashB hash+16 - - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - memcpy( &ctx.blake, &lyra2_blake_mid, sizeof lyra2_blake_mid ); - sph_blake256( &ctx.blake, input + midlen, tail ); - - sph_blake256_close(&ctx.blake, hashA); - - sph_keccak256(&ctx.keccak, hashA, 32); - sph_keccak256_close(&ctx.keccak, hashB); - - LYRA2RE( hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); -// LYRA2RE( lyra2re_wholeMatrix, hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); - - sph_skein256(&ctx.skein, hashA, 32); - sph_skein256_close(&ctx.skein, hashB); - -#if defined(__AES__) - update_and_final_groestl256( &ctx.groestl, hashA, hashB, 256 ); -#else - sph_groestl256( &ctx.groestl, hashB, 32 ); - sph_groestl256_close( &ctx.groestl, hashA ); -#endif - - memcpy(state, hashA, 32); -} - -int scanhash_lyra2re( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t hash[8] __attribute__((aligned(64))); - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - swab32_array( endiandata, pdata, 20 ); - - lyra2_blake256_midstate( endiandata ); - - do { - be32enc(&endiandata[19], nonce); - lyra2re_hash(hash, endiandata); - if ( hash[7] <= Htarg ) - if ( fulltest(hash, ptarget) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while (nonce < max_nonce && !work_restart[thr_id].restart); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -int64_t lyra2re_get_max64 () -{ - return 0xffffLL; -} - -void lyra2re_set_target ( struct work* work, double job_diff ) -{ - work_set_target(work, job_diff / (128.0 * opt_diff_factor) ); -} - -bool register_lyra2re_algo( algo_gate_t* gate ) -{ - init_lyra2re_ctx(); - gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT; - gate->scanhash = (void*)&scanhash_lyra2re; - gate->hash = (void*)&lyra2re_hash; - gate->get_max64 = (void*)&lyra2re_get_max64; - gate->set_target = (void*)&lyra2re_set_target; - return true; -}; - diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c deleted file mode 100644 index 9832fb1..0000000 --- a/algo/lyra2/lyra2rev2-4way.c +++ /dev/null @@ -1,132 +0,0 @@ -#include "lyra2-gate.h" -#include - -#if defined (LYRA2REV2_4WAY) - -#include "algo/blake/blake-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/cubehash/cubehash_sse2.h" - -typedef struct { - blake256_4way_context blake; - keccak256_4way_context keccak; - cubehashParam cube; - skein256_4way_context skein; - bmw256_4way_context bmw; -} lyra2v2_4way_ctx_holder; - -static lyra2v2_4way_ctx_holder l2v2_4way_ctx; - -bool init_lyra2rev2_4way_ctx() -{ - keccak256_4way_init( &l2v2_4way_ctx.keccak ); - cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 ); - skein256_4way_init( &l2v2_4way_ctx.skein ); - bmw256_4way_init( &l2v2_4way_ctx.bmw ); - return true; -} - -void lyra2rev2_4way_hash( void *state, const void *input ) -{ - uint32_t hash0[8] __attribute__ ((aligned (64))); - uint32_t hash1[8] __attribute__ ((aligned (32))); - uint32_t hash2[8] __attribute__ ((aligned (32))); - uint32_t hash3[8] __attribute__ ((aligned (32))); - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhash64[4*4] __attribute__ ((aligned (64))); - lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) ); - - blake256_4way( &ctx.blake, input + (64<<2), 16 ); - blake256_4way_close( &ctx.blake, vhash ); - - rintrlv_4x32_4x64( vhash64, vhash, 256 ); - - keccak256_4way( &ctx.keccak, vhash64, 32 ); - keccak256_4way_close( &ctx.keccak, vhash64 ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); - - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 ); - - LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 ); - LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 ); - LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 ); - LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); - - intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 ); - - skein256_4way( &ctx.skein, vhash64, 32 ); - skein256_4way_close( &ctx.skein, vhash64 ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); - - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 ); - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 ); - - bmw256_4way( &ctx.bmw, vhash, 32 ); - bmw256_4way_close( &ctx.bmw, state ); -} - -int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[7<<2]); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - const uint32_t Htarg = ptarget[7]; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ( (uint32_t*)ptarget )[7] = 0x0000ff; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - - blake256_4way_init( &l2v2_4way_ctx.blake ); - blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 ); - - do - { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); - - lyra2rev2_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg ) - { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/lyra2/lyra2rev2.c b/algo/lyra2/lyra2rev2.c deleted file mode 100644 index 618c045..0000000 --- a/algo/lyra2/lyra2rev2.c +++ /dev/null @@ -1,110 +0,0 @@ -#include "lyra2-gate.h" -#include -#include "algo/blake/sph_blake.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/cubehash/cubehash_sse2.h" -//#include "lyra2.h" - -typedef struct { - cubehashParam cube1; - cubehashParam cube2; - sph_blake256_context blake; - sph_keccak256_context keccak; - sph_skein256_context skein; - sph_bmw256_context bmw; - -} lyra2v2_ctx_holder; - -static lyra2v2_ctx_holder lyra2v2_ctx; -static __thread sph_blake256_context l2v2_blake_mid; - -bool init_lyra2rev2_ctx() -{ - cubehashInit( &lyra2v2_ctx.cube1, 256, 16, 32 ); - cubehashInit( &lyra2v2_ctx.cube2, 256, 16, 32 ); - sph_blake256_init( &lyra2v2_ctx.blake ); - sph_keccak256_init( &lyra2v2_ctx.keccak ); - sph_skein256_init( &lyra2v2_ctx.skein ); - sph_bmw256_init( &lyra2v2_ctx.bmw ); - return true; -} - -void l2v2_blake256_midstate( const void* input ) -{ - memcpy( &l2v2_blake_mid, &lyra2v2_ctx.blake, sizeof l2v2_blake_mid ); - sph_blake256( &l2v2_blake_mid, input, 64 ); -} - -void lyra2rev2_hash( void *state, const void *input ) -{ - lyra2v2_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &lyra2v2_ctx, sizeof(lyra2v2_ctx) ); - uint8_t hash[128] __attribute__ ((aligned (64))); - #define hashA hash - #define hashB hash+64 - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - memcpy( &ctx.blake, &l2v2_blake_mid, sizeof l2v2_blake_mid ); - sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail ); - sph_blake256_close( &ctx.blake, hashA ); - - sph_keccak256( &ctx.keccak, hashA, 32 ); - sph_keccak256_close(&ctx.keccak, hashB); - - cubehashUpdateDigest( &ctx.cube1, (byte*) hashA, - (const byte*) hashB, 32 ); - - LYRA2REV2( l2v2_wholeMatrix, hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 ); - - sph_skein256( &ctx.skein, hashA, 32 ); - sph_skein256_close( &ctx.skein, hashB ); - - cubehashUpdateDigest( &ctx.cube2, (byte*) hashA, - (const byte*) hashB, 32 ); - - sph_bmw256( &ctx.bmw, hashA, 32 ); - sph_bmw256_close( &ctx.bmw, hashB ); - - memcpy( state, hashB, 32 ); -} - -int scanhash_lyra2rev2( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t endiandata[20] __attribute__ ((aligned (64))); - uint32_t hash[8] __attribute__((aligned(64))); - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - swab32_array( endiandata, pdata, 20 ); - - l2v2_blake256_midstate( endiandata ); - - do { - be32enc(&endiandata[19], nonce); - lyra2rev2_hash(hash, endiandata); - - if (hash[7] <= Htarg ) - if( fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !work_restart[thr_id].restart ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c deleted file mode 100644 index 389aebf..0000000 --- a/algo/lyra2/lyra2rev3-4way.c +++ /dev/null @@ -1,230 +0,0 @@ -#include "lyra2-gate.h" -#include - -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/cubehash/cubehash_sse2.h" - - -#if defined (LYRA2REV3_8WAY) - -typedef struct { - blake256_8way_context blake; - cubehashParam cube; - bmw256_8way_context bmw; -} lyra2v3_8way_ctx_holder; - -static lyra2v3_8way_ctx_holder l2v3_8way_ctx; - -bool init_lyra2rev3_8way_ctx() -{ - blake256_8way_init( &l2v3_8way_ctx.blake ); - cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 ); - bmw256_8way_init( &l2v3_8way_ctx.bmw ); - return true; -} - -void lyra2rev3_8way_hash( void *state, const void *input ) -{ - uint32_t vhash[8*8] __attribute__ ((aligned (64))); - uint32_t hash0[8] __attribute__ ((aligned (64))); - uint32_t hash1[8] __attribute__ ((aligned (32))); - uint32_t hash2[8] __attribute__ ((aligned (32))); - uint32_t hash3[8] __attribute__ ((aligned (32))); - uint32_t hash4[8] __attribute__ ((aligned (32))); - uint32_t hash5[8] __attribute__ ((aligned (32))); - uint32_t hash6[8] __attribute__ ((aligned (32))); - uint32_t hash7[8] __attribute__ ((aligned (32))); - lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) ); - - blake256_8way( &ctx.blake, input, 80 ); - blake256_8way_close( &ctx.blake, vhash ); - - dintrlv_8x32( hash0, hash1, hash2, hash3, - hash4, hash5, hash6, hash7, vhash, 256 ); - - LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 ); - - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 ); - - LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 ); - - intrlv_8x32( vhash, hash0, hash1, hash2, hash3, - hash4, hash5, hash6, hash7, 256 ); - - bmw256_8way( &ctx.bmw, vhash, 32 ); - bmw256_8way_close( &ctx.bmw, state ); - - } - -int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*8] __attribute__ ((aligned (64))); - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[7<<3]); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - const uint32_t Htarg = ptarget[7]; - __m256i *noncev = (__m256i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ( (uint32_t*)ptarget )[7] = 0x0000ff; - - mm256_bswap32_intrlv80_8x32( vdata, pdata ); - do - { - *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, - n+3, n+2, n+1, n ) ); - - lyra2rev3_8way_hash( hash, vdata ); - pdata[19] = n; - - for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg ) - { - extr_lane_8x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 8; - } while ( (n < max_nonce-8) && !work_restart[thr_id].restart); - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - -#if defined (LYRA2REV3_4WAY) - - -typedef struct { - blake256_4way_context blake; - cubehashParam cube; - bmw256_4way_context bmw; -} lyra2v3_4way_ctx_holder; - -static lyra2v3_4way_ctx_holder l2v3_4way_ctx; - -bool init_lyra2rev3_4way_ctx() -{ - blake256_4way_init( &l2v3_4way_ctx.blake ); - cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 ); - bmw256_4way_init( &l2v3_4way_ctx.bmw ); - return true; -} - -void lyra2rev3_4way_hash( void *state, const void *input ) -{ - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - uint32_t hash0[8] __attribute__ ((aligned (64))); - uint32_t hash1[8] __attribute__ ((aligned (32))); - uint32_t hash2[8] __attribute__ ((aligned (32))); - uint32_t hash3[8] __attribute__ ((aligned (32))); - lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) ); - - blake256_4way( &ctx.blake, input, 80 ); - blake256_4way_close( &ctx.blake, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); - - LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); - - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 ); - - LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 ); - LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 ); - bmw256_4way( &ctx.bmw, vhash, 32 ); - bmw256_4way_close( &ctx.bmw, state ); -} - -int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[7<<2]); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - const uint32_t Htarg = ptarget[7]; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ( (uint32_t*)ptarget )[7] = 0x0000ff; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - do - { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); - - lyra2rev3_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg ) - { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/lyra2/lyra2rev3.c b/algo/lyra2/lyra2rev3.c deleted file mode 100644 index 83380d9..0000000 --- a/algo/lyra2/lyra2rev3.c +++ /dev/null @@ -1,99 +0,0 @@ -#include "lyra2-gate.h" -#include -#include "algo/blake/sph_blake.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/cubehash/cubehash_sse2.h" -//#include "lyra2.h" - -typedef struct { - cubehashParam cube; - sph_blake256_context blake; - sph_bmw256_context bmw; - -} lyra2v3_ctx_holder; - -static lyra2v3_ctx_holder lyra2v3_ctx; -static __thread sph_blake256_context l2v3_blake_mid; - -bool init_lyra2rev3_ctx() -{ - cubehashInit( &lyra2v3_ctx.cube, 256, 16, 32 ); - sph_blake256_init( &lyra2v3_ctx.blake ); - sph_bmw256_init( &lyra2v3_ctx.bmw ); - return true; -} - -void l2v3_blake256_midstate( const void* input ) -{ - memcpy( &l2v3_blake_mid, &lyra2v3_ctx.blake, sizeof l2v3_blake_mid ); - sph_blake256( &l2v3_blake_mid, input, 64 ); -} - -void lyra2rev3_hash( void *state, const void *input ) -{ - lyra2v3_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &lyra2v3_ctx, sizeof(lyra2v3_ctx) ); - uint8_t hash[128] __attribute__ ((aligned (64))); - #define hashA hash - #define hashB hash+64 - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - memcpy( &ctx.blake, &l2v3_blake_mid, sizeof l2v3_blake_mid ); - sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail ); - sph_blake256_close( &ctx.blake, hash ); - - LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 ); - - cubehashUpdateDigest( &ctx.cube, (byte*) hashA, - (const byte*) hash, 32 ); - - LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 ); - - sph_bmw256( &ctx.bmw, hash, 32 ); - sph_bmw256_close( &ctx.bmw, hash ); - - memcpy( state, hash, 32 ); -} - -int scanhash_lyra2rev3( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t endiandata[20] __attribute__ ((aligned (64))); - uint32_t hash[8] __attribute__((aligned(64))); - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - // need big endian data - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - l2v3_blake256_midstate( endiandata ); - do - { - be32enc(&endiandata[19], nonce); - lyra2rev3_hash(hash, endiandata); - - if (hash[7] <= Htarg ) - if( fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !work_restart[thr_id].restart ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c deleted file mode 100644 index cf8ca0c..0000000 --- a/algo/lyra2/lyra2z-4way.c +++ /dev/null @@ -1,184 +0,0 @@ -#include "lyra2-gate.h" - -#ifdef LYRA2Z_4WAY - -#include -#include -#include "lyra2.h" -#include "algo/blake/sph_blake.h" -#include "algo/blake/blake-hash-4way.h" - -__thread uint64_t* lyra2z_4way_matrix; - -bool lyra2z_4way_thread_init() -{ - return ( lyra2z_4way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) ); -} - -static __thread blake256_4way_context l2z_4way_blake_mid; - -void lyra2z_4way_midstate( const void* input ) -{ - blake256_4way_init( &l2z_4way_blake_mid ); - blake256_4way( &l2z_4way_blake_mid, input, 64 ); -} - -void lyra2z_4way_hash( void *state, const void *input ) -{ - uint32_t hash0[8] __attribute__ ((aligned (64))); - uint32_t hash1[8] __attribute__ ((aligned (64))); - uint32_t hash2[8] __attribute__ ((aligned (64))); - uint32_t hash3[8] __attribute__ ((aligned (64))); - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - blake256_4way_context ctx_blake __attribute__ ((aligned (64))); - - memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid ); - blake256_4way( &ctx_blake, input + (64*4), 16 ); - blake256_4way_close( &ctx_blake, vhash ); - - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); - - LYRA2Z( lyra2z_4way_matrix, state , 32, hash0, 32, hash0, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 ); -} - -int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ptarget[7] = 0x0000ff; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - lyra2z_4way_midstate( vdata ); - - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); - - lyra2z_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) - && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - -#if defined(LYRA2Z_8WAY) - -__thread uint64_t* lyra2z_8way_matrix; - -bool lyra2z_8way_thread_init() -{ - return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) ); -} - -static __thread blake256_8way_context l2z_8way_blake_mid; - -void lyra2z_8way_midstate( const void* input ) -{ - blake256_8way_init( &l2z_8way_blake_mid ); - blake256_8way( &l2z_8way_blake_mid, input, 64 ); -} - -void lyra2z_8way_hash( void *state, const void *input ) -{ - uint32_t hash0[8] __attribute__ ((aligned (64))); - uint32_t hash1[8] __attribute__ ((aligned (64))); - uint32_t hash2[8] __attribute__ ((aligned (64))); - uint32_t hash3[8] __attribute__ ((aligned (64))); - uint32_t hash4[8] __attribute__ ((aligned (64))); - uint32_t hash5[8] __attribute__ ((aligned (64))); - uint32_t hash6[8] __attribute__ ((aligned (64))); - uint32_t hash7[8] __attribute__ ((aligned (64))); - uint32_t vhash[8*8] __attribute__ ((aligned (64))); - blake256_8way_context ctx_blake __attribute__ ((aligned (64))); - - memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid ); - blake256_8way( &ctx_blake, input + (64*8), 16 ); - blake256_8way_close( &ctx_blake, vhash ); - - dintrlv_8x32( hash0, hash1, hash2, hash3, - hash4, hash5, hash6, hash7, vhash, 256 ); - - LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 ); - LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 ); - - memcpy( state, hash0, 32 ); - memcpy( state+ 32, hash1, 32 ); - memcpy( state+ 64, hash2, 32 ); - memcpy( state+ 96, hash3, 32 ); - memcpy( state+128, hash4, 32 ); - memcpy( state+160, hash5, 32 ); - memcpy( state+192, hash6, 32 ); - memcpy( state+224, hash7, 32 ); -} - -int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*8] __attribute__ ((aligned (64))); - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ptarget[7] = 0x0000ff; - - mm256_bswap32_intrlv80_8x32( vdata, pdata ); - lyra2z_8way_midstate( vdata ); - - do { - *noncev = mm256_bswap_32( - _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) ); - lyra2z_8way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 8; i++ ) - if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) - && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 8; - } while ( (n < max_nonce-8) && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - return 0; -} - - -#endif diff --git a/algo/lyra2/lyra2z.c b/algo/lyra2/lyra2z.c deleted file mode 100644 index b1ab094..0000000 --- a/algo/lyra2/lyra2z.c +++ /dev/null @@ -1,83 +0,0 @@ -#include -#include -#include "lyra2-gate.h" -#include "lyra2.h" -#include "algo/blake/sph_blake.h" -#include "simd-utils.h" - -__thread uint64_t* lyra2z_matrix; - -bool lyra2z_thread_init() -{ -// const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols -// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; -// int i = (int64_t)ROW_LEN_BYTES * 8; // nRows; - const int i = BLOCK_LEN_INT64 * 8 * 8 * 8; - lyra2z_matrix = _mm_malloc( i, 64 ); - return lyra2z_matrix; -} - -static __thread sph_blake256_context lyra2z_blake_mid; - -void lyra2z_midstate( const void* input ) -{ - sph_blake256_init( &lyra2z_blake_mid ); - sph_blake256( &lyra2z_blake_mid, input, 64 ); -} - -// block 2050 new algo, blake plus new lyra parms. new input -// is power of 2 so normal lyra can be used -//void zcoin_hash(void *state, const void *input, uint32_t height) -void lyra2z_hash( void *state, const void *input ) -{ - uint32_t _ALIGN(64) hash[16]; - - sph_blake256_context ctx_blake __attribute__ ((aligned (64))); - - memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof lyra2z_blake_mid ); - sph_blake256( &ctx_blake, input + 64, 16 ); - sph_blake256_close( &ctx_blake, hash ); - - LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8); - - memcpy(state, hash, 32); -} - -int scanhash_lyra2z( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) hash[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ptarget[7] = 0x0000ff; - - for (int i=0; i < 19; i++) { - be32enc(&endiandata[i], pdata[i]); - } - - lyra2z_midstate( endiandata ); - - do { - be32enc(&endiandata[19], nonce); - lyra2z_hash( hash, endiandata ); - - if ( hash[7] <= Htarg ) - if ( fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !work_restart[thr_id].restart ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - diff --git a/algo/lyra2/lyra2z330.c b/algo/lyra2/lyra2z330.c deleted file mode 100644 index 8a6eeec..0000000 --- a/algo/lyra2/lyra2z330.c +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include "algo-gate-api.h" -#include "lyra2.h" -#include "simd-utils.h" - -__thread uint64_t* lyra2z330_wholeMatrix; - -void lyra2z330_hash(void *state, const void *input, uint32_t height) -{ - uint32_t _ALIGN(256) hash[16]; - - LYRA2Z( lyra2z330_wholeMatrix, hash, 32, input, 80, input, 80, - 2, 330, 256 ); - - memcpy(state, hash, 32); -} - -int scanhash_lyra2z330( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ptarget[7] = 0x0000ff; - - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - do - { - be32enc( &endiandata[19], nonce ); - lyra2z330_hash( hash, endiandata, work->height ); - if ( hash[7] <= Htarg ) - if ( fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !work_restart[thr_id].restart ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -void lyra2z330_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool lyra2z330_thread_init() -{ - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 256; // nCols - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - - int i = (int64_t)ROW_LEN_BYTES * 330; // nRows; - lyra2z330_wholeMatrix = _mm_malloc( i, 64 ); - - return lyra2z330_wholeMatrix; -} - -bool register_lyra2z330_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE42_OPT | AVX2_OPT; - gate->miner_thread_init = (void*)&lyra2z330_thread_init; - gate->scanhash = (void*)&scanhash_lyra2z330; - gate->hash = (void*)&lyra2z330_hash; - gate->get_max64 = (void*)&get_max64_0xffffLL; - gate->set_target = (void*)&lyra2z330_set_target; - return true; -}; - diff --git a/algo/lyra2/phi2-4way.c b/algo/lyra2/phi2-4way.c deleted file mode 100644 index 1c3f759..0000000 --- a/algo/lyra2/phi2-4way.c +++ /dev/null @@ -1,233 +0,0 @@ -/** - * Phi-2 algo Implementation - */ - -#include "lyra2-gate.h" - -#if defined(PHI2_4WAY) - -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/gost/sph_gost.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/echo/aes_ni/hash_api.h" - -typedef struct { - cubehashParam cube; - jh512_4way_context jh; - hashState_echo echo; -// hashState_echo echo2; - sph_gost512_context gost; - skein512_4way_context skein; -} phi2_ctx_holder; -/* -phi2_ctx_holder phi2_ctx; - -void init_phi2_ctx() -{ - cubehashInit( &phi2_ctx.cube, 512, 16, 32 ); - sph_jh512_init(&phi2_ctx.jh); - init_echo( &phi2_ctx.echo1, 512 ); - init_echo( &phi2_ctx.echo2, 512 ); - sph_gost512_init(&phi2_ctx.gost); - sph_skein512_init(&phi2_ctx.skein); -}; -*/ -void phi2_hash_4way( void *state, const void *input ) -{ - uint32_t hash[4][16] __attribute__ ((aligned (64))); - uint32_t hashA[4][16] __attribute__ ((aligned (64))); - uint32_t hashB[4][16] __attribute__ ((aligned (64))); - uint32_t vhash[4*16] __attribute__ ((aligned (64))); - -// unsigned char _ALIGN(128) hash[64]; -// unsigned char _ALIGN(128) hashA[64]; -// unsigned char _ALIGN(128) hashB[64]; - - phi2_ctx_holder ctx __attribute__ ((aligned (64))); -// memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) ); - - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hashB[0], (const byte*)input, - phi2_has_roots ? 144 : 80 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hashB[1], (const byte*)input+144, - phi2_has_roots ? 144 : 80 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hashB[2], (const byte*)input+288, - phi2_has_roots ? 144 : 80 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hashB[3], (const byte*)input+432, - phi2_has_roots ? 144 : 80 ); - - LYRA2RE( &hashA[0][0], 32, &hashB[0][0], 32, &hashB[0][0], 32, 1, 8, 8 ); - LYRA2RE( &hashA[0][8], 32, &hashB[0][8], 32, &hashB[0][8], 32, 1, 8, 8 ); - LYRA2RE( &hashA[1][0], 32, &hashB[1][0], 32, &hashB[1][0], 32, 1, 8, 8 ); - LYRA2RE( &hashA[1][8], 32, &hashB[1][8], 32, &hashB[1][8], 32, 1, 8, 8 ); - LYRA2RE( &hashA[2][0], 32, &hashB[2][0], 32, &hashB[2][0], 32, 1, 8, 8 ); - LYRA2RE( &hashA[2][8], 32, &hashB[2][8], 32, &hashB[2][8], 32, 1, 8, 8 ); - LYRA2RE( &hashA[3][0], 32, &hashB[3][0], 32, &hashB[3][0], 32, 1, 8, 8 ); - LYRA2RE( &hashA[3][8], 32, &hashB[3][8], 32, &hashB[3][8], 32, 1, 8, 8 ); - - intrlv_4x64( vhash, hashA[0], hashA[1], hashA[2], hashA[3], 512 ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - dintrlv_4x64( hash[0], hash[1], hash[2], hash[3], vhash, 512 ); - - if ( hash[0][0] & 1 ) - { - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, (const void*)hash[0], 64 ); - sph_gost512_close( &ctx.gost, (void*)hash[0] ); - } - else - { - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash[0], - (const BitSequence *)hash[0], 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash[0], - (const BitSequence *)hash[0], 512 ); - } - - if ( hash[1][0] & 1 ) - { - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, (const void*)hash[1], 64 ); - sph_gost512_close( &ctx.gost, (void*)hash[1] ); - } - else - { - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash[1], - (const BitSequence *)hash[1], 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash[1], - (const BitSequence *)hash[1], 512 ); - } - - if ( hash[2][0] & 1 ) - { - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, (const void*)hash[2], 64 ); - sph_gost512_close( &ctx.gost, (void*)hash[2] ); - } - else - { - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash[2], - (const BitSequence *)hash[2], 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash[2], - (const BitSequence *)hash[2], 512 ); - } - - if ( hash[3][0] & 1 ) - { - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, (const void*)hash[3], 64 ); - sph_gost512_close( &ctx.gost, (void*)hash[3] ); - } - else - { - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash[3], - (const BitSequence *)hash[3], 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash[3], - (const BitSequence *)hash[3], 512 ); - } - - intrlv_4x64( vhash, hash[0], hash[1], hash[2], hash[3], 512 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - for (int i=0; i<4; i++) - { - ( (uint64_t*)vhash )[i] ^= ( (uint64_t*)vhash )[i+4]; - ( (uint64_t*)vhash+ 8 )[i] ^= ( (uint64_t*)vhash+ 8 )[i+4]; - ( (uint64_t*)vhash+16 )[i] ^= ( (uint64_t*)vhash+16 )[i+4]; - ( (uint64_t*)vhash+24 )[i] ^= ( (uint64_t*)vhash+24 )[i+4]; - } -// for ( int i = 0; i < 4; i++ ) -// casti_m256i( vhash, i ) = _mm256_xor_si256( casti_m256i( vhash, i ), -// casti_m256i( vhash, i+4 ) ); - - memcpy( state, vhash, 128 ); -} - -int scanhash_phi2_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash[8]; - uint32_t _ALIGN(128) edata[36]; - uint32_t vdata[4][36] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[25]); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if(opt_benchmark){ - ptarget[7] = 0x00ff; - } - -// Data is not interleaved, but hash is. -// any non-zero data at index 20 or above sets roots true. -// Split up the operations, bswap first, then set roots. - - phi2_has_roots = false; - for ( int i=0; i < 36; i++ ) - { - be32enc(&edata[i], pdata[i]); - if (i >= 20 && pdata[i]) phi2_has_roots = true; - } -/* - casti_m256i( vdata[0], 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); - casti_m256i( vdata[0], 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); - casti_m256i( vdata[0], 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) ); - casti_m256i( vdata[0], 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) ); - casti_m128i( vdata[0], 8 ) = mm128_bswap_32( casti_m128i( pdata, 8 ) ); - phi2_has_roots = mm128_anybits1( casti_m128i( vdata[0], 5 ) ) || - mm128_anybits1( casti_m128i( vdata[0], 6 ) ) || - mm128_anybits1( casti_m128i( vdata[0], 7 ) ) || - mm128_anybits1( casti_m128i( vdata[0], 8 ) ); -*/ - - memcpy( vdata[0], edata, 144 ); - memcpy( vdata[1], edata, 144 ); - memcpy( vdata[2], edata, 144 ); - memcpy( vdata[3], edata, 144 ); - - do { - be32enc( &vdata[0][19], n ); - be32enc( &vdata[1][19], n+1 ); - be32enc( &vdata[2][19], n+2 ); - be32enc( &vdata[3][19], n+3 ); - - phi2_hash_4way( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) if ( hash7[ lane<<1 ] < Htarg ) - { - extr_lane_4x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif // PHI2_4WAY diff --git a/algo/lyra2/phi2.c b/algo/lyra2/phi2.c deleted file mode 100644 index cad10b3..0000000 --- a/algo/lyra2/phi2.c +++ /dev/null @@ -1,132 +0,0 @@ -/** - * Phi-2 algo Implementation - */ - -#include "lyra2-gate.h" -#include "algo/skein/sph_skein.h" -#include "algo/jh/sph_jh.h" -#include "algo/gost/sph_gost.h" -#include "algo/cubehash/cubehash_sse2.h" -#ifdef __AES__ - #include "algo/echo/aes_ni/hash_api.h" -#else - #include "algo/echo/sph_echo.h" -#endif - -typedef struct { - cubehashParam cube; - sph_jh512_context jh; -#if defined(__AES__) - hashState_echo echo1; - hashState_echo echo2; -#else - sph_echo512_context echo1; - sph_echo512_context echo2; -#endif - sph_gost512_context gost; - sph_skein512_context skein; -} phi2_ctx_holder; - -phi2_ctx_holder phi2_ctx; - -void init_phi2_ctx() -{ - cubehashInit( &phi2_ctx.cube, 512, 16, 32 ); - sph_jh512_init(&phi2_ctx.jh); -#if defined(__AES__) - init_echo( &phi2_ctx.echo1, 512 ); - init_echo( &phi2_ctx.echo2, 512 ); -#else - sph_echo512_init(&phi2_ctx.echo1); - sph_echo512_init(&phi2_ctx.echo2); -#endif - sph_gost512_init(&phi2_ctx.gost); - sph_skein512_init(&phi2_ctx.skein); -}; - -void phi2_hash(void *state, const void *input) -{ - unsigned char _ALIGN(128) hash[64]; - unsigned char _ALIGN(128) hashA[64]; - unsigned char _ALIGN(128) hashB[64]; - - phi2_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) ); - - cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input, - phi2_has_roots ? 144 : 80 ); - - LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 ); - LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 ); - - sph_jh512( &ctx.jh, (const void*)hashA, 64 ); - sph_jh512_close( &ctx.jh, (void*)hash ); - - if ( hash[0] & 1 ) - { - sph_gost512( &ctx.gost, (const void*)hash, 64 ); - sph_gost512_close( &ctx.gost, (void*)hash ); - } - else - { -#if defined(__AES__) - update_final_echo ( &ctx.echo1, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - update_final_echo ( &ctx.echo2, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#else - sph_echo512( &ctx.echo1, (const void*)hash, 64 ); - sph_echo512_close( &ctx.echo1, (void*)hash ); - - sph_echo512( &ctx.echo2, (const void*)hash, 64 ); - sph_echo512_close( &ctx.echo2, (void*)hash ); -#endif - } - - sph_skein512( &ctx.skein, (const void*)hash, 64 ); - sph_skein512_close( &ctx.skein, (void*)hash ); - - for (int i=0; i<4; i++) - ((uint64_t*)hash)[i] ^= ((uint64_t*)hash)[i+4]; - - memcpy(state, hash, 32); -} - -int scanhash_phi2( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash[8]; - uint32_t _ALIGN(128) endiandata[36]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if(opt_benchmark){ - ptarget[7] = 0x00ff; - } - - phi2_has_roots = false; - for ( int i=0; i < 36; i++ ) - { - be32enc(&endiandata[i], pdata[i]); - if ( i >= 20 && pdata[i] ) phi2_has_roots = true; - } - - do { - be32enc( &endiandata[19], n ); - phi2_hash( hash, endiandata ); - if ( hash[7] < Htarg ) - if ( fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n; - submit_solution( work, hash, mythr ); - } - n++; - } while ( n < max_nonce && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/lyra2/sponge.c b/algo/lyra2/sponge.c index f465960..4a0796f 100644 --- a/algo/lyra2/sponge.c +++ b/algo/lyra2/sponge.c @@ -19,95 +19,106 @@ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include -#include -#include -#include #include "sponge.h" #include "lyra2.h" +#include +#include +#include +#if defined(__arm__) || defined(__aarch64__) +#include "sse2neon.h" +#else +#include +#endif /** - * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder - * receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges - * typically have their internal state initialized with zeros, Blake2b's G function - * has a fixed point: if the internal state and message are both filled with zeros. the - * resulting permutation will always be a block filled with zeros; this happens because - * Blake2b does not use the constants originally employed in Blake2 inside its G function, - * relying on the IV for avoiding possible fixed points. + * Initializes the Sponge State. The first 512 bits are set to zeros and the + * remainder receive Blake2b's IV as per Blake2b's specification. Note: + * Even though sponges typically have their internal state initialized with + * zeros, Blake2b's G function has a fixed point: if the internal state and + * message are both filled with zeros. the resulting permutation will always be + * a block filled with zeros; this happens because Blake2b does not use the + * constants originally employed in Blake2 inside its G function, relying on the + * IV for avoiding possible fixed points. * * @param state The 1024-bit array to be initialized */ -inline void initState( uint64_t State[/*16*/] ) -{ +inline void initState(uint64_t State[/*16*/]) { + + /* #if defined (__AVX2__) - __m256i* state = (__m256i*)State; - - state[0] = _mm256_setzero_si256(); - state[1] = _mm256_setzero_si256(); - state[2] = _mm256_set_epi64x( blake2b_IV[3], blake2b_IV[2], - blake2b_IV[1], blake2b_IV[0] ); - state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6], - blake2b_IV[5], blake2b_IV[4] ); + __m256i* state = (__m256i*)State; + const __m256i zero = m256_zero; + state[0] = zero; + state[1] = zero; + state[2] = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL, + 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL ); + state[3] = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL, + 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL ); #elif defined (__SSE2__) - __m128i* state = (__m128i*)State; + __m128i* state = (__m128i*)State; + const __m128i zero = m128_zero; - state[0] = _mm_setzero_si128(); - state[1] = _mm_setzero_si128(); - state[2] = _mm_setzero_si128(); - state[3] = _mm_setzero_si128(); - state[4] = _mm_set_epi64x( blake2b_IV[1], blake2b_IV[0] ); - state[5] = _mm_set_epi64x( blake2b_IV[3], blake2b_IV[2] ); - state[6] = _mm_set_epi64x( blake2b_IV[5], blake2b_IV[4] ); - state[7] = _mm_set_epi64x( blake2b_IV[7], blake2b_IV[6] ); + state[0] = zero; + state[1] = zero; + state[2] = zero; + state[3] = zero; + state[4] = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL ); + state[5] = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL ); + state[6] = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL ); + state[7] = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL ); #else - //First 512 bis are zeros - memset( State, 0, 64 ); - //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV - State[8] = blake2b_IV[0]; - State[9] = blake2b_IV[1]; - State[10] = blake2b_IV[2]; - State[11] = blake2b_IV[3]; - State[12] = blake2b_IV[4]; - State[13] = blake2b_IV[5]; - State[14] = blake2b_IV[6]; - State[15] = blake2b_IV[7]; + //First 512 bis are zeros + memset( State, 0, 64 ); + //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV + State[8] = blake2b_IV[0]; + State[9] = blake2b_IV[1]; + State[10] = blake2b_IV[2]; + State[11] = blake2b_IV[3]; + State[12] = blake2b_IV[4]; + State[13] = blake2b_IV[5]; + State[14] = blake2b_IV[6]; + State[15] = blake2b_IV[7]; #endif +*/ } /** * Execute Blake2b's G function, with all 12 rounds. * - * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G + * function */ -inline static void blake2bLyra( uint64_t *v ) -{ - ROUND_LYRA(0); - ROUND_LYRA(1); - ROUND_LYRA(2); - ROUND_LYRA(3); - ROUND_LYRA(4); - ROUND_LYRA(5); - ROUND_LYRA(6); - ROUND_LYRA(7); - ROUND_LYRA(8); - ROUND_LYRA(9); - ROUND_LYRA(10); - ROUND_LYRA(11); + +#if !defined(__AVX512F__) && !defined(__AVX2__) && !defined(__SSE2__) + +inline static void blake2bLyra(uint64_t *v) { + ROUND_LYRA(0); + ROUND_LYRA(1); + ROUND_LYRA(2); + ROUND_LYRA(3); + ROUND_LYRA(4); + ROUND_LYRA(5); + ROUND_LYRA(6); + ROUND_LYRA(7); + ROUND_LYRA(8); + ROUND_LYRA(9); + ROUND_LYRA(10); + ROUND_LYRA(11); } /** * Executes a reduced version of Blake2b's G function with only one round - * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G + * function */ -inline static void reducedBlake2bLyra( uint64_t *v ) -{ - ROUND_LYRA(0); -} +inline static void reducedBlake2bLyra(uint64_t *v) { ROUND_LYRA(0); } + +#endif /** * Performs a squeeze operation, using Blake2b's G function as the @@ -117,60 +128,56 @@ inline static void reducedBlake2bLyra( uint64_t *v ) * @param out Array that will receive the data squeezed * @param len The number of bytes to be squeezed into the "out" array */ -inline void squeeze( uint64_t *State, byte *Out, unsigned int len ) -{ -#if defined (__AVX2__) - - const int len_m256i = len / 32; - const int fullBlocks = len_m256i / BLOCK_LEN_M256I; - __m256i* state = (__m256i*)State; - __m256i* out = (__m256i*)Out; - int i; - - //Squeezes full blocks - for ( i = 0; i < fullBlocks; i++ ) - { - memcpy_256( out, state, BLOCK_LEN_M256I ); - LYRA_ROUND_AVX2( state[0], state[1], state[2], state[3] ); - out += BLOCK_LEN_M256I; - } - //Squeezes remaining bytes - memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) ); - -#elif defined (__SSE2__) - - const int len_m128i = len / 16; - const int fullBlocks = len_m128i / BLOCK_LEN_M128I; - __m128i* state = (__m128i*)State; - __m128i* out = (__m128i*)Out; - int i; - - //Squeezes full blocks - for ( i = 0; i < fullBlocks; i++ ) - { - memcpy_128( out, state, BLOCK_LEN_M128I ); - LYRA_ROUND_AVX( state[0], state[1], state[2], state[3], - state[4], state[5], state[6], state[7] ); - out += BLOCK_LEN_M128I; - } - //Squeezes remaining bytes - memcpy_128( out, state, ( len_m128i % BLOCK_LEN_M128I ) ); +inline void squeeze(uint64_t *State, byte *Out, unsigned int len) { +#if defined(__AVX2__) + + const int len_m256i = len / 32; + const int fullBlocks = len_m256i / BLOCK_LEN_M256I; + __m256i *state = (__m256i *)State; + __m256i *out = (__m256i *)Out; + int i; + + // Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy_256(out, state, BLOCK_LEN_M256I); + LYRA_ROUND_AVX2(state[0], state[1], state[2], state[3]); + out += BLOCK_LEN_M256I; + } + // Squeezes remaining bytes + memcpy_256(out, state, (len_m256i % BLOCK_LEN_M256I)); + +#elif defined(__SSE2__) + + const int len_m128i = len / 16; + const int fullBlocks = len_m128i / BLOCK_LEN_M128I; + __m128i *state = (__m128i *)State; + __m128i *out = (__m128i *)Out; + int i; + + // Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy_128(out, state, BLOCK_LEN_M128I); + LYRA_ROUND_AVX(state[0], state[1], state[2], state[3], state[4], state[5], + state[6], state[7]); + out += BLOCK_LEN_M128I; + } + // Squeezes remaining bytes + memcpy_128(out, state, (len_m128i % BLOCK_LEN_M128I)); #else - int fullBlocks = len / BLOCK_LEN_BYTES; - byte *out = Out; - int i; + int fullBlocks = len / BLOCK_LEN_BYTES; + byte *out = Out; + int i; - //Squeezes full blocks - for ( i = 0; i < fullBlocks; i++ ) - { - memcpy( out, State, BLOCK_LEN_BYTES ); - blake2bLyra( State ); - out += BLOCK_LEN_BYTES; - } - //Squeezes remaining bytes - memcpy( out, State, (len % BLOCK_LEN_BYTES) ); + // Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy(out, State, BLOCK_LEN_BYTES); + blake2bLyra(State); + out += BLOCK_LEN_BYTES; + } + // Squeezes remaining bytes + memcpy(out, State, (len % BLOCK_LEN_BYTES)); #endif } @@ -182,126 +189,150 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len ) * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_INT64 words) */ -inline void absorbBlock( uint64_t *State, const uint64_t *In ) -{ -#if defined (__AVX2__) +inline void absorbBlock(uint64_t *State, const uint64_t *In) { +#if defined(__AVX2__) - register __m256i state0, state1, state2, state3; - __m256i *in = (__m256i*)In; + register __m256i state0, state1, state2, state3; + __m256i *in = (__m256i *)In; - state0 = _mm256_load_si256( (__m256i*)State ); - state1 = _mm256_load_si256( (__m256i*)State + 1 ); - state2 = _mm256_load_si256( (__m256i*)State + 2 ); - state3 = _mm256_load_si256( (__m256i*)State + 3 ); + state0 = _mm256_load_si256((__m256i *)State); + state1 = _mm256_load_si256((__m256i *)State + 1); + state2 = _mm256_load_si256((__m256i *)State + 2); + state3 = _mm256_load_si256((__m256i *)State + 3); - state0 = _mm256_xor_si256( state0, in[0] ); - state1 = _mm256_xor_si256( state1, in[1] ); - state2 = _mm256_xor_si256( state2, in[2] ); + state0 = _mm256_xor_si256(state0, in[0]); + state1 = _mm256_xor_si256(state1, in[1]); + state2 = _mm256_xor_si256(state2, in[2]); - LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 ); + LYRA_12_ROUNDS_AVX2(state0, state1, state2, state3); - _mm256_store_si256( (__m256i*)State, state0 ); - _mm256_store_si256( (__m256i*)State + 1, state1 ); - _mm256_store_si256( (__m256i*)State + 2, state2 ); - _mm256_store_si256( (__m256i*)State + 3, state3 ); + _mm256_store_si256((__m256i *)State, state0); + _mm256_store_si256((__m256i *)State + 1, state1); + _mm256_store_si256((__m256i *)State + 2, state2); + _mm256_store_si256((__m256i *)State + 3, state3); -#elif defined (__SSE2__) +#elif defined(__SSE2__) - __m128i* state = (__m128i*)State; - __m128i* in = (__m128i*)In; + __m128i *state = (__m128i *)State; + __m128i *in = (__m128i *)In; - state[0] = _mm_xor_si128( state[0], in[0] ); - state[1] = _mm_xor_si128( state[1], in[1] ); - state[2] = _mm_xor_si128( state[2], in[2] ); - state[3] = _mm_xor_si128( state[3], in[3] ); - state[4] = _mm_xor_si128( state[4], in[4] ); - state[5] = _mm_xor_si128( state[5], in[5] ); + state[0] = _mm_xor_si128(state[0], in[0]); + state[1] = _mm_xor_si128(state[1], in[1]); + state[2] = _mm_xor_si128(state[2], in[2]); + state[3] = _mm_xor_si128(state[3], in[3]); + state[4] = _mm_xor_si128(state[4], in[4]); + state[5] = _mm_xor_si128(state[5], in[5]); - //Applies the transformation f to the sponge's state - LYRA_12_ROUNDS_AVX( state[0], state[1], state[2], state[3], - state[4], state[5], state[6], state[7] ); + // Applies the transformation f to the sponge's state + LYRA_12_ROUNDS_AVX(state[0], state[1], state[2], state[3], state[4], state[5], + state[6], state[7]); #else - //XORs the first BLOCK_LEN_INT64 words of "in" with the current state - State[0] ^= In[0]; - State[1] ^= In[1]; - State[2] ^= In[2]; - State[3] ^= In[3]; - State[4] ^= In[4]; - State[5] ^= In[5]; - State[6] ^= In[6]; - State[7] ^= In[7]; - State[8] ^= In[8]; - State[9] ^= In[9]; - State[10] ^= In[10]; - State[11] ^= In[11]; - - //Applies the transformation f to the sponge's state - blake2bLyra(State); + // XORs the first BLOCK_LEN_INT64 words of "in" with the current state + State[0] ^= In[0]; + State[1] ^= In[1]; + State[2] ^= In[2]; + State[3] ^= In[3]; + State[4] ^= In[4]; + State[5] ^= In[5]; + State[6] ^= In[6]; + State[7] ^= In[7]; + State[8] ^= In[8]; + State[9] ^= In[9]; + State[10] ^= In[10]; + State[11] ^= In[11]; + + // Applies the transformation f to the sponge's state + blake2bLyra(State); #endif } /** * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 - * words of type uint64_t), using Blake2b's G function as the internal permutation + * words of type uint64_t), using Blake2b's G function as the internal + * permutation * * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ -inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In ) -{ - //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state -#if defined (__AVX2__) - - register __m256i state0, state1, state2, state3; - __m256i *in = (__m256i*)In; - - state0 = _mm256_load_si256( (__m256i*)State ); - state1 = _mm256_load_si256( (__m256i*)State + 1 ); - state2 = _mm256_load_si256( (__m256i*)State + 2 ); - state3 = _mm256_load_si256( (__m256i*)State + 3 ); - - state0 = _mm256_xor_si256( state0, in[0] ); - state1 = _mm256_xor_si256( state1, in[1] ); - - LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 ); - - _mm256_store_si256( (__m256i*)State, state0 ); - _mm256_store_si256( (__m256i*)State + 1, state1 ); - _mm256_store_si256( (__m256i*)State + 2, state2 ); - _mm256_store_si256( (__m256i*)State + 3, state3 ); - -#elif defined (__SSE2__) - - __m128i* state = (__m128i*)State; - __m128i* in = (__m128i*)In; - - state[0] = _mm_xor_si128( state[0], in[0] ); - state[1] = _mm_xor_si128( state[1], in[1] ); - state[2] = _mm_xor_si128( state[2], in[2] ); - state[3] = _mm_xor_si128( state[3], in[3] ); - - //Applies the transformation f to the sponge's state - LYRA_12_ROUNDS_AVX( state[0], state[1], state[2], state[3], - state[4], state[5], state[6], state[7] ); +inline void absorbBlockBlake2Safe(uint64_t *State, const uint64_t *In, + const uint64_t nBlocks, + const uint64_t block_len) { +// XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with +// the IV. +#if defined(__AVX2__) + + register __m256i state0, state1, state2, state3; + + state0 = state1 = m256_zero; + state2 = m256_const_64(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL, + 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL); + state3 = m256_const_64(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL, + 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL); + + for (int i = 0; i < nBlocks; i++) { + __m256i *in = (__m256i *)In; + state0 = _mm256_xor_si256(state0, in[0]); + state1 = _mm256_xor_si256(state1, in[1]); + + LYRA_12_ROUNDS_AVX2(state0, state1, state2, state3); + In += block_len; + } + + _mm256_store_si256((__m256i *)State, state0); + _mm256_store_si256((__m256i *)State + 1, state1); + _mm256_store_si256((__m256i *)State + 2, state2); + _mm256_store_si256((__m256i *)State + 3, state3); + +#elif defined(__SSE2__) + + __m128i state0, state1, state2, state3, state4, state5, state6, state7; + + state0 = state1 = state2 = state3 = m128_zero; + state4 = m128_const_64(0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL); + state5 = m128_const_64(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL); + state6 = m128_const_64(0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL); + state7 = m128_const_64(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL); + + for (int i = 0; i < nBlocks; i++) { + __m128i *in = (__m128i *)In; + + state0 = _mm_xor_si128(state0, in[0]); + state1 = _mm_xor_si128(state1, in[1]); + state2 = _mm_xor_si128(state2, in[2]); + state3 = _mm_xor_si128(state3, in[3]); + + // Applies the transformation f to the sponge's state + LYRA_12_ROUNDS_AVX(state0, state1, state2, state3, state4, state5, state6, + state7); + In += block_len; + } + + _mm_store_si128((__m128i *)State, state0); + _mm_store_si128((__m128i *)State + 1, state1); + _mm_store_si128((__m128i *)State + 2, state2); + _mm_store_si128((__m128i *)State + 3, state3); + _mm_store_si128((__m128i *)State + 4, state4); + _mm_store_si128((__m128i *)State + 5, state5); + _mm_store_si128((__m128i *)State + 6, state6); + _mm_store_si128((__m128i *)State + 7, state7); #else - State[0] ^= In[0]; - State[1] ^= In[1]; - State[2] ^= In[2]; - State[3] ^= In[3]; - State[4] ^= In[4]; - State[5] ^= In[5]; - State[6] ^= In[6]; - State[7] ^= In[7]; - - //Applies the transformation f to the sponge's state - blake2bLyra(State); + State[0] ^= In[0]; + State[1] ^= In[1]; + State[2] ^= In[2]; + State[3] ^= In[3]; + State[4] ^= In[4]; + State[5] ^= In[5]; + State[6] ^= In[6]; + State[7] ^= In[7]; + + // Applies the transformation f to the sponge's state + blake2bLyra(State); #endif - } /** @@ -312,123 +343,120 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In ) * @param state The current state of the sponge * @param rowOut Row to receive the data squeezed */ -inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut, - uint64_t nCols ) -{ - int i; - - //M[row][C-1-col] = H.reduced_squeeze() - -#if defined (__AVX2__) - - register __m256i state0, state1, state2, state3; - __m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I ); - - state0 = _mm256_load_si256( (__m256i*)State ); - state1 = _mm256_load_si256( (__m256i*)State + 1 ); - state2 = _mm256_load_si256( (__m256i*)State + 2 ); - state3 = _mm256_load_si256( (__m256i*)State + 3 ); - - for ( i = 0; i < 9; i += 3) - { - _mm_prefetch( out - i, _MM_HINT_T0 ); - _mm_prefetch( out - i - 2, _MM_HINT_T0 ); - } - - for ( i = 0; i < nCols; i++ ) - { - _mm_prefetch( out - 9, _MM_HINT_T0 ); - _mm_prefetch( out - 11, _MM_HINT_T0 ); - - out[0] = state0; - out[1] = state1; - out[2] = state2; - - //Goes to next block (column) that will receive the squeezed data - out -= BLOCK_LEN_M256I; - - LYRA_ROUND_AVX2( state0, state1, state2, state3 ); - } - - _mm256_store_si256( (__m256i*)State, state0 ); - _mm256_store_si256( (__m256i*)State + 1, state1 ); - _mm256_store_si256( (__m256i*)State + 2, state2 ); - _mm256_store_si256( (__m256i*)State + 3, state3 ); - -#elif defined (__SSE2__) - - __m128i* state = (__m128i*)State; - __m128i state0 = _mm_load_si128( state ); - __m128i state1 = _mm_load_si128( &state[1] ); - __m128i state2 = _mm_load_si128( &state[2] ); - __m128i state3 = _mm_load_si128( &state[3] ); - __m128i state4 = _mm_load_si128( &state[4] ); - __m128i state5 = _mm_load_si128( &state[5] ); - __m128i state6 = _mm_load_si128( &state[6] ); - __m128i state7 = _mm_load_si128( &state[7] ); - - __m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I ); - - for ( i = 0; i < 6; i += 3) - { - _mm_prefetch( out - i, _MM_HINT_T0 ); - _mm_prefetch( out - i - 2, _MM_HINT_T0 ); - } - - for ( i = 0; i < nCols; i++ ) - { - _mm_prefetch( out - 6, _MM_HINT_T0 ); - _mm_prefetch( out - 7, _MM_HINT_T0 ); - - out[0] = state0; - out[1] = state1; - out[2] = state2; - out[3] = state3; - out[4] = state4; - out[5] = state5; - - //Goes to next block (column) that will receive the squeezed data - out -= BLOCK_LEN_M128I; - - //Applies the reduced-round transformation f to the sponge's state - LYRA_ROUND_AVX( state0, state1, state2, state3, - state4, state5, state6, state7 ); - } - - _mm_store_si128( state, state0 ); - _mm_store_si128( &state[1], state1 ); - _mm_store_si128( &state[2], state2 ); - _mm_store_si128( &state[3], state3 ); - _mm_store_si128( &state[4], state4 ); - _mm_store_si128( &state[5], state5 ); - _mm_store_si128( &state[6], state6 ); - _mm_store_si128( &state[7], state7 ); +inline void reducedSqueezeRow0(uint64_t *State, uint64_t *rowOut, + uint64_t nCols) { + int i; + + // M[row][C-1-col] = H.reduced_squeeze() + +#if defined(__AVX2__) + + register __m256i state0, state1, state2, state3; + __m256i *out = (__m256i *)rowOut + ((nCols - 1) * BLOCK_LEN_M256I); + + state0 = _mm256_load_si256((__m256i *)State); + state1 = _mm256_load_si256((__m256i *)State + 1); + state2 = _mm256_load_si256((__m256i *)State + 2); + state3 = _mm256_load_si256((__m256i *)State + 3); + + for (i = 0; i < 9; i += 3) { + _mm_prefetch(out - i, _MM_HINT_T0); + _mm_prefetch(out - i - 2, _MM_HINT_T0); + } + + for (i = 0; i < nCols; i++) { + _mm_prefetch(out - 9, _MM_HINT_T0); + _mm_prefetch(out - 11, _MM_HINT_T0); + + // printf("S RSR0 col= %d, out= %x\n",i,out); + + out[0] = state0; + out[1] = state1; + out[2] = state2; + + // Goes to next block (column) that will receive the squeezed data + out -= BLOCK_LEN_M256I; + + LYRA_ROUND_AVX2(state0, state1, state2, state3); + } + + _mm256_store_si256((__m256i *)State, state0); + _mm256_store_si256((__m256i *)State + 1, state1); + _mm256_store_si256((__m256i *)State + 2, state2); + _mm256_store_si256((__m256i *)State + 3, state3); + +#elif defined(__SSE2__) + + __m128i *state = (__m128i *)State; + __m128i state0 = _mm_load_si128(state); + __m128i state1 = _mm_load_si128(&state[1]); + __m128i state2 = _mm_load_si128(&state[2]); + __m128i state3 = _mm_load_si128(&state[3]); + __m128i state4 = _mm_load_si128(&state[4]); + __m128i state5 = _mm_load_si128(&state[5]); + __m128i state6 = _mm_load_si128(&state[6]); + __m128i state7 = _mm_load_si128(&state[7]); + + __m128i *out = (__m128i *)rowOut + ((nCols - 1) * BLOCK_LEN_M128I); + + for (i = 0; i < 6; i += 3) { + _mm_prefetch(out - i, _MM_HINT_T0); + _mm_prefetch(out - i - 2, _MM_HINT_T0); + } + + for (i = 0; i < nCols; i++) { + _mm_prefetch(out - 6, _MM_HINT_T0); + _mm_prefetch(out - 7, _MM_HINT_T0); + + out[0] = state0; + out[1] = state1; + out[2] = state2; + out[3] = state3; + out[4] = state4; + out[5] = state5; + + // Goes to next block (column) that will receive the squeezed data + out -= BLOCK_LEN_M128I; + + // Applies the reduced-round transformation f to the sponge's state + LYRA_ROUND_AVX(state0, state1, state2, state3, state4, state5, state6, + state7); + } + + _mm_store_si128(state, state0); + _mm_store_si128(&state[1], state1); + _mm_store_si128(&state[2], state2); + _mm_store_si128(&state[3], state3); + _mm_store_si128(&state[4], state4); + _mm_store_si128(&state[5], state5); + _mm_store_si128(&state[6], state6); + _mm_store_si128(&state[7], state7); #else - uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] - - for ( i = 0; i < nCols; i++ ) - { - ptrWord[0] = State[0]; - ptrWord[1] = State[1]; - ptrWord[2] = State[2]; - ptrWord[3] = State[3]; - ptrWord[4] = State[4]; - ptrWord[5] = State[5]; - ptrWord[6] = State[6]; - ptrWord[7] = State[7]; - ptrWord[8] = State[8]; - ptrWord[9] = State[9]; - ptrWord[10] = State[10]; - ptrWord[11] = State[11]; - - //Goes to next block (column) that will receive the squeezed data - ptrWord -= BLOCK_LEN_INT64; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra( State); - } + uint64_t *ptrWord = + rowOut + (nCols - 1) * BLOCK_LEN_INT64; // In Lyra2: pointer to M[0][C-1] + + for (i = 0; i < nCols; i++) { + ptrWord[0] = State[0]; + ptrWord[1] = State[1]; + ptrWord[2] = State[2]; + ptrWord[3] = State[3]; + ptrWord[4] = State[4]; + ptrWord[5] = State[5]; + ptrWord[6] = State[6]; + ptrWord[7] = State[7]; + ptrWord[8] = State[8]; + ptrWord[9] = State[9]; + ptrWord[10] = State[10]; + ptrWord[11] = State[11]; + + // Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + // Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(State); + } #endif } @@ -441,174 +469,168 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut, * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ -inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn, - uint64_t *rowOut, uint64_t nCols ) -{ - int i; - -#if defined (__AVX2__) - - register __m256i state0, state1, state2, state3; - __m256i* in = (__m256i*)rowIn; - __m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I ); - - state0 = _mm256_load_si256( (__m256i*)State ); - state1 = _mm256_load_si256( (__m256i*)State + 1 ); - state2 = _mm256_load_si256( (__m256i*)State + 2 ); - state3 = _mm256_load_si256( (__m256i*)State + 3 ); - - for ( i = 0; i < 9; i += 3) - { - _mm_prefetch( in + i, _MM_HINT_T0 ); - _mm_prefetch( in + i + 2, _MM_HINT_T0 ); - _mm_prefetch( out - i, _MM_HINT_T0 ); - _mm_prefetch( out - i - 2, _MM_HINT_T0 ); - } - - for ( i = 0; i < nCols; i++ ) - { - - _mm_prefetch( in + 9, _MM_HINT_T0 ); - _mm_prefetch( in + 11, _MM_HINT_T0 ); - _mm_prefetch( out - 9, _MM_HINT_T0 ); - _mm_prefetch( out - 11, _MM_HINT_T0 ); - - state0 = _mm256_xor_si256( state0, in[0] ); - state1 = _mm256_xor_si256( state1, in[1] ); - state2 = _mm256_xor_si256( state2, in[2] ); - - LYRA_ROUND_AVX2( state0, state1, state2, state3 ); - - out[0] = _mm256_xor_si256( state0, in[0] ); - out[1] = _mm256_xor_si256( state1, in[1] ); - out[2] = _mm256_xor_si256( state2, in[2] ); - - //Input: next column (i.e., next block in sequence) - in += BLOCK_LEN_M256I; - //Output: goes to previous column - out -= BLOCK_LEN_M256I; - } - - _mm256_store_si256( (__m256i*)State, state0 ); - _mm256_store_si256( (__m256i*)State + 1, state1 ); - _mm256_store_si256( (__m256i*)State + 2, state2 ); - _mm256_store_si256( (__m256i*)State + 3, state3 ); - -#elif defined (__SSE2__) - - __m128i* state = (__m128i*)State; - __m128i state0 = _mm_load_si128( state ); - __m128i state1 = _mm_load_si128( &state[1] ); - __m128i state2 = _mm_load_si128( &state[2] ); - __m128i state3 = _mm_load_si128( &state[3] ); - __m128i state4 = _mm_load_si128( &state[4] ); - __m128i state5 = _mm_load_si128( &state[5] ); - __m128i state6 = _mm_load_si128( &state[6] ); - __m128i state7 = _mm_load_si128( &state[7] ); - - __m128i* in = (__m128i*)rowIn; - __m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I ); - - for ( i = 0; i < 6; i += 3) - { - _mm_prefetch( in + i, _MM_HINT_T0 ); - _mm_prefetch( in + i + 2, _MM_HINT_T0 ); - _mm_prefetch( out - i, _MM_HINT_T0 ); - _mm_prefetch( out - i - 2, _MM_HINT_T0 ); - } - - for ( i = 0; i < nCols; i++ ) - { - _mm_prefetch( in - 6, _MM_HINT_T0 ); - _mm_prefetch( in - 7, _MM_HINT_T0 ); - _mm_prefetch( out - 6, _MM_HINT_T0 ); - _mm_prefetch( out - 7, _MM_HINT_T0 ); - - state0 = _mm_xor_si128( state0, in[0] ); - state1 = _mm_xor_si128( state1, in[1] ); - state2 = _mm_xor_si128( state2, in[2] ); - state3 = _mm_xor_si128( state3, in[3] ); - state4 = _mm_xor_si128( state4, in[4] ); - state5 = _mm_xor_si128( state5, in[5] ); - - //Applies the reduced-round transformation f to the sponge's state - LYRA_ROUND_AVX( state0, state1, state2, state3, - state4, state5, state6, state7 ); - - out[0] = _mm_xor_si128( state0, in[0] ); - out[1] = _mm_xor_si128( state1, in[1] ); - out[2] = _mm_xor_si128( state2, in[2] ); - out[3] = _mm_xor_si128( state3, in[3] ); - out[4] = _mm_xor_si128( state4, in[4] ); - out[5] = _mm_xor_si128( state5, in[5] ); - - //Input: next column (i.e., next block in sequence) - in += BLOCK_LEN_M128I; - //Output: goes to previous column - out -= BLOCK_LEN_M128I; - } - - _mm_store_si128( state, state0 ); - _mm_store_si128( &state[1], state1 ); - _mm_store_si128( &state[2], state2 ); - _mm_store_si128( &state[3], state3 ); - _mm_store_si128( &state[4], state4 ); - _mm_store_si128( &state[5], state5 ); - _mm_store_si128( &state[6], state6 ); - _mm_store_si128( &state[7], state7 ); +inline void reducedDuplexRow1(uint64_t *State, uint64_t *rowIn, + uint64_t *rowOut, uint64_t nCols) { + int i; + +#if defined(__AVX2__) + + register __m256i state0, state1, state2, state3; + __m256i *in = (__m256i *)rowIn; + __m256i *out = (__m256i *)rowOut + ((nCols - 1) * BLOCK_LEN_M256I); + + state0 = _mm256_load_si256((__m256i *)State); + state1 = _mm256_load_si256((__m256i *)State + 1); + state2 = _mm256_load_si256((__m256i *)State + 2); + state3 = _mm256_load_si256((__m256i *)State + 3); + + for (i = 0; i < 9; i += 3) { + _mm_prefetch(in + i, _MM_HINT_T0); + _mm_prefetch(in + i + 2, _MM_HINT_T0); + _mm_prefetch(out - i, _MM_HINT_T0); + _mm_prefetch(out - i - 2, _MM_HINT_T0); + } + + for (i = 0; i < nCols; i++) { + + _mm_prefetch(in + 9, _MM_HINT_T0); + _mm_prefetch(in + 11, _MM_HINT_T0); + _mm_prefetch(out - 9, _MM_HINT_T0); + _mm_prefetch(out - 11, _MM_HINT_T0); + + state0 = _mm256_xor_si256(state0, in[0]); + state1 = _mm256_xor_si256(state1, in[1]); + state2 = _mm256_xor_si256(state2, in[2]); + + LYRA_ROUND_AVX2(state0, state1, state2, state3); + + out[0] = _mm256_xor_si256(state0, in[0]); + out[1] = _mm256_xor_si256(state1, in[1]); + out[2] = _mm256_xor_si256(state2, in[2]); + + // Input: next column (i.e., next block in sequence) + in += BLOCK_LEN_M256I; + // Output: goes to previous column + out -= BLOCK_LEN_M256I; + } + + _mm256_store_si256((__m256i *)State, state0); + _mm256_store_si256((__m256i *)State + 1, state1); + _mm256_store_si256((__m256i *)State + 2, state2); + _mm256_store_si256((__m256i *)State + 3, state3); + +#elif defined(__SSE2__) + + __m128i *state = (__m128i *)State; + __m128i state0 = _mm_load_si128(state); + __m128i state1 = _mm_load_si128(&state[1]); + __m128i state2 = _mm_load_si128(&state[2]); + __m128i state3 = _mm_load_si128(&state[3]); + __m128i state4 = _mm_load_si128(&state[4]); + __m128i state5 = _mm_load_si128(&state[5]); + __m128i state6 = _mm_load_si128(&state[6]); + __m128i state7 = _mm_load_si128(&state[7]); + + __m128i *in = (__m128i *)rowIn; + __m128i *out = (__m128i *)rowOut + ((nCols - 1) * BLOCK_LEN_M128I); + + for (i = 0; i < 6; i += 3) { + _mm_prefetch(in + i, _MM_HINT_T0); + _mm_prefetch(in + i + 2, _MM_HINT_T0); + _mm_prefetch(out - i, _MM_HINT_T0); + _mm_prefetch(out - i - 2, _MM_HINT_T0); + } + + for (i = 0; i < nCols; i++) { + _mm_prefetch(in - 6, _MM_HINT_T0); + _mm_prefetch(in - 7, _MM_HINT_T0); + _mm_prefetch(out - 6, _MM_HINT_T0); + _mm_prefetch(out - 7, _MM_HINT_T0); + + state0 = _mm_xor_si128(state0, in[0]); + state1 = _mm_xor_si128(state1, in[1]); + state2 = _mm_xor_si128(state2, in[2]); + state3 = _mm_xor_si128(state3, in[3]); + state4 = _mm_xor_si128(state4, in[4]); + state5 = _mm_xor_si128(state5, in[5]); + + // Applies the reduced-round transformation f to the sponge's state + LYRA_ROUND_AVX(state0, state1, state2, state3, state4, state5, state6, + state7); + + out[0] = _mm_xor_si128(state0, in[0]); + out[1] = _mm_xor_si128(state1, in[1]); + out[2] = _mm_xor_si128(state2, in[2]); + out[3] = _mm_xor_si128(state3, in[3]); + out[4] = _mm_xor_si128(state4, in[4]); + out[5] = _mm_xor_si128(state5, in[5]); + + // Input: next column (i.e., next block in sequence) + in += BLOCK_LEN_M128I; + // Output: goes to previous column + out -= BLOCK_LEN_M128I; + } + + _mm_store_si128(state, state0); + _mm_store_si128(&state[1], state1); + _mm_store_si128(&state[2], state2); + _mm_store_si128(&state[3], state3); + _mm_store_si128(&state[4], state4); + _mm_store_si128(&state[5], state5); + _mm_store_si128(&state[6], state6); + _mm_store_si128(&state[7], state7); #else - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row - - for ( i = 0; i < nCols; i++ ) - { - - //Absorbing "M[prev][col]" - State[0] ^= (ptrWordIn[0]); - State[1] ^= (ptrWordIn[1]); - State[2] ^= (ptrWordIn[2]); - State[3] ^= (ptrWordIn[3]); - State[4] ^= (ptrWordIn[4]); - State[5] ^= (ptrWordIn[5]); - State[6] ^= (ptrWordIn[6]); - State[7] ^= (ptrWordIn[7]); - State[8] ^= (ptrWordIn[8]); - State[9] ^= (ptrWordIn[9]); - State[10] ^= (ptrWordIn[10]); - State[11] ^= (ptrWordIn[11]); - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra( State ); - - //M[row][C-1-col] = M[prev][col] XOR rand - ptrWordOut[0] = ptrWordIn[0] ^ State[0]; - ptrWordOut[1] = ptrWordIn[1] ^ State[1]; - ptrWordOut[2] = ptrWordIn[2] ^ State[2]; - ptrWordOut[3] = ptrWordIn[3] ^ State[3]; - ptrWordOut[4] = ptrWordIn[4] ^ State[4]; - ptrWordOut[5] = ptrWordIn[5] ^ State[5]; - ptrWordOut[6] = ptrWordIn[6] ^ State[6]; - ptrWordOut[7] = ptrWordIn[7] ^ State[7]; - ptrWordOut[8] = ptrWordIn[8] ^ State[8]; - ptrWordOut[9] = ptrWordIn[9] ^ State[9]; - ptrWordOut[10] = ptrWordIn[10] ^ State[10]; - ptrWordOut[11] = ptrWordIn[11] ^ State[11]; - - //Input: next column (i.e., next block in sequence) - ptrWordIn += BLOCK_LEN_INT64; - //Output: goes to previous column - ptrWordOut -= BLOCK_LEN_INT64; - - } + uint64_t *ptrWordIn = rowIn; // In Lyra2: pointer to prev + uint64_t *ptrWordOut = + rowOut + (nCols - 1) * BLOCK_LEN_INT64; // In Lyra2: pointer to row + + for (i = 0; i < nCols; i++) { + + // Absorbing "M[prev][col]" + State[0] ^= (ptrWordIn[0]); + State[1] ^= (ptrWordIn[1]); + State[2] ^= (ptrWordIn[2]); + State[3] ^= (ptrWordIn[3]); + State[4] ^= (ptrWordIn[4]); + State[5] ^= (ptrWordIn[5]); + State[6] ^= (ptrWordIn[6]); + State[7] ^= (ptrWordIn[7]); + State[8] ^= (ptrWordIn[8]); + State[9] ^= (ptrWordIn[9]); + State[10] ^= (ptrWordIn[10]); + State[11] ^= (ptrWordIn[11]); + + // Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(State); + + // M[row][C-1-col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ State[0]; + ptrWordOut[1] = ptrWordIn[1] ^ State[1]; + ptrWordOut[2] = ptrWordIn[2] ^ State[2]; + ptrWordOut[3] = ptrWordIn[3] ^ State[3]; + ptrWordOut[4] = ptrWordIn[4] ^ State[4]; + ptrWordOut[5] = ptrWordIn[5] ^ State[5]; + ptrWordOut[6] = ptrWordIn[6] ^ State[6]; + ptrWordOut[7] = ptrWordIn[7] ^ State[7]; + ptrWordOut[8] = ptrWordIn[8] ^ State[8]; + ptrWordOut[9] = ptrWordIn[9] ^ State[9]; + ptrWordOut[10] = ptrWordIn[10] ^ State[10]; + ptrWordOut[11] = ptrWordIn[11] ^ State[11]; + + // Input: next column (i.e., next block in sequence) + ptrWordIn += BLOCK_LEN_INT64; + // Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } #endif } /** - * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., - * the wordwise addition of two columns, ignoring carries between words). The - * output of this operation, "rand", is then used to make + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" + * (i.e., the wordwise addition of two columns, ignoring carries between words). + * The output of this operation, "rand", is then used to make * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left and N_COLS is a system parameter. @@ -619,254 +641,268 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn, * @param rowOut Row receiving the output * */ -inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn, - uint64_t *rowInOut, uint64_t *rowOut, - uint64_t nCols ) -{ - int i; - -#if defined (__AVX2__) - - register __m256i state0, state1, state2, state3; - __m256i* in = (__m256i*)rowIn; - __m256i* inout = (__m256i*)rowInOut; - __m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I ); - __m256i t0, t1, t2; - - state0 = _mm256_load_si256( (__m256i*)State ); - state1 = _mm256_load_si256( (__m256i*)State + 1 ); - state2 = _mm256_load_si256( (__m256i*)State + 2 ); - state3 = _mm256_load_si256( (__m256i*)State + 3 ); - - for ( i = 0; i < 9; i += 3) - { - _mm_prefetch( in + i, _MM_HINT_T0 ); - _mm_prefetch( in + i + 2, _MM_HINT_T0 ); - _mm_prefetch( inout + i, _MM_HINT_T0 ); - _mm_prefetch( inout + i + 2, _MM_HINT_T0 ); - _mm_prefetch( out - i, _MM_HINT_T0 ); - _mm_prefetch( out - i - 2, _MM_HINT_T0 ); - } - - for ( i = 0; i < nCols; i++ ) - { - _mm_prefetch( in + 9, _MM_HINT_T0 ); - _mm_prefetch( in + 11, _MM_HINT_T0 ); - _mm_prefetch( inout + 9, _MM_HINT_T0 ); - _mm_prefetch( inout + 11, _MM_HINT_T0 ); - _mm_prefetch( out - 9, _MM_HINT_T0 ); - _mm_prefetch( out - 11, _MM_HINT_T0 ); - - state0 = _mm256_xor_si256( state0, - _mm256_add_epi64( in[0], inout[0] ) ); - state1 = _mm256_xor_si256( state1, - _mm256_add_epi64( in[1], inout[1] ) ); - state2 = _mm256_xor_si256( state2, - _mm256_add_epi64( in[2], inout[2] ) ); - - LYRA_ROUND_AVX2( state0, state1, state2, state3 ); - - out[0] = _mm256_xor_si256( state0, in[0] ); - out[1] = _mm256_xor_si256( state1, in[1] ); - out[2] = _mm256_xor_si256( state2, in[2] ); - - //M[row*][col] = M[row*][col] XOR rotW(rand) - t0 = _mm256_permute4x64_epi64( state0, 0x93 ); - t1 = _mm256_permute4x64_epi64( state1, 0x93 ); - t2 = _mm256_permute4x64_epi64( state2, 0x93 ); - - inout[0] = _mm256_xor_si256( inout[0], - _mm256_blend_epi32( t0, t2, 0x03 ) ); - inout[1] = _mm256_xor_si256( inout[1], - _mm256_blend_epi32( t1, t0, 0x03 ) ); - inout[2] = _mm256_xor_si256( inout[2], - _mm256_blend_epi32( t2, t1, 0x03 ) ); - - //Inputs: next column (i.e., next block in sequence) - in += BLOCK_LEN_M256I; - inout += BLOCK_LEN_M256I; - //Output: goes to previous column - out -= BLOCK_LEN_M256I; - } - - _mm256_store_si256( (__m256i*)State, state0 ); - _mm256_store_si256( (__m256i*)State + 1, state1 ); - _mm256_store_si256( (__m256i*)State + 2, state2 ); - _mm256_store_si256( (__m256i*)State + 3, state3 ); - -#elif defined (__SSE2__) - - __m128i* in = (__m128i*)rowIn; - __m128i* inout = (__m128i*)rowInOut; - __m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I ); - - for ( i = 0; i < 6; i += 3) - { - _mm_prefetch( in + i, _MM_HINT_T0 ); - _mm_prefetch( in + i + 2, _MM_HINT_T0 ); - _mm_prefetch( inout + i, _MM_HINT_T0 ); - _mm_prefetch( inout + i + 2, _MM_HINT_T0 ); - _mm_prefetch( out - i, _MM_HINT_T0 ); - _mm_prefetch( out - i - 2, _MM_HINT_T0 ); - } - - __m128i* state = (__m128i*)State; - - // For the last round in this function not optimized for AVX -// uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev -// uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* -// uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row - - for ( i = 0; i < nCols; i++ ) - { - _mm_prefetch( in + 6, _MM_HINT_T0 ); - _mm_prefetch( in + 7, _MM_HINT_T0 ); - _mm_prefetch( inout + 6, _MM_HINT_T0 ); - _mm_prefetch( inout + 7, _MM_HINT_T0 ); - _mm_prefetch( out - 6, _MM_HINT_T0 ); - _mm_prefetch( out - 7, _MM_HINT_T0 ); - - state[0] = _mm_xor_si128( state[0], - _mm_add_epi64( in[0], inout[0] ) ); - state[1] = _mm_xor_si128( state[1], - _mm_add_epi64( in[1], inout[1] ) ); - state[2] = _mm_xor_si128( state[2], - _mm_add_epi64( in[2], inout[2] ) ); - state[3] = _mm_xor_si128( state[3], - _mm_add_epi64( in[3], inout[3] ) ); - state[4] = _mm_xor_si128( state[4], - _mm_add_epi64( in[4], inout[4] ) ); - state[5] = _mm_xor_si128( state[5], - _mm_add_epi64( in[5], inout[5] ) ); - - //Applies the reduced-round transformation f to the sponge's state - LYRA_ROUND_AVX( state[0], state[1], state[2], state[3], - state[4], state[5], state[6], state[7] ); - - out[0] = _mm_xor_si128( state[0], in[0] ); - out[1] = _mm_xor_si128( state[1], in[1] ); - out[2] = _mm_xor_si128( state[2], in[2] ); - out[3] = _mm_xor_si128( state[3], in[3] ); - out[4] = _mm_xor_si128( state[4], in[4] ); - out[5] = _mm_xor_si128( state[5], in[5] ); - - - __m128i t0, t1; - t0 = _mm_srli_si128( state[0], 8 ); - t1 = _mm_srli_si128( state[1], 8 ); - inout[0] = _mm_xor_si128( inout[0], - _mm_or_si128( _mm_slli_si128( state[0], 8 ), - _mm_srli_si128( state[5], 8 ) ) ); - inout[1] = _mm_xor_si128( inout[1], - _mm_or_si128( _mm_slli_si128( state[1], 8 ), t0 ) ); - t0 = _mm_srli_si128( state[2], 8 ); - inout[2] = _mm_xor_si128( inout[2], - _mm_or_si128( _mm_slli_si128( state[2], 8 ), t1 ) ); - t1 = _mm_srli_si128( state[3], 8 ); - inout[3] = _mm_xor_si128( inout[3], - _mm_or_si128( _mm_slli_si128( state[3], 8 ), t0 ) ); - t0 = _mm_srli_si128( state[4], 8 ); - inout[4] = _mm_xor_si128( inout[4], - _mm_or_si128( _mm_slli_si128( state[4], 8 ), t1 ) ); - inout[5] = _mm_xor_si128( inout[5], - _mm_or_si128( _mm_slli_si128( state[5], 8 ), t0 ) ); - -/* - ptrWordInOut[0] ^= State[11]; - ptrWordInOut[1] ^= State[0]; - ptrWordInOut[2] ^= State[1]; - ptrWordInOut[3] ^= State[2]; - ptrWordInOut[4] ^= State[3]; - ptrWordInOut[5] ^= State[4]; - ptrWordInOut[6] ^= State[5]; - ptrWordInOut[7] ^= State[6]; - ptrWordInOut[8] ^= State[7]; - ptrWordInOut[9] ^= State[8]; - ptrWordInOut[10] ^= State[9]; - ptrWordInOut[11] ^= State[10]; - - //Inputs: next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - //Output: goes to previous column - ptrWordOut -= BLOCK_LEN_INT64; -*/ - inout += BLOCK_LEN_M128I; - in += BLOCK_LEN_M128I; - out -= BLOCK_LEN_M128I; - } +inline void reducedDuplexRowSetup(uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut, uint64_t *rowOut, + uint64_t nCols) { + int i; + +#if defined(__AVX2__) + + register __m256i state0, state1, state2, state3; + __m256i *in = (__m256i *)rowIn; + __m256i *inout = (__m256i *)rowInOut; + __m256i *out = (__m256i *)rowOut + ((nCols - 1) * BLOCK_LEN_M256I); + __m256i t0, t1, t2; + + state0 = _mm256_load_si256((__m256i *)State); + state1 = _mm256_load_si256((__m256i *)State + 1); + state2 = _mm256_load_si256((__m256i *)State + 2); + state3 = _mm256_load_si256((__m256i *)State + 3); + + for (i = 0; i < 9; i += 3) { + _mm_prefetch(in + i, _MM_HINT_T0); + _mm_prefetch(in + i + 2, _MM_HINT_T0); + _mm_prefetch(inout + i, _MM_HINT_T0); + _mm_prefetch(inout + i + 2, _MM_HINT_T0); + _mm_prefetch(out - i, _MM_HINT_T0); + _mm_prefetch(out - i - 2, _MM_HINT_T0); + } + + for (i = 0; i < nCols; i++) { + _mm_prefetch(in + 9, _MM_HINT_T0); + _mm_prefetch(in + 11, _MM_HINT_T0); + _mm_prefetch(inout + 9, _MM_HINT_T0); + _mm_prefetch(inout + 11, _MM_HINT_T0); + _mm_prefetch(out - 9, _MM_HINT_T0); + _mm_prefetch(out - 11, _MM_HINT_T0); + + state0 = _mm256_xor_si256(state0, _mm256_add_epi64(in[0], inout[0])); + state1 = _mm256_xor_si256(state1, _mm256_add_epi64(in[1], inout[1])); + state2 = _mm256_xor_si256(state2, _mm256_add_epi64(in[2], inout[2])); + + LYRA_ROUND_AVX2(state0, state1, state2, state3); + + out[0] = _mm256_xor_si256(state0, in[0]); + out[1] = _mm256_xor_si256(state1, in[1]); + out[2] = _mm256_xor_si256(state2, in[2]); + + /* + printf("s duplexsetup col= %d\n",i); + uint64_t * o = (uint64_t*)out; + printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]); + printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]); + printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]); + printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]); + printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]); + printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]); + */ + + // M[row*][col] = M[row*][col] XOR rotW(rand) + t0 = _mm256_permute4x64_epi64(state0, 0x93); + t1 = _mm256_permute4x64_epi64(state1, 0x93); + t2 = _mm256_permute4x64_epi64(state2, 0x93); + + /* + uint64_t *t = (uint64_t*)&t0; + printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]); + + o = (uint64_t*)inout; + printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]); + printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]); + printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]); + printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]); + printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]); + printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]); + */ + inout[0] = _mm256_xor_si256(inout[0], _mm256_blend_epi32(t0, t2, 0x03)); + inout[1] = _mm256_xor_si256(inout[1], _mm256_blend_epi32(t1, t0, 0x03)); + inout[2] = _mm256_xor_si256(inout[2], _mm256_blend_epi32(t2, t1, 0x03)); + + /* + o = (uint64_t*)inout; + printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]); + printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]); + printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]); + printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]); + printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]); + printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]); + */ + + // Inputs: next column (i.e., next block in sequence) + in += BLOCK_LEN_M256I; + inout += BLOCK_LEN_M256I; + // Output: goes to previous column + out -= BLOCK_LEN_M256I; + } + + _mm256_store_si256((__m256i *)State, state0); + _mm256_store_si256((__m256i *)State + 1, state1); + _mm256_store_si256((__m256i *)State + 2, state2); + _mm256_store_si256((__m256i *)State + 3, state3); + +#elif defined(__SSE2__) + + __m128i *in = (__m128i *)rowIn; + __m128i *inout = (__m128i *)rowInOut; + __m128i *out = (__m128i *)rowOut + ((nCols - 1) * BLOCK_LEN_M128I); + + for (i = 0; i < 6; i += 3) { + _mm_prefetch(in + i, _MM_HINT_T0); + _mm_prefetch(in + i + 2, _MM_HINT_T0); + _mm_prefetch(inout + i, _MM_HINT_T0); + _mm_prefetch(inout + i + 2, _MM_HINT_T0); + _mm_prefetch(out - i, _MM_HINT_T0); + _mm_prefetch(out - i - 2, _MM_HINT_T0); + } + + __m128i *state = (__m128i *)State; + + // For the last round in this function not optimized for AVX + // uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + // uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + // uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: + // pointer to row + + for (i = 0; i < nCols; i++) { + _mm_prefetch(in + 6, _MM_HINT_T0); + _mm_prefetch(in + 7, _MM_HINT_T0); + _mm_prefetch(inout + 6, _MM_HINT_T0); + _mm_prefetch(inout + 7, _MM_HINT_T0); + _mm_prefetch(out - 6, _MM_HINT_T0); + _mm_prefetch(out - 7, _MM_HINT_T0); + + state[0] = _mm_xor_si128(state[0], _mm_add_epi64(in[0], inout[0])); + state[1] = _mm_xor_si128(state[1], _mm_add_epi64(in[1], inout[1])); + state[2] = _mm_xor_si128(state[2], _mm_add_epi64(in[2], inout[2])); + state[3] = _mm_xor_si128(state[3], _mm_add_epi64(in[3], inout[3])); + state[4] = _mm_xor_si128(state[4], _mm_add_epi64(in[4], inout[4])); + state[5] = _mm_xor_si128(state[5], _mm_add_epi64(in[5], inout[5])); + + // Applies the reduced-round transformation f to the sponge's state + LYRA_ROUND_AVX(state[0], state[1], state[2], state[3], state[4], state[5], + state[6], state[7]); + + out[0] = _mm_xor_si128(state[0], in[0]); + out[1] = _mm_xor_si128(state[1], in[1]); + out[2] = _mm_xor_si128(state[2], in[2]); + out[3] = _mm_xor_si128(state[3], in[3]); + out[4] = _mm_xor_si128(state[4], in[4]); + out[5] = _mm_xor_si128(state[5], in[5]); + + __m128i t0, t1; + t0 = _mm_srli_si128(state[0], 8); + t1 = _mm_srli_si128(state[1], 8); + inout[0] = + _mm_xor_si128(inout[0], _mm_or_si128(_mm_slli_si128(state[0], 8), + _mm_srli_si128(state[5], 8))); + inout[1] = + _mm_xor_si128(inout[1], _mm_or_si128(_mm_slli_si128(state[1], 8), t0)); + t0 = _mm_srli_si128(state[2], 8); + inout[2] = + _mm_xor_si128(inout[2], _mm_or_si128(_mm_slli_si128(state[2], 8), t1)); + t1 = _mm_srli_si128(state[3], 8); + inout[3] = + _mm_xor_si128(inout[3], _mm_or_si128(_mm_slli_si128(state[3], 8), t0)); + t0 = _mm_srli_si128(state[4], 8); + inout[4] = + _mm_xor_si128(inout[4], _mm_or_si128(_mm_slli_si128(state[4], 8), t1)); + inout[5] = + _mm_xor_si128(inout[5], _mm_or_si128(_mm_slli_si128(state[5], 8), t0)); + + /* + ptrWordInOut[0] ^= State[11]; + ptrWordInOut[1] ^= State[0]; + ptrWordInOut[2] ^= State[1]; + ptrWordInOut[3] ^= State[2]; + ptrWordInOut[4] ^= State[3]; + ptrWordInOut[5] ^= State[4]; + ptrWordInOut[6] ^= State[5]; + ptrWordInOut[7] ^= State[6]; + ptrWordInOut[8] ^= State[7]; + ptrWordInOut[9] ^= State[8]; + ptrWordInOut[10] ^= State[9]; + ptrWordInOut[11] ^= State[10]; + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + */ + inout += BLOCK_LEN_M128I; + in += BLOCK_LEN_M128I; + out -= BLOCK_LEN_M128I; + } #else - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row - - for ( i = 0; i < nCols; i++ ) - { - - //Absorbing "M[prev] [+] M[row*]" - State[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); - State[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); - State[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); - State[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); - State[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); - State[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); - State[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); - State[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); - State[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); - State[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); - State[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); - State[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra( State ); - - //M[row][col] = M[prev][col] XOR rand - ptrWordOut[0] = ptrWordIn[0] ^ State[0]; - ptrWordOut[1] = ptrWordIn[1] ^ State[1]; - ptrWordOut[2] = ptrWordIn[2] ^ State[2]; - ptrWordOut[3] = ptrWordIn[3] ^ State[3]; - ptrWordOut[4] = ptrWordIn[4] ^ State[4]; - ptrWordOut[5] = ptrWordIn[5] ^ State[5]; - ptrWordOut[6] = ptrWordIn[6] ^ State[6]; - ptrWordOut[7] = ptrWordIn[7] ^ State[7]; - ptrWordOut[8] = ptrWordIn[8] ^ State[8]; - ptrWordOut[9] = ptrWordIn[9] ^ State[9]; - ptrWordOut[10] = ptrWordIn[10] ^ State[10]; - ptrWordOut[11] = ptrWordIn[11] ^ State[11]; - - ptrWordInOut[0] ^= State[11]; - ptrWordInOut[1] ^= State[0]; - ptrWordInOut[2] ^= State[1]; - ptrWordInOut[3] ^= State[2]; - ptrWordInOut[4] ^= State[3]; - ptrWordInOut[5] ^= State[4]; - ptrWordInOut[6] ^= State[5]; - ptrWordInOut[7] ^= State[6]; - ptrWordInOut[8] ^= State[7]; - ptrWordInOut[9] ^= State[8]; - ptrWordInOut[10] ^= State[9]; - ptrWordInOut[11] ^= State[10]; - - //Inputs: next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - //Output: goes to previous column - ptrWordOut -= BLOCK_LEN_INT64; - } + uint64_t *ptrWordIn = rowIn; // In Lyra2: pointer to prev + uint64_t *ptrWordInOut = rowInOut; // In Lyra2: pointer to row* + uint64_t *ptrWordOut = + rowOut + (nCols - 1) * BLOCK_LEN_INT64; // In Lyra2: pointer to row + + for (i = 0; i < nCols; i++) { + + // Absorbing "M[prev] [+] M[row*]" + State[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + State[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + State[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + State[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + State[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + State[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + State[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + State[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + State[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + State[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + State[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + State[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + // Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(State); + + // M[row][col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ State[0]; + ptrWordOut[1] = ptrWordIn[1] ^ State[1]; + ptrWordOut[2] = ptrWordIn[2] ^ State[2]; + ptrWordOut[3] = ptrWordIn[3] ^ State[3]; + ptrWordOut[4] = ptrWordIn[4] ^ State[4]; + ptrWordOut[5] = ptrWordIn[5] ^ State[5]; + ptrWordOut[6] = ptrWordIn[6] ^ State[6]; + ptrWordOut[7] = ptrWordIn[7] ^ State[7]; + ptrWordOut[8] = ptrWordIn[8] ^ State[8]; + ptrWordOut[9] = ptrWordIn[9] ^ State[9]; + ptrWordOut[10] = ptrWordIn[10] ^ State[10]; + ptrWordOut[11] = ptrWordIn[11] ^ State[11]; + + ptrWordInOut[0] ^= State[11]; + ptrWordInOut[1] ^= State[0]; + ptrWordInOut[2] ^= State[1]; + ptrWordInOut[3] ^= State[2]; + ptrWordInOut[4] ^= State[3]; + ptrWordInOut[5] ^= State[4]; + ptrWordInOut[6] ^= State[5]; + ptrWordInOut[7] ^= State[6]; + ptrWordInOut[8] ^= State[7]; + ptrWordInOut[9] ^= State[8]; + ptrWordInOut[10] ^= State[9]; + ptrWordInOut[11] ^= State[10]; + + // Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + // Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } #endif - } /** - * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., - * the wordwise addition of two columns, ignoring carries between words). The - * output of this operation, "rand", is then used to make - * "M[rowOut][col] = M[rowOut][col] XOR rand" and - * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit - * rotation to the left. + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" + * (i.e., the wordwise addition of two columns, ignoring carries between words). + * The output of this operation, "rand", is then used to make "M[rowOut][col] = + * M[rowOut][col] XOR rand" and "M[rowInOut][col] = M[rowInOut][col] XOR + * rotW(rand)", where rotW is a 64-bit rotation to the left. * * @param state The current state of the sponge * @param rowIn Row used only as input @@ -875,226 +911,217 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn, * */ -inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn, - uint64_t *rowInOut, uint64_t *rowOut, - uint64_t nCols ) -{ - int i; +inline void reducedDuplexRow(uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut, uint64_t *rowOut, + uint64_t nCols) { + int i; #if defined __AVX2__ - register __m256i state0, state1, state2, state3; - __m256i* in = (__m256i*)rowIn; - __m256i* inout = (__m256i*)rowInOut; - __m256i* out = (__m256i*)rowOut; - __m256i t0, t1, t2; - - state0 = _mm256_load_si256( (__m256i*)State ); - state1 = _mm256_load_si256( (__m256i*)State + 1 ); - state2 = _mm256_load_si256( (__m256i*)State + 2 ); - state3 = _mm256_load_si256( (__m256i*)State + 3 ); - - for ( i = 0; i < 9; i += 3) - { - _mm_prefetch( in + i, _MM_HINT_T0 ); - _mm_prefetch( in + i + 2, _MM_HINT_T0 ); - _mm_prefetch( out + i, _MM_HINT_T0 ); - _mm_prefetch( out + i + 2, _MM_HINT_T0 ); - _mm_prefetch( inout + i, _MM_HINT_T0 ); - _mm_prefetch( inout + i + 2, _MM_HINT_T0 ); - } - - for ( i = 0; i < nCols; i++ ) - { - _mm_prefetch( in + 9, _MM_HINT_T0 ); - _mm_prefetch( in + 11, _MM_HINT_T0 ); - _mm_prefetch( out + 9, _MM_HINT_T0 ); - _mm_prefetch( out + 11, _MM_HINT_T0 ); - _mm_prefetch( inout + 9, _MM_HINT_T0 ); - _mm_prefetch( inout + 11, _MM_HINT_T0 ); - - //Absorbing "M[prev] [+] M[row*]" - state0 = _mm256_xor_si256( state0, - _mm256_add_epi64( in[0], inout[0] ) ); - state1 = _mm256_xor_si256( state1, - _mm256_add_epi64( in[1], inout[1] ) ); - state2 = _mm256_xor_si256( state2, - _mm256_add_epi64( in[2], inout[2] ) ); - - //Applies the reduced-round transformation f to the sponge's state - LYRA_ROUND_AVX2( state0, state1, state2, state3 ); - - //M[rowOut][col] = M[rowOut][col] XOR rand - out[0] = _mm256_xor_si256( out[0], state0 ); - out[1] = _mm256_xor_si256( out[1], state1 ); - out[2] = _mm256_xor_si256( out[2], state2 ); - - //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) - t0 = _mm256_permute4x64_epi64( state0, 0x93 ); - t1 = _mm256_permute4x64_epi64( state1, 0x93 ); - t2 = _mm256_permute4x64_epi64( state2, 0x93 ); - - inout[0] = _mm256_xor_si256( inout[0], - _mm256_blend_epi32( t0, t2, 0x03 ) ); - inout[1] = _mm256_xor_si256( inout[1], - _mm256_blend_epi32( t1, t0, 0x03 ) ); - inout[2] = _mm256_xor_si256( inout[2], - _mm256_blend_epi32( t2, t1, 0x03 ) ); - - //Goes to next block - in += BLOCK_LEN_M256I; - out += BLOCK_LEN_M256I; - inout += BLOCK_LEN_M256I; - } - - _mm256_store_si256( (__m256i*)State, state0 ); - _mm256_store_si256( (__m256i*)State + 1, state1 ); - _mm256_store_si256( (__m256i*)State + 2, state2 ); - _mm256_store_si256( (__m256i*)State + 3, state3 ); - -#elif defined (__SSE2__) - - __m128i* state = (__m128i*)State; - __m128i* in = (__m128i*)rowIn; - __m128i* inout = (__m128i*)rowInOut; - __m128i* out = (__m128i*)rowOut; - - for ( i = 0; i < 6; i += 3) - { - _mm_prefetch( in + i, _MM_HINT_T0 ); - _mm_prefetch( in + i + 2, _MM_HINT_T0 ); - _mm_prefetch( out - i, _MM_HINT_T0 ); - _mm_prefetch( out - i - 2, _MM_HINT_T0 ); - _mm_prefetch( inout + i, _MM_HINT_T0 ); - _mm_prefetch( inout + i + 2, _MM_HINT_T0 ); - } - - // for the last round in this function that isn't optimized for AVX - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - - for ( i = 0; i < nCols; i++) - { - _mm_prefetch( in + 6, _MM_HINT_T0 ); - _mm_prefetch( in + 7, _MM_HINT_T0 ); - _mm_prefetch( out - 6, _MM_HINT_T0 ); - _mm_prefetch( out - 7, _MM_HINT_T0 ); - _mm_prefetch( inout + 6, _MM_HINT_T0 ); - _mm_prefetch( inout + 7, _MM_HINT_T0 ); - - state[0] = _mm_xor_si128( state[0], - _mm_add_epi64( in[0], inout[0] ) ); - state[1] = _mm_xor_si128( state[1], - _mm_add_epi64( in[1], - inout[1] ) ); - state[2] = _mm_xor_si128( state[2], - _mm_add_epi64( in[2], - inout[2] ) ); - state[3] = _mm_xor_si128( state[3], - _mm_add_epi64( in[3], - inout[3] ) ); - state[4] = _mm_xor_si128( state[4], - _mm_add_epi64( in[4], - inout[4] ) ); - state[5] = _mm_xor_si128( state[5], - _mm_add_epi64( in[5], - inout[5] ) ); - - //Applies the reduced-round transformation f to the sponge's state - LYRA_ROUND_AVX( state[0], state[1], state[2], state[3], - state[4], state[5], state[6], state[7] ); - - out[0] = _mm_xor_si128( state[0], out[0] ); - out[1] = _mm_xor_si128( state[1], out[1] ); - out[2] = _mm_xor_si128( state[2], out[2] ); - out[3] = _mm_xor_si128( state[3], out[3] ); - out[4] = _mm_xor_si128( state[4], out[4] ); - out[5] = _mm_xor_si128( state[5], out[5] ); - - //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) - ptrWordInOut[0] ^= State[11]; - ptrWordInOut[1] ^= State[0]; - ptrWordInOut[2] ^= State[1]; - ptrWordInOut[3] ^= State[2]; - ptrWordInOut[4] ^= State[3]; - ptrWordInOut[5] ^= State[4]; - ptrWordInOut[6] ^= State[5]; - ptrWordInOut[7] ^= State[6]; - ptrWordInOut[8] ^= State[7]; - ptrWordInOut[9] ^= State[8]; - ptrWordInOut[10] ^= State[9]; - ptrWordInOut[11] ^= State[10]; - - //Goes to next block - ptrWordOut += BLOCK_LEN_INT64; - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - - out += BLOCK_LEN_M128I; - inout += BLOCK_LEN_M128I; - in += BLOCK_LEN_M128I; - } + register __m256i state0, state1, state2, state3; + __m256i *in = (__m256i *)rowIn; + __m256i *inout = (__m256i *)rowInOut; + __m256i *out = (__m256i *)rowOut; + __m256i t0, t1, t2; + + state0 = _mm256_load_si256((__m256i *)State); + state1 = _mm256_load_si256((__m256i *)State + 1); + state2 = _mm256_load_si256((__m256i *)State + 2); + state3 = _mm256_load_si256((__m256i *)State + 3); + + for (i = 0; i < 9; i += 3) { + _mm_prefetch(in + i, _MM_HINT_T0); + _mm_prefetch(in + i + 2, _MM_HINT_T0); + _mm_prefetch(out + i, _MM_HINT_T0); + _mm_prefetch(out + i + 2, _MM_HINT_T0); + _mm_prefetch(inout + i, _MM_HINT_T0); + _mm_prefetch(inout + i + 2, _MM_HINT_T0); + } + + for (i = 0; i < nCols; i++) { + _mm_prefetch(in + 9, _MM_HINT_T0); + _mm_prefetch(in + 11, _MM_HINT_T0); + _mm_prefetch(out + 9, _MM_HINT_T0); + _mm_prefetch(out + 11, _MM_HINT_T0); + _mm_prefetch(inout + 9, _MM_HINT_T0); + _mm_prefetch(inout + 11, _MM_HINT_T0); + + /* + uint64_t *io = (uint64_t*)inout; + uint64_t *ii = (uint64_t*)in; + + printf("RDRS1 col= %d\n", i); + printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]); + printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]); + printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]); + printf("RDRS1 IO %016lx %016lx %016lx + %016lx\n",io[12],io[13],io[14],io[15]); printf("RDRS1 IN %016lx %016lx + %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]); printf("RDRS1 IN %016lx %016lx + %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]); printf("RDRS1 IN %016lx %016lx + %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]); printf("RDRS1 IN %016lx %016lx + %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]); + */ + + // Absorbing "M[prev] [+] M[row*]" + state0 = _mm256_xor_si256(state0, _mm256_add_epi64(in[0], inout[0])); + state1 = _mm256_xor_si256(state1, _mm256_add_epi64(in[1], inout[1])); + state2 = _mm256_xor_si256(state2, _mm256_add_epi64(in[2], inout[2])); + + // Applies the reduced-round transformation f to the sponge's state + LYRA_ROUND_AVX2(state0, state1, state2, state3); + + // M[rowOut][col] = M[rowOut][col] XOR rand + out[0] = _mm256_xor_si256(out[0], state0); + out[1] = _mm256_xor_si256(out[1], state1); + out[2] = _mm256_xor_si256(out[2], state2); + + // M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + t0 = _mm256_permute4x64_epi64(state0, 0x93); + t1 = _mm256_permute4x64_epi64(state1, 0x93); + t2 = _mm256_permute4x64_epi64(state2, 0x93); + + inout[0] = _mm256_xor_si256(inout[0], _mm256_blend_epi32(t0, t2, 0x03)); + inout[1] = _mm256_xor_si256(inout[1], _mm256_blend_epi32(t1, t0, 0x03)); + inout[2] = _mm256_xor_si256(inout[2], _mm256_blend_epi32(t2, t1, 0x03)); + + // Goes to next block + in += BLOCK_LEN_M256I; + out += BLOCK_LEN_M256I; + inout += BLOCK_LEN_M256I; + } + + _mm256_store_si256((__m256i *)State, state0); + _mm256_store_si256((__m256i *)State + 1, state1); + _mm256_store_si256((__m256i *)State + 2, state2); + _mm256_store_si256((__m256i *)State + 3, state3); + +#elif defined(__SSE2__) + + __m128i *state = (__m128i *)State; + __m128i *in = (__m128i *)rowIn; + __m128i *inout = (__m128i *)rowInOut; + __m128i *out = (__m128i *)rowOut; + + for (i = 0; i < 6; i += 3) { + _mm_prefetch(in + i, _MM_HINT_T0); + _mm_prefetch(in + i + 2, _MM_HINT_T0); + _mm_prefetch(out - i, _MM_HINT_T0); + _mm_prefetch(out - i - 2, _MM_HINT_T0); + _mm_prefetch(inout + i, _MM_HINT_T0); + _mm_prefetch(inout + i + 2, _MM_HINT_T0); + } + + // for the last round in this function that isn't optimized for AVX + uint64_t *ptrWordInOut = rowInOut; // In Lyra2: pointer to row* + uint64_t *ptrWordIn = rowIn; // In Lyra2: pointer to prev + uint64_t *ptrWordOut = rowOut; // In Lyra2: pointer to row + + for (i = 0; i < nCols; i++) { + _mm_prefetch(in + 6, _MM_HINT_T0); + _mm_prefetch(in + 7, _MM_HINT_T0); + _mm_prefetch(out - 6, _MM_HINT_T0); + _mm_prefetch(out - 7, _MM_HINT_T0); + _mm_prefetch(inout + 6, _MM_HINT_T0); + _mm_prefetch(inout + 7, _MM_HINT_T0); + + state[0] = _mm_xor_si128(state[0], _mm_add_epi64(in[0], inout[0])); + state[1] = _mm_xor_si128(state[1], _mm_add_epi64(in[1], inout[1])); + state[2] = _mm_xor_si128(state[2], _mm_add_epi64(in[2], inout[2])); + state[3] = _mm_xor_si128(state[3], _mm_add_epi64(in[3], inout[3])); + state[4] = _mm_xor_si128(state[4], _mm_add_epi64(in[4], inout[4])); + state[5] = _mm_xor_si128(state[5], _mm_add_epi64(in[5], inout[5])); + + // Applies the reduced-round transformation f to the sponge's state + LYRA_ROUND_AVX(state[0], state[1], state[2], state[3], state[4], state[5], + state[6], state[7]); + + out[0] = _mm_xor_si128(state[0], out[0]); + out[1] = _mm_xor_si128(state[1], out[1]); + out[2] = _mm_xor_si128(state[2], out[2]); + out[3] = _mm_xor_si128(state[3], out[3]); + out[4] = _mm_xor_si128(state[4], out[4]); + out[5] = _mm_xor_si128(state[5], out[5]); + + // M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= State[11]; + ptrWordInOut[1] ^= State[0]; + ptrWordInOut[2] ^= State[1]; + ptrWordInOut[3] ^= State[2]; + ptrWordInOut[4] ^= State[3]; + ptrWordInOut[5] ^= State[4]; + ptrWordInOut[6] ^= State[5]; + ptrWordInOut[7] ^= State[6]; + ptrWordInOut[8] ^= State[7]; + ptrWordInOut[9] ^= State[8]; + ptrWordInOut[10] ^= State[9]; + ptrWordInOut[11] ^= State[10]; + + // Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + + out += BLOCK_LEN_M128I; + inout += BLOCK_LEN_M128I; + in += BLOCK_LEN_M128I; + } #else - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - - for ( i = 0; i < nCols; i++) - { - - //Absorbing "M[prev] [+] M[row*]" - State[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); - State[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); - State[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); - State[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); - State[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); - State[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); - State[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); - State[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); - State[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); - State[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); - State[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); - State[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra( State); - - ptrWordOut[0] ^= State[0]; - ptrWordOut[1] ^= State[1]; - ptrWordOut[2] ^= State[2]; - ptrWordOut[3] ^= State[3]; - ptrWordOut[4] ^= State[4]; - ptrWordOut[5] ^= State[5]; - ptrWordOut[6] ^= State[6]; - ptrWordOut[7] ^= State[7]; - ptrWordOut[8] ^= State[8]; - ptrWordOut[9] ^= State[9]; - ptrWordOut[10] ^= State[10]; - ptrWordOut[11] ^= State[11]; - - //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) - ptrWordInOut[0] ^= State[11]; - ptrWordInOut[1] ^= State[0]; - ptrWordInOut[2] ^= State[1]; - ptrWordInOut[3] ^= State[2]; - ptrWordInOut[4] ^= State[3]; - ptrWordInOut[5] ^= State[4]; - ptrWordInOut[6] ^= State[5]; - ptrWordInOut[7] ^= State[6]; - ptrWordInOut[8] ^= State[7]; - ptrWordInOut[9] ^= State[8]; - ptrWordInOut[10] ^= State[9]; - ptrWordInOut[11] ^= State[10]; - - //Goes to next block - ptrWordOut += BLOCK_LEN_INT64; - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - } + uint64_t *ptrWordInOut = rowInOut; // In Lyra2: pointer to row* + uint64_t *ptrWordIn = rowIn; // In Lyra2: pointer to prev + uint64_t *ptrWordOut = rowOut; // In Lyra2: pointer to row + + for (i = 0; i < nCols; i++) { + + // Absorbing "M[prev] [+] M[row*]" + State[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + State[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + State[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + State[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + State[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + State[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + State[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + State[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + State[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + State[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + State[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + State[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + // Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(State); + + ptrWordOut[0] ^= State[0]; + ptrWordOut[1] ^= State[1]; + ptrWordOut[2] ^= State[2]; + ptrWordOut[3] ^= State[3]; + ptrWordOut[4] ^= State[4]; + ptrWordOut[5] ^= State[5]; + ptrWordOut[6] ^= State[6]; + ptrWordOut[7] ^= State[7]; + ptrWordOut[8] ^= State[8]; + ptrWordOut[9] ^= State[9]; + ptrWordOut[10] ^= State[10]; + ptrWordOut[11] ^= State[11]; + + // M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= State[11]; + ptrWordInOut[1] ^= State[0]; + ptrWordInOut[2] ^= State[1]; + ptrWordInOut[3] ^= State[2]; + ptrWordInOut[4] ^= State[3]; + ptrWordInOut[5] ^= State[4]; + ptrWordInOut[6] ^= State[5]; + ptrWordInOut[7] ^= State[6]; + ptrWordInOut[8] ^= State[7]; + ptrWordInOut[9] ^= State[8]; + ptrWordInOut[10] ^= State[9]; + ptrWordInOut[11] ^= State[10]; + + // Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } #endif } - - diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 76be768..b24b173 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -52,8 +52,46 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ // However, 2 way parallel looks trivial to code for AVX512 except for // a data dependency with rowa. +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +#define G2W_4X64(a,b,c,d) \ + a = _mm512_add_epi64( a, b ); \ + d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \ + c = _mm512_add_epi64( c, d ); \ + b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \ + a = _mm512_add_epi64( a, b ); \ + d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \ + c = _mm512_add_epi64( c, d ); \ + b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 ); + +#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + G2W_4X64( s0, s1, s2, s3 ); \ + s1 = mm512_ror256_64( s1); \ + s2 = mm512_swap256_128( s2 ); \ + s3 = mm512_rol256_64( s3 ); \ + G2W_4X64( s0, s1, s2, s3 ); \ + s1 = mm512_rol256_64( s1 ); \ + s2 = mm512_swap256_128( s2 ); \ + s3 = mm512_ror256_64( s3 ); + +#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ + LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) + + +#endif // AVX512 + #if defined __AVX2__ -// only available with avx2 // process 4 columns in parallel // returns void, updates all args @@ -89,9 +127,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ - LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) + +#endif -#elif defined(__SSE2__) +#if defined(__SSE2__) // process 2 columns in parallel // returns void, all args updated @@ -108,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_ror1x64_256( s2, s3 ); \ - mm128_swap128_256( s4, s5 ); \ - mm128_rol1x64_256( s6, s7 ); \ + mm128_ror256_64( s2, s3 ); \ + mm128_swap256_128( s4, s5 ); \ + mm128_rol256_64( s6, s7 ); \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_rol1x64_256( s2, s3 ); \ - mm128_swap128_256( s4, s5 ); \ - mm128_ror1x64_256( s6, s7 ); + mm128_rol256_64( s2, s3 ); \ + mm128_swap256_128( s4, s5 ); \ + mm128_ror256_64( s6, s7 ); #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ @@ -129,8 +169,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ - LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ - + LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) #endif // AVX2 else SSE2 @@ -160,6 +199,56 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +union _ovly_512 +{ + __m512i v512; + struct + { + __m256i v256lo; + __m256i v256hi; + }; +}; +typedef union _ovly_512 ovly_512; + + +union _inout_ovly +{ + __m512i v512[3]; + __m256i v256[6]; +}; +typedef union _inout_ovly inout_ovly; + +//---- Housekeeping +void initState_2way( uint64_t State[/*16*/] ); + +//---- Squeezes +void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len ); +void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols ); + +//---- Absorbs +void absorbBlock_2way( uint64_t *State, const uint64_t *In0, + const uint64_t *In1 ); +void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In, + const uint64_t nBlocks, const uint64_t block_len ); + +//---- Duplexes +void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn, + uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols ); + +void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut0, uint64_t *rowInOut1, + uint64_t *rowOut, uint64_t nCols); + +void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut0, uint64_t *rowInOut1, + uint64_t *rowOut, uint64_t nCols); + +#endif + //---- Housekeeping void initState(uint64_t state[/*16*/]); @@ -170,27 +259,12 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols); //---- Absorbs void absorbBlock(uint64_t *state, const uint64_t *in); -void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); +void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in, + const uint64_t nBlocks, const uint64_t block_len ); //---- Duplexes void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols); void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); -//---- Misc -void printArray(unsigned char *array, unsigned int size, char *name); - -//////////////////////////////////////////////////////////////////////////////////////////////// - - -////TESTS//// -//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2); -//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -///////////// - - #endif /* SPONGE_H_ */ diff --git a/algo/m7m.c b/algo/m7m.c deleted file mode 100644 index a45ec34..0000000 --- a/algo/m7m.c +++ /dev/null @@ -1,343 +0,0 @@ -#include "cpuminer-config.h" -#include "algo-gate-api.h" - -#include -#include -#include -#include -#include -#include -#include "algo/keccak/sph_keccak.h" -#include "algo/haval/sph-haval.h" -#include "algo/tiger/sph_tiger.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/ripemd/sph_ripemd.h" -#include - - -#define EPSa DBL_EPSILON -#define EPS1 DBL_EPSILON -#define EPS2 3.0e-11 - -inline double exp_n(double xt) -{ - if(xt < -700.0) - return 0; - else if(xt > 700.0) - return 1e200; - else if(xt > -0.8e-8 && xt < 0.8e-8) - return (1.0 + xt); - else - return exp(xt); -} - -inline double exp_n2(double x1, double x2) -{ - double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.; - double xt = x1 - x2; - if (xt < p1+1.e-200) - return 1.; - else if (xt > p1 && xt < p2 + 1.e-200) - return ( 1. - exp(xt) ); - else if (xt > p2 && xt < p3 + 1.e-200) - return ( 1. / (1. + exp(xt)) ); - else if (xt > p3 && xt < p4) - return ( 1. / (2. + xt) ); - else if (xt > p4 - 1.e-200 && xt < p5) - return ( exp(-xt) / (1. + exp(-xt)) ); - else if (xt > p5 - 1.e-200 && xt < p6) - return ( exp(-xt) ); - else if (xt > p6 - 1.e-200) - return 0.; -} - -double swit2_(double wvnmb) -{ - return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5) - / 1034.66 * pow(sin(wvnmb/65.), 2.); -} - - -double GaussianQuad_N2(const double x1, const double x2) -{ - double s=0.0; - double x[6], w[6]; - //gauleg(a2, b2, x, w); - - double z1, z, xm, xl, pp, p3, p2, p1; - xm=0.5*(x2+x1); - xl=0.5*(x2-x1); - for(int i=1;i<=3;i++) - { - z = (i == 1) ? 0.909632 : -0.0; - z = (i == 2) ? 0.540641 : z; - do - { - p1 = z; - p2 = 1; - p3 = 0; - - p3=1; - p2=z; - p1=((3.0 * z * z) - 1) / 2; - - p3=p2; - p2=p1; - p1=((5.0 * z * p2) - (2.0 * z)) / 3; - - p3=p2; - p2=p1; - p1=((7.0 * z * p2) - (3.0 * p3)) / 4; - - p3=p2; - p2=p1; - p1=((9.0 * z * p2) - (4.0 * p3)) / 5; - - pp=5*(z*p1-p2)/(z*z-1.0); - z1=z; - z=z1-p1/pp; - } while (fabs(z-z1) > 3.0e-11); - - x[i]=xm-xl*z; - x[5+1-i]=xm+xl*z; - w[i]=2.0*xl/((1.0-z*z)*pp*pp); - w[5+1-i]=w[i]; - } - - for(int j=1; j<=5; j++) s += w[j]*swit2_(x[j]); - - return s; -} - -uint32_t sw2_(int nnounce) -{ - double wmax = ((sqrt((double)(nnounce))*(1.+EPSa))/450+100); - return ((uint32_t)(GaussianQuad_N2(0., wmax)*(1.+EPSa)*1.e6)); -} - -typedef struct { - SHA256_CTX sha256; - SHA512_CTX sha512; - sph_keccak512_context keccak; - sph_whirlpool_context whirlpool; - sph_haval256_5_context haval; - sph_tiger_context tiger; - sph_ripemd160_context ripemd; -} m7m_ctx_holder; - -m7m_ctx_holder m7m_ctx; - -void init_m7m_ctx() -{ - SHA256_Init( &m7m_ctx.sha256 ); - SHA512_Init( &m7m_ctx.sha512 ); - sph_keccak512_init( &m7m_ctx.keccak ); - sph_whirlpool_init( &m7m_ctx.whirlpool ); - sph_haval256_5_init( &m7m_ctx.haval ); - sph_tiger_init( &m7m_ctx.tiger ); - sph_ripemd160_init( &m7m_ctx.ripemd ); -} - -#define BITS_PER_DIGIT 3.32192809488736234787 -#define EPS (DBL_EPSILON) - -#define NM7M 5 -#define SW_DIVS 5 -#define M7_MIDSTATE_LEN 76 -int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, - unsigned long *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t data[32] __attribute__((aligned(64))); - uint32_t *data_p64 = data + (M7_MIDSTATE_LEN / sizeof(data[0])); - uint32_t hash[8] __attribute__((aligned(64))); - uint8_t bhash[7][64] __attribute__((aligned(64))); - uint32_t n = pdata[19] - 1; - int thr_id = mythr->id; // thr_id arg is deprecated - uint32_t usw_, mpzscale; - const uint32_t first_nonce = pdata[19]; - char data_str[161], hash_str[65], target_str[65]; - //uint8_t *bdata = 0; - uint8_t bdata[8192] __attribute__ ((aligned (64))); - int rc = 0, i, digits; - int bytes; - size_t p = sizeof(unsigned long), a = 64/p, b = 32/p; - - m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64))); - memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) ); - SHA256_CTX ctxf_sha256; - - memcpy(data, pdata, 80); - - SHA256_Update( &ctx1.sha256, data, M7_MIDSTATE_LEN ); - SHA512_Update( &ctx1.sha512, data, M7_MIDSTATE_LEN ); - sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN ); - sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN ); - sph_haval256_5( &ctx1.haval, data, M7_MIDSTATE_LEN ); - sph_tiger( &ctx1.tiger, data, M7_MIDSTATE_LEN ); - sph_ripemd160( &ctx1.ripemd, data, M7_MIDSTATE_LEN ); - - mpz_t magipi, magisw, product, bns0, bns1; - mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten; - - mpz_inits(magipi, magisw, bns0, bns1, NULL); - mpz_init2(product, 512); - - mp_bitcnt_t prec0 = (long int)((int)((sqrt((double)(INT_MAX))*(1.+EPS))/9000+75)*BITS_PER_DIGIT+16); - mpf_set_default_prec(prec0); - - mpf_init(magifpi); - mpf_init(magifpi0); - mpf_init(mpt1); - mpf_init(mpt2); - mpf_init(mptmp); - mpf_init_set_ui(mpten, 10); - mpf_set_str(mpt2, "0.8e3b1a9b359805c2e54c6415037f2e336893b6457f7754f6b4ae045eb6c5f2bedb26a114030846be7", 16); - mpf_set_str(magifpi0, "0.b7bfc6837e20bdb22653f1fc419f6bc33ca80eb65b7b0246f7f3b65689560aea1a2f2fd95f254d68c", 16); - - do { - data[19] = ++n; - memset(bhash, 0, 7 * 64); - - memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) ); - - SHA256_Update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN ); - SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 ); - - SHA512_Update( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN ); - SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 ); - - sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN ); - sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) ); - - sph_whirlpool( &ctx2.whirlpool, data_p64, 80 - M7_MIDSTATE_LEN ); - sph_whirlpool_close( &ctx2.whirlpool, (void*)(bhash[3]) ); - - sph_haval256_5( &ctx2.haval, data_p64, 80 - M7_MIDSTATE_LEN ); - sph_haval256_5_close( &ctx2.haval, (void*)(bhash[4])) ; - - sph_tiger( &ctx2.tiger, data_p64, 80 - M7_MIDSTATE_LEN ); - sph_tiger_close( &ctx2.tiger, (void*)(bhash[5]) ); - - sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN ); - sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) ); - - mpz_import(bns0, a, -1, p, -1, 0, bhash[0]); - mpz_set(bns1, bns0); - mpz_set(product, bns0); - for ( i=1; i < 7; i++ ) - { - mpz_import(bns0, a, -1, p, -1, 0, bhash[i]); - mpz_add(bns1, bns1, bns0); - mpz_mul(product, product, bns0); - } - mpz_mul(product, product, bns1); - - mpz_mul(product, product, product); - bytes = mpz_sizeinbase(product, 256); - mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product); - - SHA256_Init( &ctxf_sha256 ); - SHA256_Update( &ctxf_sha256, bdata, bytes ); - SHA256_Final( (unsigned char*) hash, &ctxf_sha256 ); - - digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75); - mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16); - mpf_set_prec_raw(magifpi, prec); - mpf_set_prec_raw(mptmp, prec); - mpf_set_prec_raw(mpt1, prec); - mpf_set_prec_raw(mpt2, prec); - - usw_ = sw2_(n/2); - mpzscale = 1; - mpz_set_ui(magisw, usw_); - - for ( i = 0; i < 5; i++ ) - { - mpf_set_d(mpt1, 0.25*mpzscale); - mpf_sub(mpt1, mpt1, mpt2); - mpf_abs(mpt1, mpt1); - mpf_div(magifpi, magifpi0, mpt1); - mpf_pow_ui(mptmp, mpten, digits >> 1); - mpf_mul(magifpi, magifpi, mptmp); - mpz_set_f(magipi, magifpi); - mpz_add(magipi,magipi,magisw); - mpz_add(product,product,magipi); - mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash)); - mpz_add(bns1, bns1, bns0); - mpz_mul(product,product,bns1); - mpz_cdiv_q (product, product, bns0); - - bytes = mpz_sizeinbase(product, 256); - mpzscale=bytes; - mpz_export(bdata, NULL, -1, 1, 0, 0, product); - - SHA256_Init( &ctxf_sha256 ); - SHA256_Update( &ctxf_sha256, bdata, bytes ); - SHA256_Final( (unsigned char*) hash, &ctxf_sha256 ); - } - - const unsigned char *hash_ = (const unsigned char *)hash; - const unsigned char *target_ = (const unsigned char *)ptarget; - for ( i = 31; i >= 0; i-- ) - { - if ( hash_[i] != target_[i] ) - { - rc = hash_[i] < target_[i]; - break; - } - } - if ( unlikely(rc) ) - { - if ( opt_debug ) - { - bin2hex(hash_str, (unsigned char *)hash, 32); - bin2hex(target_str, (unsigned char *)ptarget, 32); - bin2hex(data_str, (unsigned char *)data, 80); - applog(LOG_DEBUG, "DEBUG: [%d thread] Found share!\ndata %s\nhash %s\ntarget %s", thr_id, - data_str, - hash_str, - target_str); - } - pdata[19] = data[19]; - submit_solution( work, hash, mythr ); - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = n; - -// can this be skipped after finding a share? Seems to work ok. -//out: - mpf_set_prec_raw(magifpi, prec0); - mpf_set_prec_raw(magifpi0, prec0); - mpf_set_prec_raw(mptmp, prec0); - mpf_set_prec_raw(mpt1, prec0); - mpf_set_prec_raw(mpt2, prec0); - mpf_clear(magifpi); - mpf_clear(magifpi0); - mpf_clear(mpten); - mpf_clear(mptmp); - mpf_clear(mpt1); - mpf_clear(mpt2); - mpz_clears(magipi, magisw, product, bns0, bns1, NULL); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -bool register_m7m_algo( algo_gate_t *gate ) -{ - gate->optimizations = SHA_OPT; - init_m7m_ctx(); - gate->scanhash = (void*)scanhash_m7m_hash; - gate->build_stratum_request = (void*)&std_be_build_stratum_request; - gate->work_decode = (void*)&std_be_work_decode; - gate->submit_getwork_result = (void*)&std_be_submit_getwork_result; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&get_max64_0x1ffff; - gate->set_work_data_endian = (void*)&set_work_data_big_endian; - return true; -} - - diff --git a/algo/nist5/nist5-4way.c b/algo/nist5/nist5-4way.c deleted file mode 100644 index 1d09c72..0000000 --- a/algo/nist5/nist5-4way.c +++ /dev/null @@ -1,124 +0,0 @@ -#include "nist5-gate.h" -#include -#include -#include -#include - -#if defined(NIST5_4WAY) - -#include "algo/blake/blake-hash-4way.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" - -void nist5hash_4way( void *out, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - blake512_4way_context ctx_blake; - hashState_groestl ctx_groestl; - jh512_4way_context ctx_jh; - skein512_4way_context ctx_skein; - keccak512_4way_context ctx_keccak; - - blake512_4way_init( &ctx_blake ); - blake512_4way( &ctx_blake, input, 80 ); - blake512_4way_close( &ctx_blake, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash0, - (const char*)hash0, 512 ); - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash1, - (const char*)hash1, 512 ); - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash2, - (const char*)hash2, 512 ); - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash3, - (const char*)hash3, 512 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - jh512_4way_init( &ctx_jh ); - jh512_4way( &ctx_jh, vhash, 64 ); - jh512_4way_close( &ctx_jh, vhash ); - - keccak512_4way_init( &ctx_keccak ); - keccak512_4way( &ctx_keccak, vhash, 64 ); - keccak512_4way_close( &ctx_keccak, vhash ); - - skein512_4way_init( &ctx_skein ); - skein512_4way( &ctx_skein, vhash, 64 ); - skein512_4way_close( &ctx_skein, out ); -} - -int scanhash_nist5_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[25]); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - - uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - for ( int m=0; m < 6; m++ ) - { - if (Htarg <= htmax[m]) - { - uint32_t mask = masks[m]; - - do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - nist5hash_4way( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( ( hash7[ lane ] & mask ) == 0 ) - { - extr_lane_4x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/nist5/nist5-gate.c b/algo/nist5/nist5-gate.c deleted file mode 100644 index 7cc69f6..0000000 --- a/algo/nist5/nist5-gate.c +++ /dev/null @@ -1,16 +0,0 @@ -#include "nist5-gate.h" - -bool register_nist5_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; -#if defined (NIST5_4WAY) - gate->scanhash = (void*)&scanhash_nist5_4way; - gate->hash = (void*)&nist5hash_4way; -#else - init_nist5_ctx(); - gate->scanhash = (void*)&scanhash_nist5; - gate->hash = (void*)&nist5hash; -#endif - return true; -}; - diff --git a/algo/nist5/nist5-gate.h b/algo/nist5/nist5-gate.h deleted file mode 100644 index 80828b7..0000000 --- a/algo/nist5/nist5-gate.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef __NIST5_GATE_H__ -#define __NIST5_GATE_H__ - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define NIST5_4WAY -#endif - -#if defined(NIST5_4WAY) - -void nist5hash_4way( void *state, const void *input ); - -int scanhash_nist5_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#else - -void nist5hash( void *state, const void *input ); - -int scanhash_nist5( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_nist5_ctx(); -#endif - -#endif diff --git a/algo/nist5/nist5.c b/algo/nist5/nist5.c deleted file mode 100644 index 431fb71..0000000 --- a/algo/nist5/nist5.c +++ /dev/null @@ -1,161 +0,0 @@ -#include "nist5-gate.h" - -#include -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/skein/sph_skein.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" - -#include "algo/blake/sse2/blake.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; -#else - hashState_groestl groestl; -#endif -} nist5_ctx_holder; - -nist5_ctx_holder nist5_ctx; - -void init_nist5_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init( &nist5_ctx.groestl ); -#else - init_groestl( &nist5_ctx.groestl, 64 ); -#endif -} - -void nist5hash(void *output, const void *input) -{ - size_t hashptr; - unsigned char hashbuf[128]; - sph_u64 hashctA; - sph_u64 hashctB; - unsigned char hash[128] __attribute__ ((aligned (64))) ; - #define hashA hash - #define hashB hash+64 - - nist5_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &nist5_ctx, sizeof(nist5_ctx) ); - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); - #else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); - #endif - - DECL_JH; - JH_H; - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - memcpy(output, hash, 32); -} - -int scanhash_nist5( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - -#ifdef DEBUG_ALGO - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - nist5hash(hash64, endiandata); -#ifndef DEBUG_ALGO - if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - work_set_target_ratio( work, hash64 ); - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} -/* -bool register_nist5_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT; - init_nist5_ctx(); - gate->scanhash = (void*)&scanhash_nist5; - gate->hash = (void*)&nist5hash; - return true; -}; -*/ diff --git a/algo/nist5/zr5.c b/algo/nist5/zr5.c deleted file mode 100644 index 9ec6e19..0000000 --- a/algo/nist5/zr5.c +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright 2014 mkimid - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ - -#include "cpuminer-config.h" -#include "algo-gate-api.h" -#include -#include - -#include "algo/groestl/sph_groestl.h" -#include "algo/keccak/sph_keccak.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -#include "algo/jh/sse2/jh_sse2_opt64.h" -#include "algo/skein/sse2/skein.c" -#include "algo/blake/sse2/blake.c" - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) -#else - #define DATA_ALIGN16(x) __declspec(align(16)) x -#endif - -#define ZR_BLAKE 0 -#define ZR_GROESTL 1 -#define ZR_JH 2 -#define ZR_SKEIN 3 -#define POK_BOOL_MASK 0x00008000 -#define POK_DATA_MASK 0xFFFF0000 - -typedef struct { - #ifdef NO_AES_NI - sph_groestl512_context groestl; - #else - hashState_groestl groestl; - #endif - sph_keccak512_context keccak; -} zr5_ctx_holder; - -zr5_ctx_holder zr5_ctx; - -void init_zr5_ctx() -{ - #ifdef NO_AES_NI - sph_groestl512_init( &zr5_ctx.groestl ); - #else - init_groestl( &zr5_ctx.groestl, 64 ); - #endif - sph_keccak512_init(&zr5_ctx.keccak); -} - -static void zr5hash(void *state, const void *input) -{ - -DATA_ALIGN16(unsigned char hashbuf[128]); -DATA_ALIGN16(unsigned char hash[128]); -DATA_ALIGN16(size_t hashptr); -DATA_ALIGN16(sph_u64 hashctA); -DATA_ALIGN16(sph_u64 hashctB); - -//memset(hash, 0, 128); - -static const int arrOrder[][4] = -{ - { 0, 1, 2, 3 }, { 0, 1, 3, 2 }, { 0, 2, 1, 3 }, { 0, 2, 3, 1 }, - { 0, 3, 1, 2 }, { 0, 3, 2, 1 }, { 1, 0, 2, 3 }, { 1, 0, 3, 2 }, - { 1, 2, 0, 3 }, { 1, 2, 3, 0 }, { 1, 3, 0, 2 }, { 1, 3, 2, 0 }, - { 2, 0, 1, 3 }, { 2, 0, 3, 1 }, { 2, 1, 0, 3 }, { 2, 1, 3, 0 }, - { 2, 3, 0, 1 }, { 2, 3, 1, 0 }, { 3, 0, 1, 2 }, { 3, 0, 2, 1 }, - { 3, 1, 0, 2 }, { 3, 1, 2, 0 }, { 3, 2, 0, 1 }, { 3, 2, 1, 0 } -}; - - zr5_ctx_holder ctx; - memcpy( &ctx, &zr5_ctx, sizeof(zr5_ctx) ); - - sph_keccak512 (&ctx.keccak, input, 80); - sph_keccak512_close(&ctx.keccak, hash); - - unsigned int nOrder = *(unsigned int *)(&hash) % 24; - unsigned int i = 0; - - for (i = 0; i < 4; i++) - { - switch (arrOrder[nOrder][i]) - { - case 0: - {DECL_BLK; - BLK_I; - BLK_U; - BLK_C;} - break; - case 1: - #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); - #else - update_groestl( &ctx.groestl, (char*)hash,512); - final_groestl( &ctx.groestl, (char*)hash); - #endif - break; - case 2: - {DECL_JH; - JH_H;} - break; - case 3: - {DECL_SKN; - SKN_I; - SKN_U; - SKN_C; } - break; - default: - break; - } - } - asm volatile ("emms"); - memcpy(state, hash, 32); -} - -int scanhash_zr5( struct work *work, uint32_t max_nonce, - unsigned long *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t hash[16] __attribute__((aligned(64))); - uint32_t tmpdata[20] __attribute__((aligned(64))); - const uint32_t version = pdata[0] & (~POK_DATA_MASK); - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - memcpy(tmpdata, pdata, 80); - - do - { - #define Htarg ptarget[7] - tmpdata[0] = version; - tmpdata[19] = nonce; - zr5hash(hash, tmpdata); - tmpdata[0] = version | (hash[0] & POK_DATA_MASK); - zr5hash(hash, tmpdata); - if (hash[7] <= Htarg ) - { - if( fulltest(hash, ptarget) ) - { - pdata[0] = tmpdata[0]; - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - work_set_target_ratio( work, hash ); - if (opt_debug) - applog(LOG_INFO, "found nonce %x", nonce); - return 1; - } - } - nonce++; - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -void zr5_get_new_work( struct work* work, struct work* g_work, int thr_id, - uint32_t* end_nonce_ptr, bool clean_job ) -{ - // ignore POK in first word -// const int nonce_i = 19; - const int wkcmp_sz = 72; // (19-1) * sizeof(uint32_t) - uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); - if ( memcmp( &work->data[1], &g_work->data[1], wkcmp_sz ) - && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) ) - { - work_free( work ); - work_copy( work, g_work ); - *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id; - if ( opt_randomize ) - *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads; - *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; - } - else - ++(*nonceptr); -} - -int64_t zr5_get_max64 () -{ -// return 0x1ffffLL; - return 0x1fffffLL; -} - -void zr5_display_pok( struct work* work ) -{ - if ( work->data[0] & 0x00008000 ) - applog(LOG_BLUE, "POK received: %08xx", work->data[0] ); -} - -int zr5_get_work_data_size() { return 80; } - -bool register_zr5_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT; - init_zr5_ctx(); - gate->get_new_work = (void*)&zr5_get_new_work; - gate->scanhash = (void*)&scanhash_zr5; - gate->hash = (void*)&zr5hash; - gate->get_max64 = (void*)&zr5_get_max64; - gate->decode_extra_data = (void*)&zr5_display_pok; - gate->build_stratum_request = (void*)&std_be_build_stratum_request; - gate->work_decode = (void*)&std_be_work_decode; - gate->submit_getwork_result = (void*)&std_be_submit_getwork_result; - gate->set_work_data_endian = (void*)&set_work_data_big_endian; - gate->get_work_data_size = (void*)&zr5_get_work_data_size; - gate->work_cmp_size = 72; - return true; -}; - diff --git a/algo/panama/sph_panama.c b/algo/panama/sph_panama.c deleted file mode 100644 index f3c27c7..0000000 --- a/algo/panama/sph_panama.c +++ /dev/null @@ -1,334 +0,0 @@ -/* $Id: panama.c 216 2010-06-08 09:46:57Z tp $ */ -/* - * PANAMA implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph_panama.h" - -#define LVAR17(b) sph_u32 \ - b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \ - b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \ - b ## 12, b ## 13, b ## 14, b ## 15, b ## 16; - -#define LVARS \ - LVAR17(a) \ - LVAR17(g) \ - LVAR17(p) \ - LVAR17(t) - -#define M17(macro) do { \ - macro( 0, 1, 2, 4); \ - macro( 1, 2, 3, 5); \ - macro( 2, 3, 4, 6); \ - macro( 3, 4, 5, 7); \ - macro( 4, 5, 6, 8); \ - macro( 5, 6, 7, 9); \ - macro( 6, 7, 8, 10); \ - macro( 7, 8, 9, 11); \ - macro( 8, 9, 10, 12); \ - macro( 9, 10, 11, 13); \ - macro(10, 11, 12, 14); \ - macro(11, 12, 13, 15); \ - macro(12, 13, 14, 16); \ - macro(13, 14, 15, 0); \ - macro(14, 15, 16, 1); \ - macro(15, 16, 0, 2); \ - macro(16, 0, 1, 3); \ - } while (0) - -#define BUPDATE1(n0, n2) do { \ - sc->buffer[ptr24][n0] ^= sc->buffer[ptr31][n2]; \ - sc->buffer[ptr31][n2] ^= INW1(n2); \ - } while (0) - -#define BUPDATE do { \ - BUPDATE1(0, 2); \ - BUPDATE1(1, 3); \ - BUPDATE1(2, 4); \ - BUPDATE1(3, 5); \ - BUPDATE1(4, 6); \ - BUPDATE1(5, 7); \ - BUPDATE1(6, 0); \ - BUPDATE1(7, 1); \ - } while (0) - -#define RSTATE(n0, n1, n2, n4) (a ## n0 = sc->state[n0]) - -#define WSTATE(n0, n1, n2, n4) (sc->state[n0] = a ## n0) - -#define GAMMA(n0, n1, n2, n4) \ - (g ## n0 = a ## n0 ^ (a ## n1 | SPH_T32(~a ## n2))) - -#define PI_ALL do { \ - p0 = g0; \ - p1 = SPH_ROTL32( g7, 1); \ - p2 = SPH_ROTL32(g14, 3); \ - p3 = SPH_ROTL32( g4, 6); \ - p4 = SPH_ROTL32(g11, 10); \ - p5 = SPH_ROTL32( g1, 15); \ - p6 = SPH_ROTL32( g8, 21); \ - p7 = SPH_ROTL32(g15, 28); \ - p8 = SPH_ROTL32( g5, 4); \ - p9 = SPH_ROTL32(g12, 13); \ - p10 = SPH_ROTL32( g2, 23); \ - p11 = SPH_ROTL32( g9, 2); \ - p12 = SPH_ROTL32(g16, 14); \ - p13 = SPH_ROTL32( g6, 27); \ - p14 = SPH_ROTL32(g13, 9); \ - p15 = SPH_ROTL32( g3, 24); \ - p16 = SPH_ROTL32(g10, 8); \ - } while (0) - -#define THETA(n0, n1, n2, n4) \ - (t ## n0 = p ## n0 ^ p ## n1 ^ p ## n4) - -#define SIGMA_ALL do { \ - a0 = t0 ^ 1; \ - a1 = t1 ^ INW2(0); \ - a2 = t2 ^ INW2(1); \ - a3 = t3 ^ INW2(2); \ - a4 = t4 ^ INW2(3); \ - a5 = t5 ^ INW2(4); \ - a6 = t6 ^ INW2(5); \ - a7 = t7 ^ INW2(6); \ - a8 = t8 ^ INW2(7); \ - a9 = t9 ^ sc->buffer[ptr16][0]; \ - a10 = t10 ^ sc->buffer[ptr16][1]; \ - a11 = t11 ^ sc->buffer[ptr16][2]; \ - a12 = t12 ^ sc->buffer[ptr16][3]; \ - a13 = t13 ^ sc->buffer[ptr16][4]; \ - a14 = t14 ^ sc->buffer[ptr16][5]; \ - a15 = t15 ^ sc->buffer[ptr16][6]; \ - a16 = t16 ^ sc->buffer[ptr16][7]; \ - } while (0) - -#define PANAMA_STEP do { \ - unsigned ptr16, ptr24, ptr31; \ - \ - ptr24 = (ptr0 - 8) & 31; \ - ptr31 = (ptr0 - 1) & 31; \ - BUPDATE; \ - M17(GAMMA); \ - PI_ALL; \ - M17(THETA); \ - ptr16 = ptr0 ^ 16; \ - SIGMA_ALL; \ - ptr0 = ptr31; \ - } while (0) - -/* - * These macros are used to compute - */ -#define INC0 1 -#define INC1 2 -#define INC2 3 -#define INC3 4 -#define INC4 5 -#define INC5 6 -#define INC6 7 -#define INC7 8 - -/* - * Push data by blocks of 32 bytes. "pbuf" must be 32-bit aligned. Each - * iteration processes 32 data bytes; "num" contains the number of - * iterations. - */ -static void -panama_push(sph_panama_context *sc, const unsigned char *pbuf, size_t num) -{ - LVARS - unsigned ptr0; -#if SPH_LITTLE_FAST -#define INW1(i) sph_dec32le_aligned(pbuf + 4 * (i)) -#else - sph_u32 X_var[8]; -#define INW1(i) X_var[i] -#endif -#define INW2(i) INW1(i) - - M17(RSTATE); - ptr0 = sc->buffer_ptr; - while (num -- > 0) { -#if !SPH_LITTLE_FAST - int i; - - for (i = 0; i < 8; i ++) - X_var[i] = sph_dec32le_aligned(pbuf + 4 * (i)); -#endif - PANAMA_STEP; - pbuf = (const unsigned char *)pbuf + 32; - } - M17(WSTATE); - sc->buffer_ptr = ptr0; - -#undef INW1 -#undef INW2 -} - -/* - * Perform the "pull" operation repeatedly ("num" times). The hash output - * will be extracted from the state afterwards. - */ -static void -panama_pull(sph_panama_context *sc, unsigned num) -{ - LVARS - unsigned ptr0; -#define INW1(i) INW_H1(INC ## i) -#define INW_H1(i) INW_H2(i) -#define INW_H2(i) a ## i -#define INW2(i) sc->buffer[ptr4][i] - - M17(RSTATE); - ptr0 = sc->buffer_ptr; - while (num -- > 0) { - unsigned ptr4; - - ptr4 = (ptr0 + 4) & 31; - PANAMA_STEP; - } - M17(WSTATE); - -#undef INW1 -#undef INW_H1 -#undef INW_H2 -#undef INW2 -} - -/* see sph_panama.h */ -void -sph_panama_init(void *cc) -{ - sph_panama_context *sc; - - sc = cc; - /* - * This is not completely conformant, but "it will work - * everywhere". Initial state consists of zeroes everywhere. - * Conceptually, the sph_u32 type may have padding bits which - * must not be set to 0; but such an architecture remains to - * be seen. - */ - sc->data_ptr = 0; - memset(sc->buffer, 0, sizeof sc->buffer); - sc->buffer_ptr = 0; - memset(sc->state, 0, sizeof sc->state); -} - -#ifdef SPH_UPTR -static void -panama_short(void *cc, const void *data, size_t len) -#else -void -sph_panama(void *cc, const void *data, size_t len) -#endif -{ - sph_panama_context *sc; - unsigned current; - - sc = cc; - current = sc->data_ptr; - while (len > 0) { - unsigned clen; - - clen = (sizeof sc->data) - current; - if (clen > len) - clen = len; - memcpy(sc->data + current, data, clen); - data = (const unsigned char *)data + clen; - len -= clen; - current += clen; - if (current == sizeof sc->data) { - current = 0; - panama_push(sc, sc->data, 1); - } - } - sc->data_ptr = current; -} - -#ifdef SPH_UPTR -/* see sph_panama.h */ -void -sph_panama(void *cc, const void *data, size_t len) -{ - sph_panama_context *sc; - unsigned current; - size_t rlen; - - if (len < (2 * sizeof sc->data)) { - panama_short(cc, data, len); - return; - } - sc = cc; - current = sc->data_ptr; - if (current > 0) { - unsigned t; - - t = (sizeof sc->data) - current; - panama_short(sc, data, t); - data = (const unsigned char *)data + t; - len -= t; - } -#if !SPH_UNALIGNED - if (((SPH_UPTR)data & 3) != 0) { - panama_short(sc, data, len); - return; - } -#endif - panama_push(sc, data, len >> 5); - rlen = len & 31; - if (rlen > 0) - memcpy(sc->data, - (const unsigned char *)data + len - rlen, rlen); - sc->data_ptr = rlen; -} -#endif - -/* see sph_panama.h */ -void -sph_panama_close(void *cc, void *dst) -{ - sph_panama_context *sc; - unsigned current; - int i; - - sc = cc; - current = sc->data_ptr; - sc->data[current ++] = 0x01; - memset(sc->data + current, 0, (sizeof sc->data) - current); - panama_push(sc, sc->data, 1); - panama_pull(sc, 32); - for (i = 0; i < 8; i ++) - sph_enc32le((unsigned char *)dst + 4 * i, sc->state[i + 9]); - sph_panama_init(sc); -} diff --git a/algo/panama/sph_panama.h b/algo/panama/sph_panama.h deleted file mode 100644 index 6f9d3e8..0000000 --- a/algo/panama/sph_panama.h +++ /dev/null @@ -1,118 +0,0 @@ -/* $Id: sph_panama.h 154 2010-04-26 17:00:24Z tp $ */ -/** - * PANAMA interface. - * - * PANAMA has been published in: J. Daemen and C. Clapp, "Fast Hashing - * and Stream Encryption with PANAMA", Fast Software Encryption - - * FSE'98, LNCS 1372, Springer (1998), pp. 60--74. - * - * PANAMA is not fully defined with regards to endianness and related - * topics. This implementation follows strict little-endian conventions: - *
    - *
  • Each 32-byte input block is split into eight 32-bit words, the - * first (leftmost) word being numbered 0.
  • - *
  • Each such 32-bit word is decoded from memory in little-endian - * convention.
  • - *
  • The additional padding bit equal to "1" is added by considering - * the least significant bit in a byte to come first; practically, this - * means that a single byte of value 0x01 is appended to the (byte-oriented) - * message, and then 0 to 31 bytes of value 0x00.
  • - *
  • The output consists of eight 32-bit words; the word numbered 0 is - * written first (in leftmost position) and it is encoded in little-endian - * convention. - *
- * With these conventions, PANAMA is sometimes known as "PANAMA-LE". The - * PANAMA reference implementation uses our conventions for input, but - * prescribes no convention for output. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_panama.h - * @author Thomas Pornin - */ - -#ifndef SPH_PANAMA_H__ -#define SPH_PANAMA_H__ - -#include -#include "algo/sha/sph_types.h" - -/** - * Output size (in bits) for PANAMA. - */ -#define SPH_SIZE_panama 256 - -/** - * This structure is a context for PANAMA computations: it contains the - * intermediate values and some data from the last entered block. Once - * a PANAMA computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running PANAMA computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char data[32]; /* first field, for alignment */ - unsigned data_ptr; - - sph_u32 buffer[32][8]; - unsigned buffer_ptr; - - sph_u32 state[17]; -#endif -} sph_panama_context; - -/** - * Initialize a PANAMA context. This process performs no memory allocation. - * - * @param cc the PANAMA context (pointer to a sph_panama_context) - */ -void sph_panama_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the PANAMA context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_panama(void *cc, const void *data, size_t len); - -/** - * Terminate the current PANAMA computation and output the result into the - * provided buffer. The destination buffer must be wide enough to - * accomodate the result (32 bytes). The context is automatically - * reinitialized. - * - * @param cc the PANAMA context - * @param dst the destination buffer - */ -void sph_panama_close(void *cc, void *dst); - -#endif diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c deleted file mode 100644 index c5bdde3..0000000 --- a/algo/quark/anime-4way.c +++ /dev/null @@ -1,221 +0,0 @@ -#include "cpuminer-config.h" -#include "anime-gate.h" - -#if defined (ANIME_4WAY) - -#include -#include -#include - -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - jh512_4way_context jh; - skein512_4way_context skein; - keccak512_4way_context keccak; -} anime_4way_ctx_holder; - -anime_4way_ctx_holder anime_4way_ctx __attribute__ ((aligned (64))); - -void init_anime_4way_ctx() -{ - blake512_4way_init( &anime_4way_ctx.blake ); - bmw512_4way_init( &anime_4way_ctx.bmw ); - init_groestl( &anime_4way_ctx.groestl, 64 ); - skein512_4way_init( &anime_4way_ctx.skein ); - jh512_4way_init( &anime_4way_ctx.jh ); - keccak512_4way_init( &anime_4way_ctx.keccak ); -} - -void anime_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhashA[8*4] __attribute__ ((aligned (64))); - uint64_t vhashB[8*4] __attribute__ ((aligned (64))); - __m256i* vh = (__m256i*)vhash; - __m256i* vhA = (__m256i*)vhashA; - __m256i* vhB = (__m256i*)vhashB; - __m256i vh_mask; - const uint32_t mask = 8; - const __m256i bit3_mask = _mm256_set1_epi64x( 8 ); - const __m256i zero = _mm256_setzero_si256(); - anime_4way_ctx_holder ctx; - memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) ); - - bmw512_4way( &ctx.bmw, input, 80 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - blake512_4way( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhash ); - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - if ( hash0[0] & mask ) - { - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); - } - if ( hash1[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); - } - if ( hash2[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); - } - if ( hash3[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); - } - - intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); - - if ( mm256_anybits0( vh_mask ) ) - { - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhashB ); - } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); - - if ( mm256_anybits1( vh_mask ) ) - { - blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhashA ); - } - if ( mm256_anybits0( vh_mask ) ) - { - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhashB ); - } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); - - if ( mm256_anybits1( vh_mask ) ) - { - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhashA ); - } - if ( mm256_anybits0( vh_mask ) ) - { - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhashB ); - } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 ); -} - -int scanhash_anime_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - for (int m=0; m < 6; m++) - if (Htarg <= htmax[m]) - { - uint32_t mask = masks[m]; - - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - anime_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/quark/anime-gate.c b/algo/quark/anime-gate.c deleted file mode 100644 index 53a06e1..0000000 --- a/algo/quark/anime-gate.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "anime-gate.h" - -bool register_anime_algo( algo_gate_t* gate ) -{ -#if defined (ANIME_4WAY) - init_anime_4way_ctx(); - gate->scanhash = (void*)&scanhash_anime_4way; - gate->hash = (void*)&anime_4way_hash; -#else - init_anime_ctx(); - gate->scanhash = (void*)&scanhash_anime; - gate->hash = (void*)&anime_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - return true; -}; - diff --git a/algo/quark/anime-gate.h b/algo/quark/anime-gate.h deleted file mode 100644 index fdf34b4..0000000 --- a/algo/quark/anime-gate.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef ANIME_GATE_H__ -#define ANIME_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define ANIME_4WAY -#endif - -bool register_anime_algo( algo_gate_t* gate ); - -#if defined(ANIME_4WAY) - -void anime_4way_hash( void *state, const void *input ); -int scanhash_anime_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_anime_4way_ctx(); - -#endif - -void anime_hash( void *state, const void *input ); -int scanhash_anime( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_anime_ctx(); - -#endif - diff --git a/algo/quark/anime.c b/algo/quark/anime.c deleted file mode 100644 index 545f273..0000000 --- a/algo/quark/anime.c +++ /dev/null @@ -1,176 +0,0 @@ -#include "cpuminer-config.h" -#include "anime-gate.h" -#include -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/skein/sph_skein.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#ifdef __AES__ - #include "algo/groestl/aes_ni/hash-groestl.h" -#else - #include "algo/groestl/sph_groestl.h" -#endif - -typedef struct { - sph_blake512_context blake; - sph_bmw512_context bmw; -#ifdef __AES__ - hashState_groestl groestl; -#else - sph_groestl512_context groestl; -#endif - sph_jh512_context jh; - sph_skein512_context skein; - sph_keccak512_context keccak; -} anime_ctx_holder; - -anime_ctx_holder anime_ctx __attribute__ ((aligned (64))); - -void init_anime_ctx() -{ - sph_blake512_init( &anime_ctx.blake ); - sph_bmw512_init( &anime_ctx.bmw ); -#ifdef __AES__ - init_groestl( &anime_ctx.groestl, 64 ); -#else - sph_groestl512_init( &anime_ctx.groestl ); -#endif - sph_skein512_init( &anime_ctx.skein ); - sph_jh512_init( &anime_ctx.jh ); - sph_keccak512_init( &anime_ctx.keccak ); -} - -void anime_hash( void *state, const void *input ) -{ - unsigned char hash[128] __attribute__ ((aligned (32))); - uint32_t mask = 8; - anime_ctx_holder ctx; - memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) ); - - sph_bmw512( &ctx.bmw, input, 80 ); - sph_bmw512_close( &ctx.bmw, hash ); - - sph_blake512( &ctx.blake, hash, 64 ); - sph_blake512_close( &ctx.blake, hash ); - - if ( ( hash[0] & mask ) != 0 ) - { -#ifdef __AES__ - update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 ); - reinit_groestl( &ctx.groestl ); -#else - sph_groestl512 ( &ctx.groestl, hash, 64 ); - sph_groestl512_close( &ctx.groestl, hash ); - sph_groestl512_init( &ctx.groestl ); -#endif - } - else - { - sph_skein512( &ctx.skein, hash, 64 ); - sph_skein512_close( &ctx.skein, hash ); - sph_skein512_init( &ctx.skein ); - } - -#ifdef __AES__ - update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 ); -#else - sph_groestl512 ( &ctx.groestl, hash, 64 ); - sph_groestl512_close( &ctx.groestl, hash ); -#endif - - sph_jh512( &ctx.jh, hash, 64 ); - sph_jh512_close( &ctx.jh, hash ); - - if ( ( hash[0] & mask ) != 0 ) - { - sph_blake512_init( &ctx.blake ); - sph_blake512( &ctx.blake, hash, 64 ); - sph_blake512_close( &ctx.blake, hash ); - } - else - { - sph_bmw512_init( &ctx.bmw ); - sph_bmw512( &ctx.bmw, hash, 64 ); - sph_bmw512_close( &ctx.bmw, hash ); - } - - sph_keccak512( &ctx.keccak, hash, 64 ); - sph_keccak512_close( &ctx.keccak, hash ); - - sph_skein512( &ctx.skein, hash, 64 ); - sph_skein512_close( &ctx.skein, hash ); - - if ( ( hash[0] & mask ) != 0 ) - { - sph_keccak512_init( &ctx.keccak ); - sph_keccak512( &ctx.keccak, hash, 64 ); - sph_keccak512_close( &ctx.keccak, hash ); - } - else - { - sph_jh512_init( &ctx.jh ); - sph_jh512( &ctx.jh, hash, 64 ); - sph_jh512_close( &ctx.jh, hash ); - } - - memcpy( state, hash, 32 ); -} - -int scanhash_anime( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[8] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - swab32_array( endiandata, pdata, 20 ); - - for (int m=0; m < 6; m++) - if (Htarg <= htmax[m]) - { - uint32_t mask = masks[m]; - do - { - be32enc( &endiandata[19], n ); - anime_hash( hash, endiandata ); - pdata[19] = n; - - if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) - { - work_set_target_ratio( work, hash ); - *hashes_done = n - first_nonce + 1; - return true; - } - n++; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - pdata[19] = n; - return 0; -} - diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c deleted file mode 100644 index 3645f19..0000000 --- a/algo/quark/hmq1725-4way.c +++ /dev/null @@ -1,618 +0,0 @@ -#include "hmq1725-gate.h" - -#if defined(HMQ1725_4WAY) - -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/haval-hash-4way.h" -#include "algo/sha/sha2-hash-4way.h" - -union _hmq1725_4way_context_overlay -{ - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - hashState_echo echo; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; -}; -typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay; - -extern void hmq1725_4way_hash(void *state, const void *input) -{ -// why so big? only really need 16. - uint32_t hash0 [32] __attribute__ ((aligned (64))); - uint32_t hash1 [32] __attribute__ ((aligned (64))); - uint32_t hash2 [32] __attribute__ ((aligned (64))); - uint32_t hash3 [32] __attribute__ ((aligned (64))); - uint32_t vhash [32<<2] __attribute__ ((aligned (64))); - uint32_t vhashA[32<<2] __attribute__ ((aligned (64))); - uint32_t vhashB[32<<2] __attribute__ ((aligned (64))); - hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64))); - __m256i vh_mask; - const __m256i vmask = _mm256_set1_epi64x( 24 ); - const uint32_t mask = 24; - __m256i* vh = (__m256i*)vhash; - __m256i* vhA = (__m256i*)vhashA; - __m256i* vhB = (__m256i*)vhashB; - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, input, 80 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - -// first fork, A is groestl serial, B is skein parallel. - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); - -// A - -// if ( hash0[0] & mask ) -// { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); -// } -// if ( hash1[0] & mask ) -// { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); -// } -// if ( hash2[0] & mask ) -// { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); -// } -// if ( hash3[0] & mask ) -// { - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); -// } - - intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); - -// B - -// if ( mm256_any_clr_256( vh_mask ) ) -// { - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhashB ); -// } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - -// second fork, A = blake parallel, B= bmw parallel. - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); - -// if ( mm256_any_set_256( vh_mask ) ) -// { - blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhashA ); -// } - -// if ( mm256_any_clr_256( vh_mask ) ) -// { - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhashB ); -// } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); - - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0, - (const BitSequence *)hash0, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1, - (const BitSequence *)hash1, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2, - (const BitSequence *)hash2, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3, - (const BitSequence *)hash3, 64 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - -// A= keccak parallel, B= jh parallel - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); - -// if ( mm256_any_set_256( vh_mask ) ) -// { - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhashA ); -// } - -// if ( mm256_any_clr_256( vh_mask ) ) -// { - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhashB ); -// } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512 ( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); - -// A is whirlpool serial, B is haval parallel. - - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); - // A - -// if ( hash0[0] & mask ) -// { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); -// } -// if ( hash1[0] & mask ) -// { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); -// } -// if ( hash2[0] & mask ) -// { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); -// } -// if ( hash3[0] & mask ) -// { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); -// } - - intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); - -// B - -// if ( mm256_any_clr_256( vh_mask ) ) -// { - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhash, 64 ); - haval256_5_4way_close( &ctx.haval, vhashB ); - memset( &vhashB[8<<2], 0, 32<<2); -// } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - -// shavite & luffa, both serial, select individually. - - if ( hash0[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash0, 64 ); // - sph_shavite512_close( &ctx.shavite, hash0 ); //8 - } - else - { - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0, - (const BitSequence *)hash0, 64 ); - } - - if ( hash1[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); // - sph_shavite512_close( &ctx.shavite, hash1 ); //8 - } - else - { - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1, - (const BitSequence *)hash1, 64 ); - } - - if ( hash2[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); // - sph_shavite512_close( &ctx.shavite, hash2 ); //8 - } - else - { - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2, - (const BitSequence *)hash2, 64 ); - } - - if ( hash3[0] & mask ) - { - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); // - sph_shavite512_close( &ctx.shavite, hash3 ); //8 - } - else - { - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3, - (const BitSequence *)hash3, 64 ); - } - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - -// A echo, B sd both serial - - if ( hash0[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - } - else - { - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - } - - if ( hash1[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - } - else - { - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - } - - if ( hash2[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - } - else - { - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - } - - if ( hash3[0] & mask ) //4 - { - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); - } - else - { - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); - } - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); - - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - -// A = fugue serial, B = sha512 prarallel - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); - -// if ( hash0[0] & mask ) -// { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); -// } -// if ( hash1[0] & mask ) -// { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); -// } -// if ( hash2[0] & mask ) -// { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); -// } -// if ( hash3[0] & mask ) -// { - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); -// } - - intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); - -// if ( mm256_any_clr_256( vh_mask ) ) -// { - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhashB ); -// } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); - -// A = haval parallel, B = Whirlpool serial - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), - m256_zero ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - -// if ( mm256_any_set_256( vh_mask ) ) //4 -// { - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhash, 64 ); - haval256_5_4way_close( &ctx.haval, vhashA ); - memset( &vhashA[8<<2], 0, 32<<2 ); -// } - -// if ( !( hash0[0] & mask ) ) -// { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); -// } -// if ( !( hash2[0] & mask ) ) -// { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); -// } -// if ( !( hash2[0] & mask ) ) -// { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); -// } -// if ( !( hash3[0] & mask ) ) -// { - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); -// } - - intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 ); - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - memcpy(state, vhash, 32<<2 ); -} - -int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); -// uint32_t *hash7 = &(hash[25]); -// uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[ m ]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - hmq1725_4way_hash( hash, vdata ); - for ( int i = 0; i < 4; i++ ) - if ( ( (hash+(i<<3))[7] & mask ) == 0 ) - { - if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark ) - { - pdata[19] = n + i; - submit_lane_solution( work, (hash+(i<<3)), mythr, i ); - } - } - n += 4; - } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif // HMQ1725_4WAY diff --git a/algo/quark/hmq1725-gate.c b/algo/quark/hmq1725-gate.c deleted file mode 100644 index 7fd327c..0000000 --- a/algo/quark/hmq1725-gate.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "hmq1725-gate.h" - -bool register_hmq1725_algo( algo_gate_t* gate ) -{ -#if defined(HMQ1725_4WAY) - gate->scanhash = (void*)&scanhash_hmq1725_4way; - gate->hash = (void*)&hmq1725_4way_hash; -#else - init_hmq1725_ctx(); - gate->scanhash = (void*)&scanhash_hmq1725; - gate->hash = (void*)&hmq1725hash; -#endif - gate->set_target = (void*)&scrypt_set_target; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - return true; -}; - diff --git a/algo/quark/hmq1725-gate.h b/algo/quark/hmq1725-gate.h deleted file mode 100644 index 4f77fd0..0000000 --- a/algo/quark/hmq1725-gate.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef HMQ1725_GATE_H__ -#define HMQ1725_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) -// #define HMQ1725_4WAY -#endif - -bool register_hmq1725_algo( algo_gate_t* gate ); - -#if defined(HMQ1725_4WAY) - -void hmq1725_4way_hash( void *state, const void *input ); -int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#else - -void hmq1725hash( void *state, const void *input ); -int scanhash_hmq1725( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_hmq1725_ctx(); - -#endif - -#endif // HMQ1725_GATE_H__ diff --git a/algo/quark/hmq1725.c b/algo/quark/hmq1725.c deleted file mode 100644 index 8b71911..0000000 --- a/algo/quark/hmq1725.c +++ /dev/null @@ -1,422 +0,0 @@ -#include "hmq1725-gate.h" -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/sph-haval.h" -#include -#if defined(__AES__) - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -typedef struct { - sph_blake512_context blake1, blake2; - sph_bmw512_context bmw1, bmw2, bmw3; - sph_skein512_context skein1, skein2; - sph_jh512_context jh1, jh2; - sph_keccak512_context keccak1, keccak2; - hashState_luffa luffa1, luffa2; - cubehashParam cube; - sph_shavite512_context shavite1, shavite2; - hashState_sd simd1, simd2; - sph_hamsi512_context hamsi1; - sph_fugue512_context fugue1, fugue2; - sph_shabal512_context shabal1; - sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4; - SHA512_CTX sha1, sha2; - sph_haval256_5_context haval1, haval2; -#if defined(__AES__) - hashState_echo echo1, echo2; - hashState_groestl groestl1, groestl2; -#else - sph_groestl512_context groestl1, groestl2; - sph_echo512_context echo1, echo2; -#endif -} hmq1725_ctx_holder; - -static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64))); -static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64))); - -void init_hmq1725_ctx() -{ - sph_blake512_init(&hmq1725_ctx.blake1); - sph_blake512_init(&hmq1725_ctx.blake2); - - sph_bmw512_init(&hmq1725_ctx.bmw1); - sph_bmw512_init(&hmq1725_ctx.bmw2); - sph_bmw512_init(&hmq1725_ctx.bmw3); - - sph_skein512_init(&hmq1725_ctx.skein1); - sph_skein512_init(&hmq1725_ctx.skein2); - - sph_jh512_init(&hmq1725_ctx.jh1); - sph_jh512_init(&hmq1725_ctx.jh2); - - sph_keccak512_init(&hmq1725_ctx.keccak1); - sph_keccak512_init(&hmq1725_ctx.keccak2); - - init_luffa( &hmq1725_ctx.luffa1, 512 ); - init_luffa( &hmq1725_ctx.luffa2, 512 ); - - cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 ); - - sph_shavite512_init(&hmq1725_ctx.shavite1); - sph_shavite512_init(&hmq1725_ctx.shavite2); - - init_sd( &hmq1725_ctx.simd1, 512 ); - init_sd( &hmq1725_ctx.simd2, 512 ); - - sph_hamsi512_init(&hmq1725_ctx.hamsi1); - - sph_fugue512_init(&hmq1725_ctx.fugue1); - sph_fugue512_init(&hmq1725_ctx.fugue2); - - sph_shabal512_init(&hmq1725_ctx.shabal1); - - sph_whirlpool_init(&hmq1725_ctx.whirlpool1); - sph_whirlpool_init(&hmq1725_ctx.whirlpool2); - sph_whirlpool_init(&hmq1725_ctx.whirlpool3); - sph_whirlpool_init(&hmq1725_ctx.whirlpool4); - - SHA512_Init( &hmq1725_ctx.sha1 ); - SHA512_Init( &hmq1725_ctx.sha2 ); - - sph_haval256_5_init(&hmq1725_ctx.haval1); - sph_haval256_5_init(&hmq1725_ctx.haval2); - -#if defined(__AES__) - init_echo( &hmq1725_ctx.echo1, 512 ); - init_echo( &hmq1725_ctx.echo2, 512 ); - init_groestl( &hmq1725_ctx.groestl1, 64 ); - init_groestl( &hmq1725_ctx.groestl2, 64 ); -#else - sph_groestl512_init( &hmq1725_ctx.groestl1 ); - sph_groestl512_init( &hmq1725_ctx.groestl2 ); - sph_echo512_init( &hmq1725_ctx.echo1 ); - sph_echo512_init( &hmq1725_ctx.echo2 ); -#endif -} - -void hmq_bmw512_midstate( const void* input ) -{ - memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid ); - sph_bmw512( &hmq_bmw_mid, input, 64 ); -} - -__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64))); - -extern void hmq1725hash(void *state, const void *input) -{ - const uint32_t mask = 24; - uint32_t hashA[32] __attribute__((aligned(64))); - uint32_t hashB[32] __attribute__((aligned(64))); - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx)); - - memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid ); - sph_bmw512( &h_ctx.bmw1, input + midlen, tail ); - sph_bmw512_close(&h_ctx.bmw1, hashA); //1 - - sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0 - sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1 - - if ( hashB[0] & mask ) //1 - { -#if defined(__AES__) - update_and_final_groestl( &h_ctx.groestl1, (char*)hashA, - (const char*)hashB, 512 ); -#else - sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1 - sph_groestl512_close(&h_ctx.groestl1, hashA); //2 -#endif - } - else - { - sph_skein512 (&h_ctx.skein1, hashB, 64); //1 - sph_skein512_close(&h_ctx.skein1, hashA); //2 - } - - sph_jh512 (&h_ctx.jh1, hashA, 64); //3 - sph_jh512_close(&h_ctx.jh1, hashB); //4 - - sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2 - sph_keccak512_close(&h_ctx.keccak1, hashA); //3 - - if ( hashA[0] & mask ) //4 - { - sph_blake512 (&h_ctx.blake1, hashA, 64); // - sph_blake512_close(&h_ctx.blake1, hashB); //5 - } - else - { - sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4 - sph_bmw512_close(&h_ctx.bmw2, hashB); //5 - } - - update_and_final_luffa( &h_ctx.luffa1, (BitSequence*)hashA, - (const BitSequence*)hashB, 64 ); - - cubehashUpdateDigest( &h_ctx.cube, (BitSequence *)hashB, - (const BitSequence *)hashA, 64 ); - - if ( hashB[0] & mask ) //7 - { - sph_keccak512 (&h_ctx.keccak2, hashB, 64); // - sph_keccak512_close(&h_ctx.keccak2, hashA); //8 - } - else - { - sph_jh512 (&h_ctx.jh2, hashB, 64); //7 - sph_jh512_close(&h_ctx.jh2, hashA); //8 - } - - sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3 - sph_shavite512_close(&h_ctx.shavite1, hashB); //4 - - update_final_sd( &h_ctx.simd1, (BitSequence *)hashA, - (const BitSequence *)hashB, 512 ); - - if ( hashA[0] & mask ) //4 - { - sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); // - sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5 - } - else - { - sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4 - sph_haval256_5_close(&h_ctx.haval1, hashB); //5 - memset(&hashB[8], 0, 32); - } - -#if defined(__AES__) - update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA, - (const BitSequence *)hashB, 512 ); -#else - sph_echo512 (&h_ctx.echo1, hashB, 64); //5 - sph_echo512_close(&h_ctx.echo1, hashA); //6 -#endif - - sph_blake512 (&h_ctx.blake2, hashA, 64); //6 - sph_blake512_close(&h_ctx.blake2, hashB); //7 - - if ( hashB[0] & mask ) //7 - { - sph_shavite512 (&h_ctx.shavite2, hashB, 64); // - sph_shavite512_close(&h_ctx.shavite2, hashA); //8 - } - else - { - update_and_final_luffa( &h_ctx.luffa2, (BitSequence *)hashA, - (const BitSequence *)hashB, 64 ); - } - - sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3 - sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4 - - sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 //// - sph_fugue512_close(&h_ctx.fugue1, hashA); //3 - - if ( hashA[0] & mask ) //4 - { -#if defined(__AES__) - update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB, - (const BitSequence *)hashA, 512 ); -#else - sph_echo512 (&h_ctx.echo2, hashA, 64); // - sph_echo512_close(&h_ctx.echo2, hashB); //5 -#endif - } - else - { - update_final_sd( &h_ctx.simd2, (BitSequence *)hashB, - (const BitSequence *)hashA, 512 ); - } - - sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5 - sph_shabal512_close(&h_ctx.shabal1, hashA); //6 - - sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6 - sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7 - - if ( hashB[0] & mask ) //7 - { - sph_fugue512 (&h_ctx.fugue2, hashB, 64); // - sph_fugue512_close(&h_ctx.fugue2, hashA); //8 - } - else - { - SHA512_Update( &h_ctx.sha1, hashB, 64 ); - SHA512_Final( (unsigned char*) hashA, &h_ctx.sha1 ); - } - -#if defined(__AES__) - update_and_final_groestl( &h_ctx.groestl2, (char*)hashB, - (const char*)hashA, 512 ); -#else - sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3 - sph_groestl512_close(&h_ctx.groestl2, hashB); //4 -#endif - - SHA512_Update( &h_ctx.sha2, hashB, 64 ); - SHA512_Final( (unsigned char*) hashA, &h_ctx.sha2 ); - - if ( hashA[0] & mask ) //4 - { - sph_haval256_5 (&h_ctx.haval2, hashA, 64); // - sph_haval256_5_close(&h_ctx.haval2, hashB); //5 - memset(&hashB[8], 0, 32); - } - else - { - sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4 - sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5 - } - - sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5 - sph_bmw512_close(&h_ctx.bmw3, hashA); //6 - - memcpy(state, hashA, 32); -} - -int scanhash_hmq1725( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ -// uint32_t endiandata[32] __attribute__((aligned(64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - //const uint32_t Htarg = ptarget[7]; - - //we need bigendian data... -// for (int k = 0; k < 32; k++) - for (int k = 0; k < 20; k++) - be32enc(&endiandata[k], pdata[k]); - - hmq_bmw512_midstate( endiandata ); - -// if (opt_debug) -// { -// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce); -// } - - /* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */ - /* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */ - if (ptarget[7]==0) { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - hmq1725hash(hash64, endiandata); - if (((hash64[7]&0xFFFFFFFF)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - else if (ptarget[7]<=0xF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - hmq1725hash(hash64, endiandata); - if (((hash64[7]&0xFFFFFFF0)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - else if (ptarget[7]<=0xFF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - hmq1725hash(hash64, endiandata); - if (((hash64[7]&0xFFFFFF00)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - else if (ptarget[7]<=0xFFF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - hmq1725hash(hash64, endiandata); - if (((hash64[7]&0xFFFFF000)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - } - else if (ptarget[7]<=0xFFFF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - hmq1725hash(hash64, endiandata); - if (((hash64[7]&0xFFFF0000)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - } - else - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - hmq1725hash(hash64, endiandata); - if (fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} -/* -bool register_hmq1725_algo( algo_gate_t* gate ) -{ - init_hmq1725_ctx(); - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->set_target = (void*)&scrypt_set_target; - gate->scanhash = (void*)&scanhash_hmq1725; - gate->hash = (void*)&hmq1725hash; - return true; -}; -*/ diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c deleted file mode 100644 index 9c0fb5d..0000000 --- a/algo/quark/quark-4way.c +++ /dev/null @@ -1,206 +0,0 @@ -#include "cpuminer-config.h" -#include "quark-gate.h" - -#if defined (QUARK_4WAY) - -#include -#include -#include - -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - jh512_4way_context jh; - skein512_4way_context skein; - keccak512_4way_context keccak; -} quark_4way_ctx_holder; - -quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64))); - -void init_quark_4way_ctx() -{ - blake512_4way_init( &quark_4way_ctx.blake ); - bmw512_4way_init( &quark_4way_ctx.bmw ); - init_groestl( &quark_4way_ctx.groestl, 64 ); - skein512_4way_init( &quark_4way_ctx.skein ); - jh512_4way_init( &quark_4way_ctx.jh ); - keccak512_4way_init( &quark_4way_ctx.keccak ); -} - -void quark_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhashA[8*4] __attribute__ ((aligned (64))); - uint64_t vhashB[8*4] __attribute__ ((aligned (64))); - __m256i* vh = (__m256i*)vhash; - __m256i* vhA = (__m256i*)vhashA; - __m256i* vhB = (__m256i*)vhashB; - __m256i vh_mask; - quark_4way_ctx_holder ctx; - const __m256i bit3_mask = _mm256_set1_epi64x( 8 ); - const uint32_t mask = 8; - const __m256i zero = _mm256_setzero_si256(); - - memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) ); - - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - if ( hash0[0] & mask ) - { - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); - } - if ( hash1[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); - } - if ( hash2[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); - } - if ( hash3[0] & mask ) - { - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); - } - - intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); - - if ( mm256_anybits0( vh_mask ) ) - { - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhashB ); - } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); - - if ( mm256_anybits1( vh_mask ) ) - { - blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, 64 ); - blake512_4way_close( &ctx.blake, vhashA ); - } - - if ( mm256_anybits0( vh_mask ) ) - { - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhashB ); - } - - mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); - - if ( mm256_anybits1( vh_mask ) ) - { - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhashA ); - } - - if ( mm256_anybits0( vh_mask ) ) - { - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhashB ); - } - - // Final blend, directly to state, only need 32 bytes. - casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask ); - casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask ); - casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask ); - casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask ); -} - -int scanhash_quark_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[25]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - quark_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 ) - { - extr_lane_4x64( lane_hash, hash, i, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, lane_hash, mythr, i ); - } - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/quark/quark-gate.c b/algo/quark/quark-gate.c deleted file mode 100644 index 4d7018a..0000000 --- a/algo/quark/quark-gate.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "quark-gate.h" - -bool register_quark_algo( algo_gate_t* gate ) -{ -#if defined (QUARK_4WAY) - init_quark_4way_ctx(); - gate->scanhash = (void*)&scanhash_quark_4way; - gate->hash = (void*)&quark_4way_hash; -#else - init_quark_ctx(); - gate->scanhash = (void*)&scanhash_quark; - gate->hash = (void*)&quark_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - return true; -}; - diff --git a/algo/quark/quark-gate.h b/algo/quark/quark-gate.h deleted file mode 100644 index e97b20d..0000000 --- a/algo/quark/quark-gate.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef QUARK_GATE_H__ -#define QUARK_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define QUARK_4WAY -#endif - -bool register_quark_algo( algo_gate_t* gate ); - -#if defined(QUARK_4WAY) - -void quark_4way_hash( void *state, const void *input ); -int scanhash_quark_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_quark_4way_ctx(); - -#endif - -void quark_hash( void *state, const void *input ); -int scanhash_quark( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_quark_ctx(); - -#endif - diff --git a/algo/quark/quark.c b/algo/quark/quark.c deleted file mode 100644 index 638e629..0000000 --- a/algo/quark/quark.c +++ /dev/null @@ -1,207 +0,0 @@ -#include "cpuminer-config.h" -#include "quark-gate.h" - -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" - -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) - #define DATA_ALIGNXY(x,y) x __attribute__ ((aligned(y))) - -#else - #define DATA_ALIGN16(x) __declspec(align(16)) x - #define DATA_ALIGNXY(x,y) __declspec(align(y)) x -#endif - -#ifdef NO_AES_NI - sph_groestl512_context quark_ctx; -#else - hashState_groestl quark_ctx; -#endif - -void init_quark_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init( &quark_ctx ); -#else - init_groestl( &quark_ctx, 64 ); -#endif -} - -void quark_hash(void *state, const void *input) -{ - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; - int i; - unsigned char hash[128] __attribute__ ((aligned (32))); -#ifdef NO_AES_NI - sph_groestl512_context ctx; -#else - hashState_groestl ctx; -#endif - - memcpy( &ctx, &quark_ctx, sizeof(ctx) ); - - // Blake - DECL_BLK; - BLK_I; - BLK_W; - for(i=0; i<9; i++) - { - /* blake is split between 64byte hashes and the 80byte initial block */ - //DECL_BLK; - switch (i+(16*((hash[0] & (uint32_t)(8)) == (uint32_t)(0)))) - { - // Blake - case 5 : - BLK_I; - BLK_U; - case 0: - case 16: - BLK_C; - break; - case 1: - case 17: - case 21: - - // BMW - do - { - DECL_BMW; - BMW_I; - BMW_U; - /* bmw compress uses some defines */ - /* i havent gotten around to rewriting these */ - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - BMW_C; - #undef M - #undef H - #undef dH - } while(0); continue;; - - case 2: - // dos this entry point represent a second groestl round? - - case 3: - case 19: - // Groestl - do - { - -#ifdef NO_AES_NI - sph_groestl512_init( &ctx ); - sph_groestl512 ( &ctx, hash, 64 ); - sph_groestl512_close( &ctx, hash ); -#else - reinit_groestl( &ctx ); - update_and_final_groestl( &ctx, (char*)hash, (char*)hash, 512 ); -// update_groestl( &ctx, (char*)hash, 512 ); -// final_groestl( &ctx, (char*)hash ); -#endif - - } while(0); continue; - - case 4: - case 20: - case 24: - // JH - do - { - DECL_JH; - JH_H; - } while(0); continue; - - case 6: - case 22: - case 8: - // Keccak - do - { - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - } while(0); continue; - - case 18: - case 7: - case 23: - // Skein - do - { - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; /* is a magintue faster than others, done */ - } while(0); continue; - - default: - /* bad things happend, i counted to potato */ - abort(); - } - /* only blake shouuld get here without continue */ - /* blake finishs from top split */ - //BLK_C; - } - - -// asm volatile ("emms"); - memcpy(state, hash, 32); -} - -int scanhash_quark( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - - swab32_array( endiandata, pdata, 20 ); - - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - quark_hash(hash64, &endiandata); - if ((hash64[7]&0xFFFFFF00)==0) - { - if (fulltest(hash64, ptarget)) - { - work_set_target_ratio( work, hash64 ); - *hashes_done = n - first_nonce + 1; - return true; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/qubit/deep-2way.c b/algo/qubit/deep-2way.c deleted file mode 100644 index 9ca6608..0000000 --- a/algo/qubit/deep-2way.c +++ /dev/null @@ -1,125 +0,0 @@ -#include "deep-gate.h" - -#if defined(DEEP_2WAY) - -#include -#include -#include -#include -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/echo/aes_ni/hash_api.h" - -typedef struct -{ - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_echo echo; -} deep_2way_ctx_holder; - -deep_2way_ctx_holder deep_2way_ctx; - -void init_deep_2way_ctx() -{ - luffa_2way_init( &deep_2way_ctx.luffa, 512 ); - cubehashInit(&deep_2way_ctx.cube,512,16,32); - sph_shavite512_init(&deep_2way_ctx.shavite); - init_echo(&deep_2way_ctx.echo, 512); -}; - -void deep_2way_hash( void *output, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*2] __attribute__ ((aligned (64))); - deep_2way_ctx_holder ctx; - - memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) ); - luffa_2way_update( &ctx.luffa, input + (64<<1), 16 ); - luffa_2way_close( &ctx.luffa, vhash ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &deep_2way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); -} - -int scanhash_deep_2way( struct work *work,uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - uint32_t *noncep = vdata + 32+3; // 4*8 + 3 - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); - casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 ); - - luffa_2way_init( &deep_2way_ctx.luffa, 512 ); - luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 ); - - for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - be32enc( noncep, n ); - be32enc( noncep+4, n+1 ); - - deep_2way_hash( hash, vdata ); - pdata[19] = n; - - if ( !( hash[7] & mask ) ) - if ( fulltest( hash, ptarget) && !opt_benchmark ) - { - pdata[19] = n; - submit_lane_solution( work, hash, mythr, 0 ); - } - if ( !( (hash+8)[7] & mask ) ) - if ( fulltest( hash+8, ptarget) && !opt_benchmark ) - { - pdata[19] = n+1; - submit_lane_solution( work, hash+8, mythr, 1 ); - } - n += 2; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/qubit/deep-gate.c b/algo/qubit/deep-gate.c deleted file mode 100644 index bae0a8d..0000000 --- a/algo/qubit/deep-gate.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "deep-gate.h" - -bool register_deep_algo( algo_gate_t* gate ) -{ -#if defined (DEEP_2WAY) - init_deep_2way_ctx(); - gate->scanhash = (void*)&scanhash_deep_2way; - gate->hash = (void*)&deep_2way_hash; -#else - init_deep_ctx(); - gate->scanhash = (void*)&scanhash_deep; - gate->hash = (void*)&deep_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - return true; -}; - diff --git a/algo/qubit/deep-gate.h b/algo/qubit/deep-gate.h deleted file mode 100644 index 1d1c932..0000000 --- a/algo/qubit/deep-gate.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef DEEP_GATE_H__ -#define DEEP_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define DEEP_2WAY -#endif - -bool register_deep_algo( algo_gate_t* gate ); - -#if defined(DEEP_2WAY) - -void deep_2way_hash( void *state, const void *input ); -int scanhash_deep_2way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_deep_2way_ctx(); - -#endif - -void deep_hash( void *state, const void *input ); -int scanhash_deep( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_deep_ctx(); - -#endif - diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c deleted file mode 100644 index b48f0d0..0000000 --- a/algo/qubit/deep.c +++ /dev/null @@ -1,142 +0,0 @@ -#include "deep-gate.h" -#include -#include -#include -#include -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#ifndef NO_AES_NI -#include "algo/echo/aes_ni/hash_api.h" -#else -#include "algo/echo/sph_echo.h" -#endif - -typedef struct -{ - hashState_luffa luffa; - cubehashParam cubehash; -#ifdef NO_AES_NI - sph_echo512_context echo; -#else - hashState_echo echo; -#endif -} deep_ctx_holder; - -deep_ctx_holder deep_ctx __attribute((aligned(64))); -static __thread hashState_luffa deep_luffa_mid; - -void init_deep_ctx() -{ - init_luffa( &deep_ctx.luffa, 512 ); - cubehashInit( &deep_ctx.cubehash, 512, 16, 32 ); -#ifdef NO_AES_NI - sph_echo512_init( &deep_ctx.echo ); -#else - init_echo( &deep_ctx.echo, 512 ); -#endif -}; - -void deep_luffa_midstate( const void* input ) -{ - memcpy( &deep_luffa_mid, &deep_ctx.luffa, sizeof deep_luffa_mid ); - update_luffa( &deep_luffa_mid, input, 64 ); -} - -void deep_hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute((aligned(64))); - #define hashB hash+64 - - deep_ctx_holder ctx __attribute((aligned(64))); - memcpy( &ctx, &deep_ctx, sizeof(deep_ctx) ); - - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - memcpy( &ctx.luffa, &deep_luffa_mid, sizeof deep_luffa_mid ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)input + midlen, tail ); - - cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, - (const byte*) hash,64); - -#ifdef NO_AES_NI - sph_echo512 (&ctx.echo, (const void*) hash, 64); - sph_echo512_close(&ctx.echo, (void*) hash); -#else - update_final_echo ( &ctx.echo, (BitSequence *) hash, - (const BitSequence *) hash, 512); -#endif - - asm volatile ("emms"); - memcpy(output, hash, 32); -} - -int scanhash_deep( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - - uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = - { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0 }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - - deep_luffa_midstate( endiandata ); - -#ifdef DEBUG_ALGO - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for ( int m=0; m < 6; m++ ) - { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - pdata[19] = ++n; - be32enc( &endiandata[19], n ); - deep_hash( hash64, endiandata ); -#ifndef DEBUG_ALGO - if (!(hash64[7] & mask)) - { - if ( fulltest(hash64, ptarget) ) - { - *hashes_done = n - first_nonce + 1; - return true; - } -// else -// { -// applog(LOG_INFO, "Result does not validate on CPU!"); -// } - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - work_set_target_ratio( work, hash64 ); - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while ( n < max_nonce && !work_restart[thr_id].restart ); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c deleted file mode 100644 index 8dc04b9..0000000 --- a/algo/qubit/qubit-2way.c +++ /dev/null @@ -1,130 +0,0 @@ -#include "qubit-gate.h" - -#if defined(QUBIT_2WAY) - -#include -#include -#include -#include -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/echo/aes_ni/hash_api.h" - -typedef struct -{ - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; -} qubit_2way_ctx_holder; - -qubit_2way_ctx_holder qubit_2way_ctx; - -void init_qubit_2way_ctx() -{ - cubehashInit(&qubit_2way_ctx.cube,512,16,32); - sph_shavite512_init(&qubit_2way_ctx.shavite); - simd_2way_init( &qubit_2way_ctx.simd, 512 ); - init_echo(&qubit_2way_ctx.echo, 512); -}; - -void qubit_2way_hash( void *output, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*2] __attribute__ ((aligned (64))); - qubit_2way_ctx_holder ctx; - - memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) ); - luffa_2way_update( &ctx.luffa, input + (64<<1), 16 ); - luffa_2way_close( &ctx.luffa, vhash ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &qubit_2way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - - intrlv_2x128( vhash, hash0, hash1, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); -} - -int scanhash_qubit_2way( struct work *work,uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - uint32_t *noncep = vdata + 32+3; // 4*8 + 3 - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); - casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 ); - - luffa_2way_init( &qubit_2way_ctx.luffa, 512 ); - luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 ); - - for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - be32enc( noncep, n ); - be32enc( noncep+4, n+1 ); - qubit_2way_hash( hash, vdata ); - pdata[19] = n; - - if ( !( hash[7] & mask ) ) - if ( fulltest( hash, ptarget) && !opt_benchmark ) - { - pdata[19] = n; - submit_lane_solution( work, hash, mythr, 0 ); - } - if ( !( (hash+8)[7] & mask ) ) - if ( fulltest( hash+8, ptarget) && !opt_benchmark ) - { - pdata[19] = n+1; - submit_lane_solution( work, hash+8, mythr, 1 ); - } - n += 2; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/qubit/qubit-gate.c b/algo/qubit/qubit-gate.c deleted file mode 100644 index b3df5c1..0000000 --- a/algo/qubit/qubit-gate.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "qubit-gate.h" - -bool register_qubit_algo( algo_gate_t* gate ) -{ -#if defined (QUBIT_2WAY) - init_qubit_2way_ctx(); - gate->scanhash = (void*)&scanhash_qubit_2way; - gate->hash = (void*)&qubit_2way_hash; -#else - init_qubit_ctx(); - gate->scanhash = (void*)&scanhash_qubit; - gate->hash = (void*)&qubit_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - return true; -}; - diff --git a/algo/qubit/qubit-gate.h b/algo/qubit/qubit-gate.h deleted file mode 100644 index 741a71a..0000000 --- a/algo/qubit/qubit-gate.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef QUBIT_GATE_H__ -#define QUBIT_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define QUBIT_2WAY -#endif - -bool register_qubit_algo( algo_gate_t* gate ); - -#if defined(QUBIT_2WAY) - -void qubit_2way_hash( void *state, const void *input ); -int scanhash_qubit_2way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_qubit_2way_ctx(); - -#endif - -void qubit_hash( void *state, const void *input ); -int scanhash_qubit( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_qubit_ctx(); - -#endif - diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c deleted file mode 100644 index fc953be..0000000 --- a/algo/qubit/qubit.c +++ /dev/null @@ -1,154 +0,0 @@ -#include "qubit-gate.h" -#include -#include -#include -#include -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/shavite/sph_shavite.h" -#ifndef NO_AES_NI -#include "algo/echo/aes_ni/hash_api.h" -#else -#include "algo/echo/sph_echo.h" -#endif - -typedef struct -{ - hashState_luffa luffa; - cubehashParam cubehash; - sph_shavite512_context shavite; - hashState_sd simd; -#ifdef NO_AES_NI - sph_echo512_context echo; -#else - hashState_echo echo; -#endif -} qubit_ctx_holder; - -qubit_ctx_holder qubit_ctx; -static __thread hashState_luffa qubit_luffa_mid; - -void init_qubit_ctx() -{ - init_luffa(&qubit_ctx.luffa,512); - cubehashInit(&qubit_ctx.cubehash,512,16,32); - sph_shavite512_init(&qubit_ctx.shavite); - init_sd(&qubit_ctx.simd,512); -#ifdef NO_AES_NI - sph_echo512_init(&qubit_ctx.echo); -#else - init_echo(&qubit_ctx.echo, 512); -#endif -}; - -void qubit_luffa_midstate( const void* input ) -{ - memcpy( &qubit_luffa_mid, &qubit_ctx.luffa, sizeof qubit_luffa_mid ); - update_luffa( &qubit_luffa_mid, input, 64 ); -} - -void qubit_hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute((aligned(64))); - #define hashB hash+64 - - qubit_ctx_holder ctx; - memcpy( &ctx, &qubit_ctx, sizeof(qubit_ctx) ); - - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - memcpy( &ctx.luffa, &qubit_luffa_mid, sizeof qubit_luffa_mid ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)input + midlen, tail ); - - cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, - (const byte*) hash, 64 ); - - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hash); - - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence*)hash, 512 ); - -#ifdef NO_AES_NI - sph_echo512 (&ctx.echo, (const void*) hash, 64); - sph_echo512_close(&ctx.echo, (void*) hash); -#else - update_final_echo( &ctx.echo, (BitSequence *) hash, - (const BitSequence *) hash, 512 ); -#endif - - asm volatile ("emms"); - memcpy(output, hash, 32); -} - -int scanhash_qubit( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - - uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = - { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0 }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - - qubit_luffa_midstate( endiandata ); - -#ifdef DEBUG_ALGO - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for ( int m=0; m < 6; m++ ) - { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - qubit_hash(hash64, endiandata); -#ifndef DEBUG_ALGO - if (!(hash64[7] & mask)) - { - if ( fulltest(hash64, ptarget) ) - { - *hashes_done = n - first_nonce + 1; - return true; - } -// else -// { -// applog(LOG_INFO, "Result does not validate on CPU!"); -// } - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - work_set_target_ratio( work, hash64 ); - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while ( n < max_nonce && !work_restart[thr_id].restart ); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/radiogatun/sph_radiogatun.c b/algo/radiogatun/sph_radiogatun.c deleted file mode 100644 index 888b028..0000000 --- a/algo/radiogatun/sph_radiogatun.c +++ /dev/null @@ -1,1003 +0,0 @@ -/* $Id: radiogatun.c 226 2010-06-16 17:28:08Z tp $ */ -/* - * RadioGatun implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph_radiogatun.h" - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_RADIOGATUN -#define SPH_SMALL_FOOTPRINT_RADIOGATUN 1 -#endif - -/* ======================================================================= */ -/* - * The core macros. We want to unroll 13 successive rounds so that the - * belt rotation becomes pure routing, solved at compilation time, with - * no unnecessary copying. We also wish all state variables to be - * independant local variables, so that the C compiler becomes free to - * map these on registers at it sees fit. This requires some heavy - * preprocessor trickeries, including a full addition macro modulo 13. - * - * These macros are size-independent. Some macros must be defined before - * use: - * WT evaluates to the type for a word (32-bit or 64-bit) - * T truncates a value to the proper word size - * ROR(x, n) right rotation of a word x, with explicit modular - * reduction of the rotation count n by the word size - * INW(i, j) input word j (0, 1, or 2) of block i (0 to 12) - * - * For INW, the input buffer is pointed to by "buf" which has type - * "const unsigned char *". - */ - -#define MUL19(action) do { \ - action(0); \ - action(1); \ - action(2); \ - action(3); \ - action(4); \ - action(5); \ - action(6); \ - action(7); \ - action(8); \ - action(9); \ - action(10); \ - action(11); \ - action(12); \ - action(13); \ - action(14); \ - action(15); \ - action(16); \ - action(17); \ - action(18); \ - } while (0) - -#define DECL19(b) b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \ - b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \ - b ## 12, b ## 13, b ## 14, b ## 15, b ## 16, \ - b ## 17, b ## 18 - -#define M19_T7(i) M19_T7_(i) -#define M19_T7_(i) M19_T7_ ## i -#define M19_T7_0 0 -#define M19_T7_1 7 -#define M19_T7_2 14 -#define M19_T7_3 2 -#define M19_T7_4 9 -#define M19_T7_5 16 -#define M19_T7_6 4 -#define M19_T7_7 11 -#define M19_T7_8 18 -#define M19_T7_9 6 -#define M19_T7_10 13 -#define M19_T7_11 1 -#define M19_T7_12 8 -#define M19_T7_13 15 -#define M19_T7_14 3 -#define M19_T7_15 10 -#define M19_T7_16 17 -#define M19_T7_17 5 -#define M19_T7_18 12 - -#define M19_A1(i) M19_A1_(i) -#define M19_A1_(i) M19_A1_ ## i -#define M19_A1_0 1 -#define M19_A1_1 2 -#define M19_A1_2 3 -#define M19_A1_3 4 -#define M19_A1_4 5 -#define M19_A1_5 6 -#define M19_A1_6 7 -#define M19_A1_7 8 -#define M19_A1_8 9 -#define M19_A1_9 10 -#define M19_A1_10 11 -#define M19_A1_11 12 -#define M19_A1_12 13 -#define M19_A1_13 14 -#define M19_A1_14 15 -#define M19_A1_15 16 -#define M19_A1_16 17 -#define M19_A1_17 18 -#define M19_A1_18 0 - -#define M19_A2(i) M19_A2_(i) -#define M19_A2_(i) M19_A2_ ## i -#define M19_A2_0 2 -#define M19_A2_1 3 -#define M19_A2_2 4 -#define M19_A2_3 5 -#define M19_A2_4 6 -#define M19_A2_5 7 -#define M19_A2_6 8 -#define M19_A2_7 9 -#define M19_A2_8 10 -#define M19_A2_9 11 -#define M19_A2_10 12 -#define M19_A2_11 13 -#define M19_A2_12 14 -#define M19_A2_13 15 -#define M19_A2_14 16 -#define M19_A2_15 17 -#define M19_A2_16 18 -#define M19_A2_17 0 -#define M19_A2_18 1 - -#define M19_A4(i) M19_A4_(i) -#define M19_A4_(i) M19_A4_ ## i -#define M19_A4_0 4 -#define M19_A4_1 5 -#define M19_A4_2 6 -#define M19_A4_3 7 -#define M19_A4_4 8 -#define M19_A4_5 9 -#define M19_A4_6 10 -#define M19_A4_7 11 -#define M19_A4_8 12 -#define M19_A4_9 13 -#define M19_A4_10 14 -#define M19_A4_11 15 -#define M19_A4_12 16 -#define M19_A4_13 17 -#define M19_A4_14 18 -#define M19_A4_15 0 -#define M19_A4_16 1 -#define M19_A4_17 2 -#define M19_A4_18 3 - -#define ACC_a(i) ACC_a_(i) -#define ACC_a_(i) a ## i -#define ACC_atmp(i) ACC_atmp_(i) -#define ACC_atmp_(i) atmp ## i - -#define MILL1(i) (atmp ## i = a ## i ^ T(ACC_a(M19_A1(i)) \ - | ~ACC_a(M19_A2(i)))) -#define MILL2(i) (a ## i = ROR(ACC_atmp(M19_T7(i)), ((i * (i + 1)) >> 1))) -#define MILL3(i) (atmp ## i = a ## i ^ ACC_a(M19_A1(i)) ^ ACC_a(M19_A4(i))) -#define MILL4(i) (a ## i = atmp ## i ^ (i == 0)) - -#define MILL do { \ - WT DECL19(atmp); \ - MUL19(MILL1); \ - MUL19(MILL2); \ - MUL19(MILL3); \ - MUL19(MILL4); \ - } while (0) - -#define DECL13(b) b ## 0 ## _0, b ## 0 ## _1, b ## 0 ## _2, \ - b ## 1 ## _0, b ## 1 ## _1, b ## 1 ## _2, \ - b ## 2 ## _0, b ## 2 ## _1, b ## 2 ## _2, \ - b ## 3 ## _0, b ## 3 ## _1, b ## 3 ## _2, \ - b ## 4 ## _0, b ## 4 ## _1, b ## 4 ## _2, \ - b ## 5 ## _0, b ## 5 ## _1, b ## 5 ## _2, \ - b ## 6 ## _0, b ## 6 ## _1, b ## 6 ## _2, \ - b ## 7 ## _0, b ## 7 ## _1, b ## 7 ## _2, \ - b ## 8 ## _0, b ## 8 ## _1, b ## 8 ## _2, \ - b ## 9 ## _0, b ## 9 ## _1, b ## 9 ## _2, \ - b ## 10 ## _0, b ## 10 ## _1, b ## 10 ## _2, \ - b ## 11 ## _0, b ## 11 ## _1, b ## 11 ## _2, \ - b ## 12 ## _0, b ## 12 ## _1, b ## 12 ## _2 - -#define M13_A(i, j) M13_A_(i, j) -#define M13_A_(i, j) M13_A_ ## i ## _ ## j -#define M13_A_0_0 0 -#define M13_A_0_1 1 -#define M13_A_0_2 2 -#define M13_A_0_3 3 -#define M13_A_0_4 4 -#define M13_A_0_5 5 -#define M13_A_0_6 6 -#define M13_A_0_7 7 -#define M13_A_0_8 8 -#define M13_A_0_9 9 -#define M13_A_0_10 10 -#define M13_A_0_11 11 -#define M13_A_0_12 12 -#define M13_A_1_0 1 -#define M13_A_1_1 2 -#define M13_A_1_2 3 -#define M13_A_1_3 4 -#define M13_A_1_4 5 -#define M13_A_1_5 6 -#define M13_A_1_6 7 -#define M13_A_1_7 8 -#define M13_A_1_8 9 -#define M13_A_1_9 10 -#define M13_A_1_10 11 -#define M13_A_1_11 12 -#define M13_A_1_12 0 -#define M13_A_2_0 2 -#define M13_A_2_1 3 -#define M13_A_2_2 4 -#define M13_A_2_3 5 -#define M13_A_2_4 6 -#define M13_A_2_5 7 -#define M13_A_2_6 8 -#define M13_A_2_7 9 -#define M13_A_2_8 10 -#define M13_A_2_9 11 -#define M13_A_2_10 12 -#define M13_A_2_11 0 -#define M13_A_2_12 1 -#define M13_A_3_0 3 -#define M13_A_3_1 4 -#define M13_A_3_2 5 -#define M13_A_3_3 6 -#define M13_A_3_4 7 -#define M13_A_3_5 8 -#define M13_A_3_6 9 -#define M13_A_3_7 10 -#define M13_A_3_8 11 -#define M13_A_3_9 12 -#define M13_A_3_10 0 -#define M13_A_3_11 1 -#define M13_A_3_12 2 -#define M13_A_4_0 4 -#define M13_A_4_1 5 -#define M13_A_4_2 6 -#define M13_A_4_3 7 -#define M13_A_4_4 8 -#define M13_A_4_5 9 -#define M13_A_4_6 10 -#define M13_A_4_7 11 -#define M13_A_4_8 12 -#define M13_A_4_9 0 -#define M13_A_4_10 1 -#define M13_A_4_11 2 -#define M13_A_4_12 3 -#define M13_A_5_0 5 -#define M13_A_5_1 6 -#define M13_A_5_2 7 -#define M13_A_5_3 8 -#define M13_A_5_4 9 -#define M13_A_5_5 10 -#define M13_A_5_6 11 -#define M13_A_5_7 12 -#define M13_A_5_8 0 -#define M13_A_5_9 1 -#define M13_A_5_10 2 -#define M13_A_5_11 3 -#define M13_A_5_12 4 -#define M13_A_6_0 6 -#define M13_A_6_1 7 -#define M13_A_6_2 8 -#define M13_A_6_3 9 -#define M13_A_6_4 10 -#define M13_A_6_5 11 -#define M13_A_6_6 12 -#define M13_A_6_7 0 -#define M13_A_6_8 1 -#define M13_A_6_9 2 -#define M13_A_6_10 3 -#define M13_A_6_11 4 -#define M13_A_6_12 5 -#define M13_A_7_0 7 -#define M13_A_7_1 8 -#define M13_A_7_2 9 -#define M13_A_7_3 10 -#define M13_A_7_4 11 -#define M13_A_7_5 12 -#define M13_A_7_6 0 -#define M13_A_7_7 1 -#define M13_A_7_8 2 -#define M13_A_7_9 3 -#define M13_A_7_10 4 -#define M13_A_7_11 5 -#define M13_A_7_12 6 -#define M13_A_8_0 8 -#define M13_A_8_1 9 -#define M13_A_8_2 10 -#define M13_A_8_3 11 -#define M13_A_8_4 12 -#define M13_A_8_5 0 -#define M13_A_8_6 1 -#define M13_A_8_7 2 -#define M13_A_8_8 3 -#define M13_A_8_9 4 -#define M13_A_8_10 5 -#define M13_A_8_11 6 -#define M13_A_8_12 7 -#define M13_A_9_0 9 -#define M13_A_9_1 10 -#define M13_A_9_2 11 -#define M13_A_9_3 12 -#define M13_A_9_4 0 -#define M13_A_9_5 1 -#define M13_A_9_6 2 -#define M13_A_9_7 3 -#define M13_A_9_8 4 -#define M13_A_9_9 5 -#define M13_A_9_10 6 -#define M13_A_9_11 7 -#define M13_A_9_12 8 -#define M13_A_10_0 10 -#define M13_A_10_1 11 -#define M13_A_10_2 12 -#define M13_A_10_3 0 -#define M13_A_10_4 1 -#define M13_A_10_5 2 -#define M13_A_10_6 3 -#define M13_A_10_7 4 -#define M13_A_10_8 5 -#define M13_A_10_9 6 -#define M13_A_10_10 7 -#define M13_A_10_11 8 -#define M13_A_10_12 9 -#define M13_A_11_0 11 -#define M13_A_11_1 12 -#define M13_A_11_2 0 -#define M13_A_11_3 1 -#define M13_A_11_4 2 -#define M13_A_11_5 3 -#define M13_A_11_6 4 -#define M13_A_11_7 5 -#define M13_A_11_8 6 -#define M13_A_11_9 7 -#define M13_A_11_10 8 -#define M13_A_11_11 9 -#define M13_A_11_12 10 -#define M13_A_12_0 12 -#define M13_A_12_1 0 -#define M13_A_12_2 1 -#define M13_A_12_3 2 -#define M13_A_12_4 3 -#define M13_A_12_5 4 -#define M13_A_12_6 5 -#define M13_A_12_7 6 -#define M13_A_12_8 7 -#define M13_A_12_9 8 -#define M13_A_12_10 9 -#define M13_A_12_11 10 -#define M13_A_12_12 11 - -#define M13_N(i) M13_N_(i) -#define M13_N_(i) M13_N_ ## i -#define M13_N_0 12 -#define M13_N_1 11 -#define M13_N_2 10 -#define M13_N_3 9 -#define M13_N_4 8 -#define M13_N_5 7 -#define M13_N_6 6 -#define M13_N_7 5 -#define M13_N_8 4 -#define M13_N_9 3 -#define M13_N_10 2 -#define M13_N_11 1 -#define M13_N_12 0 - -#define ACC_b(i, k) ACC_b_(i, k) -#define ACC_b_(i, k) b ## i ## _ ## k - -#define ROUND_ELT(k, s) do { \ - if ((bj += 3) == 39) \ - bj = 0; \ - sc->b[bj + s] ^= a ## k; \ - } while (0) - -#define ROUND_SF(j) do { \ - size_t bj = (j) * 3; \ - ROUND_ELT(1, 0); \ - ROUND_ELT(2, 1); \ - ROUND_ELT(3, 2); \ - ROUND_ELT(4, 0); \ - ROUND_ELT(5, 1); \ - ROUND_ELT(6, 2); \ - ROUND_ELT(7, 0); \ - ROUND_ELT(8, 1); \ - ROUND_ELT(9, 2); \ - ROUND_ELT(10, 0); \ - ROUND_ELT(11, 1); \ - ROUND_ELT(12, 2); \ - MILL; \ - bj = (j) * 3; \ - a ## 13 ^= sc->b[bj + 0]; \ - a ## 14 ^= sc->b[bj + 1]; \ - a ## 15 ^= sc->b[bj + 2]; \ - } while (0) - -#define INPUT_SF(j, p0, p1, p2) do { \ - size_t bj = ((j) + 1) * 3; \ - if (bj == 39) \ - bj = 0; \ - sc->b[bj + 0] ^= (p0); \ - sc->b[bj + 1] ^= (p1); \ - sc->b[bj + 2] ^= (p2); \ - a16 ^= (p0); \ - a17 ^= (p1); \ - a18 ^= (p2); \ - } while (0) - - -#if SPH_SMALL_FOOTPRINT_RADIOGATUN - -#define ROUND ROUND_SF -#define INPUT INPUT_SF - -#else - -/* - * Round function R, on base j. The value j is such that B[0] is actually - * b[j] after the initial rotation. On the 13-round macro, j has the - * successive values 12, 11, 10... 1, 0. - */ -#define ROUND(j) do { \ - ACC_b(M13_A(1, j), 0) ^= a ## 1; \ - ACC_b(M13_A(2, j), 1) ^= a ## 2; \ - ACC_b(M13_A(3, j), 2) ^= a ## 3; \ - ACC_b(M13_A(4, j), 0) ^= a ## 4; \ - ACC_b(M13_A(5, j), 1) ^= a ## 5; \ - ACC_b(M13_A(6, j), 2) ^= a ## 6; \ - ACC_b(M13_A(7, j), 0) ^= a ## 7; \ - ACC_b(M13_A(8, j), 1) ^= a ## 8; \ - ACC_b(M13_A(9, j), 2) ^= a ## 9; \ - ACC_b(M13_A(10, j), 0) ^= a ## 10; \ - ACC_b(M13_A(11, j), 1) ^= a ## 11; \ - ACC_b(M13_A(12, j), 2) ^= a ## 12; \ - MILL; \ - a ## 13 ^= ACC_b(j, 0); \ - a ## 14 ^= ACC_b(j, 1); \ - a ## 15 ^= ACC_b(j, 2); \ - } while (0) - -#define INPUT(j, p0, p1, p2) do { \ - ACC_b(M13_A(1, j), 0) ^= (p0); \ - ACC_b(M13_A(1, j), 1) ^= (p1); \ - ACC_b(M13_A(1, j), 2) ^= (p2); \ - a16 ^= (p0); \ - a17 ^= (p1); \ - a18 ^= (p2); \ - } while (0) - -#endif - -#define MUL13(action) do { \ - action(0); \ - action(1); \ - action(2); \ - action(3); \ - action(4); \ - action(5); \ - action(6); \ - action(7); \ - action(8); \ - action(9); \ - action(10); \ - action(11); \ - action(12); \ - } while (0) - -#define MILL_READ_ELT(i) do { \ - a ## i = sc->a[i]; \ - } while (0) - -#define MILL_WRITE_ELT(i) do { \ - sc->a[i] = a ## i; \ - } while (0) - -#define STATE_READ_SF do { \ - MUL19(MILL_READ_ELT); \ - } while (0) - -#define STATE_WRITE_SF do { \ - MUL19(MILL_WRITE_ELT); \ - } while (0) - -#define PUSH13_SF do { \ - WT DECL19(a); \ - const unsigned char *buf; \ - \ - buf = data; \ - STATE_READ_SF; \ - while (len >= sizeof sc->data) { \ - size_t mk; \ - for (mk = 13; mk > 0; mk --) { \ - WT p0 = INW(0, 0); \ - WT p1 = INW(0, 1); \ - WT p2 = INW(0, 2); \ - INPUT_SF(mk - 1, p0, p1, p2); \ - ROUND_SF(mk - 1); \ - buf += (sizeof sc->data) / 13; \ - len -= (sizeof sc->data) / 13; \ - } \ - } \ - STATE_WRITE_SF; \ - return len; \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_RADIOGATUN - -#define STATE_READ STATE_READ_SF -#define STATE_WRITE STATE_WRITE_SF -#define PUSH13 PUSH13_SF - -#else - -#define BELT_READ_ELT(i) do { \ - b ## i ## _0 = sc->b[3 * i + 0]; \ - b ## i ## _1 = sc->b[3 * i + 1]; \ - b ## i ## _2 = sc->b[3 * i + 2]; \ - } while (0) - -#define BELT_WRITE_ELT(i) do { \ - sc->b[3 * i + 0] = b ## i ## _0; \ - sc->b[3 * i + 1] = b ## i ## _1; \ - sc->b[3 * i + 2] = b ## i ## _2; \ - } while (0) - -#define STATE_READ do { \ - MUL13(BELT_READ_ELT); \ - MUL19(MILL_READ_ELT); \ - } while (0) - -#define STATE_WRITE do { \ - MUL13(BELT_WRITE_ELT); \ - MUL19(MILL_WRITE_ELT); \ - } while (0) - -/* - * Input data by chunks of 13*3 blocks. This is the body of the - * radiogatun32_push13() and radiogatun64_push13() functions. - */ -#define PUSH13 do { \ - WT DECL19(a), DECL13(b); \ - const unsigned char *buf; \ - \ - buf = data; \ - STATE_READ; \ - while (len >= sizeof sc->data) { \ - WT p0, p1, p2; \ - MUL13(PUSH13_ELT); \ - buf += sizeof sc->data; \ - len -= sizeof sc->data; \ - } \ - STATE_WRITE; \ - return len; \ - } while (0) - -#define PUSH13_ELT(k) do { \ - p0 = INW(k, 0); \ - p1 = INW(k, 1); \ - p2 = INW(k, 2); \ - INPUT(M13_N(k), p0, p1, p2); \ - ROUND(M13_N(k)); \ - } while (0) - -#endif - -#define BLANK13_SF do { \ - size_t mk = 13; \ - while (mk -- > 0) \ - ROUND_SF(mk); \ - } while (0) - -#define BLANK1_SF do { \ - WT tmp0, tmp1, tmp2; \ - ROUND_SF(12); \ - tmp0 = sc->b[36]; \ - tmp1 = sc->b[37]; \ - tmp2 = sc->b[38]; \ - memmove(sc->b + 3, sc->b, 36 * sizeof sc->b[0]); \ - sc->b[0] = tmp0; \ - sc->b[1] = tmp1; \ - sc->b[2] = tmp2; \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_RADIOGATUN - -#define BLANK13 BLANK13_SF -#define BLANK1 BLANK1_SF - -#else - -/* - * Run 13 blank rounds. This macro expects the "a" and "b" state variables - * to be alread declared. - */ -#define BLANK13 MUL13(BLANK13_ELT) - -#define BLANK13_ELT(k) ROUND(M13_N(k)) - -#define MUL12(action) do { \ - action(0); \ - action(1); \ - action(2); \ - action(3); \ - action(4); \ - action(5); \ - action(6); \ - action(7); \ - action(8); \ - action(9); \ - action(10); \ - action(11); \ - } while (0) - -/* - * Run a single blank round, and physically rotate the belt. This is used - * for the last blank rounds, and the output rounds. This macro expects the - * "a" abd "b" state variables to be already declared. - */ -#define BLANK1 do { \ - WT tmp0, tmp1, tmp2; \ - ROUND(12); \ - tmp0 = b0_0; \ - tmp1 = b0_1; \ - tmp2 = b0_2; \ - MUL12(BLANK1_ELT); \ - b1_0 = tmp0; \ - b1_1 = tmp1; \ - b1_2 = tmp2; \ - } while (0) - -#define BLANK1_ELT(i) do { \ - ACC_b(M13_A(M13_N(i), 1), 0) = ACC_b(M13_N(i), 0); \ - ACC_b(M13_A(M13_N(i), 1), 1) = ACC_b(M13_N(i), 1); \ - ACC_b(M13_A(M13_N(i), 1), 2) = ACC_b(M13_N(i), 2); \ - } while (0) - -#endif - -#define NO_TOKEN - -/* - * Perform padding, then blank rounds, then output some words. This is - * the body of sph_radiogatun32_close() and sph_radiogatun64_close(). - */ -#define CLOSE_SF(width) CLOSE_GEN(width, \ - NO_TOKEN, STATE_READ_SF, BLANK1_SF, BLANK13_SF) - -#if SPH_SMALL_FOOTPRINT_RADIOGATUN -#define CLOSE CLOSE_SF -#else -#define CLOSE(width) CLOSE_GEN(width, \ - WT DECL13(b);, STATE_READ, BLANK1, BLANK13) -#endif - -#define CLOSE_GEN(width, WTb13, state_read, blank1, blank13) do { \ - unsigned ptr, num; \ - unsigned char *out; \ - WT DECL19(a); \ - WTb13 \ - \ - ptr = sc->data_ptr; \ - sc->data[ptr ++] = 0x01; \ - memset(sc->data + ptr, 0, (sizeof sc->data) - ptr); \ - radiogatun ## width ## _push13(sc, sc->data, sizeof sc->data); \ - \ - num = 17; \ - for (;;) { \ - ptr += 3 * (width >> 3); \ - if (ptr > sizeof sc->data) \ - break; \ - num --; \ - } \ - \ - state_read; \ - if (num >= 13) { \ - blank13; \ - num -= 13; \ - } \ - while (num -- > 0) \ - blank1; \ - \ - num = 0; \ - out = dst; \ - for (;;) { \ - OUTW(out, a1); \ - out += width >> 3; \ - OUTW(out, a2); \ - out += width >> 3; \ - num += 2 * (width >> 3); \ - if (num >= 32) \ - break; \ - blank1; \ - } \ - INIT; \ - } while (0) - -/* - * Initialize context structure. - */ -#if SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN - -#define INIT do { \ - memset(sc->a, 0, sizeof sc->a); \ - memset(sc->b, 0, sizeof sc->b); \ - sc->data_ptr = 0; \ - } while (0) - -#else - -#define INIT do { \ - size_t u; \ - for (u = 0; u < 19; u ++) \ - sc->a[u] = 0; \ - for (u = 0; u < 39; u ++) \ - sc->b[u] = 0; \ - sc->data_ptr = 0; \ - } while (0) - -#endif - -/* ======================================================================= */ -/* - * RadioGatun[32]. - */ - -#if !SPH_NO_RG32 - -#undef WT -#define WT sph_u32 -#undef T -#define T SPH_T32 -#undef ROR -#define ROR(x, n) SPH_T32(((x) << ((32 - (n)) & 31)) | ((x) >> ((n) & 31))) -#undef INW -#define INW(i, j) sph_dec32le_aligned(buf + (4 * (3 * (i) + (j)))) -#undef OUTW -#define OUTW(b, v) sph_enc32le(b, v) - -/* - * Insert data by big chunks of 13*12 = 156 bytes. Returned value is the - * number of remaining bytes (between 0 and 155). This method assumes that - * the input data is suitably aligned. - */ -static size_t -radiogatun32_push13(sph_radiogatun32_context *sc, const void *data, size_t len) -{ - PUSH13; -} - -/* see sph_radiogatun.h */ -void -sph_radiogatun32_init(void *cc) -{ - sph_radiogatun32_context *sc; - - sc = cc; - INIT; -} - -#ifdef SPH_UPTR -static void -radiogatun32_short(void *cc, const void *data, size_t len) -#else -/* see sph_radiogatun.h */ -void -sph_radiogatun32(void *cc, const void *data, size_t len) -#endif -{ - sph_radiogatun32_context *sc; - unsigned ptr; - - sc = cc; - ptr = sc->data_ptr; - while (len > 0) { - size_t clen; - - clen = (sizeof sc->data) - ptr; - if (clen > len) - clen = len; - memcpy(sc->data + ptr, data, clen); - data = (const unsigned char *)data + clen; - len -= clen; - ptr += clen; - if (ptr == sizeof sc->data) { - radiogatun32_push13(sc, sc->data, sizeof sc->data); - ptr = 0; - } - } - sc->data_ptr = ptr; -} - -#ifdef SPH_UPTR -/* see sph_radiogatun.h */ -void -sph_radiogatun32(void *cc, const void *data, size_t len) -{ - sph_radiogatun32_context *sc; - unsigned ptr; - size_t rlen; - - if (len < (2 * sizeof sc->data)) { - radiogatun32_short(cc, data, len); - return; - } - sc = cc; - ptr = sc->data_ptr; - if (ptr > 0) { - unsigned t; - - t = (sizeof sc->data) - ptr; - radiogatun32_short(sc, data, t); - data = (const unsigned char *)data + t; - len -= t; - } -#if !SPH_UNALIGNED - if (((SPH_UPTR)data & 3) != 0) { - radiogatun32_short(sc, data, len); - return; - } -#endif - rlen = radiogatun32_push13(sc, data, len); - memcpy(sc->data, (const unsigned char *)data + len - rlen, rlen); - sc->data_ptr = rlen; -} -#endif - -/* see sph_radiogatun.h */ -void -sph_radiogatun32_close(void *cc, void *dst) -{ - sph_radiogatun32_context *sc; - - sc = cc; - CLOSE(32); -} - -#endif - -/* ======================================================================= */ -/* - * RadioGatun[64]. Compiled only if a 64-bit or more type is available. - */ - -#if SPH_64 - -#if !SPH_NO_RG64 - -#undef WT -#define WT sph_u64 -#undef T -#define T SPH_T64 -#undef ROR -#define ROR(x, n) SPH_T64(((x) << ((64 - (n)) & 63)) | ((x) >> ((n) & 63))) -#undef INW -#define INW(i, j) sph_dec64le_aligned(buf + (8 * (3 * (i) + (j)))) -#undef OUTW -#define OUTW(b, v) sph_enc64le(b, v) - -/* - * On 32-bit x86, register pressure is such that using the small - * footprint version is a net gain (x2 speed), because that variant - * uses fewer local variables. - */ -#if SPH_I386_MSVC || SPH_I386_GCC || defined __i386__ -#undef PUSH13 -#define PUSH13 PUSH13_SF -#undef CLOSE -#define CLOSE CLOSE_SF -#endif - -/* - * Insert data by big chunks of 13*24 = 312 bytes. Returned value is the - * number of remaining bytes (between 0 and 311). This method assumes that - * the input data is suitably aligned. - */ -static size_t -radiogatun64_push13(sph_radiogatun64_context *sc, const void *data, size_t len) -{ - PUSH13; -} - -/* see sph_radiogatun.h */ -void -sph_radiogatun64_init(void *cc) -{ - sph_radiogatun64_context *sc; - - sc = cc; - INIT; -} - -#ifdef SPH_UPTR -static void -radiogatun64_short(void *cc, const void *data, size_t len) -#else -/* see sph_radiogatun.h */ -void -sph_radiogatun64(void *cc, const void *data, size_t len) -#endif -{ - sph_radiogatun64_context *sc; - unsigned ptr; - - sc = cc; - ptr = sc->data_ptr; - while (len > 0) { - size_t clen; - - clen = (sizeof sc->data) - ptr; - if (clen > len) - clen = len; - memcpy(sc->data + ptr, data, clen); - data = (const unsigned char *)data + clen; - len -= clen; - ptr += clen; - if (ptr == sizeof sc->data) { - radiogatun64_push13(sc, sc->data, sizeof sc->data); - ptr = 0; - } - } - sc->data_ptr = ptr; -} - -#ifdef SPH_UPTR -/* see sph_radiogatun.h */ -void -sph_radiogatun64(void *cc, const void *data, size_t len) -{ - sph_radiogatun64_context *sc; - unsigned ptr; - size_t rlen; - - if (len < (2 * sizeof sc->data)) { - radiogatun64_short(cc, data, len); - return; - } - sc = cc; - ptr = sc->data_ptr; - if (ptr > 0) { - unsigned t; - - t = (sizeof sc->data) - ptr; - radiogatun64_short(sc, data, t); - data = (const unsigned char *)data + t; - len -= t; - } -#if !SPH_UNALIGNED - if (((SPH_UPTR)data & 7) != 0) { - radiogatun64_short(sc, data, len); - return; - } -#endif - rlen = radiogatun64_push13(sc, data, len); - memcpy(sc->data, (const unsigned char *)data + len - rlen, rlen); - sc->data_ptr = rlen; -} -#endif - -/* see sph_radiogatun.h */ -void -sph_radiogatun64_close(void *cc, void *dst) -{ - sph_radiogatun64_context *sc; - - sc = cc; - CLOSE(64); -} - -#endif - -#endif diff --git a/algo/radiogatun/sph_radiogatun.h b/algo/radiogatun/sph_radiogatun.h deleted file mode 100644 index 4e3888c..0000000 --- a/algo/radiogatun/sph_radiogatun.h +++ /dev/null @@ -1,186 +0,0 @@ -/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */ -/** - * RadioGatun interface. - * - * RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters - * and G. Van Assche, "RadioGatun, a belt-and-mill hash function", - * presented at the Second Cryptographic Hash Workshop, Santa Barbara, - * August 24-25, 2006. The main Web site, containing that article, the - * reference code and some test vectors, appears to be currently located - * at the following URL: http://radiogatun.noekeon.org/ - * - * The presentation article does not specify endianness or padding. The - * reference code uses the following conventions, which we also apply - * here: - *
    - *
  • The input message is an integral number of sequences of three - * words. Each word is either a 32-bit of 64-bit word (depending on - * the version of RadioGatun).
  • - *
  • Input bytes are decoded into words using little-endian - * convention.
  • - *
  • Padding consists of a single bit of value 1, using little-endian - * convention within bytes (i.e. for a byte-oriented input, a single - * byte of value 0x01 is appended), then enough bits of value 0 to finish - * the current block.
  • - *
  • Output consists of 256 bits. Successive output words are encoded - * with little-endian convention.
  • - *
- * These conventions are very close to those we use for PANAMA, which is - * a close ancestor or RadioGatun. - * - * RadioGatun is actually a family of functions, depending on some - * internal parameters. We implement here two functions, with a "belt - * length" of 13, a "belt width" of 3, and a "mill length" of 19. The - * RadioGatun[32] version uses 32-bit words, while the RadioGatun[64] - * variant uses 64-bit words. - * - * Strictly speaking, the name "RadioGatun" should use an acute accent - * on the "u", which we omitted here to keep strict ASCII-compatibility - * of this file. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_radiogatun.h - * @author Thomas Pornin - */ - -#ifndef SPH_RADIOGATUN_H__ -#define SPH_RADIOGATUN_H__ - -#include -#include "algo/sha/sph_types.h" - -/** - * Output size (in bits) for RadioGatun[32]. - */ -#define SPH_SIZE_radiogatun32 256 - -/** - * This structure is a context for RadioGatun[32] computations: it - * contains intermediate values and some data from the last entered - * block. Once a RadioGatun[32] computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running RadioGatun[32] - * computation can be cloned by copying the context (e.g. with a - * simple memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char data[156]; /* first field, for alignment */ - unsigned data_ptr; - sph_u32 a[19], b[39]; -#endif -} sph_radiogatun32_context; - -/** - * Initialize a RadioGatun[32] context. This process performs no - * memory allocation. - * - * @param cc the RadioGatun[32] context (pointer to a - * sph_radiogatun32_context) - */ -void sph_radiogatun32_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the RadioGatun[32] context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_radiogatun32(void *cc, const void *data, size_t len); - -/** - * Terminate the current RadioGatun[32] computation and output the - * result into the provided buffer. The destination buffer must be wide - * enough to accomodate the result (32 bytes). The context is - * automatically reinitialized. - * - * @param cc the RadioGatun[32] context - * @param dst the destination buffer - */ -void sph_radiogatun32_close(void *cc, void *dst); - -#if SPH_64 - -/** - * Output size (in bits) for RadioGatun[64]. - */ -#define SPH_SIZE_radiogatun64 256 - -/** - * This structure is a context for RadioGatun[64] computations: it - * contains intermediate values and some data from the last entered - * block. Once a RadioGatun[64] computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running RadioGatun[64] - * computation can be cloned by copying the context (e.g. with a - * simple memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char data[312]; /* first field, for alignment */ - unsigned data_ptr; - sph_u64 a[19], b[39]; -#endif -} sph_radiogatun64_context; - -/** - * Initialize a RadioGatun[64] context. This process performs no - * memory allocation. - * - * @param cc the RadioGatun[64] context (pointer to a - * sph_radiogatun64_context) - */ -void sph_radiogatun64_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the RadioGatun[64] context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_radiogatun64(void *cc, const void *data, size_t len); - -/** - * Terminate the current RadioGatun[64] computation and output the - * result into the provided buffer. The destination buffer must be wide - * enough to accomodate the result (32 bytes). The context is - * automatically reinitialized. - * - * @param cc the RadioGatun[64] context - * @param dst the destination buffer - */ -void sph_radiogatun64_close(void *cc, void *dst); - -#endif - -#endif diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c deleted file mode 100644 index 9ac5f53..0000000 --- a/algo/ripemd/lbry-4way.c +++ /dev/null @@ -1,140 +0,0 @@ -#include "lbry-gate.h" -#include -#include -#include -#include -#include "algo/sha/sha2-hash-4way.h" -#include "ripemd-hash-4way.h" - -#define LBRY_INPUT_SIZE 112 -#define LBRY_MIDSTATE 64 -#define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE) - -#if defined(LBRY_8WAY) - -static __thread sha256_8way_context sha256_8w_mid; - -void lbry_8way_hash( void* output, const void* input ) -{ - uint32_t _ALIGN(64) vhashA[16<<3]; - uint32_t _ALIGN(64) vhashB[16<<3]; - uint32_t _ALIGN(64) vhashC[16<<3]; - uint32_t _ALIGN(32) h0[32]; - uint32_t _ALIGN(32) h1[32]; - uint32_t _ALIGN(32) h2[32]; - uint32_t _ALIGN(32) h3[32]; - uint32_t _ALIGN(32) h4[32]; - uint32_t _ALIGN(32) h5[32]; - uint32_t _ALIGN(32) h6[32]; - uint32_t _ALIGN(32) h7[32]; - sha256_8way_context ctx_sha256 __attribute__ ((aligned (64))); - sha512_4way_context ctx_sha512; - ripemd160_8way_context ctx_ripemd; - - memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) ); - sha256_8way( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL ); - sha256_8way_close( &ctx_sha256, vhashA ); - - sha256_8way_init( &ctx_sha256 ); - sha256_8way( &ctx_sha256, vhashA, 32 ); - sha256_8way_close( &ctx_sha256, vhashA ); - - // reinterleave to do sha512 4-way 64 bit twice. - dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 ); - intrlv_4x64( vhashA, h0, h1, h2, h3, 256 ); - intrlv_4x64( vhashB, h4, h5, h6, h7, 256 ); - - sha512_4way_init( &ctx_sha512 ); - sha512_4way( &ctx_sha512, vhashA, 32 ); - sha512_4way_close( &ctx_sha512, vhashA ); - - sha512_4way_init( &ctx_sha512 ); - sha512_4way( &ctx_sha512, vhashB, 32 ); - sha512_4way_close( &ctx_sha512, vhashB ); - - // back to 8-way 32 bit - dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 ); - dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 ); - intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 ); - - ripemd160_8way_init( &ctx_ripemd ); - ripemd160_8way( &ctx_ripemd, vhashA, 32 ); - ripemd160_8way_close( &ctx_ripemd, vhashB ); - - ripemd160_8way_init( &ctx_ripemd ); - ripemd160_8way( &ctx_ripemd, vhashA+(8<<3), 32 ); - ripemd160_8way_close( &ctx_ripemd, vhashC ); - - sha256_8way_init( &ctx_sha256 ); - sha256_8way( &ctx_sha256, vhashB, 20 ); - sha256_8way( &ctx_sha256, vhashC, 20 ); - sha256_8way_close( &ctx_sha256, vhashA ); - - sha256_8way_init( &ctx_sha256 ); - sha256_8way( &ctx_sha256, vhashA, 32 ); - sha256_8way_close( &ctx_sha256, output ); -} - -int scanhash_lbry_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*8] __attribute__ ((aligned (64))); - uint32_t vdata[32*8] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash7 = &(hash[7<<3]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[27]; - const uint32_t first_nonce = pdata[27]; - const uint32_t Htarg = ptarget[7]; - uint32_t edata[32] __attribute__ ((aligned (64))); - __m256i *noncev = (__m256i*)vdata + 27; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - // we need bigendian data... - casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) ); - casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) ); - casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) ); - intrlv_8x32( vdata, edata, edata, edata, edata, - edata, edata, edata, edata, 1024 ); - sha256_8way_init( &sha256_8w_mid ); - sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE ); - - for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_bswap_32( _mm256_set_epi32( - n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) ); - lbry_8way_hash( hash, vdata ); - - for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) ) - { - // deinterleave hash for lane - extr_lane_8x32( lane_hash, hash, i, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[27] = n + i; - submit_lane_solution( work, lane_hash, mythr, i ); - } - } - n += 8; - } while ( (n < max_nonce-10) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/ripemd/lbry-gate.c b/algo/ripemd/lbry-gate.c deleted file mode 100644 index 10b3c2f..0000000 --- a/algo/ripemd/lbry-gate.c +++ /dev/null @@ -1,129 +0,0 @@ -#include "lbry-gate.h" -#include -#include -#include -#include - -double lbry_calc_network_diff( struct work *work ) -{ - // sample for diff 43.281 : 1c05ea29 - // todo: endian reversed on longpoll could be zr5 specific... - - uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] ); - uint32_t bits = (nbits & 0xffffff); - int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 - double d = (double)0x0000ffff / (double)bits; - - for (int m=shift; m < 29; m++) d *= 256.0; - for (int m=29; m < shift; m++) d /= 256.0; - if (opt_debug_diff) - applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); - - return d; -} - -// std_le should work but it doesn't -void lbry_le_build_stratum_request( char *req, struct work *work, - struct stratum_ctx *sctx ) -{ - unsigned char *xnonce2str; - uint32_t ntime, nonce; - char ntimestr[9], noncestr[9]; - - le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] ); - le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] ); - bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) ); - bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) ); - xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len); - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr ); - free(xnonce2str); -} - -void lbry_build_block_header( struct work* g_work, uint32_t version, - uint32_t *prevhash, uint32_t *merkle_root, - uint32_t ntime, uint32_t nbits ) -{ - int i; - memset( g_work->data, 0, sizeof(g_work->data) ); - g_work->data[0] = version; - - if ( have_stratum ) - for ( i = 0; i < 8; i++ ) - g_work->data[1 + i] = le32dec( prevhash + i ); - else - for (i = 0; i < 8; i++) - g_work->data[ 8-i ] = le32dec( prevhash + i ); - - for ( i = 0; i < 8; i++ ) - g_work->data[9 + i] = be32dec( merkle_root + i ); - - g_work->data[ LBRY_NTIME_INDEX ] = ntime; - g_work->data[ LBRY_NBITS_INDEX ] = nbits; - g_work->data[28] = 0x80000000; -} - -void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) -{ - unsigned char merkle_root[64] = { 0 }; - size_t t; - int i; - - algo_gate.gen_merkle_root( merkle_root, sctx ); - // Increment extranonce2 - for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - // Assemble block header - - memset( g_work->data, 0, sizeof(g_work->data) ); - g_work->data[0] = le32dec( sctx->job.version ); - - for ( i = 0; i < 8; i++ ) - g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i ); - - for ( i = 0; i < 8; i++ ) - g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i ); - - for ( int i = 0; i < 8; i++ ) - g_work->data[17 + i] = ((uint32_t*)sctx->job.extra)[i]; - - g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime); - g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits); - g_work->data[28] = 0x80000000; -} - -void lbry_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -int64_t lbry_get_max64() { return 0x1ffffLL; } - -int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; } - -bool register_lbry_algo( algo_gate_t* gate ) -{ - gate->optimizations = AVX2_OPT | SHA_OPT; -#if defined (LBRY_8WAY) - gate->scanhash = (void*)&scanhash_lbry_8way; - gate->hash = (void*)&lbry_8way_hash; -#elif defined (LBRY_4WAY) - gate->scanhash = (void*)&scanhash_lbry_4way; - gate->hash = (void*)&lbry_4way_hash; -#else - gate->scanhash = (void*)&scanhash_lbry; - gate->hash = (void*)&lbry_hash; -#endif - gate->calc_network_diff = (void*)&lbry_calc_network_diff; - gate->get_max64 = (void*)&lbry_get_max64; - gate->build_stratum_request = (void*)&lbry_le_build_stratum_request; -// gate->build_block_header = (void*)&build_block_header; - gate->build_extraheader = (void*)&lbry_build_extraheader; - gate->set_target = (void*)&lbry_set_target; - gate->ntime_index = LBRY_NTIME_INDEX; - gate->nbits_index = LBRY_NBITS_INDEX; - gate->nonce_index = LBRY_NONCE_INDEX; - gate->get_work_data_size = (void*)&lbry_get_work_data_size; - return true; -} - diff --git a/algo/ripemd/lbry-gate.h b/algo/ripemd/lbry-gate.h deleted file mode 100644 index e6d9263..0000000 --- a/algo/ripemd/lbry-gate.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef LBRY_GATE_H__ -#define LBRY_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if !defined(__SHA__) - #if defined(__AVX2__) - #define LBRY_8WAY - #endif -#endif - -#define LBRY_NTIME_INDEX 25 -#define LBRY_NBITS_INDEX 26 -#define LBRY_NONCE_INDEX 27 -#define LBRY_WORK_DATA_SIZE 192 -#define LBRY_WORK_CMP_SIZE 76 // same as default - -bool register_lbry_algo( algo_gate_t* gate ); - -#if defined(LBRY_8WAY) - -void lbry_8way_hash( void *state, const void *input ); -int scanhash_lbry_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -/* -#elif defined(LBRY_4WAY) - -void lbry_4way_hash( void *state, const void *input ); -int scanhash_lbry_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); -*/ -#else - -void lbry_hash( void *state, const void *input ); -int scanhash_lbry( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif -#endif diff --git a/algo/ripemd/lbry.c b/algo/ripemd/lbry.c deleted file mode 100644 index 57f9a82..0000000 --- a/algo/ripemd/lbry.c +++ /dev/null @@ -1,117 +0,0 @@ -#include "lbry-gate.h" -#include -#include -#include -#include -#include "sph_ripemd.h" -#include - -void lbry_hash(void* output, const void* input) -{ - SHA256_CTX ctx_sha256 __attribute__ ((aligned (64))); - SHA512_CTX ctx_sha512 __attribute__ ((aligned (64))); - sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64))); - uint32_t _ALIGN(64) hashA[16]; - uint32_t _ALIGN(64) hashB[16]; - uint32_t _ALIGN(64) hashC[16]; - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, input, 112 ); - SHA256_Final( (unsigned char*) hashA, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, hashA, 32 ); - SHA256_Final( (unsigned char*) hashA, &ctx_sha256 ); - - SHA512_Init( &ctx_sha512 ); - SHA512_Update( &ctx_sha512, hashA, 32 ); - SHA512_Final( (unsigned char*) hashA, &ctx_sha512 ); - - sph_ripemd160_init( &ctx_ripemd ); - sph_ripemd160 ( &ctx_ripemd, hashA, 32 ); - sph_ripemd160_close( &ctx_ripemd, hashB ); - - sph_ripemd160_init( &ctx_ripemd ); - sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 ); - sph_ripemd160_close( &ctx_ripemd, hashC ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, hashB, 20 ); - SHA256_Update( &ctx_sha256, hashC, 20 ); - SHA256_Final( (unsigned char*) hashA, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, hashA, 32 ); - SHA256_Final( (unsigned char*) hashA, &ctx_sha256 ); - - memcpy( output, hashA, 32 ); -} - -int scanhash_lbry( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[27] - 1; - const uint32_t first_nonce = pdata[27]; - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t endiandata[32] __attribute__ ((aligned (64))); - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 32 ); - -#ifdef DEBUG_ALGO - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < sizeof(masks); m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[27] = ++n; - be32enc(&endiandata[27], n); - lbry_hash(hash64, &endiandata); -#ifndef DEBUG_ALGO - if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[27] = n; - return 0; -} diff --git a/algo/ripemd/ripemd-hash-4way.c b/algo/ripemd/ripemd-hash-4way.c deleted file mode 100644 index 046e36d..0000000 --- a/algo/ripemd/ripemd-hash-4way.c +++ /dev/null @@ -1,622 +0,0 @@ -#include "ripemd-hash-4way.h" - -#if defined(__SSE4_2__) - -#include -#include - -static const uint32_t IV[5] = -{ 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 }; - -/* - * Round constants for RIPEMD-160. - */ -#define K11 0x00000000 -#define K12 0x5A827999 -#define K13 0x6ED9EBA1 -#define K14 0x8F1BBCDC -#define K15 0xA953FD4E - -#define K21 0x50A28BE6 -#define K22 0x5C4DD124 -#define K23 0x6D703EF3 -#define K24 0x7A6D76E9 -#define K25 0x00000000 - -// RIPEMD-160 4 way - -#define F1(x, y, z) \ - _mm_xor_si128( _mm_xor_si128( x, y ), z ) - -#define F2(x, y, z) \ - _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z ) - -#define F3(x, y, z) \ - _mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z ) - -#define F4(x, y, z) \ - _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y ) - -#define F5(x, y, z) \ - _mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) ) - -#define RR(a, b, c, d, e, f, s, r, k) \ -do{ \ - a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \ - _mm_add_epi32( a, f( b ,c, d ) ), r ), \ - _mm_set1_epi32( k ) ), s ), e ); \ - c = mm128_rol_32( c, 10 );\ -} while (0) - -#define ROUND1(a, b, c, d, e, f, s, r, k) \ - RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k) - -#define ROUND2(a, b, c, d, e, f, s, r, k) \ - RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) - -static void ripemd160_4way_round( ripemd160_4way_context *sc ) -{ - const __m128i *in = (__m128i*)sc->buf; - __m128i *h = (__m128i*)sc->val; - register __m128i A1, B1, C1, D1, E1; - register __m128i A2, B2, C2, D2, E2; - __m128i tmp; - - A1 = A2 = h[0]; - B1 = B2 = h[1]; - C1 = C2 = h[2]; - D1 = D2 = h[3]; - E1 = E2 = h[4]; - - ROUND1( A, B, C, D, E, F1, 11, in[ 0], 1 ); - ROUND1( E, A, B, C, D, F1, 14, in[ 1], 1 ); - ROUND1( D, E, A, B, C, F1, 15, in[ 2], 1 ); - ROUND1( C, D, E, A, B, F1, 12, in[ 3], 1 ); - ROUND1( B, C, D, E, A, F1, 5, in[ 4], 1 ); - ROUND1( A, B, C, D, E, F1, 8, in[ 5], 1 ); - ROUND1( E, A, B, C, D, F1, 7, in[ 6], 1 ); - ROUND1( D, E, A, B, C, F1, 9, in[ 7], 1 ); - ROUND1( C, D, E, A, B, F1, 11, in[ 8], 1 ); - ROUND1( B, C, D, E, A, F1, 13, in[ 9], 1 ); - ROUND1( A, B, C, D, E, F1, 14, in[10], 1 ); - ROUND1( E, A, B, C, D, F1, 15, in[11], 1 ); - ROUND1( D, E, A, B, C, F1, 6, in[12], 1 ); - ROUND1( C, D, E, A, B, F1, 7, in[13], 1 ); - ROUND1( B, C, D, E, A, F1, 9, in[14], 1 ); - ROUND1( A, B, C, D, E, F1, 8, in[15], 1 ); - - ROUND1( E, A, B, C, D, F2, 7, in[ 7], 2 ); - ROUND1( D, E, A, B, C, F2, 6, in[ 4], 2 ); - ROUND1( C, D, E, A, B, F2, 8, in[13], 2 ); - ROUND1( B, C, D, E, A, F2, 13, in[ 1], 2 ); - ROUND1( A, B, C, D, E, F2, 11, in[10], 2 ); - ROUND1( E, A, B, C, D, F2, 9, in[ 6], 2 ); - ROUND1( D, E, A, B, C, F2, 7, in[15], 2 ); - ROUND1( C, D, E, A, B, F2, 15, in[ 3], 2 ); - ROUND1( B, C, D, E, A, F2, 7, in[12], 2 ); - ROUND1( A, B, C, D, E, F2, 12, in[ 0], 2 ); - ROUND1( E, A, B, C, D, F2, 15, in[ 9], 2 ); - ROUND1( D, E, A, B, C, F2, 9, in[ 5], 2 ); - ROUND1( C, D, E, A, B, F2, 11, in[ 2], 2 ); - ROUND1( B, C, D, E, A, F2, 7, in[14], 2 ); - ROUND1( A, B, C, D, E, F2, 13, in[11], 2 ); - ROUND1( E, A, B, C, D, F2, 12, in[ 8], 2 ); - - ROUND1( D, E, A, B, C, F3, 11, in[ 3], 3 ); - ROUND1( C, D, E, A, B, F3, 13, in[10], 3 ); - ROUND1( B, C, D, E, A, F3, 6, in[14], 3 ); - ROUND1( A, B, C, D, E, F3, 7, in[ 4], 3 ); - ROUND1( E, A, B, C, D, F3, 14, in[ 9], 3 ); - ROUND1( D, E, A, B, C, F3, 9, in[15], 3 ); - ROUND1( C, D, E, A, B, F3, 13, in[ 8], 3 ); - ROUND1( B, C, D, E, A, F3, 15, in[ 1], 3 ); - ROUND1( A, B, C, D, E, F3, 14, in[ 2], 3 ); - ROUND1( E, A, B, C, D, F3, 8, in[ 7], 3 ); - ROUND1( D, E, A, B, C, F3, 13, in[ 0], 3 ); - ROUND1( C, D, E, A, B, F3, 6, in[ 6], 3 ); - ROUND1( B, C, D, E, A, F3, 5, in[13], 3 ); - ROUND1( A, B, C, D, E, F3, 12, in[11], 3 ); - ROUND1( E, A, B, C, D, F3, 7, in[ 5], 3 ); - ROUND1( D, E, A, B, C, F3, 5, in[12], 3 ); - - ROUND1( C, D, E, A, B, F4, 11, in[ 1], 4 ); - ROUND1( B, C, D, E, A, F4, 12, in[ 9], 4 ); - ROUND1( A, B, C, D, E, F4, 14, in[11], 4 ); - ROUND1( E, A, B, C, D, F4, 15, in[10], 4 ); - ROUND1( D, E, A, B, C, F4, 14, in[ 0], 4 ); - ROUND1( C, D, E, A, B, F4, 15, in[ 8], 4 ); - ROUND1( B, C, D, E, A, F4, 9, in[12], 4 ); - ROUND1( A, B, C, D, E, F4, 8, in[ 4], 4 ); - ROUND1( E, A, B, C, D, F4, 9, in[13], 4 ); - ROUND1( D, E, A, B, C, F4, 14, in[ 3], 4 ); - ROUND1( C, D, E, A, B, F4, 5, in[ 7], 4 ); - ROUND1( B, C, D, E, A, F4, 6, in[15], 4 ); - ROUND1( A, B, C, D, E, F4, 8, in[14], 4 ); - ROUND1( E, A, B, C, D, F4, 6, in[ 5], 4 ); - ROUND1( D, E, A, B, C, F4, 5, in[ 6], 4 ); - ROUND1( C, D, E, A, B, F4, 12, in[ 2], 4 ); - - ROUND1( B, C, D, E, A, F5, 9, in[ 4], 5 ); - ROUND1( A, B, C, D, E, F5, 15, in[ 0], 5 ); - ROUND1( E, A, B, C, D, F5, 5, in[ 5], 5 ); - ROUND1( D, E, A, B, C, F5, 11, in[ 9], 5 ); - ROUND1( C, D, E, A, B, F5, 6, in[ 7], 5 ); - ROUND1( B, C, D, E, A, F5, 8, in[12], 5 ); - ROUND1( A, B, C, D, E, F5, 13, in[ 2], 5 ); - ROUND1( E, A, B, C, D, F5, 12, in[10], 5 ); - ROUND1( D, E, A, B, C, F5, 5, in[14], 5 ); - ROUND1( C, D, E, A, B, F5, 12, in[ 1], 5 ); - ROUND1( B, C, D, E, A, F5, 13, in[ 3], 5 ); - ROUND1( A, B, C, D, E, F5, 14, in[ 8], 5 ); - ROUND1( E, A, B, C, D, F5, 11, in[11], 5 ); - ROUND1( D, E, A, B, C, F5, 8, in[ 6], 5 ); - ROUND1( C, D, E, A, B, F5, 5, in[15], 5 ); - ROUND1( B, C, D, E, A, F5, 6, in[13], 5 ); - - ROUND2( A, B, C, D, E, F5, 8, in[ 5], 1 ); - ROUND2( E, A, B, C, D, F5, 9, in[14], 1 ); - ROUND2( D, E, A, B, C, F5, 9, in[ 7], 1 ); - ROUND2( C, D, E, A, B, F5, 11, in[ 0], 1 ); - ROUND2( B, C, D, E, A, F5, 13, in[ 9], 1 ); - ROUND2( A, B, C, D, E, F5, 15, in[ 2], 1 ); - ROUND2( E, A, B, C, D, F5, 15, in[11], 1 ); - ROUND2( D, E, A, B, C, F5, 5, in[ 4], 1 ); - ROUND2( C, D, E, A, B, F5, 7, in[13], 1 ); - ROUND2( B, C, D, E, A, F5, 7, in[ 6], 1 ); - ROUND2( A, B, C, D, E, F5, 8, in[15], 1 ); - ROUND2( E, A, B, C, D, F5, 11, in[ 8], 1 ); - ROUND2( D, E, A, B, C, F5, 14, in[ 1], 1 ); - ROUND2( C, D, E, A, B, F5, 14, in[10], 1 ); - ROUND2( B, C, D, E, A, F5, 12, in[ 3], 1 ); - ROUND2( A, B, C, D, E, F5, 6, in[12], 1 ); - - ROUND2( E, A, B, C, D, F4, 9, in[ 6], 2 ); - ROUND2( D, E, A, B, C, F4, 13, in[11], 2 ); - ROUND2( C, D, E, A, B, F4, 15, in[ 3], 2 ); - ROUND2( B, C, D, E, A, F4, 7, in[ 7], 2 ); - ROUND2( A, B, C, D, E, F4, 12, in[ 0], 2 ); - ROUND2( E, A, B, C, D, F4, 8, in[13], 2 ); - ROUND2( D, E, A, B, C, F4, 9, in[ 5], 2 ); - ROUND2( C, D, E, A, B, F4, 11, in[10], 2 ); - ROUND2( B, C, D, E, A, F4, 7, in[14], 2 ); - ROUND2( A, B, C, D, E, F4, 7, in[15], 2 ); - ROUND2( E, A, B, C, D, F4, 12, in[ 8], 2 ); - ROUND2( D, E, A, B, C, F4, 7, in[12], 2 ); - ROUND2( C, D, E, A, B, F4, 6, in[ 4], 2 ); - ROUND2( B, C, D, E, A, F4, 15, in[ 9], 2 ); - ROUND2( A, B, C, D, E, F4, 13, in[ 1], 2 ); - ROUND2( E, A, B, C, D, F4, 11, in[ 2], 2 ); - - ROUND2( D, E, A, B, C, F3, 9, in[15], 3 ); - ROUND2( C, D, E, A, B, F3, 7, in[ 5], 3 ); - ROUND2( B, C, D, E, A, F3, 15, in[ 1], 3 ); - ROUND2( A, B, C, D, E, F3, 11, in[ 3], 3 ); - ROUND2( E, A, B, C, D, F3, 8, in[ 7], 3 ); - ROUND2( D, E, A, B, C, F3, 6, in[14], 3 ); - ROUND2( C, D, E, A, B, F3, 6, in[ 6], 3 ); - ROUND2( B, C, D, E, A, F3, 14, in[ 9], 3 ); - ROUND2( A, B, C, D, E, F3, 12, in[11], 3 ); - ROUND2( E, A, B, C, D, F3, 13, in[ 8], 3 ); - ROUND2( D, E, A, B, C, F3, 5, in[12], 3 ); - ROUND2( C, D, E, A, B, F3, 14, in[ 2], 3 ); - ROUND2( B, C, D, E, A, F3, 13, in[10], 3 ); - ROUND2( A, B, C, D, E, F3, 13, in[ 0], 3 ); - ROUND2( E, A, B, C, D, F3, 7, in[ 4], 3 ); - ROUND2( D, E, A, B, C, F3, 5, in[13], 3 ); - - ROUND2( C, D, E, A, B, F2, 15, in[ 8], 4 ); - ROUND2( B, C, D, E, A, F2, 5, in[ 6], 4 ); - ROUND2( A, B, C, D, E, F2, 8, in[ 4], 4 ); - ROUND2( E, A, B, C, D, F2, 11, in[ 1], 4 ); - ROUND2( D, E, A, B, C, F2, 14, in[ 3], 4 ); - ROUND2( C, D, E, A, B, F2, 14, in[11], 4 ); - ROUND2( B, C, D, E, A, F2, 6, in[15], 4 ); - ROUND2( A, B, C, D, E, F2, 14, in[ 0], 4 ); - ROUND2( E, A, B, C, D, F2, 6, in[ 5], 4 ); - ROUND2( D, E, A, B, C, F2, 9, in[12], 4 ); - ROUND2( C, D, E, A, B, F2, 12, in[ 2], 4 ); - ROUND2( B, C, D, E, A, F2, 9, in[13], 4 ); - ROUND2( A, B, C, D, E, F2, 12, in[ 9], 4 ); - ROUND2( E, A, B, C, D, F2, 5, in[ 7], 4 ); - ROUND2( D, E, A, B, C, F2, 15, in[10], 4 ); - ROUND2( C, D, E, A, B, F2, 8, in[14], 4 ); - - ROUND2( B, C, D, E, A, F1, 8, in[12], 5 ); - ROUND2( A, B, C, D, E, F1, 5, in[15], 5 ); - ROUND2( E, A, B, C, D, F1, 12, in[10], 5 ); - ROUND2( D, E, A, B, C, F1, 9, in[ 4], 5 ); - ROUND2( C, D, E, A, B, F1, 12, in[ 1], 5 ); - ROUND2( B, C, D, E, A, F1, 5, in[ 5], 5 ); - ROUND2( A, B, C, D, E, F1, 14, in[ 8], 5 ); - ROUND2( E, A, B, C, D, F1, 6, in[ 7], 5 ); - ROUND2( D, E, A, B, C, F1, 8, in[ 6], 5 ); - ROUND2( C, D, E, A, B, F1, 13, in[ 2], 5 ); - ROUND2( B, C, D, E, A, F1, 6, in[13], 5 ); - ROUND2( A, B, C, D, E, F1, 5, in[14], 5 ); - ROUND2( E, A, B, C, D, F1, 15, in[ 0], 5 ); - ROUND2( D, E, A, B, C, F1, 13, in[ 3], 5 ); - ROUND2( C, D, E, A, B, F1, 11, in[ 9], 5 ); - ROUND2( B, C, D, E, A, F1, 11, in[11], 5 ); - - tmp = _mm_add_epi32( _mm_add_epi32( h[1], C1 ), D2 ); - h[1] = _mm_add_epi32( _mm_add_epi32( h[2], D1 ), E2 ); - h[2] = _mm_add_epi32( _mm_add_epi32( h[3], E1 ), A2 ); - h[3] = _mm_add_epi32( _mm_add_epi32( h[4], A1 ), B2 ); - h[4] = _mm_add_epi32( _mm_add_epi32( h[0], B1 ), C2 ); - h[0] = tmp; -} - -void ripemd160_4way_init( ripemd160_4way_context *sc ) -{ - sc->val[0] = _mm_set1_epi32( IV[0] ); - sc->val[1] = _mm_set1_epi32( IV[1] ); - sc->val[2] = _mm_set1_epi32( IV[2] ); - sc->val[3] = _mm_set1_epi32( IV[3] ); - sc->val[4] = _mm_set1_epi32( IV[4] ); - sc->count_high = sc->count_low = 0; -} - -void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len ) -{ - __m128i *vdata = (__m128i*)data; - size_t ptr; - const int block_size = 64; - - ptr = (unsigned)sc->count_low & (block_size - 1U); - while ( len > 0 ) - { - size_t clen; - uint32_t clow, clow2; - - clen = block_size - ptr; - if ( clen > len ) - clen = len; - memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 ); - vdata = vdata + (clen>>2); - ptr += clen; - len -= clen; - if ( ptr == block_size ) - { - ripemd160_4way_round( sc ); - ptr = 0; - } - clow = sc->count_low; - clow2 = clow + clen; - sc->count_low = clow2; - if ( clow2 < clow ) - sc->count_high++; - } -} - -void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst ) -{ - unsigned ptr, u; - uint32_t low, high; - const int block_size = 64; - const int pad = block_size - 8; - - ptr = (unsigned)sc->count_low & ( block_size - 1U); - sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 ); - ptr += 4; - - if ( ptr > pad ) - { - memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 ); - ripemd160_4way_round( sc ); - memset_zero_128( sc->buf, pad>>2 ); - } - else - memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 ); - - low = sc->count_low; - high = (sc->count_high << 3) | (low >> 29); - low = low << 3; - sc->buf[ pad>>2 ] = _mm_set1_epi32( low ); - sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high ); - ripemd160_4way_round( sc ); - for (u = 0; u < 5; u ++) - casti_m128i( dst, u ) = sc->val[u]; -} - -#endif - -#if defined(__AVX2__) - -// Ripemd-160 8 way - -#define F8W_1(x, y, z) \ - _mm256_xor_si256( _mm256_xor_si256( x, y ), z ) - -#define F8W_2(x, y, z) \ - _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z ) - -#define F8W_3(x, y, z) \ - _mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z ) - -#define F8W_4(x, y, z) \ - _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y ) - -#define F8W_5(x, y, z) \ - _mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) ) - -#define RR_8W(a, b, c, d, e, f, s, r, k) \ -do{ \ - a = _mm256_add_epi32( mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32( \ - _mm256_add_epi32( a, f( b ,c, d ) ), r ), \ - _mm256_set1_epi32( k ) ), s ), e ); \ - c = mm256_rol_32( c, 10 );\ -} while (0) - -#define ROUND1_8W(a, b, c, d, e, f, s, r, k) \ - RR_8W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k) - -#define ROUND2_8W(a, b, c, d, e, f, s, r, k) \ - RR_8W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) - -static void ripemd160_8way_round( ripemd160_8way_context *sc ) -{ - const __m256i *in = (__m256i*)sc->buf; - __m256i *h = (__m256i*)sc->val; - register __m256i A1, B1, C1, D1, E1; - register __m256i A2, B2, C2, D2, E2; - __m256i tmp; - - A1 = A2 = h[0]; - B1 = B2 = h[1]; - C1 = C2 = h[2]; - D1 = D2 = h[3]; - E1 = E2 = h[4]; - - ROUND1_8W( A, B, C, D, E, F8W_1, 11, in[ 0], 1 ); - ROUND1_8W( E, A, B, C, D, F8W_1, 14, in[ 1], 1 ); - ROUND1_8W( D, E, A, B, C, F8W_1, 15, in[ 2], 1 ); - ROUND1_8W( C, D, E, A, B, F8W_1, 12, in[ 3], 1 ); - ROUND1_8W( B, C, D, E, A, F8W_1, 5, in[ 4], 1 ); - ROUND1_8W( A, B, C, D, E, F8W_1, 8, in[ 5], 1 ); - ROUND1_8W( E, A, B, C, D, F8W_1, 7, in[ 6], 1 ); - ROUND1_8W( D, E, A, B, C, F8W_1, 9, in[ 7], 1 ); - ROUND1_8W( C, D, E, A, B, F8W_1, 11, in[ 8], 1 ); - ROUND1_8W( B, C, D, E, A, F8W_1, 13, in[ 9], 1 ); - ROUND1_8W( A, B, C, D, E, F8W_1, 14, in[10], 1 ); - ROUND1_8W( E, A, B, C, D, F8W_1, 15, in[11], 1 ); - ROUND1_8W( D, E, A, B, C, F8W_1, 6, in[12], 1 ); - ROUND1_8W( C, D, E, A, B, F8W_1, 7, in[13], 1 ); - ROUND1_8W( B, C, D, E, A, F8W_1, 9, in[14], 1 ); - ROUND1_8W( A, B, C, D, E, F8W_1, 8, in[15], 1 ); - - ROUND1_8W( E, A, B, C, D, F8W_2, 7, in[ 7], 2 ); - ROUND1_8W( D, E, A, B, C, F8W_2, 6, in[ 4], 2 ); - ROUND1_8W( C, D, E, A, B, F8W_2, 8, in[13], 2 ); - ROUND1_8W( B, C, D, E, A, F8W_2, 13, in[ 1], 2 ); - ROUND1_8W( A, B, C, D, E, F8W_2, 11, in[10], 2 ); - ROUND1_8W( E, A, B, C, D, F8W_2, 9, in[ 6], 2 ); - ROUND1_8W( D, E, A, B, C, F8W_2, 7, in[15], 2 ); - ROUND1_8W( C, D, E, A, B, F8W_2, 15, in[ 3], 2 ); - ROUND1_8W( B, C, D, E, A, F8W_2, 7, in[12], 2 ); - ROUND1_8W( A, B, C, D, E, F8W_2, 12, in[ 0], 2 ); - ROUND1_8W( E, A, B, C, D, F8W_2, 15, in[ 9], 2 ); - ROUND1_8W( D, E, A, B, C, F8W_2, 9, in[ 5], 2 ); - ROUND1_8W( C, D, E, A, B, F8W_2, 11, in[ 2], 2 ); - ROUND1_8W( B, C, D, E, A, F8W_2, 7, in[14], 2 ); - ROUND1_8W( A, B, C, D, E, F8W_2, 13, in[11], 2 ); - ROUND1_8W( E, A, B, C, D, F8W_2, 12, in[ 8], 2 ); - - ROUND1_8W( D, E, A, B, C, F8W_3, 11, in[ 3], 3 ); - ROUND1_8W( C, D, E, A, B, F8W_3, 13, in[10], 3 ); - ROUND1_8W( B, C, D, E, A, F8W_3, 6, in[14], 3 ); - ROUND1_8W( A, B, C, D, E, F8W_3, 7, in[ 4], 3 ); - ROUND1_8W( E, A, B, C, D, F8W_3, 14, in[ 9], 3 ); - ROUND1_8W( D, E, A, B, C, F8W_3, 9, in[15], 3 ); - ROUND1_8W( C, D, E, A, B, F8W_3, 13, in[ 8], 3 ); - ROUND1_8W( B, C, D, E, A, F8W_3, 15, in[ 1], 3 ); - ROUND1_8W( A, B, C, D, E, F8W_3, 14, in[ 2], 3 ); - ROUND1_8W( E, A, B, C, D, F8W_3, 8, in[ 7], 3 ); - ROUND1_8W( D, E, A, B, C, F8W_3, 13, in[ 0], 3 ); - ROUND1_8W( C, D, E, A, B, F8W_3, 6, in[ 6], 3 ); - ROUND1_8W( B, C, D, E, A, F8W_3, 5, in[13], 3 ); - ROUND1_8W( A, B, C, D, E, F8W_3, 12, in[11], 3 ); - ROUND1_8W( E, A, B, C, D, F8W_3, 7, in[ 5], 3 ); - ROUND1_8W( D, E, A, B, C, F8W_3, 5, in[12], 3 ); - - ROUND1_8W( C, D, E, A, B, F8W_4, 11, in[ 1], 4 ); - ROUND1_8W( B, C, D, E, A, F8W_4, 12, in[ 9], 4 ); - ROUND1_8W( A, B, C, D, E, F8W_4, 14, in[11], 4 ); - ROUND1_8W( E, A, B, C, D, F8W_4, 15, in[10], 4 ); - ROUND1_8W( D, E, A, B, C, F8W_4, 14, in[ 0], 4 ); - ROUND1_8W( C, D, E, A, B, F8W_4, 15, in[ 8], 4 ); - ROUND1_8W( B, C, D, E, A, F8W_4, 9, in[12], 4 ); - ROUND1_8W( A, B, C, D, E, F8W_4, 8, in[ 4], 4 ); - ROUND1_8W( E, A, B, C, D, F8W_4, 9, in[13], 4 ); - ROUND1_8W( D, E, A, B, C, F8W_4, 14, in[ 3], 4 ); - ROUND1_8W( C, D, E, A, B, F8W_4, 5, in[ 7], 4 ); - ROUND1_8W( B, C, D, E, A, F8W_4, 6, in[15], 4 ); - ROUND1_8W( A, B, C, D, E, F8W_4, 8, in[14], 4 ); - ROUND1_8W( E, A, B, C, D, F8W_4, 6, in[ 5], 4 ); - ROUND1_8W( D, E, A, B, C, F8W_4, 5, in[ 6], 4 ); - ROUND1_8W( C, D, E, A, B, F8W_4, 12, in[ 2], 4 ); - - ROUND1_8W( B, C, D, E, A, F8W_5, 9, in[ 4], 5 ); - ROUND1_8W( A, B, C, D, E, F8W_5, 15, in[ 0], 5 ); - ROUND1_8W( E, A, B, C, D, F8W_5, 5, in[ 5], 5 ); - ROUND1_8W( D, E, A, B, C, F8W_5, 11, in[ 9], 5 ); - ROUND1_8W( C, D, E, A, B, F8W_5, 6, in[ 7], 5 ); - ROUND1_8W( B, C, D, E, A, F8W_5, 8, in[12], 5 ); - ROUND1_8W( A, B, C, D, E, F8W_5, 13, in[ 2], 5 ); - ROUND1_8W( E, A, B, C, D, F8W_5, 12, in[10], 5 ); - ROUND1_8W( D, E, A, B, C, F8W_5, 5, in[14], 5 ); - ROUND1_8W( C, D, E, A, B, F8W_5, 12, in[ 1], 5 ); - ROUND1_8W( B, C, D, E, A, F8W_5, 13, in[ 3], 5 ); - ROUND1_8W( A, B, C, D, E, F8W_5, 14, in[ 8], 5 ); - ROUND1_8W( E, A, B, C, D, F8W_5, 11, in[11], 5 ); - ROUND1_8W( D, E, A, B, C, F8W_5, 8, in[ 6], 5 ); - ROUND1_8W( C, D, E, A, B, F8W_5, 5, in[15], 5 ); - ROUND1_8W( B, C, D, E, A, F8W_5, 6, in[13], 5 ); - - ROUND2_8W( A, B, C, D, E, F8W_5, 8, in[ 5], 1 ); - ROUND2_8W( E, A, B, C, D, F8W_5, 9, in[14], 1 ); - ROUND2_8W( D, E, A, B, C, F8W_5, 9, in[ 7], 1 ); - ROUND2_8W( C, D, E, A, B, F8W_5, 11, in[ 0], 1 ); - ROUND2_8W( B, C, D, E, A, F8W_5, 13, in[ 9], 1 ); - ROUND2_8W( A, B, C, D, E, F8W_5, 15, in[ 2], 1 ); - ROUND2_8W( E, A, B, C, D, F8W_5, 15, in[11], 1 ); - ROUND2_8W( D, E, A, B, C, F8W_5, 5, in[ 4], 1 ); - ROUND2_8W( C, D, E, A, B, F8W_5, 7, in[13], 1 ); - ROUND2_8W( B, C, D, E, A, F8W_5, 7, in[ 6], 1 ); - ROUND2_8W( A, B, C, D, E, F8W_5, 8, in[15], 1 ); - ROUND2_8W( E, A, B, C, D, F8W_5, 11, in[ 8], 1 ); - ROUND2_8W( D, E, A, B, C, F8W_5, 14, in[ 1], 1 ); - ROUND2_8W( C, D, E, A, B, F8W_5, 14, in[10], 1 ); - ROUND2_8W( B, C, D, E, A, F8W_5, 12, in[ 3], 1 ); - ROUND2_8W( A, B, C, D, E, F8W_5, 6, in[12], 1 ); - - ROUND2_8W( E, A, B, C, D, F8W_4, 9, in[ 6], 2 ); - ROUND2_8W( D, E, A, B, C, F8W_4, 13, in[11], 2 ); - ROUND2_8W( C, D, E, A, B, F8W_4, 15, in[ 3], 2 ); - ROUND2_8W( B, C, D, E, A, F8W_4, 7, in[ 7], 2 ); - ROUND2_8W( A, B, C, D, E, F8W_4, 12, in[ 0], 2 ); - ROUND2_8W( E, A, B, C, D, F8W_4, 8, in[13], 2 ); - ROUND2_8W( D, E, A, B, C, F8W_4, 9, in[ 5], 2 ); - ROUND2_8W( C, D, E, A, B, F8W_4, 11, in[10], 2 ); - ROUND2_8W( B, C, D, E, A, F8W_4, 7, in[14], 2 ); - ROUND2_8W( A, B, C, D, E, F8W_4, 7, in[15], 2 ); - ROUND2_8W( E, A, B, C, D, F8W_4, 12, in[ 8], 2 ); - ROUND2_8W( D, E, A, B, C, F8W_4, 7, in[12], 2 ); - ROUND2_8W( C, D, E, A, B, F8W_4, 6, in[ 4], 2 ); - ROUND2_8W( B, C, D, E, A, F8W_4, 15, in[ 9], 2 ); - ROUND2_8W( A, B, C, D, E, F8W_4, 13, in[ 1], 2 ); - ROUND2_8W( E, A, B, C, D, F8W_4, 11, in[ 2], 2 ); - - ROUND2_8W( D, E, A, B, C, F8W_3, 9, in[15], 3 ); - ROUND2_8W( C, D, E, A, B, F8W_3, 7, in[ 5], 3 ); - ROUND2_8W( B, C, D, E, A, F8W_3, 15, in[ 1], 3 ); - ROUND2_8W( A, B, C, D, E, F8W_3, 11, in[ 3], 3 ); - ROUND2_8W( E, A, B, C, D, F8W_3, 8, in[ 7], 3 ); - ROUND2_8W( D, E, A, B, C, F8W_3, 6, in[14], 3 ); - ROUND2_8W( C, D, E, A, B, F8W_3, 6, in[ 6], 3 ); - ROUND2_8W( B, C, D, E, A, F8W_3, 14, in[ 9], 3 ); - ROUND2_8W( A, B, C, D, E, F8W_3, 12, in[11], 3 ); - ROUND2_8W( E, A, B, C, D, F8W_3, 13, in[ 8], 3 ); - ROUND2_8W( D, E, A, B, C, F8W_3, 5, in[12], 3 ); - ROUND2_8W( C, D, E, A, B, F8W_3, 14, in[ 2], 3 ); - ROUND2_8W( B, C, D, E, A, F8W_3, 13, in[10], 3 ); - ROUND2_8W( A, B, C, D, E, F8W_3, 13, in[ 0], 3 ); - ROUND2_8W( E, A, B, C, D, F8W_3, 7, in[ 4], 3 ); - ROUND2_8W( D, E, A, B, C, F8W_3, 5, in[13], 3 ); - - ROUND2_8W( C, D, E, A, B, F8W_2, 15, in[ 8], 4 ); - ROUND2_8W( B, C, D, E, A, F8W_2, 5, in[ 6], 4 ); - ROUND2_8W( A, B, C, D, E, F8W_2, 8, in[ 4], 4 ); - ROUND2_8W( E, A, B, C, D, F8W_2, 11, in[ 1], 4 ); - ROUND2_8W( D, E, A, B, C, F8W_2, 14, in[ 3], 4 ); - ROUND2_8W( C, D, E, A, B, F8W_2, 14, in[11], 4 ); - ROUND2_8W( B, C, D, E, A, F8W_2, 6, in[15], 4 ); - ROUND2_8W( A, B, C, D, E, F8W_2, 14, in[ 0], 4 ); - ROUND2_8W( E, A, B, C, D, F8W_2, 6, in[ 5], 4 ); - ROUND2_8W( D, E, A, B, C, F8W_2, 9, in[12], 4 ); - ROUND2_8W( C, D, E, A, B, F8W_2, 12, in[ 2], 4 ); - ROUND2_8W( B, C, D, E, A, F8W_2, 9, in[13], 4 ); - ROUND2_8W( A, B, C, D, E, F8W_2, 12, in[ 9], 4 ); - ROUND2_8W( E, A, B, C, D, F8W_2, 5, in[ 7], 4 ); - ROUND2_8W( D, E, A, B, C, F8W_2, 15, in[10], 4 ); - ROUND2_8W( C, D, E, A, B, F8W_2, 8, in[14], 4 ); - - ROUND2_8W( B, C, D, E, A, F8W_1, 8, in[12], 5 ); - ROUND2_8W( A, B, C, D, E, F8W_1, 5, in[15], 5 ); - ROUND2_8W( E, A, B, C, D, F8W_1, 12, in[10], 5 ); - ROUND2_8W( D, E, A, B, C, F8W_1, 9, in[ 4], 5 ); - ROUND2_8W( C, D, E, A, B, F8W_1, 12, in[ 1], 5 ); - ROUND2_8W( B, C, D, E, A, F8W_1, 5, in[ 5], 5 ); - ROUND2_8W( A, B, C, D, E, F8W_1, 14, in[ 8], 5 ); - ROUND2_8W( E, A, B, C, D, F8W_1, 6, in[ 7], 5 ); - ROUND2_8W( D, E, A, B, C, F8W_1, 8, in[ 6], 5 ); - ROUND2_8W( C, D, E, A, B, F8W_1, 13, in[ 2], 5 ); - ROUND2_8W( B, C, D, E, A, F8W_1, 6, in[13], 5 ); - ROUND2_8W( A, B, C, D, E, F8W_1, 5, in[14], 5 ); - ROUND2_8W( E, A, B, C, D, F8W_1, 15, in[ 0], 5 ); - ROUND2_8W( D, E, A, B, C, F8W_1, 13, in[ 3], 5 ); - ROUND2_8W( C, D, E, A, B, F8W_1, 11, in[ 9], 5 ); - ROUND2_8W( B, C, D, E, A, F8W_1, 11, in[11], 5 ); - - tmp = _mm256_add_epi32( _mm256_add_epi32( h[1], C1 ), D2 ); - h[1] = _mm256_add_epi32( _mm256_add_epi32( h[2], D1 ), E2 ); - h[2] = _mm256_add_epi32( _mm256_add_epi32( h[3], E1 ), A2 ); - h[3] = _mm256_add_epi32( _mm256_add_epi32( h[4], A1 ), B2 ); - h[4] = _mm256_add_epi32( _mm256_add_epi32( h[0], B1 ), C2 ); - h[0] = tmp; -} - - -void ripemd160_8way_init( ripemd160_8way_context *sc ) -{ - sc->val[0] = _mm256_set1_epi32( IV[0] ); - sc->val[1] = _mm256_set1_epi32( IV[1] ); - sc->val[2] = _mm256_set1_epi32( IV[2] ); - sc->val[3] = _mm256_set1_epi32( IV[3] ); - sc->val[4] = _mm256_set1_epi32( IV[4] ); - sc->count_high = sc->count_low = 0; -} - -void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len ) -{ - __m256i *vdata = (__m256i*)data; - size_t ptr; - const int block_size = 64; - - ptr = (unsigned)sc->count_low & (block_size - 1U); - while ( len > 0 ) - { - size_t clen; - uint32_t clow, clow2; - - clen = block_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( sc->buf + (ptr>>2), vdata, clen>>2 ); - vdata = vdata + (clen>>2); - ptr += clen; - len -= clen; - if ( ptr == block_size ) - { - ripemd160_8way_round( sc ); - ptr = 0; - } - clow = sc->count_low; - clow2 = clow + clen; - sc->count_low = clow2; - if ( clow2 < clow ) - sc->count_high++; - } -} - -void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ) -{ - unsigned ptr, u; - uint32_t low, high; - const int block_size = 64; - const int pad = block_size - 8; - - ptr = (unsigned)sc->count_low & ( block_size - 1U); - sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 ); - ptr += 4; - - if ( ptr > pad ) - { - memset_zero_256( sc->buf + (ptr>>2), (block_size - ptr) >> 2 ); - ripemd160_8way_round( sc ); - memset_zero_256( sc->buf, pad>>2 ); - } - else - memset_zero_256( sc->buf + (ptr>>2), (pad - ptr) >> 2 ); - - low = sc->count_low; - high = (sc->count_high << 3) | (low >> 29); - low = low << 3; - sc->buf[ pad>>2 ] = _mm256_set1_epi32( low ); - sc->buf[ (pad>>2) + 1 ] = _mm256_set1_epi32( high ); - ripemd160_8way_round( sc ); - for (u = 0; u < 5; u ++) - casti_m256i( dst, u ) = sc->val[u]; -} - -#endif // __AVX2__ - diff --git a/algo/ripemd/ripemd-hash-4way.h b/algo/ripemd/ripemd-hash-4way.h deleted file mode 100644 index 51878be..0000000 --- a/algo/ripemd/ripemd-hash-4way.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef RIPEMD_HASH_4WAY_H__ -#define RIPEMD_HASH_4WAY_H__ - -#include -#include "algo/sha/sph_types.h" - -#if defined(__SSE4_2__) - -#include "simd-utils.h" - -typedef struct -{ - __m128i buf[64>>2]; - __m128i val[5]; - uint32_t count_high, count_low; -} __attribute__ ((aligned (64))) ripemd160_4way_context; - -void ripemd160_4way_init( ripemd160_4way_context *sc ); -void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len ); -void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst ); - -#if defined (__AVX2__) - -typedef struct -{ - __m256i buf[64>>2]; - __m256i val[5]; - uint32_t count_high, count_low; -} __attribute__ ((aligned (64))) ripemd160_8way_context; - -void ripemd160_8way_init( ripemd160_8way_context *sc ); -void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len ); -void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ); - - -#endif // __AVX2__ -#endif // __SSE4_2__ -#endif // RIPEMD_HASH_4WAY_H__ diff --git a/algo/ripemd/sph_ripemd.c b/algo/ripemd/sph_ripemd.c deleted file mode 100644 index f295497..0000000 --- a/algo/ripemd/sph_ripemd.c +++ /dev/null @@ -1,834 +0,0 @@ -/* $Id: ripemd.c 216 2010-06-08 09:46:57Z tp $ */ -/* - * RIPEMD-160 implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph_ripemd.h" - -/* - * Round functions for RIPEMD (original). - */ -#define F(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) -#define G(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) -#define H(x, y, z) ((x) ^ (y) ^ (z)) - -static const sph_u32 oIV[5] = { - SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), - SPH_C32(0x98BADCFE), SPH_C32(0x10325476) -}; - -/* - * Round functions for RIPEMD-128 and RIPEMD-160. - */ -#define F1(x, y, z) ((x) ^ (y) ^ (z)) -#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) -#define F3(x, y, z) (((x) | ~(y)) ^ (z)) -#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) -#define F5(x, y, z) ((x) ^ ((y) | ~(z))) - -static const sph_u32 IV[5] = { - SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), SPH_C32(0x98BADCFE), - SPH_C32(0x10325476), SPH_C32(0xC3D2E1F0) -}; - -#define ROTL SPH_ROTL32 - -/* ===================================================================== */ -/* - * RIPEMD (original hash, deprecated). - */ - -#define FF1(A, B, C, D, X, s) do { \ - sph_u32 tmp = SPH_T32((A) + F(B, C, D) + (X)); \ - (A) = ROTL(tmp, (s)); \ - } while (0) - -#define GG1(A, B, C, D, X, s) do { \ - sph_u32 tmp = SPH_T32((A) + G(B, C, D) \ - + (X) + SPH_C32(0x5A827999)); \ - (A) = ROTL(tmp, (s)); \ - } while (0) - -#define HH1(A, B, C, D, X, s) do { \ - sph_u32 tmp = SPH_T32((A) + H(B, C, D) \ - + (X) + SPH_C32(0x6ED9EBA1)); \ - (A) = ROTL(tmp, (s)); \ - } while (0) - -#define FF2(A, B, C, D, X, s) do { \ - sph_u32 tmp = SPH_T32((A) + F(B, C, D) \ - + (X) + SPH_C32(0x50A28BE6)); \ - (A) = ROTL(tmp, (s)); \ - } while (0) - -#define GG2(A, B, C, D, X, s) do { \ - sph_u32 tmp = SPH_T32((A) + G(B, C, D) + (X)); \ - (A) = ROTL(tmp, (s)); \ - } while (0) - -#define HH2(A, B, C, D, X, s) do { \ - sph_u32 tmp = SPH_T32((A) + H(B, C, D) \ - + (X) + SPH_C32(0x5C4DD124)); \ - (A) = ROTL(tmp, (s)); \ - } while (0) - -#define RIPEMD_ROUND_BODY(in, h) do { \ - sph_u32 A1, B1, C1, D1; \ - sph_u32 A2, B2, C2, D2; \ - sph_u32 tmp; \ - \ - A1 = A2 = (h)[0]; \ - B1 = B2 = (h)[1]; \ - C1 = C2 = (h)[2]; \ - D1 = D2 = (h)[3]; \ - \ - FF1(A1, B1, C1, D1, in( 0), 11); \ - FF1(D1, A1, B1, C1, in( 1), 14); \ - FF1(C1, D1, A1, B1, in( 2), 15); \ - FF1(B1, C1, D1, A1, in( 3), 12); \ - FF1(A1, B1, C1, D1, in( 4), 5); \ - FF1(D1, A1, B1, C1, in( 5), 8); \ - FF1(C1, D1, A1, B1, in( 6), 7); \ - FF1(B1, C1, D1, A1, in( 7), 9); \ - FF1(A1, B1, C1, D1, in( 8), 11); \ - FF1(D1, A1, B1, C1, in( 9), 13); \ - FF1(C1, D1, A1, B1, in(10), 14); \ - FF1(B1, C1, D1, A1, in(11), 15); \ - FF1(A1, B1, C1, D1, in(12), 6); \ - FF1(D1, A1, B1, C1, in(13), 7); \ - FF1(C1, D1, A1, B1, in(14), 9); \ - FF1(B1, C1, D1, A1, in(15), 8); \ - \ - GG1(A1, B1, C1, D1, in( 7), 7); \ - GG1(D1, A1, B1, C1, in( 4), 6); \ - GG1(C1, D1, A1, B1, in(13), 8); \ - GG1(B1, C1, D1, A1, in( 1), 13); \ - GG1(A1, B1, C1, D1, in(10), 11); \ - GG1(D1, A1, B1, C1, in( 6), 9); \ - GG1(C1, D1, A1, B1, in(15), 7); \ - GG1(B1, C1, D1, A1, in( 3), 15); \ - GG1(A1, B1, C1, D1, in(12), 7); \ - GG1(D1, A1, B1, C1, in( 0), 12); \ - GG1(C1, D1, A1, B1, in( 9), 15); \ - GG1(B1, C1, D1, A1, in( 5), 9); \ - GG1(A1, B1, C1, D1, in(14), 7); \ - GG1(D1, A1, B1, C1, in( 2), 11); \ - GG1(C1, D1, A1, B1, in(11), 13); \ - GG1(B1, C1, D1, A1, in( 8), 12); \ - \ - HH1(A1, B1, C1, D1, in( 3), 11); \ - HH1(D1, A1, B1, C1, in(10), 13); \ - HH1(C1, D1, A1, B1, in( 2), 14); \ - HH1(B1, C1, D1, A1, in( 4), 7); \ - HH1(A1, B1, C1, D1, in( 9), 14); \ - HH1(D1, A1, B1, C1, in(15), 9); \ - HH1(C1, D1, A1, B1, in( 8), 13); \ - HH1(B1, C1, D1, A1, in( 1), 15); \ - HH1(A1, B1, C1, D1, in(14), 6); \ - HH1(D1, A1, B1, C1, in( 7), 8); \ - HH1(C1, D1, A1, B1, in( 0), 13); \ - HH1(B1, C1, D1, A1, in( 6), 6); \ - HH1(A1, B1, C1, D1, in(11), 12); \ - HH1(D1, A1, B1, C1, in(13), 5); \ - HH1(C1, D1, A1, B1, in( 5), 7); \ - HH1(B1, C1, D1, A1, in(12), 5); \ - \ - FF2(A2, B2, C2, D2, in( 0), 11); \ - FF2(D2, A2, B2, C2, in( 1), 14); \ - FF2(C2, D2, A2, B2, in( 2), 15); \ - FF2(B2, C2, D2, A2, in( 3), 12); \ - FF2(A2, B2, C2, D2, in( 4), 5); \ - FF2(D2, A2, B2, C2, in( 5), 8); \ - FF2(C2, D2, A2, B2, in( 6), 7); \ - FF2(B2, C2, D2, A2, in( 7), 9); \ - FF2(A2, B2, C2, D2, in( 8), 11); \ - FF2(D2, A2, B2, C2, in( 9), 13); \ - FF2(C2, D2, A2, B2, in(10), 14); \ - FF2(B2, C2, D2, A2, in(11), 15); \ - FF2(A2, B2, C2, D2, in(12), 6); \ - FF2(D2, A2, B2, C2, in(13), 7); \ - FF2(C2, D2, A2, B2, in(14), 9); \ - FF2(B2, C2, D2, A2, in(15), 8); \ - \ - GG2(A2, B2, C2, D2, in( 7), 7); \ - GG2(D2, A2, B2, C2, in( 4), 6); \ - GG2(C2, D2, A2, B2, in(13), 8); \ - GG2(B2, C2, D2, A2, in( 1), 13); \ - GG2(A2, B2, C2, D2, in(10), 11); \ - GG2(D2, A2, B2, C2, in( 6), 9); \ - GG2(C2, D2, A2, B2, in(15), 7); \ - GG2(B2, C2, D2, A2, in( 3), 15); \ - GG2(A2, B2, C2, D2, in(12), 7); \ - GG2(D2, A2, B2, C2, in( 0), 12); \ - GG2(C2, D2, A2, B2, in( 9), 15); \ - GG2(B2, C2, D2, A2, in( 5), 9); \ - GG2(A2, B2, C2, D2, in(14), 7); \ - GG2(D2, A2, B2, C2, in( 2), 11); \ - GG2(C2, D2, A2, B2, in(11), 13); \ - GG2(B2, C2, D2, A2, in( 8), 12); \ - \ - HH2(A2, B2, C2, D2, in( 3), 11); \ - HH2(D2, A2, B2, C2, in(10), 13); \ - HH2(C2, D2, A2, B2, in( 2), 14); \ - HH2(B2, C2, D2, A2, in( 4), 7); \ - HH2(A2, B2, C2, D2, in( 9), 14); \ - HH2(D2, A2, B2, C2, in(15), 9); \ - HH2(C2, D2, A2, B2, in( 8), 13); \ - HH2(B2, C2, D2, A2, in( 1), 15); \ - HH2(A2, B2, C2, D2, in(14), 6); \ - HH2(D2, A2, B2, C2, in( 7), 8); \ - HH2(C2, D2, A2, B2, in( 0), 13); \ - HH2(B2, C2, D2, A2, in( 6), 6); \ - HH2(A2, B2, C2, D2, in(11), 12); \ - HH2(D2, A2, B2, C2, in(13), 5); \ - HH2(C2, D2, A2, B2, in( 5), 7); \ - HH2(B2, C2, D2, A2, in(12), 5); \ - \ - tmp = SPH_T32((h)[1] + C1 + D2); \ - (h)[1] = SPH_T32((h)[2] + D1 + A2); \ - (h)[2] = SPH_T32((h)[3] + A1 + B2); \ - (h)[3] = SPH_T32((h)[0] + B1 + C2); \ - (h)[0] = tmp; \ - } while (0) - -/* - * One round of RIPEMD. The data must be aligned for 32-bit access. - */ -static void -ripemd_round(const unsigned char *data, sph_u32 r[5]) -{ -#if SPH_LITTLE_FAST - -#define RIPEMD_IN(x) sph_dec32le_aligned(data + (4 * (x))) - -#else - - sph_u32 X_var[16]; - int i; - - for (i = 0; i < 16; i ++) - X_var[i] = sph_dec32le_aligned(data + 4 * i); -#define RIPEMD_IN(x) X_var[x] - -#endif - RIPEMD_ROUND_BODY(RIPEMD_IN, r); -#undef RIPEMD_IN -} - -/* see sph_ripemd.h */ -void -sph_ripemd_init(void *cc) -{ - sph_ripemd_context *sc; - - sc = (sph_ripemd_context*)cc; - memcpy(sc->val, oIV, sizeof sc->val); -#if SPH_64 - sc->count = 0; -#else - sc->count_high = sc->count_low = 0; -#endif -} - -#define RFUN ripemd_round -#define HASH ripemd -#define LE32 1 -#include "algo/sha/md_helper.c" -#undef RFUN -#undef HASH -#undef LE32 - -/* see sph_ripemd.h */ -void -sph_ripemd_close(void *cc, void *dst) -{ - ripemd_close(cc, dst, 4); -// sph_ripemd_init(cc); -} - -/* see sph_ripemd.h */ -void -sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]) -{ -#define RIPEMD_IN(x) msg[x] - RIPEMD_ROUND_BODY(RIPEMD_IN, val); -#undef RIPEMD_IN -} - -/* ===================================================================== */ -/* - * RIPEMD-128. - */ - -/* - * Round constants for RIPEMD-128. - */ -#define sK11 SPH_C32(0x00000000) -#define sK12 SPH_C32(0x5A827999) -#define sK13 SPH_C32(0x6ED9EBA1) -#define sK14 SPH_C32(0x8F1BBCDC) - -#define sK21 SPH_C32(0x50A28BE6) -#define sK22 SPH_C32(0x5C4DD124) -#define sK23 SPH_C32(0x6D703EF3) -#define sK24 SPH_C32(0x00000000) - -#define sRR(a, b, c, d, f, s, r, k) do { \ - a = ROTL(SPH_T32(a + f(b, c, d) + r + k), s); \ - } while (0) - -#define sROUND1(a, b, c, d, f, s, r, k) \ - sRR(a ## 1, b ## 1, c ## 1, d ## 1, f, s, r, sK1 ## k) - -#define sROUND2(a, b, c, d, f, s, r, k) \ - sRR(a ## 2, b ## 2, c ## 2, d ## 2, f, s, r, sK2 ## k) - -/* - * This macro defines the body for a RIPEMD-128 compression function - * implementation. The "in" parameter should evaluate, when applied to a - * numerical input parameter from 0 to 15, to an expression which yields - * the corresponding input block. The "h" parameter should evaluate to - * an array or pointer expression designating the array of 4 words which - * contains the input and output of the compression function. - */ - -#define RIPEMD128_ROUND_BODY(in, h) do { \ - sph_u32 A1, B1, C1, D1; \ - sph_u32 A2, B2, C2, D2; \ - sph_u32 tmp; \ - \ - A1 = A2 = (h)[0]; \ - B1 = B2 = (h)[1]; \ - C1 = C2 = (h)[2]; \ - D1 = D2 = (h)[3]; \ - \ - sROUND1(A, B, C, D, F1, 11, in( 0), 1); \ - sROUND1(D, A, B, C, F1, 14, in( 1), 1); \ - sROUND1(C, D, A, B, F1, 15, in( 2), 1); \ - sROUND1(B, C, D, A, F1, 12, in( 3), 1); \ - sROUND1(A, B, C, D, F1, 5, in( 4), 1); \ - sROUND1(D, A, B, C, F1, 8, in( 5), 1); \ - sROUND1(C, D, A, B, F1, 7, in( 6), 1); \ - sROUND1(B, C, D, A, F1, 9, in( 7), 1); \ - sROUND1(A, B, C, D, F1, 11, in( 8), 1); \ - sROUND1(D, A, B, C, F1, 13, in( 9), 1); \ - sROUND1(C, D, A, B, F1, 14, in(10), 1); \ - sROUND1(B, C, D, A, F1, 15, in(11), 1); \ - sROUND1(A, B, C, D, F1, 6, in(12), 1); \ - sROUND1(D, A, B, C, F1, 7, in(13), 1); \ - sROUND1(C, D, A, B, F1, 9, in(14), 1); \ - sROUND1(B, C, D, A, F1, 8, in(15), 1); \ - \ - sROUND1(A, B, C, D, F2, 7, in( 7), 2); \ - sROUND1(D, A, B, C, F2, 6, in( 4), 2); \ - sROUND1(C, D, A, B, F2, 8, in(13), 2); \ - sROUND1(B, C, D, A, F2, 13, in( 1), 2); \ - sROUND1(A, B, C, D, F2, 11, in(10), 2); \ - sROUND1(D, A, B, C, F2, 9, in( 6), 2); \ - sROUND1(C, D, A, B, F2, 7, in(15), 2); \ - sROUND1(B, C, D, A, F2, 15, in( 3), 2); \ - sROUND1(A, B, C, D, F2, 7, in(12), 2); \ - sROUND1(D, A, B, C, F2, 12, in( 0), 2); \ - sROUND1(C, D, A, B, F2, 15, in( 9), 2); \ - sROUND1(B, C, D, A, F2, 9, in( 5), 2); \ - sROUND1(A, B, C, D, F2, 11, in( 2), 2); \ - sROUND1(D, A, B, C, F2, 7, in(14), 2); \ - sROUND1(C, D, A, B, F2, 13, in(11), 2); \ - sROUND1(B, C, D, A, F2, 12, in( 8), 2); \ - \ - sROUND1(A, B, C, D, F3, 11, in( 3), 3); \ - sROUND1(D, A, B, C, F3, 13, in(10), 3); \ - sROUND1(C, D, A, B, F3, 6, in(14), 3); \ - sROUND1(B, C, D, A, F3, 7, in( 4), 3); \ - sROUND1(A, B, C, D, F3, 14, in( 9), 3); \ - sROUND1(D, A, B, C, F3, 9, in(15), 3); \ - sROUND1(C, D, A, B, F3, 13, in( 8), 3); \ - sROUND1(B, C, D, A, F3, 15, in( 1), 3); \ - sROUND1(A, B, C, D, F3, 14, in( 2), 3); \ - sROUND1(D, A, B, C, F3, 8, in( 7), 3); \ - sROUND1(C, D, A, B, F3, 13, in( 0), 3); \ - sROUND1(B, C, D, A, F3, 6, in( 6), 3); \ - sROUND1(A, B, C, D, F3, 5, in(13), 3); \ - sROUND1(D, A, B, C, F3, 12, in(11), 3); \ - sROUND1(C, D, A, B, F3, 7, in( 5), 3); \ - sROUND1(B, C, D, A, F3, 5, in(12), 3); \ - \ - sROUND1(A, B, C, D, F4, 11, in( 1), 4); \ - sROUND1(D, A, B, C, F4, 12, in( 9), 4); \ - sROUND1(C, D, A, B, F4, 14, in(11), 4); \ - sROUND1(B, C, D, A, F4, 15, in(10), 4); \ - sROUND1(A, B, C, D, F4, 14, in( 0), 4); \ - sROUND1(D, A, B, C, F4, 15, in( 8), 4); \ - sROUND1(C, D, A, B, F4, 9, in(12), 4); \ - sROUND1(B, C, D, A, F4, 8, in( 4), 4); \ - sROUND1(A, B, C, D, F4, 9, in(13), 4); \ - sROUND1(D, A, B, C, F4, 14, in( 3), 4); \ - sROUND1(C, D, A, B, F4, 5, in( 7), 4); \ - sROUND1(B, C, D, A, F4, 6, in(15), 4); \ - sROUND1(A, B, C, D, F4, 8, in(14), 4); \ - sROUND1(D, A, B, C, F4, 6, in( 5), 4); \ - sROUND1(C, D, A, B, F4, 5, in( 6), 4); \ - sROUND1(B, C, D, A, F4, 12, in( 2), 4); \ - \ - sROUND2(A, B, C, D, F4, 8, in( 5), 1); \ - sROUND2(D, A, B, C, F4, 9, in(14), 1); \ - sROUND2(C, D, A, B, F4, 9, in( 7), 1); \ - sROUND2(B, C, D, A, F4, 11, in( 0), 1); \ - sROUND2(A, B, C, D, F4, 13, in( 9), 1); \ - sROUND2(D, A, B, C, F4, 15, in( 2), 1); \ - sROUND2(C, D, A, B, F4, 15, in(11), 1); \ - sROUND2(B, C, D, A, F4, 5, in( 4), 1); \ - sROUND2(A, B, C, D, F4, 7, in(13), 1); \ - sROUND2(D, A, B, C, F4, 7, in( 6), 1); \ - sROUND2(C, D, A, B, F4, 8, in(15), 1); \ - sROUND2(B, C, D, A, F4, 11, in( 8), 1); \ - sROUND2(A, B, C, D, F4, 14, in( 1), 1); \ - sROUND2(D, A, B, C, F4, 14, in(10), 1); \ - sROUND2(C, D, A, B, F4, 12, in( 3), 1); \ - sROUND2(B, C, D, A, F4, 6, in(12), 1); \ - \ - sROUND2(A, B, C, D, F3, 9, in( 6), 2); \ - sROUND2(D, A, B, C, F3, 13, in(11), 2); \ - sROUND2(C, D, A, B, F3, 15, in( 3), 2); \ - sROUND2(B, C, D, A, F3, 7, in( 7), 2); \ - sROUND2(A, B, C, D, F3, 12, in( 0), 2); \ - sROUND2(D, A, B, C, F3, 8, in(13), 2); \ - sROUND2(C, D, A, B, F3, 9, in( 5), 2); \ - sROUND2(B, C, D, A, F3, 11, in(10), 2); \ - sROUND2(A, B, C, D, F3, 7, in(14), 2); \ - sROUND2(D, A, B, C, F3, 7, in(15), 2); \ - sROUND2(C, D, A, B, F3, 12, in( 8), 2); \ - sROUND2(B, C, D, A, F3, 7, in(12), 2); \ - sROUND2(A, B, C, D, F3, 6, in( 4), 2); \ - sROUND2(D, A, B, C, F3, 15, in( 9), 2); \ - sROUND2(C, D, A, B, F3, 13, in( 1), 2); \ - sROUND2(B, C, D, A, F3, 11, in( 2), 2); \ - \ - sROUND2(A, B, C, D, F2, 9, in(15), 3); \ - sROUND2(D, A, B, C, F2, 7, in( 5), 3); \ - sROUND2(C, D, A, B, F2, 15, in( 1), 3); \ - sROUND2(B, C, D, A, F2, 11, in( 3), 3); \ - sROUND2(A, B, C, D, F2, 8, in( 7), 3); \ - sROUND2(D, A, B, C, F2, 6, in(14), 3); \ - sROUND2(C, D, A, B, F2, 6, in( 6), 3); \ - sROUND2(B, C, D, A, F2, 14, in( 9), 3); \ - sROUND2(A, B, C, D, F2, 12, in(11), 3); \ - sROUND2(D, A, B, C, F2, 13, in( 8), 3); \ - sROUND2(C, D, A, B, F2, 5, in(12), 3); \ - sROUND2(B, C, D, A, F2, 14, in( 2), 3); \ - sROUND2(A, B, C, D, F2, 13, in(10), 3); \ - sROUND2(D, A, B, C, F2, 13, in( 0), 3); \ - sROUND2(C, D, A, B, F2, 7, in( 4), 3); \ - sROUND2(B, C, D, A, F2, 5, in(13), 3); \ - \ - sROUND2(A, B, C, D, F1, 15, in( 8), 4); \ - sROUND2(D, A, B, C, F1, 5, in( 6), 4); \ - sROUND2(C, D, A, B, F1, 8, in( 4), 4); \ - sROUND2(B, C, D, A, F1, 11, in( 1), 4); \ - sROUND2(A, B, C, D, F1, 14, in( 3), 4); \ - sROUND2(D, A, B, C, F1, 14, in(11), 4); \ - sROUND2(C, D, A, B, F1, 6, in(15), 4); \ - sROUND2(B, C, D, A, F1, 14, in( 0), 4); \ - sROUND2(A, B, C, D, F1, 6, in( 5), 4); \ - sROUND2(D, A, B, C, F1, 9, in(12), 4); \ - sROUND2(C, D, A, B, F1, 12, in( 2), 4); \ - sROUND2(B, C, D, A, F1, 9, in(13), 4); \ - sROUND2(A, B, C, D, F1, 12, in( 9), 4); \ - sROUND2(D, A, B, C, F1, 5, in( 7), 4); \ - sROUND2(C, D, A, B, F1, 15, in(10), 4); \ - sROUND2(B, C, D, A, F1, 8, in(14), 4); \ - \ - tmp = SPH_T32((h)[1] + C1 + D2); \ - (h)[1] = SPH_T32((h)[2] + D1 + A2); \ - (h)[2] = SPH_T32((h)[3] + A1 + B2); \ - (h)[3] = SPH_T32((h)[0] + B1 + C2); \ - (h)[0] = tmp; \ - } while (0) - -/* - * One round of RIPEMD-128. The data must be aligned for 32-bit access. - */ -static void -ripemd128_round(const unsigned char *data, sph_u32 r[5]) -{ -#if SPH_LITTLE_FAST - -#define RIPEMD128_IN(x) sph_dec32le_aligned(data + (4 * (x))) - -#else - - sph_u32 X_var[16]; - int i; - - for (i = 0; i < 16; i ++) - X_var[i] = sph_dec32le_aligned(data + 4 * i); -#define RIPEMD128_IN(x) X_var[x] - -#endif - RIPEMD128_ROUND_BODY(RIPEMD128_IN, r); -#undef RIPEMD128_IN -} - -/* see sph_ripemd.h */ -void -sph_ripemd128_init(void *cc) -{ - sph_ripemd128_context *sc; - - sc = (sph_ripemd128_context*)cc; - memcpy(sc->val, IV, sizeof sc->val); -#if SPH_64 - sc->count = 0; -#else - sc->count_high = sc->count_low = 0; -#endif -} - -#define RFUN ripemd128_round -#define HASH ripemd128 -#define LE32 1 -#include "algo/sha/md_helper.c" -#undef RFUN -#undef HASH -#undef LE32 - -/* see sph_ripemd.h */ -void -sph_ripemd128_close(void *cc, void *dst) -{ - ripemd128_close(cc, dst, 4); -// sph_ripemd128_init(cc); -} - -/* see sph_ripemd.h */ -void -sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]) -{ -#define RIPEMD128_IN(x) msg[x] - RIPEMD128_ROUND_BODY(RIPEMD128_IN, val); -#undef RIPEMD128_IN -} - -/* ===================================================================== */ -/* - * RIPEMD-160. - */ - -/* - * Round constants for RIPEMD-160. - */ -#define K11 SPH_C32(0x00000000) -#define K12 SPH_C32(0x5A827999) -#define K13 SPH_C32(0x6ED9EBA1) -#define K14 SPH_C32(0x8F1BBCDC) -#define K15 SPH_C32(0xA953FD4E) - -#define K21 SPH_C32(0x50A28BE6) -#define K22 SPH_C32(0x5C4DD124) -#define K23 SPH_C32(0x6D703EF3) -#define K24 SPH_C32(0x7A6D76E9) -#define K25 SPH_C32(0x00000000) - -#define RR(a, b, c, d, e, f, s, r, k) do { \ - a = SPH_T32(ROTL(SPH_T32(a + f(b, c, d) + r + k), s) + e); \ - c = ROTL(c, 10); \ - } while (0) - -#define ROUND1(a, b, c, d, e, f, s, r, k) \ - RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k) - -#define ROUND2(a, b, c, d, e, f, s, r, k) \ - RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) - -/* - * This macro defines the body for a RIPEMD-160 compression function - * implementation. The "in" parameter should evaluate, when applied to a - * numerical input parameter from 0 to 15, to an expression which yields - * the corresponding input block. The "h" parameter should evaluate to - * an array or pointer expression designating the array of 5 words which - * contains the input and output of the compression function. - */ - -#define RIPEMD160_ROUND_BODY(in, h) do { \ - sph_u32 A1, B1, C1, D1, E1; \ - sph_u32 A2, B2, C2, D2, E2; \ - sph_u32 tmp; \ - \ - A1 = A2 = (h)[0]; \ - B1 = B2 = (h)[1]; \ - C1 = C2 = (h)[2]; \ - D1 = D2 = (h)[3]; \ - E1 = E2 = (h)[4]; \ - \ - ROUND1(A, B, C, D, E, F1, 11, in( 0), 1); \ - ROUND1(E, A, B, C, D, F1, 14, in( 1), 1); \ - ROUND1(D, E, A, B, C, F1, 15, in( 2), 1); \ - ROUND1(C, D, E, A, B, F1, 12, in( 3), 1); \ - ROUND1(B, C, D, E, A, F1, 5, in( 4), 1); \ - ROUND1(A, B, C, D, E, F1, 8, in( 5), 1); \ - ROUND1(E, A, B, C, D, F1, 7, in( 6), 1); \ - ROUND1(D, E, A, B, C, F1, 9, in( 7), 1); \ - ROUND1(C, D, E, A, B, F1, 11, in( 8), 1); \ - ROUND1(B, C, D, E, A, F1, 13, in( 9), 1); \ - ROUND1(A, B, C, D, E, F1, 14, in(10), 1); \ - ROUND1(E, A, B, C, D, F1, 15, in(11), 1); \ - ROUND1(D, E, A, B, C, F1, 6, in(12), 1); \ - ROUND1(C, D, E, A, B, F1, 7, in(13), 1); \ - ROUND1(B, C, D, E, A, F1, 9, in(14), 1); \ - ROUND1(A, B, C, D, E, F1, 8, in(15), 1); \ - \ - ROUND1(E, A, B, C, D, F2, 7, in( 7), 2); \ - ROUND1(D, E, A, B, C, F2, 6, in( 4), 2); \ - ROUND1(C, D, E, A, B, F2, 8, in(13), 2); \ - ROUND1(B, C, D, E, A, F2, 13, in( 1), 2); \ - ROUND1(A, B, C, D, E, F2, 11, in(10), 2); \ - ROUND1(E, A, B, C, D, F2, 9, in( 6), 2); \ - ROUND1(D, E, A, B, C, F2, 7, in(15), 2); \ - ROUND1(C, D, E, A, B, F2, 15, in( 3), 2); \ - ROUND1(B, C, D, E, A, F2, 7, in(12), 2); \ - ROUND1(A, B, C, D, E, F2, 12, in( 0), 2); \ - ROUND1(E, A, B, C, D, F2, 15, in( 9), 2); \ - ROUND1(D, E, A, B, C, F2, 9, in( 5), 2); \ - ROUND1(C, D, E, A, B, F2, 11, in( 2), 2); \ - ROUND1(B, C, D, E, A, F2, 7, in(14), 2); \ - ROUND1(A, B, C, D, E, F2, 13, in(11), 2); \ - ROUND1(E, A, B, C, D, F2, 12, in( 8), 2); \ - \ - ROUND1(D, E, A, B, C, F3, 11, in( 3), 3); \ - ROUND1(C, D, E, A, B, F3, 13, in(10), 3); \ - ROUND1(B, C, D, E, A, F3, 6, in(14), 3); \ - ROUND1(A, B, C, D, E, F3, 7, in( 4), 3); \ - ROUND1(E, A, B, C, D, F3, 14, in( 9), 3); \ - ROUND1(D, E, A, B, C, F3, 9, in(15), 3); \ - ROUND1(C, D, E, A, B, F3, 13, in( 8), 3); \ - ROUND1(B, C, D, E, A, F3, 15, in( 1), 3); \ - ROUND1(A, B, C, D, E, F3, 14, in( 2), 3); \ - ROUND1(E, A, B, C, D, F3, 8, in( 7), 3); \ - ROUND1(D, E, A, B, C, F3, 13, in( 0), 3); \ - ROUND1(C, D, E, A, B, F3, 6, in( 6), 3); \ - ROUND1(B, C, D, E, A, F3, 5, in(13), 3); \ - ROUND1(A, B, C, D, E, F3, 12, in(11), 3); \ - ROUND1(E, A, B, C, D, F3, 7, in( 5), 3); \ - ROUND1(D, E, A, B, C, F3, 5, in(12), 3); \ - \ - ROUND1(C, D, E, A, B, F4, 11, in( 1), 4); \ - ROUND1(B, C, D, E, A, F4, 12, in( 9), 4); \ - ROUND1(A, B, C, D, E, F4, 14, in(11), 4); \ - ROUND1(E, A, B, C, D, F4, 15, in(10), 4); \ - ROUND1(D, E, A, B, C, F4, 14, in( 0), 4); \ - ROUND1(C, D, E, A, B, F4, 15, in( 8), 4); \ - ROUND1(B, C, D, E, A, F4, 9, in(12), 4); \ - ROUND1(A, B, C, D, E, F4, 8, in( 4), 4); \ - ROUND1(E, A, B, C, D, F4, 9, in(13), 4); \ - ROUND1(D, E, A, B, C, F4, 14, in( 3), 4); \ - ROUND1(C, D, E, A, B, F4, 5, in( 7), 4); \ - ROUND1(B, C, D, E, A, F4, 6, in(15), 4); \ - ROUND1(A, B, C, D, E, F4, 8, in(14), 4); \ - ROUND1(E, A, B, C, D, F4, 6, in( 5), 4); \ - ROUND1(D, E, A, B, C, F4, 5, in( 6), 4); \ - ROUND1(C, D, E, A, B, F4, 12, in( 2), 4); \ - \ - ROUND1(B, C, D, E, A, F5, 9, in( 4), 5); \ - ROUND1(A, B, C, D, E, F5, 15, in( 0), 5); \ - ROUND1(E, A, B, C, D, F5, 5, in( 5), 5); \ - ROUND1(D, E, A, B, C, F5, 11, in( 9), 5); \ - ROUND1(C, D, E, A, B, F5, 6, in( 7), 5); \ - ROUND1(B, C, D, E, A, F5, 8, in(12), 5); \ - ROUND1(A, B, C, D, E, F5, 13, in( 2), 5); \ - ROUND1(E, A, B, C, D, F5, 12, in(10), 5); \ - ROUND1(D, E, A, B, C, F5, 5, in(14), 5); \ - ROUND1(C, D, E, A, B, F5, 12, in( 1), 5); \ - ROUND1(B, C, D, E, A, F5, 13, in( 3), 5); \ - ROUND1(A, B, C, D, E, F5, 14, in( 8), 5); \ - ROUND1(E, A, B, C, D, F5, 11, in(11), 5); \ - ROUND1(D, E, A, B, C, F5, 8, in( 6), 5); \ - ROUND1(C, D, E, A, B, F5, 5, in(15), 5); \ - ROUND1(B, C, D, E, A, F5, 6, in(13), 5); \ - \ - ROUND2(A, B, C, D, E, F5, 8, in( 5), 1); \ - ROUND2(E, A, B, C, D, F5, 9, in(14), 1); \ - ROUND2(D, E, A, B, C, F5, 9, in( 7), 1); \ - ROUND2(C, D, E, A, B, F5, 11, in( 0), 1); \ - ROUND2(B, C, D, E, A, F5, 13, in( 9), 1); \ - ROUND2(A, B, C, D, E, F5, 15, in( 2), 1); \ - ROUND2(E, A, B, C, D, F5, 15, in(11), 1); \ - ROUND2(D, E, A, B, C, F5, 5, in( 4), 1); \ - ROUND2(C, D, E, A, B, F5, 7, in(13), 1); \ - ROUND2(B, C, D, E, A, F5, 7, in( 6), 1); \ - ROUND2(A, B, C, D, E, F5, 8, in(15), 1); \ - ROUND2(E, A, B, C, D, F5, 11, in( 8), 1); \ - ROUND2(D, E, A, B, C, F5, 14, in( 1), 1); \ - ROUND2(C, D, E, A, B, F5, 14, in(10), 1); \ - ROUND2(B, C, D, E, A, F5, 12, in( 3), 1); \ - ROUND2(A, B, C, D, E, F5, 6, in(12), 1); \ - \ - ROUND2(E, A, B, C, D, F4, 9, in( 6), 2); \ - ROUND2(D, E, A, B, C, F4, 13, in(11), 2); \ - ROUND2(C, D, E, A, B, F4, 15, in( 3), 2); \ - ROUND2(B, C, D, E, A, F4, 7, in( 7), 2); \ - ROUND2(A, B, C, D, E, F4, 12, in( 0), 2); \ - ROUND2(E, A, B, C, D, F4, 8, in(13), 2); \ - ROUND2(D, E, A, B, C, F4, 9, in( 5), 2); \ - ROUND2(C, D, E, A, B, F4, 11, in(10), 2); \ - ROUND2(B, C, D, E, A, F4, 7, in(14), 2); \ - ROUND2(A, B, C, D, E, F4, 7, in(15), 2); \ - ROUND2(E, A, B, C, D, F4, 12, in( 8), 2); \ - ROUND2(D, E, A, B, C, F4, 7, in(12), 2); \ - ROUND2(C, D, E, A, B, F4, 6, in( 4), 2); \ - ROUND2(B, C, D, E, A, F4, 15, in( 9), 2); \ - ROUND2(A, B, C, D, E, F4, 13, in( 1), 2); \ - ROUND2(E, A, B, C, D, F4, 11, in( 2), 2); \ - \ - ROUND2(D, E, A, B, C, F3, 9, in(15), 3); \ - ROUND2(C, D, E, A, B, F3, 7, in( 5), 3); \ - ROUND2(B, C, D, E, A, F3, 15, in( 1), 3); \ - ROUND2(A, B, C, D, E, F3, 11, in( 3), 3); \ - ROUND2(E, A, B, C, D, F3, 8, in( 7), 3); \ - ROUND2(D, E, A, B, C, F3, 6, in(14), 3); \ - ROUND2(C, D, E, A, B, F3, 6, in( 6), 3); \ - ROUND2(B, C, D, E, A, F3, 14, in( 9), 3); \ - ROUND2(A, B, C, D, E, F3, 12, in(11), 3); \ - ROUND2(E, A, B, C, D, F3, 13, in( 8), 3); \ - ROUND2(D, E, A, B, C, F3, 5, in(12), 3); \ - ROUND2(C, D, E, A, B, F3, 14, in( 2), 3); \ - ROUND2(B, C, D, E, A, F3, 13, in(10), 3); \ - ROUND2(A, B, C, D, E, F3, 13, in( 0), 3); \ - ROUND2(E, A, B, C, D, F3, 7, in( 4), 3); \ - ROUND2(D, E, A, B, C, F3, 5, in(13), 3); \ - \ - ROUND2(C, D, E, A, B, F2, 15, in( 8), 4); \ - ROUND2(B, C, D, E, A, F2, 5, in( 6), 4); \ - ROUND2(A, B, C, D, E, F2, 8, in( 4), 4); \ - ROUND2(E, A, B, C, D, F2, 11, in( 1), 4); \ - ROUND2(D, E, A, B, C, F2, 14, in( 3), 4); \ - ROUND2(C, D, E, A, B, F2, 14, in(11), 4); \ - ROUND2(B, C, D, E, A, F2, 6, in(15), 4); \ - ROUND2(A, B, C, D, E, F2, 14, in( 0), 4); \ - ROUND2(E, A, B, C, D, F2, 6, in( 5), 4); \ - ROUND2(D, E, A, B, C, F2, 9, in(12), 4); \ - ROUND2(C, D, E, A, B, F2, 12, in( 2), 4); \ - ROUND2(B, C, D, E, A, F2, 9, in(13), 4); \ - ROUND2(A, B, C, D, E, F2, 12, in( 9), 4); \ - ROUND2(E, A, B, C, D, F2, 5, in( 7), 4); \ - ROUND2(D, E, A, B, C, F2, 15, in(10), 4); \ - ROUND2(C, D, E, A, B, F2, 8, in(14), 4); \ - \ - ROUND2(B, C, D, E, A, F1, 8, in(12), 5); \ - ROUND2(A, B, C, D, E, F1, 5, in(15), 5); \ - ROUND2(E, A, B, C, D, F1, 12, in(10), 5); \ - ROUND2(D, E, A, B, C, F1, 9, in( 4), 5); \ - ROUND2(C, D, E, A, B, F1, 12, in( 1), 5); \ - ROUND2(B, C, D, E, A, F1, 5, in( 5), 5); \ - ROUND2(A, B, C, D, E, F1, 14, in( 8), 5); \ - ROUND2(E, A, B, C, D, F1, 6, in( 7), 5); \ - ROUND2(D, E, A, B, C, F1, 8, in( 6), 5); \ - ROUND2(C, D, E, A, B, F1, 13, in( 2), 5); \ - ROUND2(B, C, D, E, A, F1, 6, in(13), 5); \ - ROUND2(A, B, C, D, E, F1, 5, in(14), 5); \ - ROUND2(E, A, B, C, D, F1, 15, in( 0), 5); \ - ROUND2(D, E, A, B, C, F1, 13, in( 3), 5); \ - ROUND2(C, D, E, A, B, F1, 11, in( 9), 5); \ - ROUND2(B, C, D, E, A, F1, 11, in(11), 5); \ - \ - tmp = SPH_T32((h)[1] + C1 + D2); \ - (h)[1] = SPH_T32((h)[2] + D1 + E2); \ - (h)[2] = SPH_T32((h)[3] + E1 + A2); \ - (h)[3] = SPH_T32((h)[4] + A1 + B2); \ - (h)[4] = SPH_T32((h)[0] + B1 + C2); \ - (h)[0] = tmp; \ - } while (0) - -/* - * One round of RIPEMD-160. The data must be aligned for 32-bit access. - */ -static void -ripemd160_round(const unsigned char *data, sph_u32 r[5]) -{ -#if SPH_LITTLE_FAST - -#define RIPEMD160_IN(x) sph_dec32le_aligned(data + (4 * (x))) - -#else - - sph_u32 X_var[16]; - int i; - - for (i = 0; i < 16; i ++) - X_var[i] = sph_dec32le_aligned(data + 4 * i); -#define RIPEMD160_IN(x) X_var[x] - -#endif - RIPEMD160_ROUND_BODY(RIPEMD160_IN, r); -#undef RIPEMD160_IN -} - -/* see sph_ripemd.h */ -void -sph_ripemd160_init(void *cc) -{ - sph_ripemd160_context *sc; - - sc = (sph_ripemd160_context*)cc; - memcpy(sc->val, IV, sizeof sc->val); -#if SPH_64 - sc->count = 0; -#else - sc->count_high = sc->count_low = 0; -#endif -} - -#define RFUN ripemd160_round -#define HASH ripemd160 -#define LE32 1 -#include "algo/sha/md_helper.c" -#undef RFUN -#undef HASH -#undef LE32 - -/* see sph_ripemd.h */ -void -sph_ripemd160_close(void *cc, void *dst) -{ - ripemd160_close(cc, dst, 5); -// sph_ripemd160_init(cc); -} - -/* see sph_ripemd.h */ -void -sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5]) -{ -#define RIPEMD160_IN(x) msg[x] - RIPEMD160_ROUND_BODY(RIPEMD160_IN, val); -#undef RIPEMD160_IN -} - diff --git a/algo/ripemd/sph_ripemd.h b/algo/ripemd/sph_ripemd.h deleted file mode 100644 index 39fe5d1..0000000 --- a/algo/ripemd/sph_ripemd.h +++ /dev/null @@ -1,274 +0,0 @@ -/* $Id: sph_ripemd.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * RIPEMD, RIPEMD-128 and RIPEMD-160 interface. - * - * RIPEMD was first described in: Research and Development in Advanced - * Communication Technologies in Europe, "RIPE Integrity Primitives: - * Final Report of RACE Integrity Primitives Evaluation (R1040)", RACE, - * June 1992. - * - * A new, strengthened version, dubbed RIPEMD-160, was published in: H. - * Dobbertin, A. Bosselaers, and B. Preneel, "RIPEMD-160, a strengthened - * version of RIPEMD", Fast Software Encryption - FSE'96, LNCS 1039, - * Springer (1996), pp. 71--82. - * - * This article describes both RIPEMD-160, with a 160-bit output, and a - * reduced version called RIPEMD-128, which has a 128-bit output. RIPEMD-128 - * was meant as a "drop-in" replacement for any hash function with 128-bit - * output, especially the original RIPEMD. - * - * @warning Collisions, and an efficient method to build other collisions, - * have been published for the original RIPEMD, which is thus considered as - * cryptographically broken. It is also very rarely encountered, and there - * seems to exist no free description or implementation of RIPEMD (except - * the sphlib code, of course). As of january 2007, RIPEMD-128 and RIPEMD-160 - * seem as secure as their output length allows. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_ripemd.h - * @author Thomas Pornin - */ - -#ifndef SPH_RIPEMD_H__ -#define SPH_RIPEMD_H__ - -#include -#include "algo/sha/sph_types.h" - -/** - * Output size (in bits) for RIPEMD. - */ -#define SPH_SIZE_ripemd 128 - -/** - * Output size (in bits) for RIPEMD-128. - */ -#define SPH_SIZE_ripemd128 128 - -/** - * Output size (in bits) for RIPEMD-160. - */ -#define SPH_SIZE_ripemd160 160 - -/** - * This structure is a context for RIPEMD computations: it contains the - * intermediate values and some data from the last entered block. Once - * a RIPEMD computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running RIPEMD computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ - sph_u32 val[4]; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif -#endif -} sph_ripemd_context; - -/** - * Initialize a RIPEMD context. This process performs no memory allocation. - * - * @param cc the RIPEMD context (pointer to - * a sph_ripemd_context) - */ -void sph_ripemd_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the RIPEMD context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_ripemd(void *cc, const void *data, size_t len); - -/** - * Terminate the current RIPEMD computation and output the result into the - * provided buffer. The destination buffer must be wide enough to - * accomodate the result (16 bytes). The context is automatically - * reinitialized. - * - * @param cc the RIPEMD context - * @param dst the destination buffer - */ -void sph_ripemd_close(void *cc, void *dst); - -/** - * Apply the RIPEMD compression function on the provided data. The - * msg parameter contains the 16 32-bit input blocks, - * as numerical values (hence after the little-endian decoding). The - * val parameter contains the 5 32-bit input blocks for - * the compression function; the output is written in place in this - * array. - * - * @param msg the message block (16 values) - * @param val the function 128-bit input and output - */ -void sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]); - -/* ===================================================================== */ - -/** - * This structure is a context for RIPEMD-128 computations: it contains the - * intermediate values and some data from the last entered block. Once - * a RIPEMD-128 computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running RIPEMD-128 computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ - sph_u32 val[4]; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif -#endif -} sph_ripemd128_context; - -/** - * Initialize a RIPEMD-128 context. This process performs no memory allocation. - * - * @param cc the RIPEMD-128 context (pointer to - * a sph_ripemd128_context) - */ -void sph_ripemd128_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the RIPEMD-128 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_ripemd128(void *cc, const void *data, size_t len); - -/** - * Terminate the current RIPEMD-128 computation and output the result into the - * provided buffer. The destination buffer must be wide enough to - * accomodate the result (16 bytes). The context is automatically - * reinitialized. - * - * @param cc the RIPEMD-128 context - * @param dst the destination buffer - */ -void sph_ripemd128_close(void *cc, void *dst); - -/** - * Apply the RIPEMD-128 compression function on the provided data. The - * msg parameter contains the 16 32-bit input blocks, - * as numerical values (hence after the little-endian decoding). The - * val parameter contains the 5 32-bit input blocks for - * the compression function; the output is written in place in this - * array. - * - * @param msg the message block (16 values) - * @param val the function 128-bit input and output - */ -void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]); - -/* ===================================================================== */ - -/** - * This structure is a context for RIPEMD-160 computations: it contains the - * intermediate values and some data from the last entered block. Once - * a RIPEMD-160 computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running RIPEMD-160 computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ - sph_u32 val[5]; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif -#endif -} sph_ripemd160_context; - -/** - * Initialize a RIPEMD-160 context. This process performs no memory allocation. - * - * @param cc the RIPEMD-160 context (pointer to - * a sph_ripemd160_context) - */ -void sph_ripemd160_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the RIPEMD-160 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_ripemd160(void *cc, const void *data, size_t len); - -/** - * Terminate the current RIPEMD-160 computation and output the result into the - * provided buffer. The destination buffer must be wide enough to - * accomodate the result (20 bytes). The context is automatically - * reinitialized. - * - * @param cc the RIPEMD-160 context - * @param dst the destination buffer - */ -void sph_ripemd160_close(void *cc, void *dst); - -/** - * Apply the RIPEMD-160 compression function on the provided data. The - * msg parameter contains the 16 32-bit input blocks, - * as numerical values (hence after the little-endian decoding). The - * val parameter contains the 5 32-bit input blocks for - * the compression function; the output is written in place in this - * array. - * - * @param msg the message block (16 values) - * @param val the function 160-bit input and output - */ -void sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5]); - -#endif - diff --git a/algo/scrypt/neoscrypt.c b/algo/scrypt/neoscrypt.c deleted file mode 100644 index 3349afb..0000000 --- a/algo/scrypt/neoscrypt.c +++ /dev/null @@ -1,1102 +0,0 @@ -/* - * Copyright (c) 2009 Colin Percival, 2011 ArtForz - * Copyright (c) 2012 Andrew Moon (floodyberry) - * Copyright (c) 2012 Samuel Neves - * Copyright (c) 2014 John Doering - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -#include -#include -#include -#include - -#include "algo-gate-api.h" - -#define USE_CUSTOM_BLAKE2S -// TODO: try blake2sp -//#include "crypto/blake2s.h" - -#define STACK_ALIGN 0x40 - -#ifdef _MSC_VER // todo: msvc -#define ASM 0 -#elif defined(__arm__) -#define ASM 0 -#endif - -#ifdef __GNUC__ -#if defined(NOASM) || defined(__arm__) -#define ASM 0 -#else -#define ASM 1 -#endif -#endif - -#if (WINDOWS) -/* sizeof(unsigned long) = 4 for MinGW64 */ -typedef unsigned long long ulong; -#else -typedef unsigned long ulong; -#endif -typedef unsigned int uint; - - -#define MIN(a, b) ((a) < (b) ? a : b) -#define MAX(a, b) ((a) > (b) ? a : b) - -#define SCRYPT_BLOCK_SIZE 64U -#define SCRYPT_HASH_BLOCK_SIZE 64U -#define SCRYPT_HASH_DIGEST_SIZE 32U - -#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) -#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) - -#define U8TO32_BE(p) \ - (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ - ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]))) - -#define U32TO8_BE(p, v) \ - (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ - (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); - -#define U64TO8_BE(p, v) \ - U32TO8_BE((p), (uint32_t)((v) >> 32)); \ - U32TO8_BE((p) + 4, (uint32_t)((v) )); - - -typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE] __attribute__ ((aligned (16))); - - -/* SHA-256 */ - -static const uint32_t sha256_constants[64] __attribute__ ((aligned (16))) = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -#define Ch(x,y,z) (z ^ (x & (y ^ z))) -#define Maj(x,y,z) (((x | y) & z) | (x & y)) -#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) -#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) -#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) -#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) -#define W0(in,i) (U8TO32_BE(&in[i * 4])) -#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) -#define STEP(i) \ - t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ - t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ - r[7] = r[6]; \ - r[6] = r[5]; \ - r[5] = r[4]; \ - r[4] = r[3] + t0; \ - r[3] = r[2]; \ - r[2] = r[1]; \ - r[1] = r[0]; \ - r[0] = t0 + t1; - - -typedef struct sha256_hash_state_t { - uint32_t H[8] __attribute__ ((aligned (16))); - uint64_t T; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16))); -} sha256_hash_state; - - -static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks) { - uint32_t r[8], w[64], t0, t1; - size_t i; - - for(i = 0; i < 8; i++) - r[i] = S->H[i]; - - while(blocks--) { - - for(i = 0; i < 16; i++) { - w[i] = W0(in, i); - } - for(i = 16; i < 64; i++) { - w[i] = W1(i); - } - - for(i = 0; i < 64; i++) { - STEP(i); - } - - for(i = 0; i < 8; i++) { - r[i] += S->H[i]; - S->H[i] = r[i]; - } - - S->T += SCRYPT_HASH_BLOCK_SIZE * 8; - in += SCRYPT_HASH_BLOCK_SIZE; - } -} - -static void neoscrypt_hash_init_sha256(sha256_hash_state *S) { - S->H[0] = 0x6a09e667; - S->H[1] = 0xbb67ae85; - S->H[2] = 0x3c6ef372; - S->H[3] = 0xa54ff53a; - S->H[4] = 0x510e527f; - S->H[5] = 0x9b05688c; - S->H[6] = 0x1f83d9ab; - S->H[7] = 0x5be0cd19; - S->T = 0; - S->leftover = 0; -} - -static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* handle the previous data */ - if(S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if(S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - sha256_blocks(S, S->buffer, 1); - } - - /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); - if(blocks) { - sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); - in += blocks; - } - - /* handle leftover data */ - if(S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash) { - uint64_t t = S->T + (S->leftover * 8); - - S->buffer[S->leftover] = 0x80; - if(S->leftover <= 55) { - memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); - } else { - memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); - sha256_blocks(S, S->buffer, 1); - memset(S->buffer, 0, 56); - } - - U64TO8_BE(S->buffer + 56, t); - sha256_blocks(S, S->buffer, 1); - - U32TO8_BE(&hash[ 0], S->H[0]); - U32TO8_BE(&hash[ 4], S->H[1]); - U32TO8_BE(&hash[ 8], S->H[2]); - U32TO8_BE(&hash[12], S->H[3]); - U32TO8_BE(&hash[16], S->H[4]); - U32TO8_BE(&hash[20], S->H[5]); - U32TO8_BE(&hash[24], S->H[6]); - U32TO8_BE(&hash[28], S->H[7]); -} - -static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen) { - sha256_hash_state st; - neoscrypt_hash_init_sha256(&st); - neoscrypt_hash_update_sha256(&st, m, mlen); - neoscrypt_hash_finish_sha256(&st, hash); -} - - -/* HMAC for SHA-256 */ - -typedef struct sha256_hmac_state_t { - sha256_hash_state inner, outer; -} sha256_hmac_state; - -static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) { - uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16))) = {0}; - size_t i; - - neoscrypt_hash_init_sha256(&st->inner); - neoscrypt_hash_init_sha256(&st->outer); - - if(keylen <= SCRYPT_HASH_BLOCK_SIZE) { - /* use the key directly if it's <= blocksize bytes */ - memcpy(pad, key, keylen); - } else { - /* if it's > blocksize bytes, hash it */ - neoscrypt_hash_sha256(pad, key, keylen); - } - - /* inner = (key ^ 0x36) */ - /* h(inner || ...) */ - for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= 0x36; - - neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); - - /* outer = (key ^ 0x5c) */ - /* h(outer || ...) */ - for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= (0x5c ^ 0x36); - - neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); -} - -static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen) { - /* h(inner || m...) */ - neoscrypt_hash_update_sha256(&st->inner, m, mlen); -} - -static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac) { - /* h(inner || m) */ - hash_digest innerhash; - neoscrypt_hash_finish_sha256(&st->inner, innerhash); - - /* h(outer || h(inner || m)) */ - neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash)); - neoscrypt_hash_finish_sha256(&st->outer, mac); -} - - -/* PBKDF2 for SHA-256 */ - -static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len, - const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len) { - sha256_hmac_state hmac_pw, hmac_pw_salt, work; - hash_digest ti, u; - uint8_t be[4]; - uint32_t i, j, k, blocks; - - /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ - - /* hmac(password, ...) */ - neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len); - - /* hmac(password, salt...) */ - hmac_pw_salt = hmac_pw; - neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len); - - blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; - for(i = 1; i <= blocks; i++) { - /* U1 = hmac(password, salt || be(i)) */ - U32TO8_BE(be, i); - work = hmac_pw_salt; - neoscrypt_hmac_update_sha256(&work, be, 4); - neoscrypt_hmac_finish_sha256(&work, ti); - memcpy(u, ti, sizeof(u)); - - /* T[i] = U1 ^ U2 ^ U3... */ - for(j = 0; j < N - 1; j++) { - /* UX = hmac(password, U{X-1}) */ - work = hmac_pw; - neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE); - neoscrypt_hmac_finish_sha256(&work, u); - - /* T[i] ^= UX */ - for(k = 0; k < sizeof(u); k++) - ti[k] ^= u[k]; - } - - memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len); - output += SCRYPT_HASH_DIGEST_SIZE; - output_len -= SCRYPT_HASH_DIGEST_SIZE; - } -} - - -/* NeoScrypt */ - -#if (ASM) - -extern void neoscrypt_salsa(uint *X, uint rounds); -extern void neoscrypt_salsa_tangle(uint *X, uint count); -extern void neoscrypt_chacha(uint *X, uint rounds); - -extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len); -extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len); -extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len); - -#else - -/* Salsa20, rounds must be a multiple of 2 */ -static void neoscrypt_salsa(uint *X, uint rounds) { - uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; - - x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; - x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; - x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; - x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; - -#define quarter(a, b, c, d) \ - t = a + d; t = ROTL32(t, 7); b ^= t; \ - t = b + a; t = ROTL32(t, 9); c ^= t; \ - t = c + b; t = ROTL32(t, 13); d ^= t; \ - t = d + c; t = ROTL32(t, 18); a ^= t; - - for(; rounds; rounds -= 2) { - quarter( x0, x4, x8, x12); - quarter( x5, x9, x13, x1); - quarter(x10, x14, x2, x6); - quarter(x15, x3, x7, x11); - quarter( x0, x1, x2, x3); - quarter( x5, x6, x7, x4); - quarter(x10, x11, x8, x9); - quarter(x15, x12, x13, x14); - } - - X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; - X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; - X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; - X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; - -#undef quarter -} - -/* ChaCha20, rounds must be a multiple of 2 */ -static void neoscrypt_chacha(uint *X, uint rounds) { - uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; - - x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; - x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; - x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; - x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; - -#define quarter(a,b,c,d) \ - a += b; t = d ^ a; d = ROTL32(t, 16); \ - c += d; t = b ^ c; b = ROTL32(t, 12); \ - a += b; t = d ^ a; d = ROTL32(t, 8); \ - c += d; t = b ^ c; b = ROTL32(t, 7); - - for(; rounds; rounds -= 2) { - quarter( x0, x4, x8, x12); - quarter( x1, x5, x9, x13); - quarter( x2, x6, x10, x14); - quarter( x3, x7, x11, x15); - quarter( x0, x5, x10, x15); - quarter( x1, x6, x11, x12); - quarter( x2, x7, x8, x13); - quarter( x3, x4, x9, x14); - } - - X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; - X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; - X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; - X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; - -#undef quarter -} - - -/* Fast 32-bit / 64-bit memcpy(); - * len must be a multiple of 32 bytes */ -static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; - uint i; - - for(i = 0; i < (len / sizeof(ulong)); i += 4) { - dst[i] = src[i]; - dst[i + 1] = src[i + 1]; - dst[i + 2] = src[i + 2]; - dst[i + 3] = src[i + 3]; - } -} - -/* Fast 32-bit / 64-bit block swapper; - * len must be a multiple of 32 bytes */ -static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { - ulong *blkA = (ulong *) blkAp; - ulong *blkB = (ulong *) blkBp; - register ulong t0, t1, t2, t3; - uint i; - - for(i = 0; i < (len / sizeof(ulong)); i += 4) { - t0 = blkA[i]; - t1 = blkA[i + 1]; - t2 = blkA[i + 2]; - t3 = blkA[i + 3]; - blkA[i] = blkB[i]; - blkA[i + 1] = blkB[i + 1]; - blkA[i + 2] = blkB[i + 2]; - blkA[i + 3] = blkB[i + 3]; - blkB[i] = t0; - blkB[i + 1] = t1; - blkB[i + 2] = t2; - blkB[i + 3] = t3; - } -} - -/* Fast 32-bit / 64-bit block XOR engine; - * len must be a multiple of 32 bytes */ -static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; - uint i; - - for(i = 0; i < (len / sizeof(ulong)); i += 4) { - dst[i] ^= src[i]; - dst[i + 1] ^= src[i + 1]; - dst[i + 2] ^= src[i + 2]; - dst[i + 3] ^= src[i + 3]; - } -} - -#endif - -/* 32-bit / 64-bit optimised memcpy() */ -static void neoscrypt_copy(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; - uint i, tail; - - for(i = 0; i < (len / sizeof(ulong)); i++) - dst[i] = src[i]; - - tail = len & (sizeof(ulong) - 1); - if(tail) { - uchar *dstb = (uchar *) dstp; - uchar *srcb = (uchar *) srcp; - - for(i = len - tail; i < len; i++) - dstb[i] = srcb[i]; - } -} - -/* 32-bit / 64-bit optimised memory erase aka memset() to zero */ -static void neoscrypt_erase(void *dstp, uint len) { - const ulong null = 0; - ulong *dst = (ulong *) dstp; - uint i, tail; - - for(i = 0; i < (len / sizeof(ulong)); i++) - dst[i] = null; - - tail = len & (sizeof(ulong) - 1); - if(tail) { - uchar *dstb = (uchar *) dstp; - - for(i = len - tail; i < len; i++) - dstb[i] = (uchar)null; - } -} - -/* 32-bit / 64-bit optimised XOR engine */ -static void neoscrypt_xor(void *dstp, const void *srcp, uint len) { - ulong *dst = (ulong *) dstp; - ulong *src = (ulong *) srcp; - uint i, tail; - - for(i = 0; i < (len / sizeof(ulong)); i++) - dst[i] ^= src[i]; - - tail = len & (sizeof(ulong) - 1); - if(tail) { - uchar *dstb = (uchar *) dstp; - uchar *srcb = (uchar *) srcp; - - for(i = len - tail; i < len; i++) - dstb[i] ^= srcb[i]; - } -} - -/* BLAKE2s */ - -#define BLAKE2S_BLOCK_SIZE 64U -#define BLAKE2S_OUT_SIZE 32U -#define BLAKE2S_KEY_SIZE 32U - -static const uint blake2s_IV[8] = { - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 -}; - -#ifdef USE_CUSTOM_BLAKE2S - -static const uint8_t blake2s_sigma[10][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , -}; - -/* Parameter block of 32 bytes */ -typedef struct blake2s_param_t { - uchar digest_length; - uchar key_length; - uchar fanout; - uchar depth; - uint leaf_length; - uchar node_offset[6]; - uchar node_depth; - uchar inner_length; - uchar salt[8]; - uchar personal[8]; -} blake2s_param; - -/* State block of 180 bytes */ -typedef struct blake2s_state_t { - uint h[8] __attribute__ ((aligned (16))); - uint t[2]; - uint f[2]; - uchar buf[2 * BLAKE2S_BLOCK_SIZE] __attribute__ ((aligned (16))); - uint buflen; -} blake2s_state; - -static void blake2s_compress(blake2s_state *S, const void *buf) { - uint i; - uint m[16] __attribute__ ((aligned (16))); - uint v[16] __attribute__ ((aligned (16))); - - neoscrypt_copy(m, buf, 64); - neoscrypt_copy(v, S, 32); - - v[ 8] = blake2s_IV[0]; - v[ 9] = blake2s_IV[1]; - v[10] = blake2s_IV[2]; - v[11] = blake2s_IV[3]; - v[12] = S->t[0] ^ blake2s_IV[4]; - v[13] = S->t[1] ^ blake2s_IV[5]; - v[14] = S->f[0] ^ blake2s_IV[6]; - v[15] = S->f[1] ^ blake2s_IV[7]; -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2s_sigma[r][2*i+0]]; \ - d = ROTR32(d ^ a, 16); \ - c = c + d; \ - b = ROTR32(b ^ c, 12); \ - a = a + b + m[blake2s_sigma[r][2*i+1]]; \ - d = ROTR32(d ^ a, 8); \ - c = c + d; \ - b = ROTR32(b ^ c, 7); \ - } while(0) -#define ROUND(r) \ - do { \ - G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \ - G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \ - G(r, 2, v[ 2], v[ 6], v[10], v[14]); \ - G(r, 3, v[ 3], v[ 7], v[11], v[15]); \ - G(r, 4, v[ 0], v[ 5], v[10], v[15]); \ - G(r, 5, v[ 1], v[ 6], v[11], v[12]); \ - G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \ - G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ - } while(0) - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - - for(i = 0; i < 8; i++) - S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; - -#undef G -#undef ROUND -} - -static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) { - uint left, fill; - - while(input_size > 0) { - left = S->buflen; - fill = 2 * BLAKE2S_BLOCK_SIZE - left; - if(input_size > fill) { - /* Buffer fill */ - neoscrypt_copy(S->buf + left, input, fill); - S->buflen += fill; - /* Counter increment */ - S->t[0] += BLAKE2S_BLOCK_SIZE; - /* Compress */ - blake2s_compress(S, (void *) S->buf); - /* Shift buffer left */ - neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE); - S->buflen -= BLAKE2S_BLOCK_SIZE; - input += fill; - input_size -= fill; - } else { - neoscrypt_copy(S->buf + left, input, input_size); - S->buflen += input_size; - /* Do not compress */ - input += input_size; - input_size = 0; - } - } -} -#endif - -static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size, - void *output, const uchar output_size) { - uchar block[BLAKE2S_BLOCK_SIZE]; - blake2s_param P[1]; - blake2s_state S[1]; - - /* Initialise */ - neoscrypt_erase(P, 32); - P->digest_length = output_size; - P->key_length = key_size; - P->fanout = 1; - P->depth = 1; - - neoscrypt_erase(S, 180); - neoscrypt_copy(S, blake2s_IV, 32); - neoscrypt_xor(S, P, 32); - - neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE); - neoscrypt_copy(block, key, key_size); - blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE); - - /* Update */ - blake2s_update(S, (uchar *) input, input_size); - - /* Finish */ - if(S->buflen > BLAKE2S_BLOCK_SIZE) { - S->t[0] += BLAKE2S_BLOCK_SIZE; - blake2s_compress(S, (void *) S->buf); - S->buflen -= BLAKE2S_BLOCK_SIZE; - neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen); - } - S->t[0] += S->buflen; - S->f[0] = ~0U; - neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen); - blake2s_compress(S, (void *) S->buf); - - /* Write back */ - neoscrypt_copy(output, S, output_size); -} - - -#define FASTKDF_BUFFER_SIZE 256U - -/* FastKDF, a fast buffered key derivation function: - * FASTKDF_BUFFER_SIZE must be a power of 2; - * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; - * prf_output_size must be <= prf_key_size; */ -static void neoscrypt_fastkdf(const uchar *password, uint password_len, const uchar *salt, uint salt_len, - uint N, uchar *output, uint output_len) { - -#define kdf_buf_size FASTKDF_BUFFER_SIZE -#define prf_input_size BLAKE2S_BLOCK_SIZE -#define prf_key_size BLAKE2S_KEY_SIZE -#define prf_output_size BLAKE2S_OUT_SIZE - - uint bufptr, a, b, i, j; - uchar *A, *B, *prf_input, *prf_key, *prf_output; - - /* Align and set up the buffers in stack */ - uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + STACK_ALIGN]; - A = &stack[STACK_ALIGN & ~(STACK_ALIGN - 1)]; - B = &A[kdf_buf_size + prf_input_size]; - prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size]; - - /* Initialise the password buffer */ - if(password_len > kdf_buf_size) - password_len = kdf_buf_size; - - a = kdf_buf_size / password_len; - for(i = 0; i < a; i++) - neoscrypt_copy(&A[i * password_len], &password[0], password_len); - b = kdf_buf_size - a * password_len; - if(b) - neoscrypt_copy(&A[a * password_len], &password[0], b); - neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size); - - /* Initialise the salt buffer */ - if(salt_len > kdf_buf_size) - salt_len = kdf_buf_size; - - a = kdf_buf_size / salt_len; - for(i = 0; i < a; i++) - neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len); - b = kdf_buf_size - a * salt_len; - if(b) - neoscrypt_copy(&B[a * salt_len], &salt[0], b); - neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size); - - /* The primary iteration */ - for(i = 0, bufptr = 0; i < N; i++) { - - /* Map the PRF input buffer */ - prf_input = &A[bufptr]; - - /* Map the PRF key buffer */ - prf_key = &B[bufptr]; - - /* PRF */ - neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size); - - /* Calculate the next buffer pointer */ - for(j = 0, bufptr = 0; j < prf_output_size; j++) - bufptr += prf_output[j]; - bufptr &= (kdf_buf_size - 1); - - /* Modify the salt buffer */ - neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size); - - /* Head modified, tail updated */ - if(bufptr < prf_key_size) - neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr)); - - /* Tail modified, head updated */ - if((kdf_buf_size - bufptr) < prf_output_size) - neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr)); - - } - - /* Modify and copy into the output buffer */ - if(output_len > kdf_buf_size) - output_len = kdf_buf_size; - - a = kdf_buf_size - bufptr; - if(a >= output_len) { - neoscrypt_xor(&B[bufptr], &A[0], output_len); - neoscrypt_copy(&output[0], &B[bufptr], output_len); - } else { - neoscrypt_xor(&B[bufptr], &A[0], a); - neoscrypt_xor(&B[0], &A[a], output_len - a); - neoscrypt_copy(&output[0], &B[bufptr], a); - neoscrypt_copy(&output[a], &B[0], output_len - a); - } - -} - - -/* Configurable optimised block mixer */ -static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) { - uint i, mixer, rounds; - - mixer = mixmode >> 8; - rounds = mixmode & 0xFF; - - /* NeoScrypt flow: Scrypt flow: - Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; - Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; - Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; - Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; - Xa" = Ya; Xb" = Yc; - Xc" = Yb; Xd" = Yd; */ - - if(r == 1) { - neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE); - if(mixer) - neoscrypt_chacha(&X[0], rounds); - else - neoscrypt_salsa(&X[0], rounds); - neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); - if(mixer) - neoscrypt_chacha(&X[16], rounds); - else - neoscrypt_salsa(&X[16], rounds); - return; - } - - if(r == 2) { - neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE); - if(mixer) - neoscrypt_chacha(&X[0], rounds); - else - neoscrypt_salsa(&X[0], rounds); - neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); - if(mixer) - neoscrypt_chacha(&X[16], rounds); - else - neoscrypt_salsa(&X[16], rounds); - neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE); - if(mixer) - neoscrypt_chacha(&X[32], rounds); - else - neoscrypt_salsa(&X[32], rounds); - neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE); - if(mixer) - neoscrypt_chacha(&X[48], rounds); - else - neoscrypt_salsa(&X[48], rounds); - neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE); - return; - } - - /* Reference code for any reasonable r */ - for(i = 0; i < 2 * r; i++) { - if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE); - else neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE); - if(mixer) - neoscrypt_chacha(&X[16 * i], rounds); - else - neoscrypt_salsa(&X[16 * i], rounds); - neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE); - } - for(i = 0; i < r; i++) - neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE); - for(i = 0; i < r; i++) - neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE); -} - -/* NeoScrypt core engine: - * p = 1, salt = password; - * Basic customisation (required): - * profile bit 0: - * 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20; - * 1 = Scrypt(1024, 1, 1) with Salsa20/8; - * profile bits 4 to 1: - * 0000 = FastKDF-BLAKE2s; - * 0001 = PBKDF2-HMAC-SHA256; - * Extended customisation (optional): - * profile bit 31: - * 0 = extended customisation absent; - * 1 = extended customisation present; - * profile bits 7 to 5 (rfactor): - * 000 = r of 1; - * 001 = r of 2; - * 010 = r of 4; - * ... - * 111 = r of 128; - * profile bits 12 to 8 (Nfactor): - * 00000 = N of 2; - * 00001 = N of 4; - * 00010 = N of 8; - * ..... - * 00110 = N of 128; - * ..... - * 01001 = N of 1024; - * ..... - * 11110 = N of 2147483648; - * profile bits 30 to 13 are reserved */ -void neoscrypt(uchar *output, const uchar *password) -{ - uint N = 128, r = 2, dblmix = 1, mixmode = 0x14; - uint kdf, i, j; - uint *X, *Y, *Z, *V; - - // default, option not yet required - uint32_t profile = 0x80000020| (6 << 8) ; - - if(profile & 0x1) { - N = 1024; /* N = (1 << (Nfactor + 1)); */ - r = 1; /* r = (1 << rfactor); */ - dblmix = 0; /* Salsa only */ - mixmode = 0x08; /* 8 rounds */ - } - - if(profile >> 31) { - N = (1 << (((profile >> 8) & 0x1F) + 1)); - r = (1 << ((profile >> 5) & 0x7)); - } - - uchar *stack = (uchar*) malloc((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + STACK_ALIGN); - /* X = r * 2 * SCRYPT_BLOCK_SIZE */ - X = (uint *) &stack[STACK_ALIGN & ~(STACK_ALIGN - 1)]; - /* Z is a copy of X for ChaCha */ - Z = &X[32 * r]; - /* Y is an X sized temporal space */ - Y = &X[64 * r]; - /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */ - V = &X[96 * r]; - - /* X = KDF(password, salt) */ - kdf = (profile >> 1) & 0xF; - - switch(kdf) { - - default: - case(0x0): - neoscrypt_fastkdf(password, 80, password, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); - break; - - case(0x1): - neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); - break; - - } - - /* Process ChaCha 1st, Salsa 2nd and XOR them into FastKDF; otherwise Salsa only */ - - if(dblmix) { - /* blkcpy(Z, X) */ - neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); - - /* Z = SMix(Z) */ - for(i = 0; i < N; i++) { - /* blkcpy(V, Z) */ - neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); - /* blkmix(Z, Y) */ - neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); - } - for(i = 0; i < N; i++) { - /* integerify(Z) mod N */ - j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1)); - /* blkxor(Z, V) */ - neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); - /* blkmix(Z, Y) */ - neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); - } - } - -#if (ASM) - /* Must be called before and after SSE2 Salsa */ - neoscrypt_salsa_tangle(&X[0], r * 2); -#endif - - /* X = SMix(X) */ - for(i = 0; i < N; i++) { - /* blkcpy(V, X) */ - neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); - /* blkmix(X, Y) */ - neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); - } - for(i = 0; i < N; i++) { - /* integerify(X) mod N */ - j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1)); - /* blkxor(X, V) */ - neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); - /* blkmix(X, Y) */ - neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); - } - -#if (ASM) - neoscrypt_salsa_tangle(&X[0], r * 2); -#endif - - if(dblmix) - /* blkxor(X, Z) */ - neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); - - /* output = KDF(password, X) */ - switch(kdf) { - default: - case(0x0): - neoscrypt_fastkdf(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32); - break; - - case(0x1): - neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32); - break; - } - - free(stack); -} - -static bool fulltest_le(const uint *hash, const uint *target) -{ - bool rc = false; - - for (int i = 7; i >= 0; i--) { - if (hash[i] > target[i]) { - rc = false; - break; - } - if(hash[i] < target[i]) { - rc = true; - break; - } - } - - if (opt_debug) { - uchar hash_str[65], target_str[65]; - - bin2hex(hash_str, (uint8_t *) hash, 32); - bin2hex(target_str, (uint8_t *) target, 32); - - applog(LOG_DEBUG, "DEBUG (little endian): %s\nHash: %sx0\nTarget: %sx0", - rc ? "hash <= target" : "hash > target (false positive)", - hash_str, target_str); - } - - return(rc); -} - -int scanhash_neoscrypt( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) - -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t _ALIGN(64) hash[8]; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - - while (pdata[19] < max_nonce && !work_restart[thr_id].restart) - { - neoscrypt((uint8_t *) hash, (uint8_t *) pdata ); - - /* Quick hash check */ - if (hash[7] <= Htarg && fulltest_le(hash, ptarget)) { - *hashes_done = pdata[19] - first_nonce + 1; - return 1; - } - - pdata[19]++; - } - - *hashes_done = pdata[19] - first_nonce; - return 0; -} - -int64_t get_neoscrypt_max64() { return 0x3ffff; } - -void neoscrypt_wait_for_diff( struct stratum_ctx *stratum ) -{ - while ( !stratum->job.diff ) - { -// applog(LOG_DEBUG, "Waiting for Stratum to set the job difficulty"); - sleep(1); - } -} - -int neoscrypt_get_work_data_size () { return 80; } - -bool register_neoscrypt_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT; - gate->scanhash = (void*)&scanhash_neoscrypt; - gate->hash = (void*)&neoscrypt; - gate->get_max64 = (void*)&get_neoscrypt_max64; - gate->set_target = (void*)&scrypt_set_target; - gate->wait_for_diff = (void*)&neoscrypt_wait_for_diff; - gate->build_stratum_request = (void*)&std_be_build_stratum_request; - gate->work_decode = (void*)&std_be_work_decode; - gate->submit_getwork_result = (void*)&std_be_submit_getwork_result; - gate->set_work_data_endian = (void*)&set_work_data_big_endian; - gate->get_work_data_size = (void*)&neoscrypt_get_work_data_size; - return true; -}; - - diff --git a/algo/scrypt/pluck.c b/algo/scrypt/pluck.c deleted file mode 100644 index 01f1c9f..0000000 --- a/algo/scrypt/pluck.c +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2014 pooler, 2015 Jordan Earls - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "cpuminer-config.h" -#include "algo-gate-api.h" - -#include -#include - -#define BLOCK_HEADER_SIZE 80 - -// windows -#ifndef htobe32 -#define htobe32(x) ((uint32_t)htonl((uint32_t)(x))) -#endif - -#ifdef _MSC_VER -#define ROTL(a, b) _rotl(a,b) -#define ROTR(a, b) _rotr(a,b) -#else -#define ROTL(a, b) (((a) << b) | ((a) >> (32 - b))) -#define ROTR(a, b) ((a >> b) | (a << (32 - b))) -#endif - -#if defined(_MSC_VER) && defined(_M_X64) -#define _VECTOR __vectorcall -#include -//#include //SSE2 -//#include //SSE3 -//#include //SSSE3 -//#include //SSE4.1 -//#include //SSE4.2 -//#include //SSE4A -//#include //AES -//#include //AVX -#define OPT_COMPATIBLE -#elif defined(__GNUC__) && defined(__x86_64__) -#include -#define _VECTOR -#endif - -static __thread char *scratchbuf; - -#ifdef OPT_COMPATIBLE -static void _VECTOR xor_salsa8(__m128i B[4], const __m128i Bx[4], int i) -{ - __m128i X0, X1, X2, X3; - - if (i <= 128) { - // a xor 0 = a - X0 = B[0] = Bx[0]; - X1 = B[1] = Bx[1]; - X2 = B[2] = Bx[2]; - X3 = B[3] = Bx[3]; - } else { - X0 = B[0] = _mm_xor_si128(B[0], Bx[0]); - X1 = B[1] = _mm_xor_si128(B[1], Bx[1]); - X2 = B[2] = _mm_xor_si128(B[2], Bx[2]); - X3 = B[3] = _mm_xor_si128(B[3], Bx[3]); - } - - for (i = 0; i < 4; i++) { - /* Operate on columns. */ - X1.m128i_u32[0] ^= ROTL(X0.m128i_u32[0] + X3.m128i_u32[0], 7); - X2.m128i_u32[1] ^= ROTL(X1.m128i_u32[1] + X0.m128i_u32[1], 7); - X3.m128i_u32[2] ^= ROTL(X2.m128i_u32[2] + X1.m128i_u32[2], 7); - X0.m128i_u32[3] ^= ROTL(X3.m128i_u32[3] + X2.m128i_u32[3], 7); - - X2.m128i_u32[0] ^= ROTL(X1.m128i_u32[0] + X0.m128i_u32[0], 9); - X3.m128i_u32[1] ^= ROTL(X2.m128i_u32[1] + X1.m128i_u32[1], 9); - X0.m128i_u32[2] ^= ROTL(X3.m128i_u32[2] + X2.m128i_u32[2], 9); - X1.m128i_u32[3] ^= ROTL(X0.m128i_u32[3] + X3.m128i_u32[3], 9); - - X3.m128i_u32[0] ^= ROTL(X2.m128i_u32[0] + X1.m128i_u32[0], 13); - X0.m128i_u32[1] ^= ROTL(X3.m128i_u32[1] + X2.m128i_u32[1], 13); - X1.m128i_u32[2] ^= ROTL(X0.m128i_u32[2] + X3.m128i_u32[2], 13); - X2.m128i_u32[3] ^= ROTL(X1.m128i_u32[3] + X0.m128i_u32[3], 13); - - X0.m128i_u32[0] ^= ROTL(X3.m128i_u32[0] + X2.m128i_u32[0], 18); - X1.m128i_u32[1] ^= ROTL(X0.m128i_u32[1] + X3.m128i_u32[1], 18); - X2.m128i_u32[2] ^= ROTL(X1.m128i_u32[2] + X0.m128i_u32[2], 18); - X3.m128i_u32[3] ^= ROTL(X2.m128i_u32[3] + X1.m128i_u32[3], 18); - - /* Operate on rows. */ - X0.m128i_u32[1] ^= ROTL(X0.m128i_u32[0] + X0.m128i_u32[3], 7); X1.m128i_u32[2] ^= ROTL(X1.m128i_u32[1] + X1.m128i_u32[0], 7); - X2.m128i_u32[3] ^= ROTL(X2.m128i_u32[2] + X2.m128i_u32[1], 7); X3.m128i_u32[0] ^= ROTL(X3.m128i_u32[3] + X3.m128i_u32[2], 7); - X0.m128i_u32[2] ^= ROTL(X0.m128i_u32[1] + X0.m128i_u32[0], 9); X1.m128i_u32[3] ^= ROTL(X1.m128i_u32[2] + X1.m128i_u32[1], 9); - X2.m128i_u32[0] ^= ROTL(X2.m128i_u32[3] + X2.m128i_u32[2], 9); X3.m128i_u32[1] ^= ROTL(X3.m128i_u32[0] + X3.m128i_u32[3], 9); - - X0.m128i_u32[3] ^= ROTL(X0.m128i_u32[2] + X0.m128i_u32[1], 13); X1.m128i_u32[0] ^= ROTL(X1.m128i_u32[3] + X1.m128i_u32[2], 13); - X2.m128i_u32[1] ^= ROTL(X2.m128i_u32[0] + X2.m128i_u32[3], 13); X3.m128i_u32[2] ^= ROTL(X3.m128i_u32[1] + X3.m128i_u32[0], 13); - X0.m128i_u32[0] ^= ROTL(X0.m128i_u32[3] + X0.m128i_u32[2], 18); X1.m128i_u32[1] ^= ROTL(X1.m128i_u32[0] + X1.m128i_u32[3], 18); - X2.m128i_u32[2] ^= ROTL(X2.m128i_u32[1] + X2.m128i_u32[0], 18); X3.m128i_u32[3] ^= ROTL(X3.m128i_u32[2] + X3.m128i_u32[1], 18); - } - - B[0] = _mm_add_epi32(B[0], X0); - B[1] = _mm_add_epi32(B[1], X1); - B[2] = _mm_add_epi32(B[2], X2); - B[3] = _mm_add_epi32(B[3], X3); -} - -#else - -static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16], int i) -{ - uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; - - if (i <= 128) { - // a xor 0 = a - x00 = B[ 0] = Bx[ 0]; x01 = B[ 1] = Bx[ 1]; x02 = B[ 2] = Bx[ 2]; x03 = B[ 3] = Bx[ 3]; - x04 = B[ 4] = Bx[ 4]; x05 = B[ 5] = Bx[ 5]; x06 = B[ 6] = Bx[ 6]; x07 = B[ 7] = Bx[ 7]; - x08 = B[ 8] = Bx[ 8]; x09 = B[ 9] = Bx[ 9]; x10 = B[10] = Bx[10]; x11 = B[11] = Bx[11]; - x12 = B[12] = Bx[12]; x13 = B[13] = Bx[13]; x14 = B[14] = Bx[14]; x15 = B[15] = Bx[15]; - } else { - x00 = (B[ 0] ^= Bx[ 0]); - x01 = (B[ 1] ^= Bx[ 1]); - x02 = (B[ 2] ^= Bx[ 2]); - x03 = (B[ 3] ^= Bx[ 3]); - x04 = (B[ 4] ^= Bx[ 4]); - x05 = (B[ 5] ^= Bx[ 5]); - x06 = (B[ 6] ^= Bx[ 6]); - x07 = (B[ 7] ^= Bx[ 7]); - x08 = (B[ 8] ^= Bx[ 8]); - x09 = (B[ 9] ^= Bx[ 9]); - x10 = (B[10] ^= Bx[10]); - x11 = (B[11] ^= Bx[11]); - x12 = (B[12] ^= Bx[12]); - x13 = (B[13] ^= Bx[13]); - x14 = (B[14] ^= Bx[14]); - x15 = (B[15] ^= Bx[15]); - } - - for (i = 0; i < 8; i += 2) { - /* Operate on columns. */ - x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7); - x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7); - - x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9); - x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9); - - x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13); - x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13); - - x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18); - x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18); - - /* Operate on rows. */ - x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7); - x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7); - - x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9); - x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9); - - x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13); - x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13); - - x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18); - x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18); - } - B[ 0] += x00; - B[ 1] += x01; - B[ 2] += x02; - B[ 3] += x03; - B[ 4] += x04; - B[ 5] += x05; - B[ 6] += x06; - B[ 7] += x07; - B[ 8] += x08; - B[ 9] += x09; - B[10] += x10; - B[11] += x11; - B[12] += x12; - B[13] += x13; - B[14] += x14; - B[15] += x15; -} - -#endif - -static const uint32_t sha256_k[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -/* Elementary functions used by SHA256 */ -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) -#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) -#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) -#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) - -/* SHA256 round function */ -#define RND(a, b, c, d, e, f, g, h, k) \ - do { \ - t0 = h + S1(e) + Ch(e, f, g) + k; \ - t1 = S0(a) + Maj(a, b, c); \ - d += t0; \ - h = t0 + t1; \ - } while (0) - -/* Adjusted round function for rotating state */ -#define RNDr(S, W, i) \ - RND(S[(64 - i) % 8], S[(65 - i) % 8], \ - S[(66 - i) % 8], S[(67 - i) % 8], \ - S[(68 - i) % 8], S[(69 - i) % 8], \ - S[(70 - i) % 8], S[(71 - i) % 8], \ - W[i] + sha256_k[i]) - - -static void sha256_transform_volatile(uint32_t *state, uint32_t *block) -{ - uint32_t* W=block; //note: block needs to be a mutable 64 int32_t - uint32_t S[8]; - uint32_t t0, t1; - int i; - - for (i = 16; i < 64; i += 2) { - W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; - W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; - } - - /* 2. Initialize working variables. */ - memcpy(S, state, 32); - - /* 3. Mix. */ - RNDr(S, W, 0); - RNDr(S, W, 1); - RNDr(S, W, 2); - RNDr(S, W, 3); - RNDr(S, W, 4); - RNDr(S, W, 5); - RNDr(S, W, 6); - RNDr(S, W, 7); - RNDr(S, W, 8); - RNDr(S, W, 9); - RNDr(S, W, 10); - RNDr(S, W, 11); - RNDr(S, W, 12); - RNDr(S, W, 13); - RNDr(S, W, 14); - RNDr(S, W, 15); - RNDr(S, W, 16); - RNDr(S, W, 17); - RNDr(S, W, 18); - RNDr(S, W, 19); - RNDr(S, W, 20); - RNDr(S, W, 21); - RNDr(S, W, 22); - RNDr(S, W, 23); - RNDr(S, W, 24); - RNDr(S, W, 25); - RNDr(S, W, 26); - RNDr(S, W, 27); - RNDr(S, W, 28); - RNDr(S, W, 29); - RNDr(S, W, 30); - RNDr(S, W, 31); - RNDr(S, W, 32); - RNDr(S, W, 33); - RNDr(S, W, 34); - RNDr(S, W, 35); - RNDr(S, W, 36); - RNDr(S, W, 37); - RNDr(S, W, 38); - RNDr(S, W, 39); - RNDr(S, W, 40); - RNDr(S, W, 41); - RNDr(S, W, 42); - RNDr(S, W, 43); - RNDr(S, W, 44); - RNDr(S, W, 45); - RNDr(S, W, 46); - RNDr(S, W, 47); - RNDr(S, W, 48); - RNDr(S, W, 49); - RNDr(S, W, 50); - RNDr(S, W, 51); - RNDr(S, W, 52); - RNDr(S, W, 53); - RNDr(S, W, 54); - RNDr(S, W, 55); - RNDr(S, W, 56); - RNDr(S, W, 57); - RNDr(S, W, 58); - RNDr(S, W, 59); - RNDr(S, W, 60); - RNDr(S, W, 61); - RNDr(S, W, 62); - RNDr(S, W, 63); - - /* 4. Mix local working variables into global state */ - for (i = 0; i < 8; i++) - state[i] += S[i]; -} - -// standard sha256 hash -#if 1 -static void sha256_hash(unsigned char *hash, const unsigned char *data, int len) -{ - uint32_t _ALIGN(64) S[16]; - uint32_t _ALIGN(64) T[64]; - int i, r; - - sha256_init(S); - for (r = len; r > -9; r -= 64) { - if (r < 64) - memset(T, 0, 64); - memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); - if (r >= 0 && r < 64) - ((unsigned char *)T)[r] = 0x80; - for (i = 0; i < 16; i++) - T[i] = be32dec(T + i); - if (r < 56) - T[15] = 8 * len; - //sha256_transform(S, T, 0); - sha256_transform_volatile(S, T); - } - for (i = 0; i < 8; i++) - be32enc((uint32_t *)hash + i, S[i]); -} -#else -#include -static void sha256_hash(unsigned char *hash, const unsigned char *data, int len) -{ - SHA256_CTX ctx; - SHA256_Init(&ctx); - SHA256_Update(&ctx, data, len); - SHA256_Final(hash, &ctx); -} -#endif - -// hash exactly 64 bytes (ie, sha256 block size) -static void sha256_hash512(uint32_t *hash, const uint32_t *data) -{ - uint32_t _ALIGN(64) S[16]; - uint32_t _ALIGN(64) T[64]; - uchar _ALIGN(64) E[64*4] = { 0 }; - int i; - - sha256_init(S); - - for (i = 0; i < 16; i++) - T[i] = be32dec(&data[i]); - sha256_transform_volatile(S, T); - - E[3] = 0x80; - E[61] = 0x02; // T[15] = 8 * 64 => 0x200; - sha256_transform_volatile(S, (uint32_t*)E); - - for (i = 0; i < 8; i++) - be32enc(&hash[i], S[i]); -} - -void pluck_hash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N) -{ - int size = N * 1024; - sha256_hash(hashbuffer, (void*)data, BLOCK_HEADER_SIZE); - memset(&hashbuffer[32], 0, 32); - - for(int i = 64; i < size - 32; i += 32) - { - uint32_t _ALIGN(64) randseed[16]; - uint32_t _ALIGN(64) randbuffer[16]; - uint32_t _ALIGN(64) joint[16]; - //i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area - //we could use size here, but then it's probable to use 0 as the value in most cases - int randmax = i - 4; - - //setup randbuffer to be an array of random indexes - memcpy(randseed, &hashbuffer[i - 64], 64); - - if(i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64); - //else memset(randbuffer, 0, 64); - - xor_salsa8((void*)randbuffer, (void*)randseed, i); - memcpy(joint, &hashbuffer[i - 32], 32); - - //use the last hash value as the seed - for (int j = 32; j < 64; j += 4) - { - //every other time, change to next random index - //randmax - 32 as otherwise we go beyond memory that's already been written to - uint32_t rand = randbuffer[(j - 32) >> 2] % (randmax - 32); - joint[j >> 2] = *((uint32_t *)&hashbuffer[rand]); - } - - sha256_hash512((uint32_t*) &hashbuffer[i], joint); - - //setup randbuffer to be an array of random indexes - //use last hash value and previous hash value(post-mixing) - memcpy(randseed, &hashbuffer[i - 32], 64); - - if(i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64); - //else memset(randbuffer, 0, 64); - - xor_salsa8((void*)randbuffer, (void*)randseed, i); - - //use the last hash value as the seed - for (int j = 0; j < 32; j += 2) - { - uint32_t rand = randbuffer[j >> 1] % randmax; - *((uint32_t *)(hashbuffer + rand)) = *((uint32_t *)(hashbuffer + j + randmax)); - } - } - - memcpy(hash, hashbuffer, 32); -} - -int scanhash_pluck( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t _ALIGN(64) hash[8]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - volatile uint8_t *restart = &(work_restart[thr_id].restart); - uint32_t n = first_nonce; - - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0ffff; - - for (int i=0; i < 19; i++) - be32enc(&endiandata[i], pdata[i]); - - const uint32_t Htarg = ptarget[7]; - do { - //be32enc(&endiandata[19], n); - endiandata[19] = n; - pluck_hash(hash, endiandata, scratchbuf, opt_pluck_n); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) - { - *hashes_done = n - first_nonce + 1; - pdata[19] = htobe32(endiandata[19]); - return 1; - } - n++; - } while (n < max_nonce && !(*restart)); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -int64_t pluck_get_max64 () -{ - return 0x1ffLL; -} - -bool pluck_miner_thread_init( int thr_id ) -{ - scratchbuf = malloc( 128 * 1024 ); - if ( scratchbuf ) - return true; - applog( LOG_ERR, "Thread %u: Pluck buffer allocation failed", thr_id ); - return false; -} - -bool register_pluck_algo( algo_gate_t* gate ) -{ - algo_not_tested(); - gate->miner_thread_init = (void*)&pluck_miner_thread_init; - gate->scanhash = (void*)&scanhash_pluck; - gate->hash = (void*)&pluck_hash; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&pluck_get_max64; - return true; -}; - - diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c deleted file mode 100644 index 387afbb..0000000 --- a/algo/scrypt/scrypt.c +++ /dev/null @@ -1,795 +0,0 @@ -/* - * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2014 pooler - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#include "algo-gate-api.h" - -#include -#include -#include - -static const uint32_t keypad[12] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 -}; -static const uint32_t innerpad[11] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 -}; -static const uint32_t outerpad[8] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 -}; -static const uint32_t finalblk[16] = { - 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 -}; - -static __thread char *scratchbuf; -int scratchbuf_size = 0; - -static inline void HMAC_SHA256_80_init(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[8]; - uint32_t pad[16]; - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 16, 16); - memcpy(pad + 4, keypad, 48); - sha256_transform(tstate, pad, 0); - memcpy(ihash, tstate, 32); - - sha256_init(ostate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform(ostate, pad, 0); - - sha256_init(tstate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 16; i++) - pad[i] = 0x36363636; - sha256_transform(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[8], ostate2[8]; - uint32_t ibuf[16], obuf[16]; - int i, j; - - memcpy(istate, tstate, 32); - sha256_transform(istate, salt, 0); - - memcpy(ibuf, salt + 16, 16); - memcpy(ibuf + 5, innerpad, 44); - memcpy(obuf + 8, outerpad, 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 32); - ibuf[4] = i + 1; - sha256_transform(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 32); - sha256_transform(ostate2, obuf, 0); - for (j = 0; j < 8; j++) - output[8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, - const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[16]; - int i; - - sha256_transform(tstate, salt, 1); - sha256_transform(tstate, salt + 16, 1); - sha256_transform(tstate, finalblk, 0); - memcpy(buf, tstate, 32); - memcpy(buf + 8, outerpad, 32); - - sha256_transform(ostate, buf, 0); - for (i = 0; i < 8; i++) - output[i] = swab32(ostate[i]); -} - - -#ifdef HAVE_SHA256_4WAY - -static const uint32_t keypad_4way[4 * 12] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000280, 0x00000280, 0x00000280, 0x00000280 -}; -static const uint32_t innerpad_4way[4 * 11] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0 -}; -static const uint32_t outerpad_4way[4 * 8] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000300, 0x00000300, 0x00000300, 0x00000300 -}; -static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t _ALIGN(16) ihash[4 * 8]; - uint32_t _ALIGN(16) pad[4 * 16]; - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 4 * 16, 4 * 16); - memcpy(pad + 4 * 4, keypad_4way, 4 * 48); - sha256_transform_4way(tstate, pad, 0); - memcpy(ihash, tstate, 4 * 32); - - sha256_init_4way(ostate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 4 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_4way(ostate, pad, 0); - - sha256_init_4way(tstate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 4 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_4way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t _ALIGN(16) istate[4 * 8]; - uint32_t _ALIGN(16) ostate2[4 * 8]; - uint32_t _ALIGN(16) ibuf[4 * 16]; - uint32_t _ALIGN(16) obuf[4 * 16]; - int i, j; - - memcpy(istate, tstate, 4 * 32); - sha256_transform_4way(istate, salt, 0); - - memcpy(ibuf, salt + 4 * 16, 4 * 16); - memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); - memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 4 * 32); - ibuf[4 * 4 + 0] = i + 1; - ibuf[4 * 4 + 1] = i + 1; - ibuf[4 * 4 + 2] = i + 1; - ibuf[4 * 4 + 3] = i + 1; - sha256_transform_4way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 4 * 32); - sha256_transform_4way(ostate2, obuf, 0); - for (j = 0; j < 4 * 8; j++) - output[4 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t _ALIGN(16) buf[4 * 16]; - int i; - - sha256_transform_4way(tstate, salt, 1); - sha256_transform_4way(tstate, salt + 4 * 16, 1); - sha256_transform_4way(tstate, finalblk_4way, 0); - memcpy(buf, tstate, 4 * 32); - memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); - - sha256_transform_4way(ostate, buf, 0); - for (i = 0; i < 4 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_4WAY */ - - -#ifdef HAVE_SHA256_8WAY - -static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t _ALIGN(32) ihash[8 * 8]; - uint32_t _ALIGN(32) pad[8 * 16]; - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - pad[8 * 4 + i] = 0x80000000; - memset(pad + 8 * 5, 0x00, 8 * 40); - for (i = 0; i < 8; i++) - pad[8 * 15 + i] = 0x00000280; - sha256_transform_8way(tstate, pad, 0); - memcpy(ihash, tstate, 8 * 32); - - sha256_init_8way(ostate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 8 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_8way(ostate, pad, 0); - - sha256_init_8way(tstate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 8 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_8way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t _ALIGN(32) istate[8 * 8]; - uint32_t _ALIGN(32) ostate2[8 * 8]; - uint32_t _ALIGN(32) ibuf[8 * 16]; - uint32_t _ALIGN(32) obuf[8 * 16]; - int i, j; - - memcpy(istate, tstate, 8 * 32); - sha256_transform_8way(istate, salt, 0); - - memcpy(ibuf, salt + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - ibuf[8 * 5 + i] = 0x80000000; - memset(ibuf + 8 * 6, 0x00, 8 * 36); - for (i = 0; i < 8; i++) - ibuf[8 * 15 + i] = 0x000004a0; - - for (i = 0; i < 8; i++) - obuf[8 * 8 + i] = 0x80000000; - memset(obuf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - obuf[8 * 15 + i] = 0x00000300; - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 8 * 32); - ibuf[8 * 4 + 0] = i + 1; - ibuf[8 * 4 + 1] = i + 1; - ibuf[8 * 4 + 2] = i + 1; - ibuf[8 * 4 + 3] = i + 1; - ibuf[8 * 4 + 4] = i + 1; - ibuf[8 * 4 + 5] = i + 1; - ibuf[8 * 4 + 6] = i + 1; - ibuf[8 * 4 + 7] = i + 1; - sha256_transform_8way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 8 * 32); - sha256_transform_8way(ostate2, obuf, 0); - for (j = 0; j < 8 * 8; j++) - output[8 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t _ALIGN(32) buf[8 * 16]; - int i; - - sha256_transform_8way(tstate, salt, 1); - sha256_transform_8way(tstate, salt + 8 * 16, 1); - sha256_transform_8way(tstate, finalblk_8way, 0); - - memcpy(buf, tstate, 8 * 32); - for (i = 0; i < 8; i++) - buf[8 * 8 + i] = 0x80000000; - memset(buf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - buf[8 * 15 + i] = 0x00000300; - sha256_transform_8way(ostate, buf, 0); - - for (i = 0; i < 8 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_8WAY */ - - -#if defined(USE_ASM) && defined(__x86_64__) - -#define SCRYPT_MAX_WAYS 12 -#define HAVE_SCRYPT_3WAY 1 -int scrypt_best_throughput(); -void scrypt_core(uint32_t *X, uint32_t *V, int N); -void scrypt_core_3way(uint32_t *X, uint32_t *V, int N); -#if defined(USE_AVX2) -#undef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 24 -#define HAVE_SCRYPT_6WAY 1 -void scrypt_core_6way(uint32_t *X, uint32_t *V, int N); -#endif - -#elif defined(USE_ASM) && defined(__i386__) - -#define SCRYPT_MAX_WAYS 4 -#define scrypt_best_throughput() 1 -void scrypt_core(uint32_t *X, uint32_t *V, int N); - -#elif defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) - -void scrypt_core(uint32_t *X, uint32_t *V, int N); -#if defined(__ARM_NEON__) -#undef HAVE_SHA256_4WAY -#define SCRYPT_MAX_WAYS 3 -#define HAVE_SCRYPT_3WAY 1 -#define scrypt_best_throughput() 3 -void scrypt_core_3way(uint32_t *X, uint32_t *V, int N); -#endif - -#else - -static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) -{ - uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; - int i; - - x00 = (B[ 0] ^= Bx[ 0]); - x01 = (B[ 1] ^= Bx[ 1]); - x02 = (B[ 2] ^= Bx[ 2]); - x03 = (B[ 3] ^= Bx[ 3]); - x04 = (B[ 4] ^= Bx[ 4]); - x05 = (B[ 5] ^= Bx[ 5]); - x06 = (B[ 6] ^= Bx[ 6]); - x07 = (B[ 7] ^= Bx[ 7]); - x08 = (B[ 8] ^= Bx[ 8]); - x09 = (B[ 9] ^= Bx[ 9]); - x10 = (B[10] ^= Bx[10]); - x11 = (B[11] ^= Bx[11]); - x12 = (B[12] ^= Bx[12]); - x13 = (B[13] ^= Bx[13]); - x14 = (B[14] ^= Bx[14]); - x15 = (B[15] ^= Bx[15]); - for (i = 0; i < 8; i += 2) { -#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns. */ - x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); - x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); - - x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); - x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); - - x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); - x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); - - x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); - x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); - - /* Operate on rows. */ - x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); - x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); - - x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); - x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); - - x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); - x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); - - x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); - x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); -#undef R - } - B[ 0] += x00; - B[ 1] += x01; - B[ 2] += x02; - B[ 3] += x03; - B[ 4] += x04; - B[ 5] += x05; - B[ 6] += x06; - B[ 7] += x07; - B[ 8] += x08; - B[ 9] += x09; - B[10] += x10; - B[11] += x11; - B[12] += x12; - B[13] += x13; - B[14] += x14; - B[15] += x15; -} - -static inline void scrypt_core(uint32_t *X, uint32_t *V, int N) -{ - int i; - - for (i = 0; i < N; i++) { - memcpy(&V[i * 32], X, 128); - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } - for (i = 0; i < N; i++) { - uint32_t j = 32 * (X[16] & (N - 1)); - for (uint8_t k = 0; k < 32; k++) - X[k] ^= V[j + k]; - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } -} - -#endif - -#ifndef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 -#endif - -unsigned char *scrypt_buffer_alloc(int N) -{ - return (uchar*) malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63); -} - -static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, - uint32_t *midstate, unsigned char *scratchpad, int N) -{ - uint32_t tstate[8], ostate[8]; - uint32_t X[32]; - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate, midstate, 32); - HMAC_SHA256_80_init(input, tstate, ostate); - PBKDF2_SHA256_80_128(tstate, ostate, input, X); - - scrypt_core(X, V, N); - - PBKDF2_SHA256_128_32(tstate, ostate, X, output); -} - -#ifdef HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_4way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N) -{ - uint32_t _ALIGN(128) tstate[4 * 8]; - uint32_t _ALIGN(128) ostate[4 * 8]; - uint32_t _ALIGN(128) W[4 * 32]; - uint32_t _ALIGN(128) X[4 * 32]; - uint32_t *V; - int i, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = input[k * 20 + i]; - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W, tstate, ostate); - PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[k * 32 + i] = W[4 * i + k]; - scrypt_core(X + 0 * 32, V, N); - scrypt_core(X + 1 * 32, V, N); - scrypt_core(X + 2 * 32, V, N); - scrypt_core(X + 3 * 32, V, N); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = X[k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[k * 8 + i] = W[4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#ifdef HAVE_SCRYPT_3WAY - -static void scrypt_1024_1_1_256_3way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N) -{ - uint32_t _ALIGN(64) tstate[3 * 8], ostate[3 * 8]; - uint32_t _ALIGN(64) X[3 * 32]; - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate + 0, midstate, 32); - memcpy(tstate + 8, midstate, 32); - memcpy(tstate + 16, midstate, 32); - HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); - HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); - PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); - PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); - PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); - - scrypt_core_3way(X, V, N); - - PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); - PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); - PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); -} - -#ifdef HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_12way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N) -{ - uint32_t _ALIGN(128) tstate[12 * 8]; - uint32_t _ALIGN(128) ostate[12 * 8]; - uint32_t _ALIGN(128) W[12 * 32]; - uint32_t _ALIGN(128) X[12 * 32]; - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[32 * j + 4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); - HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); - PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; - scrypt_core_3way(X + 0 * 96, V, N); - scrypt_core_3way(X + 1 * 96, V, N); - scrypt_core_3way(X + 2 * 96, V, N); - scrypt_core_3way(X + 3 * 96, V, N); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#endif /* HAVE_SCRYPT_3WAY */ - -#ifdef HAVE_SCRYPT_6WAY -static void scrypt_1024_1_1_256_24way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N) -{ - uint32_t _ALIGN(128) tstate[24 * 8]; - uint32_t _ALIGN(128) ostate[24 * 8]; - uint32_t _ALIGN(128) W[24 * 32]; - uint32_t _ALIGN(128) X[24 * 32]; - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - tstate[8 * 8 * j + 8 * i + k] = midstate[i]; - HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64); - HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128); - PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; - scrypt_core_6way(X + 0 * 32, V, N); - scrypt_core_6way(X + 6 * 32, V, N); - scrypt_core_6way(X + 12 * 32, V, N); - scrypt_core_6way(X + 18 * 32, V, N); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; -} -#endif /* HAVE_SCRYPT_6WAY */ - -extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; - uint32_t midstate[8]; - uint32_t n = pdata[19] - 1; - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - int throughput = scrypt_best_throughput(); - int i; - -#ifdef HAVE_SHA256_4WAY - if (sha256_use_4way()) - throughput *= 4; -#endif - - for (i = 0; i < throughput; i++) - memcpy(data + i * 20, pdata, 80); - - sha256_init(midstate); - sha256_transform(midstate, data, 0); - - do { - - for (i = 0; i < throughput; i++) - data[i * 20 + 19] = ++n; - -#if defined(HAVE_SHA256_4WAY) - if (throughput == 4) - scrypt_1024_1_1_256_4way(data, hash, midstate, - scratchbuf, scratchbuf_size ); - else -#endif -#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY) - if (throughput == 12) - scrypt_1024_1_1_256_12way(data, hash, midstate, - scratchbuf, scratchbuf_size ); - else -#endif -#if defined(HAVE_SCRYPT_6WAY) - if (throughput == 24) - scrypt_1024_1_1_256_24way(data, hash, midstate, - scratchbuf, scratchbuf_size ); - else -#endif -#if defined(HAVE_SCRYPT_3WAY) - if (throughput == 3) - scrypt_1024_1_1_256_3way(data, hash, midstate, - scratchbuf, scratchbuf_size ); - else -#endif - scrypt_1024_1_1_256(data, hash, midstate, scratchbuf, - scratchbuf_size ); - - for (i = 0; i < throughput; i++) { - if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) { - *hashes_done = n - pdata[19] + 1; - pdata[19] = data[i * 20 + 19]; - work_set_target_ratio( work, hash ); - return 1; - } - } - } while (likely(n < max_nonce && !work_restart[thr_id].restart)); - - *hashes_done = n - pdata[19] + 1; - pdata[19] = n; - return 0; -} - -int64_t scrypt_get_max64() { return 0xfff; } - -bool scrypt_miner_thread_init( int thr_id ) -{ - scratchbuf = scrypt_buffer_alloc( scratchbuf_size ); - if ( scratchbuf ) - return true; - applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id ); - return false; -} - -bool register_scrypt_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AVX2_OPT; - gate->miner_thread_init =(void*)&scrypt_miner_thread_init; - gate->scanhash = (void*)&scanhash_scrypt; -// gate->hash = (void*)&scrypt_1024_1_1_256_24way; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&scrypt_get_max64; - - if ( !opt_scrypt_n ) - scratchbuf_size = 1024; - else - scratchbuf_size = opt_scrypt_n; - return true; -}; - diff --git a/algo/scryptjane/scrypt-conf.h b/algo/scryptjane/scrypt-conf.h deleted file mode 100644 index 46685a5..0000000 --- a/algo/scryptjane/scrypt-conf.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - pick the best algo at runtime or compile time? - ---------------------------------------------- - SCRYPT_CHOOSE_COMPILETIME (gcc only!) - SCRYPT_CHOOSE_RUNTIME -*/ -#define SCRYPT_CHOOSE_RUNTIME - - -/* - hash function to use - ------------------------------- - SCRYPT_BLAKE256 - SCRYPT_BLAKE512 - SCRYPT_SHA256 - SCRYPT_SHA512 - SCRYPT_SKEIN512 -*/ -//#define SCRYPT_SHA256 - - -/* - block mixer to use - ----------------------------- - SCRYPT_CHACHA - SCRYPT_SALSA -*/ -//#define SCRYPT_SALSA diff --git a/algo/scryptjane/scrypt-jane-chacha.h b/algo/scryptjane/scrypt-jane-chacha.h deleted file mode 100644 index 128e347..0000000 --- a/algo/scryptjane/scrypt-jane-chacha.h +++ /dev/null @@ -1,148 +0,0 @@ -#define SCRYPT_MIX_BASE "ChaCha20/8" - -typedef uint32_t scrypt_mix_word_t; - -#define SCRYPT_WORDTO8_LE U32TO8_LE -#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP - -#define SCRYPT_BLOCK_BYTES 64 -#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - -/* must have these here in case block bytes is ever != 64 */ -#include "scrypt-jane-romix-basic.h" - -#include "scrypt-jane-mix_chacha-avx.h" -#include "scrypt-jane-mix_chacha-ssse3.h" -#include "scrypt-jane-mix_chacha-sse2.h" -#include "scrypt-jane-mix_chacha.h" - -#if defined(SCRYPT_CHACHA_AVX) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx - #if defined(X86_INTRINSIC_AVX) - #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_avx_1 - #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_avx_1_xor - #endif - #define SCRYPT_ROMIX_FN scrypt_ROMix_avx - #define SCRYPT_MIX_FN chacha_core_avx - #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop - #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 - #if defined(X86_INTRINSIC_SSSE3) - #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_ssse3_1 - #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_ssse3_1_xor - #endif - #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 - #define SCRYPT_MIX_FN chacha_core_ssse3 - #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop - #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 - #if defined(X86_INTRINSIC_SSE2) - #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_sse2_1 - #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_sse2_1_xor - #endif - #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 - #define SCRYPT_MIX_FN chacha_core_sse2 - #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop - #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop - #include "scrypt-jane-romix-template.h" -#endif - -/* cpu agnostic */ -#define SCRYPT_ROMIX_FN scrypt_ROMix_basic -#define SCRYPT_MIX_FN chacha_core_basic -#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian -#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian -#include "scrypt-jane-romix-template.h" - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -static scrypt_ROMixfn -scrypt_getROMix() { - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_CHACHA_AVX) - if (cpuflags & cpu_avx) - return scrypt_ROMix_avx; - else -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - if (cpuflags & cpu_ssse3) - return scrypt_ROMix_ssse3; - else -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - if (cpuflags & cpu_sse2) - return scrypt_ROMix_sse2; - else -#endif - - return scrypt_ROMix_basic; -} -#endif - - -#if defined(SCRYPT_TEST_SPEED) -static size_t -available_implementations() { - size_t cpuflags = detect_cpu(); - size_t flags = 0; - -#if defined(SCRYPT_CHACHA_AVX) - if (cpuflags & cpu_avx) - flags |= cpu_avx; -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - if (cpuflags & cpu_ssse3) - flags |= cpu_ssse3; -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - if (cpuflags & cpu_sse2) - flags |= cpu_sse2; -#endif - - return flags; -} -#endif -/* -static int -scrypt_test_mix() { - static const uint8_t expected[16] = { - 0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a, - }; - - int ret = 1; - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_CHACHA_AVX) - if (cpuflags & cpu_avx) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected); -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - if (cpuflags & cpu_ssse3) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, scrypt_romix_nop, scrypt_romix_nop, expected); -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - if (cpuflags & cpu_sse2) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, scrypt_romix_nop, scrypt_romix_nop, expected); -#endif - -#if defined(SCRYPT_CHACHA_BASIC) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); -#endif - - return ret; -} -*/ diff --git a/algo/scryptjane/scrypt-jane-hash.h b/algo/scryptjane/scrypt-jane-hash.h deleted file mode 100644 index 264eb48..0000000 --- a/algo/scryptjane/scrypt-jane-hash.h +++ /dev/null @@ -1,48 +0,0 @@ -#if defined(SCRYPT_BLAKE512) -#include "scrypt-jane-hash_blake512.h" -#elif defined(SCRYPT_BLAKE256) -#include "scrypt-jane-hash_blake256.h" -#elif defined(SCRYPT_SHA512) -#include "scrypt-jane-hash_sha512.h" -#elif defined(SCRYPT_SHA256) -#include "scrypt-jane-hash_sha256.h" -#elif defined(SCRYPT_SKEIN512) -#include "scrypt-jane-hash_skein512.h" -#elif defined(SCRYPT_KECCAK512) || defined(SCRYPT_KECCAK256) -#include "scrypt-jane-hash_keccak.h" -#else - #define SCRYPT_HASH "ERROR" - #define SCRYPT_HASH_BLOCK_SIZE 64 - #define SCRYPT_HASH_DIGEST_SIZE 64 - typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state; - typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - static void scrypt_hash_init(scrypt_hash_state *S) {} - static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {} - static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {} - static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0}; - #error must define a hash function! -#endif - -#include "scrypt-jane-pbkdf2.h" - -#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ -/* -static int -scrypt_test_hash() { - scrypt_hash_state st; - scrypt_hash_digest hash, final; - uint8_t msg[SCRYPT_TEST_HASH_LEN]; - size_t i; - - for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++) - msg[i] = (uint8_t)i; - - scrypt_hash_init(&st); - for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) { - scrypt_hash(hash, msg, i); - scrypt_hash_update(&st, hash, sizeof(hash)); - } - scrypt_hash_finish(&st, final); - return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE); -} -*/ diff --git a/algo/scryptjane/scrypt-jane-hash_blake256.h b/algo/scryptjane/scrypt-jane-hash_blake256.h deleted file mode 100644 index dee9013..0000000 --- a/algo/scryptjane/scrypt-jane-hash_blake256.h +++ /dev/null @@ -1,177 +0,0 @@ -#define SCRYPT_HASH "BLAKE-256" -#define SCRYPT_HASH_BLOCK_SIZE 64 -#define SCRYPT_HASH_DIGEST_SIZE 32 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -const uint8_t blake256_sigma[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, - 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3, - 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4, - 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8, - 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13, - 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9, - 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11, - 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10, - 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5, - 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0, -}; - -const uint32_t blake256_constants[16] = { - 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, - 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 -}; - -typedef struct scrypt_hash_state_t { - uint32_t H[8], T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static void -blake256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { - const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16); - uint32_t m[16], v[16], h[8], t[2]; - uint32_t i; - - for (i = 0; i < 8; i++) h[i] = S->H[i]; - for (i = 0; i < 2; i++) t[i] = S->T[i]; - - while (blocks--) { - t[0] += 512; - t[1] += (t[0] < 512) ? 1 : 0; - - for (i = 0; i < 8; i++) v[i ] = h[i]; - for (i = 0; i < 4; i++) v[i + 8] = blake256_constants[i]; - for (i = 0; i < 2; i++) v[i + 12] = blake256_constants[i+4] ^ t[0]; - for (i = 0; i < 2; i++) v[i + 14] = blake256_constants[i+6] ^ t[1]; - - for (i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]); - in += 64; - - #define G(a,b,c,d,e) \ - v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \ - v[d] = ROTR32(v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROTR32(v[b] ^ v[c],12); \ - v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \ - v[d] = ROTR32(v[d] ^ v[a], 8); \ - v[c] += v[d]; \ - v[b] = ROTR32(v[b] ^ v[c], 7); - - for (i = 0, sigma = blake256_sigma; i < 14; i++) { - G(0, 4, 8,12, 0); - G(1, 5, 9,13, 2); - G(2, 6,10,14, 4); - G(3, 7,11,15, 6); - - G(0, 5,10,15, 8); - G(1, 6,11,12,10); - G(2, 7, 8,13,12); - G(3, 4, 9,14,14); - - sigma += 16; - if (sigma == sigma_end) - sigma = blake256_sigma; - } - - #undef G - - for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]); - } - - for (i = 0; i < 8; i++) S->H[i] = h[i]; - for (i = 0; i < 2; i++) S->T[i] = t[i]; -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->H[0] = 0x6a09e667ULL; - S->H[1] = 0xbb67ae85ULL; - S->H[2] = 0x3c6ef372ULL; - S->H[3] = 0xa54ff53aULL; - S->H[4] = 0x510e527fULL; - S->H[5] = 0x9b05688cULL; - S->H[6] = 0x1f83d9abULL; - S->H[7] = 0x5be0cd19ULL; - S->T[0] = 0; - S->T[1] = 0; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - blake256_blocks(S, S->buffer, 1); - } - - /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); - if (blocks) { - blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); - in += blocks; - } - - /* handle leftover data */ - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - uint32_t th, tl, bits; - - bits = (S->leftover << 3); - tl = S->T[0] + bits; - th = S->T[1]; - if (S->leftover == 0) { - S->T[0] = (uint32_t)0 - (uint32_t)512; - S->T[1] = (uint32_t)0 - (uint32_t)1; - } else if (S->T[0] == 0) { - S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits; - S->T[1] = S->T[1] - 1; - } else { - S->T[0] -= (512 - bits); - } - - S->buffer[S->leftover] = 0x80; - if (S->leftover <= 55) { - memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); - } else { - memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); - blake256_blocks(S, S->buffer, 1); - S->T[0] = (uint32_t)0 - (uint32_t)512; - S->T[1] = (uint32_t)0 - (uint32_t)1; - memset(S->buffer, 0, 56); - } - S->buffer[55] |= 1; - U32TO8_BE(S->buffer + 56, th); - U32TO8_BE(S->buffer + 60, tl); - blake256_blocks(S, S->buffer, 1); - - U32TO8_BE(&hash[ 0], S->H[0]); - U32TO8_BE(&hash[ 4], S->H[1]); - U32TO8_BE(&hash[ 8], S->H[2]); - U32TO8_BE(&hash[12], S->H[3]); - U32TO8_BE(&hash[16], S->H[4]); - U32TO8_BE(&hash[20], S->H[5]); - U32TO8_BE(&hash[24], S->H[6]); - U32TO8_BE(&hash[28], S->H[7]); -} - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0xcc,0xa9,0x1e,0xa9,0x20,0x97,0x37,0x40,0x17,0xc0,0xa0,0x52,0x87,0xfc,0x08,0x20, - 0x40,0xf5,0x81,0x86,0x62,0x75,0x78,0xb2,0x79,0xce,0xde,0x27,0x3c,0x7f,0x85,0xd8, -}; diff --git a/algo/scryptjane/scrypt-jane-hash_blake512.h b/algo/scryptjane/scrypt-jane-hash_blake512.h deleted file mode 100644 index ea2a583..0000000 --- a/algo/scryptjane/scrypt-jane-hash_blake512.h +++ /dev/null @@ -1,181 +0,0 @@ -#define SCRYPT_HASH "BLAKE-512" -#define SCRYPT_HASH_BLOCK_SIZE 128 -#define SCRYPT_HASH_DIGEST_SIZE 64 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -const uint8_t blake512_sigma[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, - 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3, - 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4, - 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8, - 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13, - 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9, - 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11, - 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10, - 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5, - 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0, -}; - -const uint64_t blake512_constants[16] = { - 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, - 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, - 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, - 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL -}; - -typedef struct scrypt_hash_state_t { - uint64_t H[8], T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static void -blake512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { - const uint8_t *sigma, *sigma_end = blake512_sigma + (10 * 16); - uint64_t m[16], v[16], h[8], t[2]; - uint32_t i; - - for (i = 0; i < 8; i++) h[i] = S->H[i]; - for (i = 0; i < 2; i++) t[i] = S->T[i]; - - while (blocks--) { - t[0] += 1024; - t[1] += (t[0] < 1024) ? 1 : 0; - - for (i = 0; i < 8; i++) v[i ] = h[i]; - for (i = 0; i < 4; i++) v[i + 8] = blake512_constants[i]; - for (i = 0; i < 2; i++) v[i + 12] = blake512_constants[i+4] ^ t[0]; - for (i = 0; i < 2; i++) v[i + 14] = blake512_constants[i+6] ^ t[1]; - - for (i = 0; i < 16; i++) m[i] = U8TO64_BE(&in[i * 8]); - in += 128; - - #define G(a,b,c,d,e) \ - v[a] += (m[sigma[e+0]] ^ blake512_constants[sigma[e+1]]) + v[b]; \ - v[d] = ROTR64(v[d] ^ v[a],32); \ - v[c] += v[d]; \ - v[b] = ROTR64(v[b] ^ v[c],25); \ - v[a] += (m[sigma[e+1]] ^ blake512_constants[sigma[e+0]]) + v[b]; \ - v[d] = ROTR64(v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROTR64(v[b] ^ v[c],11); - - for (i = 0, sigma = blake512_sigma; i < 16; i++) { - G(0, 4, 8,12, 0); - G(1, 5, 9,13, 2); - G(2, 6,10,14, 4); - G(3, 7,11,15, 6); - G(0, 5,10,15, 8); - G(1, 6,11,12,10); - G(2, 7, 8,13,12); - G(3, 4, 9,14,14); - - sigma += 16; - if (sigma == sigma_end) - sigma = blake512_sigma; - } - - #undef G - - for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]); - } - - for (i = 0; i < 8; i++) S->H[i] = h[i]; - for (i = 0; i < 2; i++) S->T[i] = t[i]; -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->H[0] = 0x6a09e667f3bcc908ULL; - S->H[1] = 0xbb67ae8584caa73bULL; - S->H[2] = 0x3c6ef372fe94f82bULL; - S->H[3] = 0xa54ff53a5f1d36f1ULL; - S->H[4] = 0x510e527fade682d1ULL; - S->H[5] = 0x9b05688c2b3e6c1fULL; - S->H[6] = 0x1f83d9abfb41bd6bULL; - S->H[7] = 0x5be0cd19137e2179ULL; - S->T[0] = 0; - S->T[1] = 0; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - blake512_blocks(S, S->buffer, 1); - } - - /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); - if (blocks) { - blake512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); - in += blocks; - } - - /* handle leftover data */ - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - uint64_t th, tl; - size_t bits; - - bits = (S->leftover << 3); - tl = S->T[0] + bits; - th = S->T[1]; - if (S->leftover == 0) { - S->T[0] = (uint64_t)0 - (uint64_t)1024; - S->T[1] = (uint64_t)0 - (uint64_t)1; - } else if (S->T[0] == 0) { - S->T[0] = ((uint64_t)0 - (uint64_t)1024) + bits; - S->T[1] = S->T[1] - 1; - } else { - S->T[0] -= (1024 - bits); - } - - S->buffer[S->leftover] = 0x80; - if (S->leftover <= 111) { - memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover); - } else { - memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover); - blake512_blocks(S, S->buffer, 1); - S->T[0] = (uint64_t)0 - (uint64_t)1024; - S->T[1] = (uint64_t)0 - (uint64_t)1; - memset(S->buffer, 0, 112); - } - S->buffer[111] |= 1; - U64TO8_BE(S->buffer + 112, th); - U64TO8_BE(S->buffer + 120, tl); - blake512_blocks(S, S->buffer, 1); - - U64TO8_BE(&hash[ 0], S->H[0]); - U64TO8_BE(&hash[ 8], S->H[1]); - U64TO8_BE(&hash[16], S->H[2]); - U64TO8_BE(&hash[24], S->H[3]); - U64TO8_BE(&hash[32], S->H[4]); - U64TO8_BE(&hash[40], S->H[5]); - U64TO8_BE(&hash[48], S->H[6]); - U64TO8_BE(&hash[56], S->H[7]); -} - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0x2f,0x9d,0x5b,0xbe,0x24,0x0d,0x63,0xd3,0xa0,0xac,0x4f,0xd3,0x01,0xc0,0x23,0x6f, - 0x6d,0xdf,0x6e,0xfb,0x60,0x6f,0xa0,0x74,0xdf,0x9f,0x25,0x65,0xb6,0x11,0x0a,0x83, - 0x23,0x96,0xba,0x91,0x68,0x4b,0x85,0x15,0x13,0x54,0xba,0x19,0xf3,0x2c,0x5a,0x4a, - 0x1f,0x78,0x31,0x02,0xc9,0x1e,0x56,0xc4,0x54,0xca,0xf9,0x8f,0x2c,0x7f,0x85,0xac -}; diff --git a/algo/scryptjane/scrypt-jane-hash_keccak.h b/algo/scryptjane/scrypt-jane-hash_keccak.h deleted file mode 100644 index 7ed5574..0000000 --- a/algo/scryptjane/scrypt-jane-hash_keccak.h +++ /dev/null @@ -1,168 +0,0 @@ -#if defined(SCRYPT_KECCAK256) - #define SCRYPT_HASH "Keccak-256" - #define SCRYPT_HASH_DIGEST_SIZE 32 -#else - #define SCRYPT_HASH "Keccak-512" - #define SCRYPT_HASH_DIGEST_SIZE 64 -#endif -#define SCRYPT_KECCAK_F 1600 -#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 256=512, 512=1024 */ -#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 256=1088, 512=576 */ -#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -typedef struct scrypt_hash_state_t { - uint64_t state[SCRYPT_KECCAK_F / 64]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static const uint64_t keccak_round_constants[24] = { - 0x0000000000000001ull, 0x0000000000008082ull, - 0x800000000000808aull, 0x8000000080008000ull, - 0x000000000000808bull, 0x0000000080000001ull, - 0x8000000080008081ull, 0x8000000000008009ull, - 0x000000000000008aull, 0x0000000000000088ull, - 0x0000000080008009ull, 0x000000008000000aull, - 0x000000008000808bull, 0x800000000000008bull, - 0x8000000000008089ull, 0x8000000000008003ull, - 0x8000000000008002ull, 0x8000000000000080ull, - 0x000000000000800aull, 0x800000008000000aull, - 0x8000000080008081ull, 0x8000000000008080ull, - 0x0000000080000001ull, 0x8000000080008008ull -}; - -static void -keccak_block(scrypt_hash_state *S, const uint8_t *in) { - size_t i; - uint64_t *s = S->state, t[5], u[5], v, w; - - /* absorb input */ - for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8) - s[i] ^= U8TO64_LE(in); - - for (i = 0; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[ 1]; - s[ 1] = ROTL64(s[ 6], 44); - s[ 6] = ROTL64(s[ 9], 20); - s[ 9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[ 2], 62); - s[ 2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[ 4], 27); - s[ 4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[ 8], 55); - s[ 8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[ 5], 36); - s[ 5] = ROTL64(s[ 3], 28); - s[ 3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[ 7], 6); - s[ 7] = ROTL64(s[10], 3); - s[10] = ROTL64( v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; - v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[i]; - } -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - memset(S, 0, sizeof(*S)); -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - keccak_block(S, S->buffer); - } - - /* handle the current data */ - while (inlen >= SCRYPT_HASH_BLOCK_SIZE) { - keccak_block(S, in); - in += SCRYPT_HASH_BLOCK_SIZE; - inlen -= SCRYPT_HASH_BLOCK_SIZE; - } - - /* handle leftover data */ - S->leftover = (uint32_t)inlen; - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - size_t i; - - S->buffer[S->leftover] = 0x01; - memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1)); - S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80; - keccak_block(S, S->buffer); - - for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) { - U64TO8_LE(&hash[i], S->state[i / 8]); - } -} - -#if defined(SCRYPT_KECCAK256) -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0x26,0xb7,0x10,0xb3,0x66,0xb1,0xd1,0xb1,0x25,0xfc,0x3e,0xe3,0x1e,0x33,0x1d,0x19, - 0x94,0xaa,0x63,0x7a,0xd5,0x77,0x29,0xb4,0x27,0xe9,0xe0,0xf4,0x19,0xba,0x68,0xea, -}; -#else -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0x17,0xc7,0x8c,0xa0,0xd9,0x08,0x1d,0xba,0x8a,0xc8,0x3e,0x07,0x90,0xda,0x91,0x88, - 0x25,0xbd,0xd3,0xf8,0x78,0x4a,0x8d,0x5e,0xe4,0x96,0x9c,0x01,0xf3,0xeb,0xdc,0x12, - 0xea,0x35,0x57,0xba,0x94,0xb8,0xe9,0xb9,0x27,0x45,0x0a,0x48,0x5c,0x3d,0x69,0xf0, - 0xdb,0x22,0x38,0xb5,0x52,0x22,0x29,0xea,0x7a,0xb2,0xe6,0x07,0xaa,0x37,0x4d,0xe6, -}; -#endif - diff --git a/algo/scryptjane/scrypt-jane-hash_sha256.h b/algo/scryptjane/scrypt-jane-hash_sha256.h deleted file mode 100644 index d06d3e1..0000000 --- a/algo/scryptjane/scrypt-jane-hash_sha256.h +++ /dev/null @@ -1,135 +0,0 @@ -#define SCRYPT_HASH "SHA-2-256" -#define SCRYPT_HASH_BLOCK_SIZE 64 -#define SCRYPT_HASH_DIGEST_SIZE 32 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -typedef struct scrypt_hash_state_t { - uint32_t H[8]; - uint64_t T; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static const uint32_t sha256_constants[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -#define Ch(x,y,z) (z ^ (x & (y ^ z))) -#define Maj(x,y,z) (((x | y) & z) | (x & y)) -#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) -#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) -#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) -#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) -#define W0(in,i) (U8TO32_BE(&in[i * 4])) -#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) -#define STEP(i) \ - t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ - t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ - r[7] = r[6]; \ - r[6] = r[5]; \ - r[5] = r[4]; \ - r[4] = r[3] + t0; \ - r[3] = r[2]; \ - r[2] = r[1]; \ - r[1] = r[0]; \ - r[0] = t0 + t1; - -static void -sha256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { - uint32_t r[8], w[64], t0, t1; - size_t i; - - for (i = 0; i < 8; i++) r[i] = S->H[i]; - - while (blocks--) { - for (i = 0; i < 16; i++) { w[i] = W0(in, i); } - for (i = 16; i < 64; i++) { w[i] = W1(i); } - for (i = 0; i < 64; i++) { STEP(i); } - for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; } - S->T += SCRYPT_HASH_BLOCK_SIZE * 8; - in += SCRYPT_HASH_BLOCK_SIZE; - } -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->H[0] = 0x6a09e667; - S->H[1] = 0xbb67ae85; - S->H[2] = 0x3c6ef372; - S->H[3] = 0xa54ff53a; - S->H[4] = 0x510e527f; - S->H[5] = 0x9b05688c; - S->H[6] = 0x1f83d9ab; - S->H[7] = 0x5be0cd19; - S->T = 0; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - sha256_blocks(S, S->buffer, 1); - } - - /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); - if (blocks) { - sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); - in += blocks; - } - - /* handle leftover data */ - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - uint64_t t = S->T + (S->leftover * 8); - - S->buffer[S->leftover] = 0x80; - if (S->leftover <= 55) { - memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); - } else { - memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); - sha256_blocks(S, S->buffer, 1); - memset(S->buffer, 0, 56); - } - - U64TO8_BE(S->buffer + 56, t); - sha256_blocks(S, S->buffer, 1); - - U32TO8_BE(&hash[ 0], S->H[0]); - U32TO8_BE(&hash[ 4], S->H[1]); - U32TO8_BE(&hash[ 8], S->H[2]); - U32TO8_BE(&hash[12], S->H[3]); - U32TO8_BE(&hash[16], S->H[4]); - U32TO8_BE(&hash[20], S->H[5]); - U32TO8_BE(&hash[24], S->H[6]); - U32TO8_BE(&hash[28], S->H[7]); -} - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0xee,0x36,0xae,0xa6,0x65,0xf0,0x28,0x7d,0xc9,0xde,0xd8,0xad,0x48,0x33,0x7d,0xbf, - 0xcb,0xc0,0x48,0xfa,0x5f,0x92,0xfd,0x0a,0x95,0x6f,0x34,0x8e,0x8c,0x1e,0x73,0xad, -}; diff --git a/algo/scryptjane/scrypt-jane-hash_sha512.h b/algo/scryptjane/scrypt-jane-hash_sha512.h deleted file mode 100644 index 3e3997d..0000000 --- a/algo/scryptjane/scrypt-jane-hash_sha512.h +++ /dev/null @@ -1,152 +0,0 @@ -#define SCRYPT_HASH "SHA-2-512" -#define SCRYPT_HASH_BLOCK_SIZE 128 -#define SCRYPT_HASH_DIGEST_SIZE 64 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -typedef struct scrypt_hash_state_t { - uint64_t H[8]; - uint64_t T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static const uint64_t sha512_constants[80] = { - 0x428a2f98d728ae22ull, 0x7137449123ef65cdull, 0xb5c0fbcfec4d3b2full, 0xe9b5dba58189dbbcull, - 0x3956c25bf348b538ull, 0x59f111f1b605d019ull, 0x923f82a4af194f9bull, 0xab1c5ed5da6d8118ull, - 0xd807aa98a3030242ull, 0x12835b0145706fbeull, 0x243185be4ee4b28cull, 0x550c7dc3d5ffb4e2ull, - 0x72be5d74f27b896full, 0x80deb1fe3b1696b1ull, 0x9bdc06a725c71235ull, 0xc19bf174cf692694ull, - 0xe49b69c19ef14ad2ull, 0xefbe4786384f25e3ull, 0x0fc19dc68b8cd5b5ull, 0x240ca1cc77ac9c65ull, - 0x2de92c6f592b0275ull, 0x4a7484aa6ea6e483ull, 0x5cb0a9dcbd41fbd4ull, 0x76f988da831153b5ull, - 0x983e5152ee66dfabull, 0xa831c66d2db43210ull, 0xb00327c898fb213full, 0xbf597fc7beef0ee4ull, - 0xc6e00bf33da88fc2ull, 0xd5a79147930aa725ull, 0x06ca6351e003826full, 0x142929670a0e6e70ull, - 0x27b70a8546d22ffcull, 0x2e1b21385c26c926ull, 0x4d2c6dfc5ac42aedull, 0x53380d139d95b3dfull, - 0x650a73548baf63deull, 0x766a0abb3c77b2a8ull, 0x81c2c92e47edaee6ull, 0x92722c851482353bull, - 0xa2bfe8a14cf10364ull, 0xa81a664bbc423001ull, 0xc24b8b70d0f89791ull, 0xc76c51a30654be30ull, - 0xd192e819d6ef5218ull, 0xd69906245565a910ull, 0xf40e35855771202aull, 0x106aa07032bbd1b8ull, - 0x19a4c116b8d2d0c8ull, 0x1e376c085141ab53ull, 0x2748774cdf8eeb99ull, 0x34b0bcb5e19b48a8ull, - 0x391c0cb3c5c95a63ull, 0x4ed8aa4ae3418acbull, 0x5b9cca4f7763e373ull, 0x682e6ff3d6b2b8a3ull, - 0x748f82ee5defb2fcull, 0x78a5636f43172f60ull, 0x84c87814a1f0ab72ull, 0x8cc702081a6439ecull, - 0x90befffa23631e28ull, 0xa4506cebde82bde9ull, 0xbef9a3f7b2c67915ull, 0xc67178f2e372532bull, - 0xca273eceea26619cull, 0xd186b8c721c0c207ull, 0xeada7dd6cde0eb1eull, 0xf57d4f7fee6ed178ull, - 0x06f067aa72176fbaull, 0x0a637dc5a2c898a6ull, 0x113f9804bef90daeull, 0x1b710b35131c471bull, - 0x28db77f523047d84ull, 0x32caab7b40c72493ull, 0x3c9ebe0a15c9bebcull, 0x431d67c49c100d4cull, - 0x4cc5d4becb3e42b6ull, 0x597f299cfc657e2aull, 0x5fcb6fab3ad6faecull, 0x6c44198c4a475817ull -}; - -#define Ch(x,y,z) (z ^ (x & (y ^ z))) -#define Maj(x,y,z) (((x | y) & z) | (x & y)) -#define S0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39)) -#define S1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41)) -#define G0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ (x >> 7)) -#define G1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ (x >> 6)) -#define W0(in,i) (U8TO64_BE(&in[i * 8])) -#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) -#define STEP(i) \ - t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ - t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha512_constants[i] + w[i]; \ - r[7] = r[6]; \ - r[6] = r[5]; \ - r[5] = r[4]; \ - r[4] = r[3] + t0; \ - r[3] = r[2]; \ - r[2] = r[1]; \ - r[1] = r[0]; \ - r[0] = t0 + t1; - -static void -sha512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { - uint64_t r[8], w[80], t0, t1; - size_t i; - - for (i = 0; i < 8; i++) r[i] = S->H[i]; - - while (blocks--) { - for (i = 0; i < 16; i++) { w[i] = W0(in, i); } - for (i = 16; i < 80; i++) { w[i] = W1(i); } - for (i = 0; i < 80; i++) { STEP(i); } - for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; } - S->T[0] += SCRYPT_HASH_BLOCK_SIZE * 8; - S->T[1] += (!S->T[0]) ? 1 : 0; - in += SCRYPT_HASH_BLOCK_SIZE; - } -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->H[0] = 0x6a09e667f3bcc908ull; - S->H[1] = 0xbb67ae8584caa73bull; - S->H[2] = 0x3c6ef372fe94f82bull; - S->H[3] = 0xa54ff53a5f1d36f1ull; - S->H[4] = 0x510e527fade682d1ull; - S->H[5] = 0x9b05688c2b3e6c1full; - S->H[6] = 0x1f83d9abfb41bd6bull; - S->H[7] = 0x5be0cd19137e2179ull; - S->T[0] = 0; - S->T[1] = 0; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - sha512_blocks(S, S->buffer, 1); - } - - /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); - if (blocks) { - sha512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); - in += blocks; - } - - /* handle leftover data */ - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - uint64_t t0 = S->T[0] + (S->leftover * 8), t1 = S->T[1]; - - S->buffer[S->leftover] = 0x80; - if (S->leftover <= 111) { - memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover); - } else { - memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover); - sha512_blocks(S, S->buffer, 1); - memset(S->buffer, 0, 112); - } - - U64TO8_BE(S->buffer + 112, t1); - U64TO8_BE(S->buffer + 120, t0); - sha512_blocks(S, S->buffer, 1); - - U64TO8_BE(&hash[ 0], S->H[0]); - U64TO8_BE(&hash[ 8], S->H[1]); - U64TO8_BE(&hash[16], S->H[2]); - U64TO8_BE(&hash[24], S->H[3]); - U64TO8_BE(&hash[32], S->H[4]); - U64TO8_BE(&hash[40], S->H[5]); - U64TO8_BE(&hash[48], S->H[6]); - U64TO8_BE(&hash[56], S->H[7]); -} - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0xba,0xc3,0x80,0x2b,0x24,0x56,0x95,0x1f,0x19,0x7c,0xa2,0xd3,0x72,0x7c,0x9a,0x4d, - 0x1d,0x50,0x3a,0xa9,0x12,0x27,0xd8,0xe1,0xbe,0x76,0x53,0x87,0x5a,0x1e,0x82,0xec, - 0xc8,0xe1,0x6b,0x87,0xd0,0xb5,0x25,0x7e,0xe8,0x1e,0xd7,0x58,0xc6,0x2d,0xc2,0x9c, - 0x06,0x31,0x8f,0x5b,0x57,0x8e,0x76,0xba,0xd5,0xf6,0xec,0xfe,0x85,0x1f,0x34,0x0c, -}; diff --git a/algo/scryptjane/scrypt-jane-hash_skein512.h b/algo/scryptjane/scrypt-jane-hash_skein512.h deleted file mode 100644 index a95d46b..0000000 --- a/algo/scryptjane/scrypt-jane-hash_skein512.h +++ /dev/null @@ -1,188 +0,0 @@ -#define SCRYPT_HASH "Skein-512" -#define SCRYPT_HASH_BLOCK_SIZE 64 -#define SCRYPT_HASH_DIGEST_SIZE 64 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -typedef struct scrypt_hash_state_t { - uint64_t X[8], T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -#include - -static void -skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) { - uint64_t X[8], key[8], Xt[9+18], T[3+1]; - size_t r; - - while (blocks--) { - T[0] = S->T[0] + add; - T[1] = S->T[1]; - T[2] = T[0] ^ T[1]; - key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0]; - key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1]; - key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2]; - key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3]; - key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4]; - key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0]; - key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1]; - key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7]; - Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7]; - in += SCRYPT_HASH_BLOCK_SIZE; - - for (r = 0; r < 18; r++) - Xt[r + 9] = Xt[r + 0]; - - for (r = 0; r < 18; r += 2) { - X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0]; - X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2]; - X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4]; - X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6]; - X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2]; - X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0]; - X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6]; - X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4]; - X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4]; - X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6]; - X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0]; - X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2]; - X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6]; - X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4]; - X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2]; - X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0]; - - X[0] += Xt[r + 1]; - X[1] += Xt[r + 2]; - X[2] += Xt[r + 3]; - X[3] += Xt[r + 4]; - X[4] += Xt[r + 5]; - X[5] += Xt[r + 6] + T[1]; - X[6] += Xt[r + 7] + T[2]; - X[7] += Xt[r + 8] + r + 1; - - T[3] = T[0]; - T[0] = T[1]; - T[1] = T[2]; - T[2] = T[3]; - - X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0]; - X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2]; - X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4]; - X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6]; - X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2]; - X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0]; - X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6]; - X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4]; - X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4]; - X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6]; - X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0]; - X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2]; - X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6]; - X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4]; - X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2]; - X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0]; - - X[0] += Xt[r + 2]; - X[1] += Xt[r + 3]; - X[2] += Xt[r + 4]; - X[3] += Xt[r + 5]; - X[4] += Xt[r + 6]; - X[5] += Xt[r + 7] + T[1]; - X[6] += Xt[r + 8] + T[2]; - X[7] += Xt[r + 9] + r + 2; - - T[3] = T[0]; - T[0] = T[1]; - T[1] = T[2]; - T[2] = T[3]; - } - - S->X[0] = key[0] ^ X[0]; - S->X[1] = key[1] ^ X[1]; - S->X[2] = key[2] ^ X[2]; - S->X[3] = key[3] ^ X[3]; - S->X[4] = key[4] ^ X[4]; - S->X[5] = key[5] ^ X[5]; - S->X[6] = key[6] ^ X[6]; - S->X[7] = key[7] ^ X[7]; - - S->T[0] = T[0]; - S->T[1] = T[1] & ~0x4000000000000000ull; - } -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->X[0] = 0x4903ADFF749C51CEull; - S->X[1] = 0x0D95DE399746DF03ull; - S->X[2] = 0x8FD1934127C79BCEull; - S->X[3] = 0x9A255629FF352CB1ull; - S->X[4] = 0x5DB62599DF6CA7B0ull; - S->X[5] = 0xEABE394CA9D5C3F4ull; - S->X[6] = 0x991112C71A75B523ull; - S->X[7] = 0xAE18A40B660FCC33ull; - S->T[0] = 0x0000000000000000ull; - S->T[1] = 0x7000000000000000ull; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */ - if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) { - /* handle the previous data, we know there is enough for at least one block */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - memcpy(S->buffer + S->leftover, in, want); - in += want; - inlen -= want; - S->leftover = 0; - skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE); - } - - /* handle the current data if there's more than one block */ - if (inlen > SCRYPT_HASH_BLOCK_SIZE) { - blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE); - inlen -= blocks; - in += blocks; - } - } - - /* handle leftover data */ - memcpy(S->buffer + S->leftover, in, inlen); - S->leftover += inlen; -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover); - S->T[1] |= 0x8000000000000000ull; - skein512_blocks(S, S->buffer, 1, S->leftover); - - memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE); - S->T[0] = 0; - S->T[1] = 0xff00000000000000ull; - skein512_blocks(S, S->buffer, 1, 8); - - U64TO8_LE(&hash[ 0], S->X[0]); - U64TO8_LE(&hash[ 8], S->X[1]); - U64TO8_LE(&hash[16], S->X[2]); - U64TO8_LE(&hash[24], S->X[3]); - U64TO8_LE(&hash[32], S->X[4]); - U64TO8_LE(&hash[40], S->X[5]); - U64TO8_LE(&hash[48], S->X[6]); - U64TO8_LE(&hash[56], S->X[7]); -} - - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4, - 0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf, - 0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41, - 0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67, -}; diff --git a/algo/scryptjane/scrypt-jane-mix_chacha-avx.h b/algo/scryptjane/scrypt-jane-mix_chacha-avx.h deleted file mode 100644 index 17559d8..0000000 --- a/algo/scryptjane/scrypt-jane-mix_chacha-avx.h +++ /dev/null @@ -1,564 +0,0 @@ -/* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,64) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(mov ebx, 0x01000302) - a2(vmovd xmm4, ebx) - a2(mov ebx, 0x05040706) - a2(vmovd xmm0, ebx) - a2(mov ebx, 0x09080b0a) - a2(vmovd xmm1, ebx) - a2(mov ebx, 0x0d0c0f0e) - a2(vmovd xmm2, ebx) - a2(mov ebx, 0x02010003) - a2(vmovd xmm5, ebx) - a2(mov ebx, 0x06050407) - a2(vmovd xmm3, ebx) - a2(mov ebx, 0x0a09080b) - a2(vmovd xmm6, ebx) - a2(mov ebx, 0x0e0d0c0f) - a2(vmovd xmm7, ebx) - a3(vpunpckldq xmm4, xmm4, xmm0) - a3(vpunpckldq xmm5, xmm5, xmm3) - a3(vpunpckldq xmm1, xmm1, xmm2) - a3(vpunpckldq xmm6, xmm6, xmm7) - a3(vpunpcklqdq xmm4, xmm4, xmm1) - a3(vpunpcklqdq xmm5, xmm5, xmm6) - a2(vmovdqa xmm0,[ecx+esi+0]) - a2(vmovdqa xmm1,[ecx+esi+16]) - a2(vmovdqa xmm2,[ecx+esi+32]) - a2(vmovdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[ecx+eax+0]) - a3(vpxor xmm1,xmm1,[ecx+eax+16]) - a3(vpxor xmm2,xmm2,[ecx+eax+32]) - a3(vpxor xmm3,xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_avx_loop:) - a2(and eax, eax) - a3(vpxor xmm0,xmm0,[esi+ecx+0]) - a3(vpxor xmm1,xmm1,[esi+ecx+16]) - a3(vpxor xmm2,xmm2,[esi+ecx+32]) - a3(vpxor xmm3,xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[eax+ecx+0]) - a3(vpxor xmm1,xmm1,[eax+ecx+16]) - a3(vpxor xmm2,xmm2,[eax+ecx+32]) - a3(vpxor xmm3,xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa [esp+0],xmm0) - a2(vmovdqa [esp+16],xmm1) - a2(vmovdqa [esp+32],xmm2) - a2(vmovdqa [esp+48],xmm3) - a2(mov eax,8) - a1(scrypt_chacha_avx_loop: ) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm4) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpsrld xmm6,xmm1,20) - a3(vpslld xmm1,xmm1,12) - a3(vpxor xmm1,xmm1,xmm6) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm5) - a3(vpshufd xmm0,xmm0,0x93) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpshufd xmm3,xmm3,0x4e) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpshufd xmm2,xmm2,0x39) - a3(vpsrld xmm6,xmm1,25) - a3(vpslld xmm1,xmm1,7) - a3(vpxor xmm1,xmm1,xmm6) - a2(sub eax,2) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm4) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpsrld xmm6,xmm1,20) - a3(vpslld xmm1,xmm1,12) - a3(vpxor xmm1,xmm1,xmm6) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm5) - a3(vpshufd xmm0,xmm0,0x39) - a3(vpaddd xmm2,xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a3(vpxor xmm1,xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a3(vpsrld xmm6,xmm1,25) - a3(vpslld xmm1,xmm1,7) - a3(vpxor xmm1,xmm1,xmm6) - a1(ja scrypt_chacha_avx_loop) - a3(vpaddd xmm0,xmm0,[esp+0]) - a3(vpaddd xmm1,xmm1,[esp+16]) - a3(vpaddd xmm2,xmm2,[esp+32]) - a3(vpaddd xmm3,xmm3,[esp+48]) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(vmovdqa [eax+0],xmm0) - a2(vmovdqa [eax+16],xmm1) - a2(vmovdqa [eax+32],xmm2) - a2(vmovdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_avx_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - a2(mov r8, 0x0504070601000302) - a2(mov rax, 0x0d0c0f0e09080b0a) - a2(movq xmm4, r8) - a2(movq xmm6, rax) - a2(mov r8, 0x0605040702010003) - a2(mov rax, 0x0e0d0c0f0a09080b) - a2(movq xmm5, r8) - a2(movq xmm7, rax) - a3(vpunpcklqdq xmm4, xmm4, xmm6) - a3(vpunpcklqdq xmm5, xmm5, xmm7) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor r8,r8) - a2(xor r9,r9) - a1(scrypt_ChunkMix_avx_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa xmm8,xmm0) - a2(vmovdqa xmm9,xmm1) - a2(vmovdqa xmm10,xmm2) - a2(vmovdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_chacha_avx_loop: ) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm4) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpsrld xmm12,xmm1,20) - a3(vpslld xmm1,xmm1,12) - a3(vpxor xmm1,xmm1,xmm12) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm5) - a3(vpshufd xmm0,xmm0,0x93) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpshufd xmm3,xmm3,0x4e) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpshufd xmm2,xmm2,0x39) - a3(vpsrld xmm12,xmm1,25) - a3(vpslld xmm1,xmm1,7) - a3(vpxor xmm1,xmm1,xmm12) - a2(sub rax,2) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm4) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpsrld xmm12,xmm1,20) - a3(vpslld xmm1,xmm1,12) - a3(vpxor xmm1,xmm1,xmm12) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm5) - a3(vpshufd xmm0,xmm0,0x39) - a3(vpaddd xmm2,xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a3(vpxor xmm1,xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a3(vpsrld xmm12,xmm1,25) - a3(vpslld xmm1,xmm1,7) - a3(vpxor xmm1,xmm1,xmm12) - a1(ja scrypt_chacha_avx_loop) - a3(vpaddd xmm0,xmm0,xmm8) - a3(vpaddd xmm1,xmm1,xmm9) - a3(vpaddd xmm2,xmm2,xmm10) - a3(vpaddd xmm3,xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_avx_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_AVX - -static void NOINLINE -scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and no XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and unconditional XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_CHACHA_AVX) - #undef SCRYPT_MIX - #define SCRYPT_MIX "ChaCha/8-AVX" - #undef SCRYPT_CHACHA_INCLUDED - #define SCRYPT_CHACHA_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_chacha-sse2.h b/algo/scryptjane/scrypt-jane-mix_chacha-sse2.h deleted file mode 100644 index 8f79dec..0000000 --- a/algo/scryptjane/scrypt-jane-mix_chacha-sse2.h +++ /dev/null @@ -1,585 +0,0 @@ -/* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,16) - a2(and esp,~15) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[ecx+eax+0]) - a2(pxor xmm1,[ecx+eax+16]) - a2(pxor xmm2,[ecx+eax+32]) - a2(pxor xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and eax, eax) - a2(pxor xmm0,[esi+ecx+0]) - a2(pxor xmm1,[esi+ecx+16]) - a2(pxor xmm2,[esi+ecx+32]) - a2(pxor xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[eax+ecx+0]) - a2(pxor xmm1,[eax+ecx+16]) - a2(pxor xmm2,[eax+ecx+32]) - a2(pxor xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa [esp+0],xmm0) - a2(movdqa xmm4,xmm1) - a2(movdqa xmm5,xmm2) - a2(movdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_chacha_sse2_loop: ) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,8) - a2(psrld xmm6,24) - a2(pxor xmm3,xmm6) - a3(pshufd xmm0,xmm0,0x93) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x39) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a2(sub eax,2) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,8) - a2(psrld xmm6,24) - a2(pxor xmm3,xmm6) - a3(pshufd xmm0,xmm0,0x39) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a1(ja scrypt_chacha_sse2_loop) - a2(paddd xmm0,[esp+0]) - a2(paddd xmm1,xmm4) - a2(paddd xmm2,xmm5) - a2(paddd xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(movdqa [eax+0],xmm0) - a2(movdqa [eax+16],xmm1) - a2(movdqa [eax+32],xmm2) - a2(movdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_sse2_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa xmm8,xmm0) - a2(movdqa xmm9,xmm1) - a2(movdqa xmm10,xmm2) - a2(movdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_chacha_sse2_loop: ) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,8) - a2(psrld xmm6,24) - a2(pxor xmm3,xmm6) - a3(pshufd xmm0,xmm0,0x93) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x39) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a2(sub rax,2) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,8) - a2(psrld xmm6,24) - a2(pxor xmm3,xmm6) - a3(pshufd xmm0,xmm0,0x39) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a1(ja scrypt_chacha_sse2_loop) - a2(paddd xmm0,xmm8) - a2(paddd xmm1,xmm9) - a2(paddd xmm2,xmm10) - a2(paddd xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_sse2_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSE2 - -static void NOINLINE -scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and no XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_sse2_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and unconditional XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_sse2_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - #undef SCRYPT_MIX - #define SCRYPT_MIX "ChaCha/8-SSE2" - #undef SCRYPT_CHACHA_INCLUDED - #define SCRYPT_CHACHA_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_chacha-ssse3.h b/algo/scryptjane/scrypt-jane-mix_chacha-ssse3.h deleted file mode 100644 index 6a80cac..0000000 --- a/algo/scryptjane/scrypt-jane-mix_chacha-ssse3.h +++ /dev/null @@ -1,572 +0,0 @@ -/* x86 */ -#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSSE3 - -asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_ssse3) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,64) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(mov ebx, 0x01000302) - a2(movd xmm4, ebx) - a2(mov ebx, 0x05040706) - a2(movd xmm0, ebx) - a2(mov ebx, 0x09080b0a) - a2(movd xmm1, ebx) - a2(mov ebx, 0x0d0c0f0e) - a2(movd xmm2, ebx) - a2(mov ebx, 0x02010003) - a2(movd xmm5, ebx) - a2(mov ebx, 0x06050407) - a2(movd xmm3, ebx) - a2(mov ebx, 0x0a09080b) - a2(movd xmm6, ebx) - a2(mov ebx, 0x0e0d0c0f) - a2(movd xmm7, ebx) - a2(punpckldq xmm4, xmm0) - a2(punpckldq xmm5, xmm3) - a2(punpckldq xmm1, xmm2) - a2(punpckldq xmm6, xmm7) - a2(punpcklqdq xmm4, xmm1) - a2(punpcklqdq xmm5, xmm6) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor1) - a2(pxor xmm0,[ecx+eax+0]) - a2(pxor xmm1,[ecx+eax+16]) - a2(pxor xmm2,[ecx+eax+32]) - a2(pxor xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_ssse3_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_ssse3_loop:) - a2(and eax, eax) - a2(pxor xmm0,[esi+ecx+0]) - a2(pxor xmm1,[esi+ecx+16]) - a2(pxor xmm2,[esi+ecx+32]) - a2(pxor xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor2) - a2(pxor xmm0,[eax+ecx+0]) - a2(pxor xmm1,[eax+ecx+16]) - a2(pxor xmm2,[eax+ecx+32]) - a2(pxor xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_ssse3_no_xor2:) - a2(movdqa [esp+0],xmm0) - a2(movdqa [esp+16],xmm1) - a2(movdqa [esp+32],xmm2) - a2(movdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_chacha_ssse3_loop: ) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm4) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm5) - a3(pshufd xmm0,xmm0,0x93) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x39) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a2(sub eax,2) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm4) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm5) - a3(pshufd xmm0,xmm0,0x39) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a1(ja scrypt_chacha_ssse3_loop) - a2(paddd xmm0,[esp+0]) - a2(paddd xmm1,[esp+16]) - a2(paddd xmm2,[esp+32]) - a2(paddd xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(movdqa [eax+0],xmm0) - a2(movdqa [eax+16],xmm1) - a2(movdqa [eax+32],xmm2) - a2(movdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_ssse3_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_ssse3) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSSE3 - -asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_ssse3) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a2(mov r8, 0x0504070601000302) - a2(mov rax, 0x0d0c0f0e09080b0a) - a2(movq xmm4, r8) - a2(movq xmm6, rax) - a2(mov r8, 0x0605040702010003) - a2(mov rax, 0x0e0d0c0f0a09080b) - a2(movq xmm5, r8) - a2(movq xmm7, rax) - a2(punpcklqdq xmm4, xmm6) - a2(punpcklqdq xmm5, xmm7) - a1(jz scrypt_ChunkMix_ssse3_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a1(scrypt_ChunkMix_ssse3_no_xor1:) - a2(xor r8,r8) - a2(xor r9,r9) - a1(scrypt_ChunkMix_ssse3_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_ssse3_no_xor2:) - a2(movdqa xmm8,xmm0) - a2(movdqa xmm9,xmm1) - a2(movdqa xmm10,xmm2) - a2(movdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_chacha_ssse3_loop: ) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm4) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm12,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm12,20) - a2(pxor xmm1,xmm12) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm5) - a3(pshufd xmm0,xmm0,0x93) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x39) - a2(movdqa xmm12,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm12,25) - a2(pxor xmm1,xmm12) - a2(sub rax,2) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm4) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm12,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm12,20) - a2(pxor xmm1,xmm12) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm5) - a3(pshufd xmm0,xmm0,0x39) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a2(movdqa xmm12,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm12,25) - a2(pxor xmm1,xmm12) - a1(ja scrypt_chacha_ssse3_loop) - a2(paddd xmm0,xmm8) - a2(paddd xmm1,xmm9) - a2(paddd xmm2,xmm10) - a2(paddd xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_ssse3_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_ssse3) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSSE3 - -static void NOINLINE -scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and no XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_ssse3_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and unconditional XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_ssse3_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - #undef SCRYPT_MIX - #define SCRYPT_MIX "ChaCha/8-SSSE3" - #undef SCRYPT_CHACHA_INCLUDED - #define SCRYPT_CHACHA_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_chacha.h b/algo/scryptjane/scrypt-jane-mix_chacha.h deleted file mode 100644 index 85ee9c1..0000000 --- a/algo/scryptjane/scrypt-jane-mix_chacha.h +++ /dev/null @@ -1,69 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED) - -#undef SCRYPT_MIX -#define SCRYPT_MIX "ChaCha20/8 Ref" - -#undef SCRYPT_CHACHA_INCLUDED -#define SCRYPT_CHACHA_INCLUDED -#define SCRYPT_CHACHA_BASIC - -static void -chacha_core_basic(uint32_t state[16]) { - size_t rounds = 8; - uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; - - x0 = state[0]; - x1 = state[1]; - x2 = state[2]; - x3 = state[3]; - x4 = state[4]; - x5 = state[5]; - x6 = state[6]; - x7 = state[7]; - x8 = state[8]; - x9 = state[9]; - x10 = state[10]; - x11 = state[11]; - x12 = state[12]; - x13 = state[13]; - x14 = state[14]; - x15 = state[15]; - - #define quarter(a,b,c,d) \ - a += b; t = d^a; d = ROTL32(t,16); \ - c += d; t = b^c; b = ROTL32(t,12); \ - a += b; t = d^a; d = ROTL32(t, 8); \ - c += d; t = b^c; b = ROTL32(t, 7); - - for (; rounds; rounds -= 2) { - quarter( x0, x4, x8,x12) - quarter( x1, x5, x9,x13) - quarter( x2, x6,x10,x14) - quarter( x3, x7,x11,x15) - quarter( x0, x5,x10,x15) - quarter( x1, x6,x11,x12) - quarter( x2, x7, x8,x13) - quarter( x3, x4, x9,x14) - } - - state[0] += x0; - state[1] += x1; - state[2] += x2; - state[3] += x3; - state[4] += x4; - state[5] += x5; - state[6] += x6; - state[7] += x7; - state[8] += x8; - state[9] += x9; - state[10] += x10; - state[11] += x11; - state[12] += x12; - state[13] += x13; - state[14] += x14; - state[15] += x15; - - #undef quarter -} - -#endif \ No newline at end of file diff --git a/algo/scryptjane/scrypt-jane-mix_salsa-avx.h b/algo/scryptjane/scrypt-jane-mix_salsa-avx.h deleted file mode 100644 index 1ca90b5..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa-avx.h +++ /dev/null @@ -1,381 +0,0 @@ -/* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,32) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[ecx+eax+0]) - a3(vpxor xmm1,xmm1,[ecx+eax+16]) - a3(vpxor xmm2,xmm2,[ecx+eax+32]) - a3(vpxor xmm3,xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_avx_loop:) - a2(and eax, eax) - a3(vpxor xmm0,xmm0,[esi+ecx+0]) - a3(vpxor xmm1,xmm1,[esi+ecx+16]) - a3(vpxor xmm2,xmm2,[esi+ecx+32]) - a3(vpxor xmm3,xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[eax+ecx+0]) - a3(vpxor xmm1,xmm1,[eax+ecx+16]) - a3(vpxor xmm2,xmm2,[eax+ecx+32]) - a3(vpxor xmm3,xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa [esp+0],xmm0) - a2(vmovdqa [esp+16],xmm1) - a2(vmovdqa xmm6,xmm2) - a2(vmovdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_salsa_avx_loop: ) - a3(vpaddd xmm4, xmm1, xmm0) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm3, xmm3, xmm5) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm0, xmm3) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm3, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm1, xmm1, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm2, xmm1) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a2(sub eax, 2) - a3(vpaddd xmm4, xmm3, xmm0) - a3(pshufd xmm1, xmm1, 0x39) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm1, xmm1, xmm5) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm0, xmm1) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm1, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm3, xmm3, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm2, xmm3) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a1(ja scrypt_salsa_avx_loop) - a3(vpaddd xmm0,xmm0,[esp+0]) - a3(vpaddd xmm1,xmm1,[esp+16]) - a3(vpaddd xmm2,xmm2,xmm6) - a3(vpaddd xmm3,xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(vmovdqa [eax+0],xmm0) - a2(vmovdqa [eax+16],xmm1) - a2(vmovdqa [eax+32],xmm2) - a2(vmovdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_avx_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_avx_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa xmm8,xmm0) - a2(vmovdqa xmm9,xmm1) - a2(vmovdqa xmm10,xmm2) - a2(vmovdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_salsa_avx_loop: ) - a3(vpaddd xmm4, xmm1, xmm0) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm3, xmm3, xmm5) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm0, xmm3) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm3, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm1, xmm1, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm2, xmm1) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a2(sub rax, 2) - a3(vpaddd xmm4, xmm3, xmm0) - a3(pshufd xmm1, xmm1, 0x39) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm1, xmm1, xmm5) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm0, xmm1) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm1, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm3, xmm3, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm2, xmm3) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a1(ja scrypt_salsa_avx_loop) - a3(vpaddd xmm0,xmm0,xmm8) - a3(vpaddd xmm1,xmm1,xmm9) - a3(vpaddd xmm2,xmm2,xmm10) - a3(vpaddd xmm3,xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_avx_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_AVX - -static void NOINLINE -scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x4 = x1; - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x3 = _mm_xor_si128(x3, x4); - x4 = x0; - x3 = _mm_xor_si128(x3, x5); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x3; - x2 = _mm_xor_si128(x2, x5); - x3 = _mm_shuffle_epi32(x3, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x1 = _mm_xor_si128(x1, x4); - x4 = x2; - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x4 = x3; - x0 = _mm_xor_si128(x0, x5); - x1 = _mm_shuffle_epi32(x1, 0x39); - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x1 = _mm_xor_si128(x1, x4); - x4 = x0; - x1 = _mm_xor_si128(x1, x5); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x1; - x2 = _mm_xor_si128(x2, x5); - x1 = _mm_shuffle_epi32(x1, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x3 = _mm_xor_si128(x3, x4); - x4 = x2; - x3 = _mm_xor_si128(x3, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x3 = _mm_shuffle_epi32(x3, 0x39); - x0 = _mm_xor_si128(x0, x5); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_SALSA_AVX) - /* uses salsa_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa/8-AVX" - #undef SCRYPT_SALSA_INCLUDED - #define SCRYPT_SALSA_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_salsa-sse2.h b/algo/scryptjane/scrypt-jane-mix_salsa-sse2.h deleted file mode 100644 index ecc5f0f..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa-sse2.h +++ /dev/null @@ -1,443 +0,0 @@ -/* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,32) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[ecx+eax+0]) - a2(pxor xmm1,[ecx+eax+16]) - a2(pxor xmm2,[ecx+eax+32]) - a2(pxor xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and eax, eax) - a2(pxor xmm0,[esi+ecx+0]) - a2(pxor xmm1,[esi+ecx+16]) - a2(pxor xmm2,[esi+ecx+32]) - a2(pxor xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[eax+ecx+0]) - a2(pxor xmm1,[eax+ecx+16]) - a2(pxor xmm2,[eax+ecx+32]) - a2(pxor xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa [esp+0],xmm0) - a2(movdqa [esp+16],xmm1) - a2(movdqa xmm6,xmm2) - a2(movdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_salsa_sse2_loop: ) - a2(movdqa xmm4, xmm1) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm3, xmm5) - a2(paddd xmm4, xmm3) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm2, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm1, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm0, xmm5) - a3(pshufd xmm1, xmm1, 0x39) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm1, xmm5) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm1) - a2(pxor xmm2, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm3, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm3) - a2(sub eax, 2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a2(pxor xmm0, xmm5) - a1(ja scrypt_salsa_sse2_loop) - a2(paddd xmm0,[esp+0]) - a2(paddd xmm1,[esp+16]) - a2(paddd xmm2,xmm6) - a2(paddd xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(movdqa [eax+0],xmm0) - a2(movdqa [eax+16],xmm1) - a2(movdqa [eax+32],xmm2) - a2(movdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_sse2_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa xmm8,xmm0) - a2(movdqa xmm9,xmm1) - a2(movdqa xmm10,xmm2) - a2(movdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_salsa_sse2_loop: ) - a2(movdqa xmm4, xmm1) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm3, xmm5) - a2(paddd xmm4, xmm3) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm2, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm1, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm0, xmm5) - a3(pshufd xmm1, xmm1, 0x39) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm1, xmm5) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm1) - a2(pxor xmm2, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm3, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm3) - a2(sub rax, 2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a2(pxor xmm0, xmm5) - a1(ja scrypt_salsa_sse2_loop) - a2(paddd xmm0,xmm8) - a2(paddd xmm1,xmm9) - a2(paddd xmm2,xmm10) - a2(paddd xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_sse2_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_SSE2 - -static void NOINLINE -scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x4 = x1; - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x3 = _mm_xor_si128(x3, x4); - x4 = x0; - x3 = _mm_xor_si128(x3, x5); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x3; - x2 = _mm_xor_si128(x2, x5); - x3 = _mm_shuffle_epi32(x3, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x1 = _mm_xor_si128(x1, x4); - x4 = x2; - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x4 = x3; - x0 = _mm_xor_si128(x0, x5); - x1 = _mm_shuffle_epi32(x1, 0x39); - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x1 = _mm_xor_si128(x1, x4); - x4 = x0; - x1 = _mm_xor_si128(x1, x5); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x1; - x2 = _mm_xor_si128(x2, x5); - x1 = _mm_shuffle_epi32(x1, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x3 = _mm_xor_si128(x3, x4); - x4 = x2; - x3 = _mm_xor_si128(x3, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x3 = _mm_shuffle_epi32(x3, 0x39); - x0 = _mm_xor_si128(x0, x5); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_SALSA_SSE2) - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa/8-SSE2" - #undef SCRYPT_SALSA_INCLUDED - #define SCRYPT_SALSA_INCLUDED -#endif - -/* used by avx,etc as well */ -#if defined(SCRYPT_SALSA_INCLUDED) - /* - Default layout: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 - 12 13 14 15 - - SSE2 layout: - 0 5 10 15 - 12 1 6 11 - 8 13 2 7 - 4 9 14 3 - */ - - static void asm_calling_convention - salsa_core_tangle_sse2(uint32_t *blocks, size_t count) { - uint32_t t; - while (count--) { - t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; - t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; - t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; - t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; - t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; - t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; - blocks += 16; - } - } -#endif - diff --git a/algo/scryptjane/scrypt-jane-mix_salsa.h b/algo/scryptjane/scrypt-jane-mix_salsa.h deleted file mode 100644 index 33f3340..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa.h +++ /dev/null @@ -1,70 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED) - -#undef SCRYPT_MIX -#define SCRYPT_MIX "Salsa20/8 Ref" - -#undef SCRYPT_SALSA_INCLUDED -#define SCRYPT_SALSA_INCLUDED -#define SCRYPT_SALSA_BASIC - -static void -salsa_core_basic(uint32_t state[16]) { - size_t rounds = 8; - uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; - - x0 = state[0]; - x1 = state[1]; - x2 = state[2]; - x3 = state[3]; - x4 = state[4]; - x5 = state[5]; - x6 = state[6]; - x7 = state[7]; - x8 = state[8]; - x9 = state[9]; - x10 = state[10]; - x11 = state[11]; - x12 = state[12]; - x13 = state[13]; - x14 = state[14]; - x15 = state[15]; - - #define quarter(a,b,c,d) \ - t = a+d; t = ROTL32(t, 7); b ^= t; \ - t = b+a; t = ROTL32(t, 9); c ^= t; \ - t = c+b; t = ROTL32(t, 13); d ^= t; \ - t = d+c; t = ROTL32(t, 18); a ^= t; \ - - for (; rounds; rounds -= 2) { - quarter( x0, x4, x8,x12) - quarter( x5, x9,x13, x1) - quarter(x10,x14, x2, x6) - quarter(x15, x3, x7,x11) - quarter( x0, x1, x2, x3) - quarter( x5, x6, x7, x4) - quarter(x10,x11, x8, x9) - quarter(x15,x12,x13,x14) - } - - state[0] += x0; - state[1] += x1; - state[2] += x2; - state[3] += x3; - state[4] += x4; - state[5] += x5; - state[6] += x6; - state[7] += x7; - state[8] += x8; - state[9] += x9; - state[10] += x10; - state[11] += x11; - state[12] += x12; - state[13] += x13; - state[14] += x14; - state[15] += x15; - - #undef quarter -} - -#endif - diff --git a/algo/scryptjane/scrypt-jane-mix_salsa64-avx.h b/algo/scryptjane/scrypt-jane-mix_salsa64-avx.h deleted file mode 100644 index 50c9902..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa64-avx.h +++ /dev/null @@ -1,367 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[rcx*2]) - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - a2(vmovdqa xmm4,[rax+64]) - a2(vmovdqa xmm5,[rax+80]) - a2(vmovdqa xmm6,[rax+96]) - a2(vmovdqa xmm7,[rax+112]) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a3(vpxor xmm4,xmm4,[r9+64]) - a3(vpxor xmm5,xmm5,[r9+80]) - a3(vpxor xmm6,xmm6,[r9+96]) - a3(vpxor xmm7,xmm7,[r9+112]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_avx_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a3(vpxor xmm4,xmm4,[rsi+r9+64]) - a3(vpxor xmm5,xmm5,[rsi+r9+80]) - a3(vpxor xmm6,xmm6,[rsi+r9+96]) - a3(vpxor xmm7,xmm7,[rsi+r9+112]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a3(vpxor xmm4,xmm4,[rdx+r9+64]) - a3(vpxor xmm5,xmm5,[rdx+r9+80]) - a3(vpxor xmm6,xmm6,[rdx+r9+96]) - a3(vpxor xmm7,xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa [rsp+0],xmm0) - a2(vmovdqa [rsp+16],xmm1) - a2(vmovdqa [rsp+32],xmm2) - a2(vmovdqa [rsp+48],xmm3) - a2(vmovdqa [rsp+64],xmm4) - a2(vmovdqa [rsp+80],xmm5) - a2(vmovdqa [rsp+96],xmm6) - a2(vmovdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_avx_loop: ) - a3(vpaddq xmm8, xmm0, xmm2) - a3(vpaddq xmm9, xmm1, xmm3) - a3(vpshufd xmm8, xmm8, 0xb1) - a3(vpshufd xmm9, xmm9, 0xb1) - a3(vpxor xmm6, xmm6, xmm8) - a3(vpxor xmm7, xmm7, xmm9) - a3(vpaddq xmm10, xmm0, xmm6) - a3(vpaddq xmm11, xmm1, xmm7) - a3(vpsrlq xmm8, xmm10, 51) - a3(vpsrlq xmm9, xmm11, 51) - a3(vpsllq xmm10, xmm10, 13) - a3(vpsllq xmm11, xmm11, 13) - a3(vpxor xmm4, xmm4, xmm8) - a3(vpxor xmm5, xmm5, xmm9) - a3(vpxor xmm4, xmm4, xmm10) - a3(vpxor xmm5, xmm5, xmm11) - a3(vpaddq xmm8, xmm6, xmm4) - a3(vpaddq xmm9, xmm7, xmm5) - a3(vpsrlq xmm10, xmm8, 25) - a3(vpsrlq xmm11, xmm9, 25) - a3(vpsllq xmm8, xmm8, 39) - a3(vpsllq xmm9, xmm9, 39) - a3(vpxor xmm2, xmm2, xmm10) - a3(vpxor xmm3, xmm3, xmm11) - a3(vpxor xmm2, xmm2, xmm8) - a3(vpxor xmm3, xmm3, xmm9) - a3(vpaddq xmm10, xmm4, xmm2) - a3(vpaddq xmm11, xmm5, xmm3) - a3(vpshufd xmm10, xmm10, 0xb1) - a3(vpshufd xmm11, xmm11, 0xb1) - a3(vpxor xmm0, xmm0, xmm10) - a3(vpxor xmm1, xmm1, xmm11) - a2(vmovdqa xmm8, xmm2) - a2(vmovdqa xmm9, xmm3) - a4(vpalignr xmm2, xmm6, xmm7, 8) - a4(vpalignr xmm3, xmm7, xmm6, 8) - a4(vpalignr xmm6, xmm9, xmm8, 8) - a4(vpalignr xmm7, xmm8, xmm9, 8) - a2(sub rax, 2) - a3(vpaddq xmm10, xmm0, xmm2) - a3(vpaddq xmm11, xmm1, xmm3) - a3(vpshufd xmm10, xmm10, 0xb1) - a3(vpshufd xmm11, xmm11, 0xb1) - a3(vpxor xmm6, xmm6, xmm10) - a3(vpxor xmm7, xmm7, xmm11) - a3(vpaddq xmm8, xmm0, xmm6) - a3(vpaddq xmm9, xmm1, xmm7) - a3(vpsrlq xmm10, xmm8, 51) - a3(vpsrlq xmm11, xmm9, 51) - a3(vpsllq xmm8, xmm8, 13) - a3(vpsllq xmm9, xmm9, 13) - a3(vpxor xmm5, xmm5, xmm10) - a3(vpxor xmm4, xmm4, xmm11) - a3(vpxor xmm5, xmm5, xmm8) - a3(vpxor xmm4, xmm4, xmm9) - a3(vpaddq xmm10, xmm6, xmm5) - a3(vpaddq xmm11, xmm7, xmm4) - a3(vpsrlq xmm8, xmm10, 25) - a3(vpsrlq xmm9, xmm11, 25) - a3(vpsllq xmm10, xmm10, 39) - a3(vpsllq xmm11, xmm11, 39) - a3(vpxor xmm2, xmm2, xmm8) - a3(vpxor xmm3, xmm3, xmm9) - a3(vpxor xmm2, xmm2, xmm10) - a3(vpxor xmm3, xmm3, xmm11) - a3(vpaddq xmm8, xmm5, xmm2) - a3(vpaddq xmm9, xmm4, xmm3) - a3(vpshufd xmm8, xmm8, 0xb1) - a3(vpshufd xmm9, xmm9, 0xb1) - a3(vpxor xmm0, xmm0, xmm8) - a3(vpxor xmm1, xmm1, xmm9) - a2(vmovdqa xmm10, xmm2) - a2(vmovdqa xmm11, xmm3) - a4(vpalignr xmm2, xmm6, xmm7, 8) - a4(vpalignr xmm3, xmm7, xmm6, 8) - a4(vpalignr xmm6, xmm11, xmm10, 8) - a4(vpalignr xmm7, xmm10, xmm11, 8) - a1(ja scrypt_salsa64_avx_loop) - a3(vpaddq xmm0,xmm0,[rsp+0]) - a3(vpaddq xmm1,xmm1,[rsp+16]) - a3(vpaddq xmm2,xmm2,[rsp+32]) - a3(vpaddq xmm3,xmm3,[rsp+48]) - a3(vpaddq xmm4,xmm4,[rsp+64]) - a3(vpaddq xmm5,xmm5,[rsp+80]) - a3(vpaddq xmm6,xmm6,[rsp+96]) - a3(vpaddq xmm7,xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - a2(vmovdqa [rax+64],xmm4) - a2(vmovdqa [rax+80],xmm5) - a2(vmovdqa [rax+96],xmm6) - a2(vmovdqa [rax+112],xmm7) - a1(jne scrypt_ChunkMix_avx_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX) - -#define SCRYPT_SALSA64_AVX - -static void asm_calling_convention -scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x5 = _mm_xor_si128(x5, z2); - x4 = _mm_xor_si128(x4, z3); - x5 = _mm_xor_si128(x5, z0); - x4 = _mm_xor_si128(x4, z1); - - z0 = _mm_add_epi64(x5, x6); - z1 = _mm_add_epi64(x4, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x5); - z1 = _mm_add_epi64(x3, x4); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_AVX) - /* uses salsa64_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-AVX" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_salsa64-sse2.h b/algo/scryptjane/scrypt-jane-mix_salsa64-sse2.h deleted file mode 100644 index f8d9574..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa64-sse2.h +++ /dev/null @@ -1,449 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[rcx*2]) - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a2(movdqa xmm4,[rax+64]) - a2(movdqa xmm5,[rax+80]) - a2(movdqa xmm6,[rax+96]) - a2(movdqa xmm7,[rax+112]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a2(pxor xmm4,[r9+64]) - a2(pxor xmm5,[r9+80]) - a2(pxor xmm6,[r9+96]) - a2(pxor xmm7,[r9+112]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a2(pxor xmm4,[rsi+r9+64]) - a2(pxor xmm5,[rsi+r9+80]) - a2(pxor xmm6,[rsi+r9+96]) - a2(pxor xmm7,[rsi+r9+112]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a2(pxor xmm4,[rdx+r9+64]) - a2(pxor xmm5,[rdx+r9+80]) - a2(pxor xmm6,[rdx+r9+96]) - a2(pxor xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa [rsp+0],xmm0) - a2(movdqa [rsp+16],xmm1) - a2(movdqa [rsp+32],xmm2) - a2(movdqa [rsp+48],xmm3) - a2(movdqa [rsp+64],xmm4) - a2(movdqa [rsp+80],xmm5) - a2(movdqa [rsp+96],xmm6) - a2(movdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_sse2_loop: ) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm4, xmm10) - a2(pxor xmm5, xmm11) - a2(pxor xmm4, xmm8) - a2(pxor xmm5, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm4) - a2(paddq xmm11, xmm5) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm4) - a2(movdqa xmm9, xmm5) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm8, xmm2) - a2(movdqa xmm9, xmm3) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(movdqa xmm2, xmm7) - a2(movdqa xmm3, xmm6) - a2(punpcklqdq xmm10, xmm6) - a2(punpcklqdq xmm11, xmm7) - a2(movdqa xmm6, xmm8) - a2(movdqa xmm7, xmm9) - a2(punpcklqdq xmm9, xmm9) - a2(punpcklqdq xmm8, xmm8) - a2(punpckhqdq xmm2, xmm10) - a2(punpckhqdq xmm3, xmm11) - a2(punpckhqdq xmm6, xmm9) - a2(punpckhqdq xmm7, xmm8) - a2(sub rax, 2) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm5, xmm10) - a2(pxor xmm4, xmm11) - a2(pxor xmm5, xmm8) - a2(pxor xmm4, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm5) - a2(paddq xmm11, xmm4) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm5) - a2(movdqa xmm9, xmm4) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm8, xmm2) - a2(movdqa xmm9, xmm3) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(movdqa xmm2, xmm7) - a2(movdqa xmm3, xmm6) - a2(punpcklqdq xmm10, xmm6) - a2(punpcklqdq xmm11, xmm7) - a2(movdqa xmm6, xmm8) - a2(movdqa xmm7, xmm9) - a2(punpcklqdq xmm9, xmm9) - a2(punpcklqdq xmm8, xmm8) - a2(punpckhqdq xmm2, xmm10) - a2(punpckhqdq xmm3, xmm11) - a2(punpckhqdq xmm6, xmm9) - a2(punpckhqdq xmm7, xmm8) - a1(ja scrypt_salsa64_sse2_loop) - a2(paddq xmm0,[rsp+0]) - a2(paddq xmm1,[rsp+16]) - a2(paddq xmm2,[rsp+32]) - a2(paddq xmm3,[rsp+48]) - a2(paddq xmm4,[rsp+64]) - a2(paddq xmm5,[rsp+80]) - a2(paddq xmm6,[rsp+96]) - a2(paddq xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a2(movdqa [rax+64],xmm4) - a2(movdqa [rax+80],xmm5) - a2(movdqa [rax+96],xmm6) - a2(movdqa [rax+112],xmm7) - a1(jne scrypt_ChunkMix_sse2_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2) - -#define SCRYPT_SALSA64_SSE2 - -static void asm_calling_convention -scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x4; - z1 = x5; - z2 = x2; - z3 = x3; - x4 = z1; - x5 = z0; - x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); - x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); - x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); - x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x4; - z1 = x5; - z2 = x2; - z3 = x3; - x4 = z1; - x5 = z0; - x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); - x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); - x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); - x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-SSE2" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif - -/* sse3/avx use this as well */ -#if defined(SCRYPT_SALSA64_INCLUDED) - /* - Default layout: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 - 12 13 14 15 - - SSE2 layout: - 0 5 10 15 - 12 1 6 11 - 8 13 2 7 - 4 9 14 3 - */ - - - static void asm_calling_convention - salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) { - uint64_t t; - while (count--) { - t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; - t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; - t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; - t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; - t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; - t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; - blocks += 16; - } - } -#endif \ No newline at end of file diff --git a/algo/scryptjane/scrypt-jane-mix_salsa64-ssse3.h b/algo/scryptjane/scrypt-jane-mix_salsa64-ssse3.h deleted file mode 100644 index bebfe5c..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa64-ssse3.h +++ /dev/null @@ -1,399 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_SSSE3 - -asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_ssse3) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[rcx*2]) - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a2(movdqa xmm4,[rax+64]) - a2(movdqa xmm5,[rax+80]) - a2(movdqa xmm6,[rax+96]) - a2(movdqa xmm7,[rax+112]) - a1(jz scrypt_ChunkMix_ssse3_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a2(pxor xmm4,[r9+64]) - a2(pxor xmm5,[r9+80]) - a2(pxor xmm6,[r9+96]) - a2(pxor xmm7,[r9+112]) - a1(scrypt_ChunkMix_ssse3_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_ssse3_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a2(pxor xmm4,[rsi+r9+64]) - a2(pxor xmm5,[rsi+r9+80]) - a2(pxor xmm6,[rsi+r9+96]) - a2(pxor xmm7,[rsi+r9+112]) - a1(jz scrypt_ChunkMix_ssse3_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a2(pxor xmm4,[rdx+r9+64]) - a2(pxor xmm5,[rdx+r9+80]) - a2(pxor xmm6,[rdx+r9+96]) - a2(pxor xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_ssse3_no_xor2:) - a2(movdqa [rsp+0],xmm0) - a2(movdqa [rsp+16],xmm1) - a2(movdqa [rsp+32],xmm2) - a2(movdqa [rsp+48],xmm3) - a2(movdqa [rsp+64],xmm4) - a2(movdqa [rsp+80],xmm5) - a2(movdqa [rsp+96],xmm6) - a2(movdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_ssse3_loop: ) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm4, xmm10) - a2(pxor xmm5, xmm11) - a2(pxor xmm4, xmm8) - a2(pxor xmm5, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm4) - a2(paddq xmm11, xmm5) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm4) - a2(movdqa xmm9, xmm5) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm10, xmm2) - a2(movdqa xmm11, xmm3) - a2(movdqa xmm2, xmm6) - a2(movdqa xmm3, xmm7) - a3(palignr xmm2, xmm7, 8) - a3(palignr xmm3, xmm6, 8) - a2(movdqa xmm6, xmm11) - a2(movdqa xmm7, xmm10) - a3(palignr xmm6, xmm10, 8) - a3(palignr xmm7, xmm11, 8) - a2(sub rax, 2) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm5, xmm10) - a2(pxor xmm4, xmm11) - a2(pxor xmm5, xmm8) - a2(pxor xmm4, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm5) - a2(paddq xmm11, xmm4) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm5) - a2(movdqa xmm9, xmm4) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm10, xmm2) - a2(movdqa xmm11, xmm3) - a2(movdqa xmm2, xmm6) - a2(movdqa xmm3, xmm7) - a3(palignr xmm2, xmm7, 8) - a3(palignr xmm3, xmm6, 8) - a2(movdqa xmm6, xmm11) - a2(movdqa xmm7, xmm10) - a3(palignr xmm6, xmm10, 8) - a3(palignr xmm7, xmm11, 8) - a1(ja scrypt_salsa64_ssse3_loop) - a2(paddq xmm0,[rsp+0]) - a2(paddq xmm1,[rsp+16]) - a2(paddq xmm2,[rsp+32]) - a2(paddq xmm3,[rsp+48]) - a2(paddq xmm4,[rsp+64]) - a2(paddq xmm5,[rsp+80]) - a2(paddq xmm6,[rsp+96]) - a2(paddq xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a2(movdqa [rax+64],xmm4) - a2(movdqa [rax+80],xmm5) - a2(movdqa [rax+96],xmm6) - a2(movdqa [rax+112],xmm7) - a1(jne scrypt_ChunkMix_ssse3_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_ssse3) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3) - -#define SCRYPT_SALSA64_SSSE3 - -static void asm_calling_convention -scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x5 = _mm_xor_si128(x5, z2); - x4 = _mm_xor_si128(x4, z3); - x5 = _mm_xor_si128(x5, z0); - x4 = _mm_xor_si128(x4, z1); - - z0 = _mm_add_epi64(x5, x6); - z1 = _mm_add_epi64(x4, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x5); - z1 = _mm_add_epi64(x3, x4); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - /* uses salsa64_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-SSSE3" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_salsa64.h b/algo/scryptjane/scrypt-jane-mix_salsa64.h deleted file mode 100644 index 2aec04f..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa64.h +++ /dev/null @@ -1,41 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED) - -#undef SCRYPT_MIX -#define SCRYPT_MIX "Salsa64/8 Ref" - -#undef SCRYPT_SALSA64_INCLUDED -#define SCRYPT_SALSA64_INCLUDED -#define SCRYPT_SALSA64_BASIC - -static void -salsa64_core_basic(uint64_t state[16]) { - const size_t rounds = 8; - uint64_t v[16], t; - size_t i; - - for (i = 0; i < 16; i++) v[i] = state[i]; - - #define G(a,b,c,d) \ - t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \ - t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \ - t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \ - t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \ - - for (i = 0; i < rounds; i += 2) { - G( 0, 4, 8,12); - G( 5, 9,13, 1); - G(10,14, 2, 6); - G(15, 3, 7,11); - G( 0, 1, 2, 3); - G( 5, 6, 7, 4); - G(10,11, 8, 9); - G(15,12,13,14); - } - - for (i = 0; i < 16; i++) state[i] += v[i]; - - #undef G -} - -#endif - diff --git a/algo/scryptjane/scrypt-jane-pbkdf2.h b/algo/scryptjane/scrypt-jane-pbkdf2.h deleted file mode 100644 index 761b812..0000000 --- a/algo/scryptjane/scrypt-jane-pbkdf2.h +++ /dev/null @@ -1,161 +0,0 @@ -typedef struct scrypt_hmac_state_t { - scrypt_hash_state inner, outer; -} scrypt_hmac_state; - - -static void -scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { - scrypt_hash_state st; - scrypt_hash_init(&st); - scrypt_hash_update(&st, m, mlen); - scrypt_hash_finish(&st, hash); -} - -/* hmac */ -static void -scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { - uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; - size_t i; - - scrypt_hash_init(&st->inner); - scrypt_hash_init(&st->outer); - - if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { - /* use the key directly if it's <= blocksize bytes */ - memcpy(pad, key, keylen); - } else { - /* if it's > blocksize bytes, hash it */ - scrypt_hash(pad, key, keylen); - } - - /* inner = (key ^ 0x36) */ - /* h(inner || ...) */ - for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= 0x36; - scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); - - /* outer = (key ^ 0x5c) */ - /* h(outer || ...) */ - for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= (0x5c ^ 0x36); - scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); - -#ifdef SCRYPT_PREVENT_STATE_LEAK - scrypt_ensure_zero(pad, sizeof(pad)); -#endif -} - -static void -scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { - /* h(inner || m...) */ - scrypt_hash_update(&st->inner, m, mlen); -} - -static void -scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { - /* h(inner || m) */ - scrypt_hash_digest innerhash; - scrypt_hash_finish(&st->inner, innerhash); - - /* h(outer || h(inner || m)) */ - scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); - scrypt_hash_finish(&st->outer, mac); - -#ifdef SCRYPT_PREVENT_STATE_LEAK - scrypt_ensure_zero(st, sizeof(*st)); -#endif -} - -static void -scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) { - scrypt_hmac_state hmac_pw, hmac_pw_salt, work; - scrypt_hash_digest ti, u; - uint8_t be[4]; - uint32_t i, j, blocks; - uint64_t c; - - /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ - - /* hmac(password, ...) */ - scrypt_hmac_init(&hmac_pw, password, password_len); - - /* hmac(password, salt...) */ - hmac_pw_salt = hmac_pw; - scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); - - blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; - for (i = 1; i <= blocks; i++) { - /* U1 = hmac(password, salt || be(i)) */ - U32TO8_BE(be, i); - work = hmac_pw_salt; - scrypt_hmac_update(&work, be, 4); - scrypt_hmac_finish(&work, ti); - memcpy(u, ti, sizeof(u)); - - /* T[i] = U1 ^ U2 ^ U3... */ - for (c = 0; c < N - 1; c++) { - /* UX = hmac(password, U{X-1}) */ - work = hmac_pw; - scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE); - scrypt_hmac_finish(&work, u); - - /* T[i] ^= UX */ - for (j = 0; j < sizeof(u); j++) - ti[j] ^= u[j]; - } - - memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); - out += SCRYPT_HASH_DIGEST_SIZE; - bytes -= SCRYPT_HASH_DIGEST_SIZE; - } - -#ifdef SCRYPT_PREVENT_STATE_LEAK - scrypt_ensure_zero(ti, sizeof(ti)); - scrypt_ensure_zero(u, sizeof(u)); - scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); - scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); -#endif -} - -/* - * Special version where N = 1 - * - mikaelh - */ -static void -scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) { - scrypt_hmac_state hmac_pw, hmac_pw_salt, work; - scrypt_hash_digest ti, u; - uint8_t be[4]; - uint32_t i, /*j,*/ blocks; - //uint64_t c; - - /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ - - /* hmac(password, ...) */ - scrypt_hmac_init(&hmac_pw, password, password_len); - - /* hmac(password, salt...) */ - hmac_pw_salt = hmac_pw; - scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); - - blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; - for (i = 1; i <= blocks; i++) { - /* U1 = hmac(password, salt || be(i)) */ - U32TO8_BE(be, i); - work = hmac_pw_salt; - scrypt_hmac_update(&work, be, 4); - scrypt_hmac_finish(&work, ti); - memcpy(u, ti, sizeof(u)); - - memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); - out += SCRYPT_HASH_DIGEST_SIZE; - bytes -= SCRYPT_HASH_DIGEST_SIZE; - } - -#ifdef SCRYPT_PREVENT_STATE_LEAK - scrypt_ensure_zero(ti, sizeof(ti)); - scrypt_ensure_zero(u, sizeof(u)); - scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); - scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); -#endif -} diff --git a/algo/scryptjane/scrypt-jane-portable-x86.h b/algo/scryptjane/scrypt-jane-portable-x86.h deleted file mode 100644 index 29aaaae..0000000 --- a/algo/scryptjane/scrypt-jane-portable-x86.h +++ /dev/null @@ -1,393 +0,0 @@ -#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC)) - #define X86ASM - /* gcc 2.95 royally screws up stack alignments on variables */ - #if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) - #define X86ASM_SSE - #define X86ASM_SSE2 - #endif - #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) - #define X86ASM_SSSE3 - #endif - #if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) - #define X86ASM_AVX - #endif -#endif - -#if defined(CPU_X86_64) && defined(COMPILER_GCC) - #define X86_64ASM - #define X86_64ASM_SSE2 - #if (COMPILER_GCC >= 40102) - #define X86_64ASM_SSSE3 - #endif - #if (COMPILER_GCC >= 40400) - #define X86_64ASM_AVX - #endif -#endif - -#if defined(COMPILER_MSVC) - #define X86_INTRINSIC - #if defined(CPU_X86_64) || defined(X86ASM_SSE) - #define X86_INTRINSIC_SSE - #endif - #if defined(CPU_X86_64) || defined(X86ASM_SSE2) - #define X86_INTRINSIC_SSE2 - #endif - #if (COMPILER_MSVC >= 1400) - #define X86_INTRINSIC_SSSE3 - #endif -#endif - -#if defined(COMPILER_MSVC) && defined(CPU_X86_64) - #define X86_64USE_INTRINSIC -#endif - -#if defined(COMPILER_MSVC) && defined(CPU_X86_64) - #define X86_64USE_INTRINSIC -#endif - -#ifdef __AVX__ -#define X86_INTRINSIC_AVX -#endif - -#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) - #define X86_INTRINSIC - #if defined(__SSE__) - #define X86_INTRINSIC_SSE - #endif - #if defined(__SSE2__) - #define X86_INTRINSIC_SSE2 - #endif - #if defined(__SSSE3__) - #define X86_INTRINSIC_SSSE3 - #endif - #if defined(__AVX__) - #define X86_INTRINSIC_AVX - #endif - - /* HACK - I want to use CPU_X86_FORCE_INTRINSICS with mingw64 so these need to be undefined - mikaelh */ - #undef X86_64ASM_SSSE3 - #undef X86_64ASM_AVX - #undef X86_64ASM_SSE2 - #undef X86ASM_AVX - #undef X86ASM_SSSE3 - #undef X86ASM_SSE2 - #undef X86ASM_SSE -#endif - -/* only use simd on windows (or SSE2 on gcc)! */ -#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC) - #if defined(X86_INTRINSIC_SSE) - #define X86_INTRINSIC - #include - #include - typedef __m64 qmm; - typedef __m128 xmm; - typedef __m128d xmmd; - #endif - #if defined(X86_INTRINSIC_SSE2) - #define X86_INTRINSIC_SSE2 - #include - typedef __m128i xmmi; - #endif - #if defined(X86_INTRINSIC_SSSE3) - #define X86_INTRINSIC_SSSE3 - #include - #endif - #if defined (X86_INTRINSIC_AVX) - #define X86_INTRINSIC_AVX - #include - #endif -#endif - - -#if defined(X86_INTRINSIC_SSE2) - typedef union packedelem8_t { - uint8_t u[16]; - xmmi v; - } packedelem8; - - typedef union packedelem32_t { - uint32_t u[4]; - xmmi v; - } packedelem32; - - typedef union packedelem64_t { - uint64_t u[2]; - xmmi v; - } packedelem64; -#else - typedef union packedelem8_t { - uint8_t u[16]; - uint32_t dw[4]; - } packedelem8; - - typedef union packedelem32_t { - uint32_t u[4]; - uint8_t b[16]; - } packedelem32; - - typedef union packedelem64_t { - uint64_t u[2]; - uint8_t b[16]; - } packedelem64; -#endif - -#if defined(X86_INTRINSIC_SSSE3) - static const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; - static const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; -#endif - -/* - x86 inline asm for gcc/msvc. usage: - - asm_naked_fn_proto(return_type, name) (type parm1, type parm2..) - asm_naked_fn(name) - a1(..) - a2(.., ..) - a3(.., .., ..) - 64bit OR 0 paramters: a1(ret) - 32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters - asm_naked_fn_end(name) -*/ - -#if defined(X86ASM) || defined(X86_64ASM) - -#if defined(COMPILER_MSVC) - #pragma warning(disable : 4731) /* frame pointer modified by inline assembly */ - #define a1(x) __asm {x} - #define a2(x, y) __asm {x, y} - #define a3(x, y, z) __asm {x, y, z} - #define a4(x, y, z, w) __asm {x, y, z, w} - #define al(x) __asm {label##x:} - #define aj(x, y, z) __asm {x label##y} - #define asm_align8 a1(ALIGN 8) - #define asm_align16 a1(ALIGN 16) - - #define asm_calling_convention STDCALL - #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn - #define asm_naked_fn(fn) { - #define asm_naked_fn_end(fn) } -#elif defined(COMPILER_GCC) - #define GNU_AS1(x) #x ";\n" - #define GNU_AS2(x, y) #x ", " #y ";\n" - #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" - #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" - #define GNU_ASL(x) "\n" #x ":\n" - #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n" - #define GNU_ASJ(x, y, z) #x " " #y #z ";" - - #define a1(x) GNU_AS1(x) - #define a2(x, y) GNU_AS2(x, y) - #define a3(x, y, z) GNU_AS3(x, y, z) - #define a4(x, y, z, w) GNU_AS4(x, y, z, w) - #define al(x) GNU_ASL(x) - #define aj(x, y, z) GNU_ASJ(x, y, z) - #define asm_align8 a1(.align 8) - #define asm_align16 a1(.align 16) - - #if defined(OS_WINDOWS) - #define asm_calling_convention CDECL - #define aret(n) a1(ret) - #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" ); - #else - #define asm_calling_convention STDCALL - #define aret(n) a1(ret n) - #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" ); - #endif - #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn - #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) - - #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" - #define asm_gcc_parms() ".att_syntax prefix;" - #define asm_gcc_trashed() __asm__ __volatile__("" ::: - #define asm_gcc_end() ); -#else - need x86 asm -#endif - -#endif /* X86ASM || X86_64ASM */ - - -#if defined(CPU_X86) || defined(CPU_X86_64) - -typedef enum cpu_flags_x86_t { - cpu_mmx = 1 << 0, - cpu_sse = 1 << 1, - cpu_sse2 = 1 << 2, - cpu_sse3 = 1 << 3, - cpu_ssse3 = 1 << 4, - cpu_sse4_1 = 1 << 5, - cpu_sse4_2 = 1 << 6, - cpu_avx = 1 << 7 -} cpu_flags_x86; - -typedef enum cpu_vendors_x86_t { - cpu_nobody, - cpu_intel, - cpu_amd -} cpu_vendors_x86; - -typedef struct x86_regs_t { - uint32_t eax, ebx, ecx, edx; -} x86_regs; - -#if defined(X86ASM) -asm_naked_fn_proto(int, has_cpuid)(void) -asm_naked_fn(has_cpuid) - a1(pushfd) - a1(pop eax) - a2(mov ecx, eax) - a2(xor eax, 0x200000) - a1(push eax) - a1(popfd) - a1(pushfd) - a1(pop eax) - a2(xor eax, ecx) - a2(shr eax, 21) - a2(and eax, 1) - a1(push ecx) - a1(popfd) - a1(ret) -asm_naked_fn_end(has_cpuid) -#endif /* X86ASM */ - - -static void NOINLINE -get_cpuid(x86_regs *regs, uint32_t flags) { -#if defined(COMPILER_MSVC) - __cpuid((int *)regs, (int)flags); -#else - #if defined(CPU_X86_64) - #define cpuid_bx rbx - #else - #define cpuid_bx ebx - #endif - - asm_gcc() - a1(push cpuid_bx) - a1(cpuid) - a2(mov [%1 + 0], eax) - a2(mov [%1 + 4], ebx) - a2(mov [%1 + 8], ecx) - a2(mov [%1 + 12], edx) - a1(pop cpuid_bx) - asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc" - asm_gcc_end() -#endif -} - -#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) -static uint64_t NOINLINE -get_xgetbv(uint32_t flags) { -#if defined(COMPILER_MSVC) - return _xgetbv(flags); -#else - uint32_t lo, hi; - asm_gcc() - a1(xgetbv) - asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi) - asm_gcc_end() - return ((uint64_t)lo | ((uint64_t)hi << 32)); -#endif -} -#endif // AVX support - -#if defined(SCRYPT_TEST_SPEED) -size_t cpu_detect_mask = (size_t)-1; -#endif - -#if 0 -static size_t -detect_cpu(void) { - union { uint8_t s[12]; uint32_t i[3]; } vendor_string; - cpu_vendors_x86 vendor = cpu_nobody; - x86_regs regs; - uint32_t max_level; - size_t cpu_flags = 0; -#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) - uint64_t xgetbv_flags; -#endif - -#if defined(CPU_X86) - if (!has_cpuid()) - return cpu_flags; -#endif - - get_cpuid(®s, 0); - max_level = regs.eax; - vendor_string.i[0] = regs.ebx; - vendor_string.i[1] = regs.edx; - vendor_string.i[2] = regs.ecx; - - if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12)) - vendor = cpu_intel; - else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12)) - vendor = cpu_amd; - - if (max_level & 0x00000500) { - /* "Intel P5 pre-B0" */ - cpu_flags |= cpu_mmx; - return cpu_flags; - } - - if (max_level < 1) - return cpu_flags; - - get_cpuid(®s, 1); -#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) - /* xsave/xrestore */ - if (regs.ecx & (1 << 27)) { - xgetbv_flags = get_xgetbv(0); - if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx; - } -#endif - if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2; - if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2; - if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3; - if (regs.ecx & (1 )) cpu_flags |= cpu_sse3; - if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2; - if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse; - if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx; - -#if defined(SCRYPT_TEST_SPEED) - cpu_flags &= cpu_detect_mask; -#endif - - return cpu_flags; -} -#endif - -#if defined(SCRYPT_TEST_SPEED) -static const char * -get_top_cpuflag_desc(size_t flag) { - if (flag & cpu_avx) return "AVX"; - else if (flag & cpu_sse4_2) return "SSE4.2"; - else if (flag & cpu_sse4_1) return "SSE4.1"; - else if (flag & cpu_ssse3) return "SSSE3"; - else if (flag & cpu_sse2) return "SSE2"; - else if (flag & cpu_sse) return "SSE"; - else if (flag & cpu_mmx) return "MMX"; - else return "Basic"; -} -#endif - -/* enable the highest system-wide option */ -#if defined(SCRYPT_CHOOSE_COMPILETIME) - #if !defined(__AVX__) - #undef X86_64ASM_AVX - #undef X86ASM_AVX - #undef X86_INTRINSIC_AVX - #endif - #if !defined(__SSSE3__) - #undef X86_64ASM_SSSE3 - #undef X86ASM_SSSE3 - #undef X86_INTRINSIC_SSSE3 - #endif - #if !defined(__SSE2__) - #undef X86_64ASM_SSE2 - #undef X86ASM_SSE2 - #undef X86_INTRINSIC_SSE2 - #endif -#endif - -#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ diff --git a/algo/scryptjane/scrypt-jane-portable.h b/algo/scryptjane/scrypt-jane-portable.h deleted file mode 100644 index 939fc98..0000000 --- a/algo/scryptjane/scrypt-jane-portable.h +++ /dev/null @@ -1,280 +0,0 @@ -/* determine os */ -#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) - #include - #include - #define OS_WINDOWS -#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) - #include - #include - #include - - #define OS_SOLARIS -#else - #include - #include - #include /* need this to define BSD */ - #include - #include - - #define OS_NIX - #if defined(__linux__) - #include - #define OS_LINUX - #elif defined(BSD) - #define OS_BSD - - #if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) - #define OS_OSX - #elif defined(macintosh) || defined(Macintosh) - #define OS_MAC - #elif defined(__OpenBSD__) - #define OS_OPENBSD - #endif - #endif -#endif - - -/* determine compiler */ -#if defined(_MSC_VER) - #define COMPILER_MSVC _MSC_VER - #if ((COMPILER_MSVC > 1200) || defined(_mm_free)) - #define COMPILER_MSVC6PP_AND_LATER - #endif - #if (COMPILER_MSVC >= 1500) - #define COMPILER_HAS_TMMINTRIN - #endif - - #pragma warning(disable : 4127) /* conditional expression is constant */ - #pragma warning(disable : 4100) /* unreferenced formal parameter */ - - #include - #include /* _rotl */ - #include - - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; - typedef signed int int32_t; - typedef unsigned __int64 uint64_t; - typedef signed __int64 int64_t; - - #define ROTL32(a,b) _rotl(a,b) - #define ROTR32(a,b) _rotr(a,b) - #define ROTL64(a,b) _rotl64(a,b) - #define ROTR64(a,b) _rotr64(a,b) - #undef NOINLINE - #define NOINLINE __declspec(noinline) - #undef INLINE - #define INLINE __forceinline - #undef FASTCALL - #define FASTCALL __fastcall - #undef CDECL - #define CDECL __cdecl - #undef STDCALL - #define STDCALL __stdcall - #undef NAKED - #define NAKED __declspec(naked) - #define MM16 __declspec(align(16)) -#endif -#if defined(__ICC) - #define COMPILER_INTEL -#endif -#if defined(__GNUC__) - #if (__GNUC__ >= 3) - #define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__ - #else - #define COMPILER_GCC_PATCHLEVEL 0 - #endif - #define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL) - #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) - #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) - #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) - #define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b))) - #undef NOINLINE - #if (COMPILER_GCC >= 30000) - #define NOINLINE __attribute__((noinline)) - #else - #define NOINLINE - #endif - #undef INLINE - #if (COMPILER_GCC >= 30000) - #define INLINE __attribute__((always_inline)) - #else - #define INLINE inline - #endif - #undef FASTCALL - #if (COMPILER_GCC >= 30400) - #define FASTCALL __attribute__((fastcall)) - #else - #define FASTCALL - #endif - #undef CDECL - #define CDECL __attribute__((cdecl)) - #undef STDCALL - #define STDCALL __attribute__((stdcall)) - #define MM16 __attribute__((aligned(16))) - #include -#endif -#if defined(__MINGW32__) || defined(__MINGW64__) - #define COMPILER_MINGW -#endif -#if defined(__PATHCC__) - #define COMPILER_PATHCC -#endif - -#define OPTIONAL_INLINE -#if defined(OPTIONAL_INLINE) - #undef OPTIONAL_INLINE - #define OPTIONAL_INLINE INLINE -#else - #define OPTIONAL_INLINE -#endif - -#define CRYPTO_FN NOINLINE STDCALL - -/* determine cpu */ -#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) - #define CPU_X86_64 -#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) - #define CPU_X86 500 -#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) - #define CPU_X86 400 -#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) - #define CPU_X86 300 -#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) - #define CPU_IA64 -#endif - -#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) - #define CPU_SPARC - #if defined(__sparcv9) - #define CPU_SPARC64 - #endif -#endif - -#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) - #define CPU_64BITS - #undef FASTCALL - #define FASTCALL - #undef CDECL - #define CDECL - #undef STDCALL - #define STDCALL -#endif - -#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) - #define CPU_PPC - #if defined(_ARCH_PWR7) - #define CPU_POWER7 - #elif defined(__64BIT__) - #define CPU_PPC64 - #else - #define CPU_PPC32 - #endif -#endif - -#if defined(__hppa__) || defined(__hppa) - #define CPU_HPPA -#endif - -#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) - #define CPU_ALPHA -#endif - -/* endian */ - -#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ - (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \ - (defined(CPU_X86) || defined(CPU_X86_64)) || \ - (defined(vax) || defined(MIPSEL) || defined(_MIPSEL))) -#define CPU_LE -#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \ - (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \ - (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB)) -#define CPU_BE -#else - /* unknown endian! */ -#endif - - -#define U8TO32_BE(p) \ - (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ - ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) - -#define U8TO32_LE(p) \ - (((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ - ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) - -#define U32TO8_BE(p, v) \ - (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ - (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); - -#define U32TO8_LE(p, v) \ - (p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \ - (p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24); - -#define U8TO64_BE(p) \ - (((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4)) - -#define U8TO64_LE(p) \ - (((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32)) - -#define U64TO8_BE(p, v) \ - U32TO8_BE((p), (uint32_t)((v) >> 32)); \ - U32TO8_BE((p) + 4, (uint32_t)((v) )); - -#define U64TO8_LE(p, v) \ - U32TO8_LE((p), (uint32_t)((v) )); \ - U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); - -#define U32_SWAP(v) { \ - (v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \ - (v) = ((v) << 16) | ((v) >> 16); \ -} - -#define U64_SWAP(v) { \ - (v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \ - (v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \ - (v) = ((v) << 32) | ((v) >> 32); \ -} - -static int -scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { - uint32_t differentbits = 0; - while (len--) - differentbits |= (*x++ ^ *y++); - return (1 & ((differentbits - 1) >> 8)); -} - -void -scrypt_ensure_zero(void *p, size_t len) { -#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) - __stosb((unsigned char *)p, 0, len); -#elif (defined(CPU_X86) && defined(COMPILER_GCC)) - __asm__ __volatile__( - "pushl %%edi;\n" - "pushl %%ecx;\n" - "rep stosb;\n" - "popl %%ecx;\n" - "popl %%edi;\n" - :: "a"(0), "D"(p), "c"(len) : "cc", "memory" - ); -#elif (defined(CPU_X86_64) && defined(COMPILER_GCC)) - __asm__ __volatile__( - "pushq %%rdi;\n" - "pushq %%rcx;\n" - "rep stosb;\n" - "popq %%rcx;\n" - "popq %%rdi;\n" - :: "a"(0), "D"(p), "c"(len) : "cc", "memory" - ); -#else - volatile uint8_t *b = (volatile uint8_t *)p; - size_t i; - for (i = 0; i < len; i++) - b[i] = 0; -#endif -} - -#include "scrypt-jane-portable-x86.h" - diff --git a/algo/scryptjane/scrypt-jane-romix-basic.h b/algo/scryptjane/scrypt-jane-romix-basic.h deleted file mode 100644 index a464e04..0000000 --- a/algo/scryptjane/scrypt-jane-romix-basic.h +++ /dev/null @@ -1,67 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -/* function type returned by scrypt_getROMix, used with cpu detection */ -typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r); -#endif - -/* romix pre/post nop function */ -static void /* asm_calling_convention */ -scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { -} - -/* romix pre/post endian conversion function */ -static void /* asm_calling_convention */ -scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { -#if !defined(CPU_LE) - static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; - size_t i; - if (endian_test.w == 0x100) { - nblocks *= SCRYPT_BLOCK_WORDS; - for (i = 0; i < nblocks; i++) { - SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); - } - } -#endif -} - -/* chunkmix test function */ -typedef void (*chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); -typedef void (*blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); - -static int -scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { - /* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ - const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; - scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v; - uint8_t final[16]; - size_t i; - - for (i = 0; i < words; i++) { - v = (scrypt_mix_word_t)i; - v = (v << 8) | v; - v = (v << 16) | v; - chunk[0][i] = v; - } - - prefn(chunk[0], blocks); - mixfn(chunk[1], chunk[0], NULL, r); - postfn(chunk[1], blocks); - - /* grab the last 16 bytes of the final block */ - for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) { - SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]); - } - - return scrypt_verify(expected, final, 16); -} - -/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */ -static scrypt_mix_word_t * -scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) { - return base + (i * len); -} - -/* returns a pointer to block i */ -static scrypt_mix_word_t * -scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) { - return base + (i * SCRYPT_BLOCK_WORDS); -} diff --git a/algo/scryptjane/scrypt-jane-romix-template.h b/algo/scryptjane/scrypt-jane-romix-template.h deleted file mode 100644 index 53236e4..0000000 --- a/algo/scryptjane/scrypt-jane-romix-template.h +++ /dev/null @@ -1,179 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) - -#if defined(SCRYPT_CHOOSE_COMPILETIME) -#undef SCRYPT_ROMIX_FN -#define SCRYPT_ROMIX_FN scrypt_ROMix -#endif - -#undef SCRYPT_HAVE_ROMIX -#define SCRYPT_HAVE_ROMIX - -#if !defined(SCRYPT_CHUNKMIX_FN) - -#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic - -/* - Bout = ChunkMix(Bin) - - 2*r: number of blocks in the chunk -*/ -static void /* asm_calling_convention */ -SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { - scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block; - uint32_t i, j, blocksPerChunk = r * 2, half = 0; - - /* 1: X = B_{2r - 1} */ - block = scrypt_block(Bin, blocksPerChunk - 1); - for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) - X[i] = block[i]; - - if (Bxor) { - block = scrypt_block(Bxor, blocksPerChunk - 1); - for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) - X[i] ^= block[i]; - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - block = scrypt_block(Bin, i); - for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) - X[j] ^= block[j]; - - if (Bxor) { - block = scrypt_block(Bxor, i); - for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) - X[j] ^= block[j]; - } - SCRYPT_MIX_FN(X); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - block = scrypt_block(Bout, (i / 2) + half); - for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) - block[j] = X[j]; - } -} -#endif - -/* - X = ROMix(X) - - X: chunk to mix - Y: scratch chunk - N: number of rounds - V[N]: array of chunks to randomly index in to - 2*r: number of blocks in a chunk -*/ - -static void NOINLINE FASTCALL -SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { - uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; - scrypt_mix_word_t *block = V; - - SCRYPT_ROMIX_TANGLE_FN(X, r * 2); - - /* 1: X = B */ - /* implicit */ - - /* 2: for i = 0 to N - 1 do */ - memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); - for (i = 0; i < N - 1; i++, block += chunkWords) { - /* 3: V_i = X */ - /* 4: X = H(X) */ - SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); - } - SCRYPT_CHUNKMIX_FN(X, block, NULL, r); - - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < N; i += 2) { - /* 7: j = Integerify(X) % N */ - j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); - - /* 8: X = H(Y ^ V_j) */ - SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); - - /* 7: j = Integerify(Y) % N */ - j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); - - /* 8: X = H(Y ^ V_j) */ - SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); - } - - /* 10: B' = X */ - /* implicit */ - - SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); -} - -/* - * Special version with hard-coded r = 1 - * - mikaelh - */ -static void NOINLINE FASTCALL -scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) { - const uint32_t r = 1; - uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; - scrypt_mix_word_t *block = V; - - SCRYPT_ROMIX_TANGLE_FN(X, r * 2); - - /* 1: X = B */ - /* implicit */ - - /* 2: for i = 0 to N - 1 do */ - memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); - for (i = 0; i < N - 1; i++, block += chunkWords) { - /* 3: V_i = X */ - /* 4: X = H(X) */ -#ifdef SCRYPT_CHUNKMIX_1_FN - SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block); -#else - SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); -#endif - } -#ifdef SCRYPT_CHUNKMIX_1_FN - SCRYPT_CHUNKMIX_1_FN(X, block); -#else - SCRYPT_CHUNKMIX_FN(X, block, NULL, r); -#endif - - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < N; i += 2) { - /* 7: j = Integerify(X) % N */ - j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); - - /* 8: X = H(Y ^ V_j) */ -#ifdef SCRYPT_CHUNKMIX_1_XOR_FN - SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords)); -#else - SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); -#endif - - /* 7: j = Integerify(Y) % N */ - j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); - - /* 8: X = H(Y ^ V_j) */ -#ifdef SCRYPT_CHUNKMIX_1_XOR_FN - SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords)); -#else - SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); -#endif - } - - /* 10: B' = X */ - /* implicit */ - - SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); -} - -#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ - - -#undef SCRYPT_CHUNKMIX_FN -#undef SCRYPT_ROMIX_FN -#undef SCRYPT_MIX_FN -#undef SCRYPT_ROMIX_TANGLE_FN -#undef SCRYPT_ROMIX_UNTANGLE_FN - diff --git a/algo/scryptjane/scrypt-jane-romix.h b/algo/scryptjane/scrypt-jane-romix.h deleted file mode 100644 index faa655a..0000000 --- a/algo/scryptjane/scrypt-jane-romix.h +++ /dev/null @@ -1,27 +0,0 @@ -#if defined(SCRYPT_CHACHA) -#include "scrypt-jane-chacha.h" -#elif defined(SCRYPT_SALSA) -#include "scrypt-jane-salsa.h" -#elif defined(SCRYPT_SALSA64) -#include "scrypt-jane-salsa64.h" -#else - #define SCRYPT_MIX_BASE "ERROR" - typedef uint32_t scrypt_mix_word_t; - #define SCRYPT_WORDTO8_LE U32TO8_LE - #define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP - #define SCRYPT_BLOCK_BYTES 64 - #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - #if !defined(SCRYPT_CHOOSE_COMPILETIME) - static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {} - static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; } - #else - static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {} - #endif - static int scrypt_test_mix() { return 0; } - #error must define a mix function! -#endif - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -#undef SCRYPT_MIX -#define SCRYPT_MIX SCRYPT_MIX_BASE -#endif diff --git a/algo/scryptjane/scrypt-jane-salsa.h b/algo/scryptjane/scrypt-jane-salsa.h deleted file mode 100644 index 76f3da6..0000000 --- a/algo/scryptjane/scrypt-jane-salsa.h +++ /dev/null @@ -1,109 +0,0 @@ -#define SCRYPT_MIX_BASE "Salsa20/8" - -typedef uint32_t scrypt_mix_word_t; - -#define SCRYPT_WORDTO8_LE U32TO8_LE -#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP - -#define SCRYPT_BLOCK_BYTES 64 -#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - -/* must have these here in case block bytes is ever != 64 */ -#include "scrypt-jane-romix-basic.h" - -#include "scrypt-jane-mix_salsa-avx.h" -#include "scrypt-jane-mix_salsa-sse2.h" -#include "scrypt-jane-mix_salsa.h" - -#if defined(SCRYPT_SALSA_AVX) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx - #define SCRYPT_ROMIX_FN scrypt_ROMix_avx - #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA_SSE2) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 - #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 - #define SCRYPT_MIX_FN salsa_core_sse2 - #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -/* cpu agnostic */ -#define SCRYPT_ROMIX_FN scrypt_ROMix_basic -#define SCRYPT_MIX_FN salsa_core_basic -#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian -#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian -#include "scrypt-jane-romix-template.h" - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -static scrypt_ROMixfn -scrypt_getROMix() { - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA_AVX) - if (cpuflags & cpu_avx) - return scrypt_ROMix_avx; - else -#endif - -#if defined(SCRYPT_SALSA_SSE2) - if (cpuflags & cpu_sse2) - return scrypt_ROMix_sse2; - else -#endif - - return scrypt_ROMix_basic; -} -#endif - - -#if defined(SCRYPT_TEST_SPEED) -static size_t -available_implementations() { - size_t cpuflags = detect_cpu(); - size_t flags = 0; - -#if defined(SCRYPT_SALSA_AVX) - if (cpuflags & cpu_avx) - flags |= cpu_avx; -#endif - -#if defined(SCRYPT_SALSA_SSE2) - if (cpuflags & cpu_sse2) - flags |= cpu_sse2; -#endif - - return flags; -} -#endif - - -static int -scrypt_test_mix() { - static const uint8_t expected[16] = { - 0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66, - }; - - int ret = 1; - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA_AVX) - if (cpuflags & cpu_avx) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA_SSE2) - if (cpuflags & cpu_sse2) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA_BASIC) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); -#endif - - return ret; -} diff --git a/algo/scryptjane/scrypt-jane-salsa64.h b/algo/scryptjane/scrypt-jane-salsa64.h deleted file mode 100644 index ecc87f5..0000000 --- a/algo/scryptjane/scrypt-jane-salsa64.h +++ /dev/null @@ -1,133 +0,0 @@ -#define SCRYPT_MIX_BASE "Salsa64/8" - -typedef uint64_t scrypt_mix_word_t; - -#define SCRYPT_WORDTO8_LE U64TO8_LE -#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP - -#define SCRYPT_BLOCK_BYTES 128 -#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - -/* must have these here in case block bytes is ever != 64 */ -#include "scrypt-jane-romix-basic.h" - -#include "scrypt-jane-mix_salsa64-avx.h" -#include "scrypt-jane-mix_salsa64-ssse3.h" -#include "scrypt-jane-mix_salsa64-sse2.h" -#include "scrypt-jane-mix_salsa64.h" - -#if defined(SCRYPT_SALSA64_AVX) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx - #define SCRYPT_ROMIX_FN scrypt_ROMix_avx - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 - #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 - #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -/* cpu agnostic */ -#define SCRYPT_ROMIX_FN scrypt_ROMix_basic -#define SCRYPT_MIX_FN salsa64_core_basic -#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian -#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian -#include "scrypt-jane-romix-template.h" - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -static scrypt_ROMixfn -scrypt_getROMix() { - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA64_AVX) - if (cpuflags & cpu_avx) - return scrypt_ROMix_avx; - else -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - if (cpuflags & cpu_ssse3) - return scrypt_ROMix_ssse3; - else -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - if (cpuflags & cpu_sse2) - return scrypt_ROMix_sse2; - else -#endif - - return scrypt_ROMix_basic; -} -#endif - - -#if defined(SCRYPT_TEST_SPEED) -static size_t -available_implementations() { - size_t cpuflags = detect_cpu(); - size_t flags = 0; - -#if defined(SCRYPT_SALSA64_AVX) - if (cpuflags & cpu_avx) - flags |= cpu_avx; -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - if (cpuflags & cpu_ssse3) - flags |= cpu_ssse3; -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - if (cpuflags & cpu_sse2) - flags |= cpu_sse2; -#endif - - return flags; -} -#endif - -static int -scrypt_test_mix() { - static const uint8_t expected[16] = { - 0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c, - }; - - int ret = 1; - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA64_AVX) - if (cpuflags & cpu_avx) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - if (cpuflags & cpu_ssse3) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - if (cpuflags & cpu_sse2) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_BASIC) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); -#endif - - return ret; -} - diff --git a/algo/scryptjane/scrypt-jane-test-vectors.h b/algo/scryptjane/scrypt-jane-test-vectors.h deleted file mode 100644 index d774091..0000000 --- a/algo/scryptjane/scrypt-jane-test-vectors.h +++ /dev/null @@ -1,266 +0,0 @@ -typedef struct scrypt_test_setting_t { - const char *pw, *salt; - uint8_t Nfactor, rfactor, pfactor; -} scrypt_test_setting; - -/* - * I'm hardcoding the values of p and r, which means they can't be tested - * anymore. A new test case with a different value for N should maybe be added. - * - mikaelh - */ -static const scrypt_test_setting post_settings[] = { - {"", "", 3, 0, 0}, -// {"password", "NaCl", 9, 3, 4}, - {0} -}; - -#if defined(SCRYPT_SHA256) - #if defined(SCRYPT_SALSA) - /* sha256 + salsa20/8, the only 'official' test vectors! */ - static const uint8_t post_vectors[][64] = { - {0x77,0xd6,0x57,0x62,0x38,0x65,0x7b,0x20,0x3b,0x19,0xca,0x42,0xc1,0x8a,0x04,0x97, - 0xf1,0x6b,0x48,0x44,0xe3,0x07,0x4a,0xe8,0xdf,0xdf,0xfa,0x3f,0xed,0xe2,0x14,0x42, - 0xfc,0xd0,0x06,0x9d,0xed,0x09,0x48,0xf8,0x32,0x6a,0x75,0x3a,0x0f,0xc8,0x1f,0x17, - 0xe8,0xd3,0xe0,0xfb,0x2e,0x0d,0x36,0x28,0xcf,0x35,0xe2,0x0c,0x38,0xd1,0x89,0x06}, - {0xfd,0xba,0xbe,0x1c,0x9d,0x34,0x72,0x00,0x78,0x56,0xe7,0x19,0x0d,0x01,0xe9,0xfe, - 0x7c,0x6a,0xd7,0xcb,0xc8,0x23,0x78,0x30,0xe7,0x73,0x76,0x63,0x4b,0x37,0x31,0x62, - 0x2e,0xaf,0x30,0xd9,0x2e,0x22,0xa3,0x88,0x6f,0xf1,0x09,0x27,0x9d,0x98,0x30,0xda, - 0xc7,0x27,0xaf,0xb9,0x4a,0x83,0xee,0x6d,0x83,0x60,0xcb,0xdf,0xa2,0xcc,0x06,0x40} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0xef,0x8f,0x44,0x8f,0xc3,0xef,0x78,0x13,0xb2,0x26,0xa7,0x2a,0x40,0xa1,0x98,0x7f, - 0xc8,0x7f,0x0d,0x5f,0x40,0x66,0xa2,0x05,0x07,0x4f,0xc7,0xac,0x3b,0x47,0x07,0x0c, - 0xf5,0x20,0x46,0x76,0x20,0x7b,0xee,0x51,0x6d,0x5f,0xfa,0x9c,0x27,0xac,0xa9,0x36, - 0x62,0xbd,0xde,0x0b,0xa3,0xc0,0x66,0x84,0xde,0x82,0xd0,0x1a,0xb4,0xd1,0xb5,0xfe}, - {0xf1,0x94,0xf7,0x5f,0x15,0x12,0x10,0x4d,0x6e,0xfb,0x04,0x8c,0x35,0xc4,0x51,0xb6, - 0x11,0x04,0xa7,0x9b,0xb0,0x46,0xaf,0x7b,0x47,0x39,0xf0,0xac,0xb2,0x8a,0xfa,0x45, - 0x09,0x86,0x8f,0x10,0x4b,0xc6,0xee,0x00,0x11,0x38,0x73,0x7a,0x6a,0xd8,0x25,0x67, - 0x85,0xa4,0x10,0x4e,0xa9,0x2f,0x15,0xfe,0xcf,0x63,0xe1,0xe8,0xcf,0xab,0xe8,0xbd} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xf4,0x87,0x29,0xf4,0xc3,0x31,0x8c,0xe8,0xdf,0xe5,0xd8,0x73,0xff,0xca,0x32,0xcf, - 0xd8,0xac,0xe7,0xf7,0x15,0xda,0x84,0x41,0x60,0x23,0x26,0x4a,0xc8,0x3e,0xee,0xa6, - 0xa5,0x6e,0x52,0xd6,0x64,0x55,0x16,0x31,0x3e,0x66,0x7b,0x65,0xd5,0xe2,0xc9,0x95, - 0x1b,0xf0,0x81,0x40,0xb7,0x2f,0xff,0xa6,0xe6,0x02,0xcc,0x63,0x08,0x4a,0x74,0x31}, - {0x7a,0xd8,0xad,0x02,0x9c,0xa5,0xf4,0x42,0x6a,0x29,0xd2,0xb5,0x53,0xf1,0x6d,0x1d, - 0x25,0xc8,0x70,0x48,0x80,0xb9,0xa3,0xf6,0x94,0xf8,0xfa,0xb8,0x52,0x42,0xcd,0x14, - 0x26,0x46,0x28,0x06,0xc7,0xf6,0x1f,0xa7,0x89,0x6d,0xc5,0xa0,0x36,0xcc,0xde,0xcb, - 0x73,0x0b,0xa4,0xe2,0xd3,0xd1,0x44,0x06,0x35,0x08,0xe0,0x35,0x5b,0xf8,0xd7,0xe7} - }; - #endif -#elif defined(SCRYPT_SHA512) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0xae,0x54,0xe7,0x74,0xe4,0x51,0x6b,0x0f,0xe1,0xe7,0x28,0x03,0x17,0xe4,0x8c,0xfa, - 0x2f,0x66,0x55,0x7f,0xdc,0x3b,0x40,0xab,0x47,0x84,0xc9,0x63,0x36,0x07,0x9d,0xe5, - 0x86,0x43,0x95,0x89,0xb6,0xc0,0x6c,0x72,0x64,0x00,0xc1,0x2a,0xd7,0x69,0x21,0x92, - 0x8e,0xba,0xa4,0x59,0x9f,0x00,0x14,0x3a,0x7c,0x12,0x58,0x91,0x09,0xa0,0x32,0xfe}, - {0xc5,0xb3,0xd6,0xea,0x0a,0x4b,0x1e,0xcc,0x40,0x00,0xe5,0x98,0x5c,0xdc,0x06,0x06, - 0x78,0x34,0x92,0x16,0xcf,0xe4,0x9f,0x03,0x96,0x2d,0x41,0x35,0x00,0x9b,0xff,0x74, - 0x60,0x19,0x6e,0xe6,0xa6,0x46,0xf7,0x37,0xcb,0xfa,0xd0,0x9f,0x80,0x72,0x2e,0x85, - 0x13,0x3e,0x1a,0x91,0x90,0x53,0xa1,0x33,0x85,0x51,0xdc,0x62,0x1c,0x0e,0x4d,0x30} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0xe2,0x05,0x7c,0x44,0xf9,0x55,0x9f,0x64,0xbe,0xd5,0x7f,0x85,0x69,0xc7,0x8c,0x7f, - 0x2b,0x91,0xd6,0x9a,0x6c,0xf8,0x57,0x55,0x61,0x25,0x3d,0xee,0xb8,0xd5,0x8c,0xdc, - 0x2d,0xd5,0x53,0x84,0x8c,0x06,0xaa,0x37,0x77,0xa6,0xf0,0xf1,0x35,0xfe,0xb5,0xcb, - 0x61,0xd7,0x2c,0x67,0xf3,0x7e,0x8a,0x1b,0x04,0xa3,0xa3,0x43,0xa2,0xb2,0x29,0xf2}, - {0x82,0xda,0x29,0xb2,0x08,0x27,0xfc,0x78,0x22,0xc4,0xb8,0x7e,0xbc,0x36,0xcf,0xcd, - 0x17,0x4b,0xa1,0x30,0x16,0x4a,0x25,0x70,0xc7,0xcb,0xe0,0x2b,0x56,0xd3,0x16,0x4e, - 0x85,0xb6,0x84,0xe7,0x9b,0x7f,0x8b,0xb5,0x94,0x33,0xcf,0x33,0x44,0x65,0xc8,0xa1, - 0x46,0xf9,0xf5,0xfc,0x74,0x29,0x7e,0xd5,0x46,0xec,0xbd,0x95,0xc1,0x80,0x24,0xe4} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xa6,0xcb,0x77,0x9a,0x64,0x1f,0x95,0x02,0x53,0xe7,0x5c,0x78,0xdb,0xa3,0x43,0xff, - 0xbe,0x10,0x4c,0x7b,0xe4,0xe1,0x91,0xcf,0x67,0x69,0x5a,0x2c,0x12,0xd6,0x99,0x49, - 0x92,0xfd,0x5a,0xaa,0x12,0x4c,0x2e,0xf6,0x95,0x46,0x8f,0x5e,0x77,0x62,0x16,0x29, - 0xdb,0xe7,0xab,0x02,0x2b,0x9c,0x35,0x03,0xf8,0xd4,0x04,0x7d,0x2d,0x73,0x85,0xf1}, - {0x54,0xb7,0xca,0xbb,0xaf,0x0f,0xb0,0x5f,0xb7,0x10,0x63,0x48,0xb3,0x15,0xd8,0xb5, - 0x62,0x64,0x89,0x6a,0x59,0xc6,0x0f,0x86,0x96,0x38,0xf0,0xcf,0xd4,0x62,0x90,0x61, - 0x7d,0xce,0xd6,0x13,0x85,0x67,0x4a,0xf5,0x32,0x03,0x74,0x30,0x0b,0x5a,0x2f,0x86, - 0x82,0x6e,0x0c,0x3e,0x40,0x7a,0xde,0xbe,0x42,0x6e,0x80,0x2b,0xaf,0xdb,0xcc,0x94} - }; - #endif -#elif defined(SCRYPT_BLAKE512) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0x4a,0x48,0xb3,0xfa,0xdc,0xb0,0xb8,0xdb,0x54,0xee,0xf3,0x5c,0x27,0x65,0x6c,0x20, - 0xab,0x61,0x9a,0x5b,0xd5,0x1d,0xd9,0x95,0xab,0x88,0x0e,0x4d,0x1e,0x71,0x2f,0x11, - 0x43,0x2e,0xef,0x23,0xca,0x8a,0x49,0x3b,0x11,0x38,0xa5,0x28,0x61,0x2f,0xb7,0x89, - 0x5d,0xef,0x42,0x4c,0xc1,0x74,0xea,0x8a,0x56,0xbe,0x4a,0x82,0x76,0x15,0x1a,0x87}, - {0x96,0x24,0xbf,0x40,0xeb,0x03,0x8e,0xfe,0xc0,0xd5,0xa4,0x81,0x85,0x7b,0x09,0x88, - 0x52,0xb5,0xcb,0xc4,0x48,0xe1,0xb9,0x1d,0x3f,0x8b,0x3a,0xc6,0x38,0x32,0xc7,0x55, - 0x30,0x28,0x7a,0x42,0xa9,0x5d,0x54,0x33,0x62,0xf3,0xd9,0x3c,0x96,0x40,0xd1,0x80, - 0xe4,0x0e,0x7e,0xf0,0x64,0x53,0xfe,0x7b,0xd7,0x15,0xba,0xad,0x16,0x80,0x01,0xb5} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0x45,0x42,0x22,0x31,0x26,0x13,0x5f,0x94,0xa4,0x00,0x04,0x47,0xe8,0x50,0x6d,0xd6, - 0xdd,0xd5,0x08,0xd4,0x90,0x64,0xe0,0x59,0x70,0x46,0xff,0xfc,0x29,0xb3,0x6a,0xc9, - 0x4d,0x45,0x97,0x95,0xa8,0xf0,0x53,0xe7,0xee,0x4b,0x6b,0x5d,0x1e,0xa5,0xb2,0x58, - 0x4b,0x93,0xc9,0x89,0x4c,0xa8,0xab,0x03,0x74,0x38,0xbd,0x54,0x97,0x6b,0xab,0x4a}, - {0x4b,0x4a,0x63,0x96,0x73,0x34,0x9f,0x39,0x64,0x51,0x0e,0x2e,0x3b,0x07,0xd5,0x1c, - 0xd2,0xf7,0xce,0x60,0xab,0xac,0x89,0xa4,0x16,0x0c,0x58,0x82,0xb3,0xd3,0x25,0x5b, - 0xd5,0x62,0x32,0xf4,0x86,0x5d,0xb2,0x4b,0xbf,0x8e,0xc6,0xc0,0xac,0x40,0x48,0xb4, - 0x69,0x08,0xba,0x40,0x4b,0x07,0x2a,0x13,0x9c,0x98,0x3b,0x8b,0x20,0x0c,0xac,0x9e} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xcb,0x4b,0xc2,0xd1,0xf4,0x77,0x32,0x3c,0x42,0x9d,0xf7,0x7d,0x1f,0x22,0x64,0xa4, - 0xe2,0x88,0x30,0x2d,0x54,0x9d,0xb6,0x26,0x89,0x25,0x30,0xc3,0x3d,0xdb,0xba,0x99, - 0xe9,0x8e,0x1e,0x5e,0x57,0x66,0x75,0x7c,0x24,0xda,0x00,0x6f,0x79,0xf7,0x47,0xf5, - 0xea,0x40,0x70,0x37,0xd2,0x91,0xc7,0x4d,0xdf,0x46,0xb6,0x3e,0x95,0x7d,0xcb,0xc1}, - {0x25,0xc2,0xcb,0x7f,0xc8,0x50,0xb7,0x0b,0x11,0x9e,0x1d,0x10,0xb2,0xa8,0x35,0x23, - 0x91,0x39,0xfb,0x45,0xf2,0xbf,0xe4,0xd0,0x84,0xec,0x72,0x33,0x6d,0x09,0xed,0x41, - 0x9a,0x7e,0x4f,0x10,0x73,0x97,0x22,0x76,0x58,0x93,0x39,0x24,0xdf,0xd2,0xaa,0x2f, - 0x6b,0x2b,0x64,0x48,0xa5,0xb7,0xf5,0x56,0x77,0x02,0xa7,0x71,0x46,0xe5,0x0e,0x8d}, - }; - #endif -#elif defined(SCRYPT_BLAKE256) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0xf1,0xf1,0x91,0x1a,0x81,0xe6,0x9f,0xc1,0xce,0x43,0xab,0xb1,0x1a,0x02,0x1e,0x16, - 0x08,0xc6,0xf9,0x00,0x50,0x1b,0x6d,0xf1,0x31,0x06,0x95,0x48,0x5d,0xf7,0x6c,0x00, - 0xa2,0x4c,0xb1,0x0e,0x52,0x66,0x94,0x7e,0x84,0xfc,0xa5,0x34,0xfd,0xf0,0xe9,0x57, - 0x85,0x2d,0x8c,0x05,0x5c,0x0f,0x04,0xd4,0x8d,0x3e,0x13,0x52,0x3d,0x90,0x2d,0x2c}, - {0xd5,0x42,0xd2,0x7b,0x06,0xae,0x63,0x90,0x9e,0x30,0x00,0x0e,0xd8,0xa4,0x3a,0x0b, - 0xee,0x4a,0xef,0xb2,0xc4,0x95,0x0d,0x72,0x07,0x70,0xcc,0xa3,0xf9,0x1e,0xc2,0x75, - 0xcf,0xaf,0xe1,0x44,0x1c,0x8c,0xe2,0x3e,0x0c,0x81,0xf3,0x92,0xe1,0x13,0xe6,0x4f, - 0x2d,0x27,0xc3,0x87,0xe5,0xb6,0xf9,0xd7,0x02,0x04,0x37,0x64,0x78,0x36,0x6e,0xb3} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0xad,0x1b,0x4b,0xca,0xe3,0x26,0x1a,0xfd,0xb7,0x77,0x8c,0xde,0x8d,0x26,0x14,0xe1, - 0x54,0x38,0x42,0xf3,0xb3,0x66,0x29,0xf9,0x90,0x04,0xf1,0x82,0x7c,0x5a,0x6f,0xa8, - 0x7d,0xd6,0x08,0x0d,0x8b,0x78,0x04,0xad,0x31,0xea,0xd4,0x87,0x2d,0xf7,0x74,0x9a, - 0xe5,0xce,0x97,0xef,0xa3,0xbb,0x90,0x46,0x7c,0xf4,0x51,0x38,0xc7,0x60,0x53,0x21}, - {0x39,0xbb,0x56,0x3d,0x0d,0x7b,0x74,0x82,0xfe,0x5a,0x78,0x3d,0x66,0xe8,0x3a,0xdf, - 0x51,0x6f,0x3e,0xf4,0x86,0x20,0x8d,0xe1,0x81,0x22,0x02,0xf7,0x0d,0xb5,0x1a,0x0f, - 0xfc,0x59,0xb6,0x60,0xc9,0xdb,0x38,0x0b,0x5b,0x95,0xa5,0x94,0xda,0x42,0x2d,0x90, - 0x47,0xeb,0x73,0x31,0x9f,0x20,0xf6,0x81,0xc2,0xef,0x33,0x77,0x51,0xd8,0x2c,0xe4} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0x9e,0xf2,0x60,0x7c,0xbd,0x7c,0x19,0x5c,0x79,0xc6,0x1b,0x7e,0xb0,0x65,0x1b,0xc3, - 0x70,0x0d,0x89,0xfc,0x72,0xb2,0x03,0x72,0x15,0xcb,0x8e,0x8c,0x49,0x50,0x4c,0x27, - 0x99,0xda,0x47,0x32,0x5e,0xb4,0xa2,0x07,0x83,0x51,0x6b,0x06,0x37,0x60,0x42,0xc4, - 0x59,0x49,0x99,0xdd,0xc0,0xd2,0x08,0x94,0x7f,0xe3,0x9e,0x4e,0x43,0x8e,0x5b,0xba}, - {0x86,0x6f,0x3b,0x11,0xb8,0xca,0x4b,0x6e,0xa7,0x6f,0xc2,0xc9,0x33,0xb7,0x8b,0x9f, - 0xa3,0xb9,0xf5,0xb5,0x62,0xa6,0x17,0x66,0xe4,0xc3,0x9d,0x9b,0xca,0x51,0xb0,0x2f, - 0xda,0x09,0xc1,0x77,0xed,0x8b,0x89,0xc2,0x69,0x5a,0x34,0x05,0x4a,0x1f,0x4d,0x76, - 0xcb,0xd5,0xa4,0x78,0xfa,0x1b,0xb9,0x5b,0xbc,0x3d,0xce,0x04,0x63,0x99,0xad,0x54} - }; - #endif -#elif defined(SCRYPT_SKEIN512) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69, - 0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87, - 0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f, - 0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e}, - {0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e, - 0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b, - 0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb, - 0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0xd1,0x12,0x6d,0x64,0x10,0x0e,0x98,0x6c,0xbe,0x70,0x21,0xd9,0xc6,0x04,0x62,0xa4, - 0x29,0x13,0x9a,0x3c,0xf8,0xe9,0x1e,0x87,0x9f,0x88,0xf4,0x98,0x01,0x41,0x8e,0xce, - 0x60,0xf7,0xbe,0x17,0x0a,0xec,0xd6,0x30,0x80,0xcf,0x6b,0x1e,0xcf,0x95,0xa0,0x4d, - 0x37,0xed,0x3a,0x09,0xd1,0xeb,0x0c,0x80,0x82,0x22,0x8e,0xd3,0xb1,0x7f,0xd6,0xa8}, - {0x5c,0x5c,0x05,0xe2,0x75,0xa5,0xa4,0xec,0x81,0x97,0x9c,0x5b,0xd7,0x26,0xb3,0x16, - 0xb4,0x02,0x8c,0x56,0xe6,0x32,0x57,0x33,0x47,0x19,0x06,0x6c,0xde,0x68,0x41,0x37, - 0x5b,0x7d,0xa7,0xb3,0x73,0xeb,0x82,0xca,0x0f,0x86,0x2e,0x6b,0x47,0xa2,0x70,0x39, - 0x35,0xfd,0x2d,0x2e,0x7b,0xc3,0x68,0xbb,0x52,0x42,0x19,0x3b,0x78,0x96,0xe7,0xc8} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60, - 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59, - 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9, - 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89}, - {0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5, - 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99, - 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23, - 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b} - }; - #endif -#elif defined(SCRYPT_KECCAK512) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0xc2,0x7b,0xbe,0x1d,0xf1,0x99,0xd8,0xe7,0x1b,0xac,0xe0,0x9d,0xeb,0x5a,0xfe,0x21, - 0x71,0xff,0x41,0x51,0x4f,0xbe,0x41,0x01,0x15,0xe2,0xb7,0xb9,0x55,0x15,0x25,0xa1, - 0x40,0x4c,0x66,0x29,0x32,0xb7,0xc9,0x62,0x60,0x88,0xe0,0x99,0x39,0xae,0xce,0x25, - 0x3c,0x11,0x89,0xdd,0xc6,0x14,0xd7,0x3e,0xa3,0x6d,0x07,0x2e,0x56,0xa0,0xff,0x97}, - {0x3c,0x91,0x12,0x4a,0x37,0x7d,0xd6,0x96,0xd2,0x9b,0x5d,0xea,0xb8,0xb9,0x82,0x4e, - 0x4f,0x6b,0x60,0x4c,0x59,0x01,0xe5,0x73,0xfd,0xf6,0xb8,0x9a,0x5a,0xd3,0x7c,0x7a, - 0xd2,0x4f,0x8e,0x74,0xc1,0x90,0x88,0xa0,0x3f,0x55,0x75,0x79,0x10,0xd0,0x09,0x79, - 0x0f,0x6c,0x74,0x0c,0x05,0x08,0x3c,0x8c,0x94,0x7b,0x30,0x56,0xca,0xdf,0xdf,0x34} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0x77,0xcb,0x70,0xbf,0xae,0xd4,0x4c,0x5b,0xbc,0xd3,0xec,0x8a,0x82,0x43,0x8d,0xb3, - 0x7f,0x1f,0xfb,0x70,0x36,0x32,0x4d,0xa6,0xb7,0x13,0x37,0x77,0x30,0x0c,0x3c,0xfb, - 0x2c,0x20,0x8f,0x2a,0xf4,0x47,0x4d,0x69,0x8e,0xae,0x2d,0xad,0xba,0x35,0xe9,0x2f, - 0xe6,0x99,0x7a,0xf8,0xcf,0x70,0x78,0xbb,0x0c,0x72,0x64,0x95,0x8b,0x36,0x77,0x3d}, - {0xc6,0x43,0x17,0x16,0x87,0x09,0x5f,0x12,0xed,0x21,0xe2,0xb4,0xad,0x55,0xa1,0xa1, - 0x49,0x50,0x90,0x70,0xab,0x81,0x83,0x7a,0xcd,0xdf,0x23,0x52,0x19,0xc0,0xa2,0xd8, - 0x8e,0x98,0xeb,0xf0,0x37,0xab,0xad,0xfd,0x1c,0x04,0x97,0x18,0x42,0x85,0xf7,0x4b, - 0x18,0x2c,0x55,0xd3,0xa9,0xe6,0x89,0xfb,0x58,0x0a,0xb2,0x37,0xb9,0xf8,0xfb,0xc5} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xc7,0x34,0x95,0x02,0x5e,0x31,0x0d,0x1f,0x10,0x38,0x9c,0x3f,0x04,0x53,0xed,0x05, - 0x27,0x38,0xc1,0x3f,0x6a,0x0f,0xc5,0xa3,0x9b,0x73,0x8a,0x28,0x7e,0x5d,0x3c,0xdc, - 0x9d,0x5a,0x09,0xbf,0x8c,0x0a,0xad,0xe4,0x73,0x52,0xe3,0x6d,0xaa,0xd1,0x8b,0xbf, - 0xa3,0xb7,0xf0,0x58,0xad,0x22,0x24,0xc9,0xaa,0x96,0xb7,0x5d,0xfc,0x5f,0xb0,0xcf}, - {0x76,0x22,0xfd,0xe8,0xa2,0x79,0x8e,0x9d,0x43,0x8c,0x7a,0xba,0x78,0xb7,0x84,0xf1, - 0xc8,0xee,0x3b,0xae,0x31,0x89,0xbf,0x7e,0xd0,0x4b,0xc1,0x2d,0x58,0x5d,0x84,0x6b, - 0xec,0x86,0x56,0xe0,0x87,0x94,0x7f,0xbc,0xf9,0x48,0x92,0xef,0x54,0x7f,0x23,0x8d, - 0x4f,0x8b,0x0a,0x75,0xa7,0x39,0x0e,0x46,0x6e,0xee,0x58,0xc8,0xfa,0xea,0x90,0x53} - }; - #endif -#elif defined(SCRYPT_KECCAK256) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0x2e,0x96,0xd8,0x87,0x45,0xcd,0xd6,0xc8,0xf6,0xd2,0x87,0x33,0x50,0xc7,0x04,0xe5, - 0x3c,0x4b,0x48,0x44,0x57,0xc1,0x74,0x09,0x76,0x02,0xaa,0xd3,0x7b,0xf3,0xbf,0xed, - 0x4b,0x72,0xd7,0x1b,0x49,0x6b,0xe0,0x44,0x83,0xee,0x8f,0xaf,0xa1,0xb5,0x33,0xa9, - 0x9e,0x86,0xab,0xe2,0x9f,0xcf,0x68,0x6e,0x7e,0xbd,0xf5,0x7a,0x83,0x4b,0x1c,0x10}, - {0x42,0x7e,0xf9,0x4b,0x72,0x61,0xda,0x2d,0xb3,0x27,0x0e,0xe1,0xd9,0xde,0x5f,0x3e, - 0x64,0x2f,0xd6,0xda,0x90,0x59,0xce,0xbf,0x02,0x5b,0x32,0xf7,0x6d,0x94,0x51,0x7b, - 0xb6,0xa6,0x0d,0x99,0x3e,0x7f,0x39,0xbe,0x1b,0x1d,0x6c,0x97,0x12,0xd8,0xb7,0xfd, - 0x5b,0xb5,0xf3,0x73,0x5a,0x89,0xb2,0xdd,0xcc,0x3d,0x74,0x2e,0x3d,0x9e,0x3c,0x22} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0x76,0x1d,0x5b,0x8f,0xa9,0xe1,0xa6,0x01,0xcb,0xc5,0x7a,0x5f,0x02,0x23,0xb6,0x82, - 0x57,0x79,0x60,0x2f,0x05,0x7f,0xb8,0x0a,0xcb,0x5e,0x54,0x11,0x49,0x2e,0xdd,0x85, - 0x83,0x30,0x67,0xb3,0x24,0x5c,0xce,0xfc,0x32,0xcf,0x12,0xc3,0xff,0xe0,0x79,0x36, - 0x74,0x17,0xa6,0x3e,0xcd,0xa0,0x7e,0xcb,0x37,0xeb,0xcb,0xb6,0xe1,0xb9,0xf5,0x15}, - {0xf5,0x66,0xa7,0x4c,0xe4,0xdc,0x18,0x56,0x2f,0x3e,0x86,0x4d,0x92,0xa5,0x5c,0x5a, - 0x8f,0xc3,0x6b,0x32,0xdb,0xe5,0x72,0x50,0x84,0xfc,0x6e,0x5d,0x15,0x77,0x3d,0xca, - 0xc5,0x2b,0x20,0x3c,0x78,0x37,0x80,0x78,0x23,0x56,0x91,0xa0,0xce,0xa4,0x06,0x5a, - 0x7f,0xe3,0xbf,0xab,0x51,0x57,0x32,0x2c,0x0a,0xf0,0xc5,0x6f,0xf4,0xcb,0xff,0x42} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xb0,0xb7,0x10,0xb5,0x1f,0x2b,0x7f,0xaf,0x9d,0x95,0x5f,0x4c,0x2d,0x98,0x7c,0xc1, - 0xbc,0x37,0x2f,0x50,0x8d,0xb2,0x9f,0xfd,0x48,0x0d,0xe0,0x44,0x19,0xdf,0x28,0x6c, - 0xab,0xbf,0x1e,0x17,0x26,0xcc,0x57,0x95,0x18,0x17,0x83,0x4c,0x12,0x48,0xd9,0xee, - 0x4b,0x00,0x29,0x06,0x31,0x01,0x6b,0x8c,0x26,0x39,0xbf,0xe4,0xe4,0xd4,0x6a,0x26}, - {0xa0,0x40,0xb2,0xf2,0x11,0xb6,0x5f,0x3d,0x4c,0x1e,0xef,0x59,0xd4,0x98,0xdb,0x14, - 0x01,0xff,0xe3,0x34,0xd7,0x19,0xcd,0xeb,0xde,0x52,0x1c,0xf4,0x86,0x43,0xc9,0xe2, - 0xfb,0xf9,0x4f,0x0a,0xbb,0x1f,0x5c,0x6a,0xdf,0xb9,0x28,0xfa,0xac,0xc4,0x48,0xed, - 0xcc,0xd2,0x2e,0x25,0x5f,0xf3,0x56,0x1d,0x2d,0x23,0x22,0xc1,0xbc,0xff,0x78,0x80} - }; - #endif -#else - static const uint8_t post_vectors[][64] = {{0}}; -#endif - diff --git a/algo/scryptjane/scrypt-jane.c b/algo/scryptjane/scrypt-jane.c deleted file mode 100644 index 6afdc3e..0000000 --- a/algo/scryptjane/scrypt-jane.c +++ /dev/null @@ -1,265 +0,0 @@ -#include -#include -#include "inttypes.h" -#include "algo-gate-api.h" - -/* Hard-coded scrypt parameteres r and p - mikaelh */ -#define SCRYPT_R 1 -#define SCRYPT_P 1 - -/* Only the instrinsics versions are optimized for hard-coded values - mikaelh */ -#define CPU_X86_FORCE_INTRINSICS - -#undef SCRYPT_KECCAK512 -#undef SCRYPT_CHACHA -#undef SCRYPT_CHOOSE_COMPILETIME -#define SCRYPT_KECCAK512 -#define SCRYPT_CHACHA -#define SCRYPT_CHOOSE_COMPILETIME - -//#include "scrypt-jane.h" -#include "../scryptjane/scrypt-jane-portable.h" -#include "../scryptjane/scrypt-jane-hash.h" -#include "../scryptjane/scrypt-jane-romix.h" -#include "../scryptjane/scrypt-jane-test-vectors.h" - -#ifndef min -#define min(a,b) (a>b ? b : a) -#endif -#ifndef max -#define max(a,b) (a max_alloc) - return 0; // scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); - aa->mem = (uint8_t *)malloc((size_t)size); - aa->ptr = (uint8_t *)(((size_t)aa->mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - if (!aa->mem) - return 0; // scrypt_fatal_error("scrypt: out of memory"); - return 1; -} - -static void -scrypt_free(scrypt_aligned_alloc *aa) { - free(aa->mem); -} - -void -scrypt_N_1_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint32_t N, uint8_t *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) { - uint32_t chunk_bytes, i; - const uint32_t r = SCRYPT_R; - const uint32_t p = SCRYPT_P; - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) - scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); -#endif - - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - - /* 1: X = PBKDF2(password, salt) */ - scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p); - - /* 2: X = ROMix(X) */ - for (i = 0; i < p; i++) - scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N); - - /* 3: Out = PBKDF2(password, X) */ - scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes); - -#ifdef SCRYPT_PREVENT_STATE_LEAK - /* This is an unnecessary security feature - mikaelh */ - scrypt_ensure_zero(Y, (p + 1) * chunk_bytes); -#endif -} - - -// increasing Nfactor gradually -const unsigned char minNfactor = 4; -const unsigned char maxNfactor = 30; - -unsigned char GetNfactor(unsigned int nTimestamp, unsigned int ntime) { - int l = 0; - unsigned long int s; - int n; - unsigned char N; - - if (nTimestamp <= ntime) - return 4; - - s = nTimestamp - ntime; - while ((s >> 1) > 3) { - l += 1; - s >>= 1; - } - - s &= 3; - - n = (l * 170 + s * 25 - 2320) / 100; - - if (n < 0) n = 0; - - if (n > 255) { - n = 255; - // printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); - } - - N = (unsigned char)n; - //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor)); - - if (NmaxNfactor) return maxNfactor; - return N; -} - - -int scanhash_scryptjane( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - scrypt_aligned_alloc YX, V; - uint8_t *X, *Y; -// uint32_t N, chunk_bytes; - uint32_t chunk_bytes; - const uint32_t r = SCRYPT_R; - const uint32_t p = SCRYPT_P; - - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t _ALIGN(64) endiandata[20]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ptarget[7] = 0x00ff; - - for (int k = 0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - //Nfactor = GetNfactor(data[17], ntime); - //if (Nfactor > scrypt_maxN) { - // return 1; - // //scrypt_fatal_error("scrypt: N out of range"); - //} - -// opt_scrypt_n default is 1024 which makes no sense in this context -// and results in N = 2, but it seems to work on Nicehash scryptjanenf16 -// (leocoin). Need to test with proper NF 16 for functionality and performance. -// Also test yacoin (NF 18). -// N = (1 << ( opt_scrypt_n + 1)); - - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - if (!scrypt_alloc( sj_N * chunk_bytes, &V ) ) return 1; - if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) { - scrypt_free(&V); - return 1; - } - - Y = YX.ptr; - X = Y + chunk_bytes; - - do { - const uint32_t Htarg = ptarget[7]; - uint32_t hash[8]; - be32enc(&endiandata[19], nonce); - - scrypt_N_1_1((unsigned char *)endiandata, 80, - (unsigned char *)endiandata, 80, - sj_N, (unsigned char *)hash, 32, X, Y, V.ptr); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) { - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - scrypt_free(&V); - scrypt_free(&YX); - return 1; - } - nonce++; - - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - - scrypt_free(&V); - scrypt_free(&YX); - return 0; -} - -/* simple cpu test (util.c) */ -void scryptjanehash(void *output, const void *input ) -{ - scrypt_aligned_alloc YX, V; - uint8_t *X, *Y; - uint32_t chunk_bytes; - const uint32_t r = SCRYPT_R; - const uint32_t p = SCRYPT_P; - - memset(output, 0, 32); - - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - if (!scrypt_alloc( sj_N * chunk_bytes, &V ) ) return; - if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) { - scrypt_free(&V); - return; - } - - Y = YX.ptr; - X = Y + chunk_bytes; - - scrypt_N_1_1((unsigned char*)input, 80, (unsigned char*)input, 80, - sj_N, (unsigned char*)output, 32, X, Y, V.ptr); - - scrypt_free(&V); - scrypt_free(&YX); -} - -bool register_scryptjane_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_scryptjane; - gate->hash = (void*)&scryptjanehash; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&get_max64_0x40LL; - - // figure out if arg in N or Nfactor - if ( !opt_scrypt_n ) - { - applog( LOG_ERR, "The N factor must be specified in the form algo:nf"); - return false; - } - else if ( opt_scrypt_n < 32 ) - { - // arg is Nfactor, calculate N - sj_N = 1 << ( opt_scrypt_n + 1 ); - } - else - { - // arg is N - sj_N = opt_scrypt_n; - } - return true; -} - - diff --git a/algo/sha/sha2-hash-4way.c b/algo/sha/sha2-hash-4way.c deleted file mode 100644 index 9516543..0000000 --- a/algo/sha/sha2-hash-4way.c +++ /dev/null @@ -1,684 +0,0 @@ -/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */ -/* - * SHA-384 / SHA-512 implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#if defined(__SSE2__) - -#include -#include - -#include "sha2-hash-4way.h" - -#include - -// SHA-256 32 bit - -static const sph_u32 H256[8] = { - SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), - SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), - SPH_C32(0x510E527F), SPH_C32(0x9B05688C), - SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) -}; - -static const sph_u32 K256[64] = { - SPH_C32(0x428A2F98), SPH_C32(0x71374491), - SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5), - SPH_C32(0x3956C25B), SPH_C32(0x59F111F1), - SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5), - SPH_C32(0xD807AA98), SPH_C32(0x12835B01), - SPH_C32(0x243185BE), SPH_C32(0x550C7DC3), - SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE), - SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174), - SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786), - SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC), - SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA), - SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA), - SPH_C32(0x983E5152), SPH_C32(0xA831C66D), - SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7), - SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147), - SPH_C32(0x06CA6351), SPH_C32(0x14292967), - SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138), - SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13), - SPH_C32(0x650A7354), SPH_C32(0x766A0ABB), - SPH_C32(0x81C2C92E), SPH_C32(0x92722C85), - SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B), - SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3), - SPH_C32(0xD192E819), SPH_C32(0xD6990624), - SPH_C32(0xF40E3585), SPH_C32(0x106AA070), - SPH_C32(0x19A4C116), SPH_C32(0x1E376C08), - SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5), - SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A), - SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3), - SPH_C32(0x748F82EE), SPH_C32(0x78A5636F), - SPH_C32(0x84C87814), SPH_C32(0x8CC70208), - SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB), - SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2) -}; - -// SHA-256 4 way - -#define SHA2s_MEXP( a, b, c, d ) \ - mm128_add4_32( SSG2_1( W[a] ), W[b], SSG2_0( W[c] ), W[d] ); - -#define CHs(X, Y, Z) \ - _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) - -#define MAJs(X, Y, Z) \ - _mm_or_si128( _mm_and_si128( X, Y ), \ - _mm_and_si128( _mm_or_si128( X, Y ), Z ) ) - -#define BSG2_0(x) \ - _mm_xor_si128( _mm_xor_si128( \ - mm128_ror_32(x, 2), mm128_ror_32(x, 13) ), mm128_ror_32( x, 22) ) - -#define BSG2_1(x) \ - _mm_xor_si128( _mm_xor_si128( \ - mm128_ror_32(x, 6), mm128_ror_32(x, 11) ), mm128_ror_32( x, 25) ) - -#define SSG2_0(x) \ - _mm_xor_si128( _mm_xor_si128( \ - mm128_ror_32(x, 7), mm128_ror_32(x, 18) ), _mm_srli_epi32(x, 3) ) - -#define SSG2_1(x) \ - _mm_xor_si128( _mm_xor_si128( \ - mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) ) - -#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ -do { \ - register __m128i T1, T2; \ - T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \ - _mm_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \ - T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \ - D = _mm_add_epi32( D, T1 ); \ - H = _mm_add_epi32( T1, T2 ); \ -} while (0) - -static void -sha256_4way_round( __m128i *in, __m128i r[8] ) -{ - register __m128i A, B, C, D, E, F, G, H; - __m128i W[16]; - - mm128_block_bswap_32( W, in ); - mm128_block_bswap_32( W+8, in+8 ); - - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2s_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2s_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2s_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2s_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2s_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2s_MEXP( 13, 8, 0, 15 ); - - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - r[0] = _mm_add_epi32( r[0], A ); - r[1] = _mm_add_epi32( r[1], B ); - r[2] = _mm_add_epi32( r[2], C ); - r[3] = _mm_add_epi32( r[3], D ); - r[4] = _mm_add_epi32( r[4], E ); - r[5] = _mm_add_epi32( r[5], F ); - r[6] = _mm_add_epi32( r[6], G ); - r[7] = _mm_add_epi32( r[7], H ); -} - -void sha256_4way_init( sha256_4way_context *sc ) -{ - sc->count_high = sc->count_low = 0; - sc->val[0] = _mm_set1_epi32( H256[0] ); - sc->val[1] = _mm_set1_epi32( H256[1] ); - sc->val[2] = _mm_set1_epi32( H256[2] ); - sc->val[3] = _mm_set1_epi32( H256[3] ); - sc->val[4] = _mm_set1_epi32( H256[4] ); - sc->val[5] = _mm_set1_epi32( H256[5] ); - sc->val[6] = _mm_set1_epi32( H256[6] ); - sc->val[7] = _mm_set1_epi32( H256[7] ); -} - -void sha256_4way( sha256_4way_context *sc, const void *data, size_t len ) -{ - __m128i *vdata = (__m128i*)data; - size_t ptr; - const int buf_size = 64; - - ptr = (unsigned)sc->count_low & (buf_size - 1U); - while ( len > 0 ) - { - size_t clen; - uint32_t clow, clow2; - - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 ); - vdata = vdata + (clen>>2); - ptr += clen; - len -= clen; - if ( ptr == buf_size ) - { - sha256_4way_round( sc->buf, sc->val ); - ptr = 0; - } - clow = sc->count_low; - clow2 = SPH_T32( clow + clen ); - sc->count_low = clow2; - if ( clow2 < clow ) - sc->count_high++; - } -} - -void sha256_4way_close( sha256_4way_context *sc, void *dst ) -{ - unsigned ptr; - uint32_t low, high; - const int buf_size = 64; - const int pad = buf_size - 8; - - ptr = (unsigned)sc->count_low & (buf_size - 1U); - sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 ); - ptr += 4; - - if ( ptr > pad ) - { - memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_4way_round( sc->buf, sc->val ); - memset_zero_128( sc->buf, pad >> 2 ); - } - else - memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 ); - - low = sc->count_low; - high = (sc->count_high << 3) | (low >> 29); - low = low << 3; - - sc->buf[ pad >> 2 ] = - mm128_bswap_32( _mm_set1_epi32( high ) ); - sc->buf[ ( pad+4 ) >> 2 ] = - mm128_bswap_32( _mm_set1_epi32( low ) ); - sha256_4way_round( sc->buf, sc->val ); - - mm128_block_bswap_32( dst, sc->val ); -} - -#if defined(__AVX2__) - -// SHA-256 8 way - -#define CHx(X, Y, Z) \ - _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) - -#define MAJx(X, Y, Z) \ - _mm256_or_si256( _mm256_and_si256( X, Y ), \ - _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) - -#define BSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 2), mm256_ror_32(x, 13) ), mm256_ror_32( x, 22) ) - -#define BSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 6), mm256_ror_32(x, 11) ), mm256_ror_32( x, 25) ) - -#define SSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 7), mm256_ror_32(x, 18) ), _mm256_srli_epi32(x, 3) ) - -#define SSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) ) - -#define SHA2x_MEXP( a, b, c, d ) \ - mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] ); - -#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ -do { \ - register __m256i T1, T2; \ - T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \ - _mm256_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \ - T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ - D = _mm256_add_epi32( D, T1 ); \ - H = _mm256_add_epi32( T1, T2 ); \ -} while (0) - -static void -sha256_8way_round( __m256i *in, __m256i r[8] ) -{ - register __m256i A, B, C, D, E, F, G, H; - __m256i W[16]; - - mm256_block_bswap_32( W , in ); - mm256_block_bswap_32( W+8, in+8 ); - - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x_MEXP( 13, 8, 0, 15 ); - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - r[0] = _mm256_add_epi32( r[0], A ); - r[1] = _mm256_add_epi32( r[1], B ); - r[2] = _mm256_add_epi32( r[2], C ); - r[3] = _mm256_add_epi32( r[3], D ); - r[4] = _mm256_add_epi32( r[4], E ); - r[5] = _mm256_add_epi32( r[5], F ); - r[6] = _mm256_add_epi32( r[6], G ); - r[7] = _mm256_add_epi32( r[7], H ); -} - - -void sha256_8way_init( sha256_8way_context *sc ) -{ - sc->count_high = sc->count_low = 0; - sc->val[0] = _mm256_set1_epi32( H256[0] ); - sc->val[1] = _mm256_set1_epi32( H256[1] ); - sc->val[2] = _mm256_set1_epi32( H256[2] ); - sc->val[3] = _mm256_set1_epi32( H256[3] ); - sc->val[4] = _mm256_set1_epi32( H256[4] ); - sc->val[5] = _mm256_set1_epi32( H256[5] ); - sc->val[6] = _mm256_set1_epi32( H256[6] ); - sc->val[7] = _mm256_set1_epi32( H256[7] ); -} - -void sha256_8way( sha256_8way_context *sc, const void *data, size_t len ) -{ - __m256i *vdata = (__m256i*)data; - size_t ptr; - const int buf_size = 64; - - ptr = (unsigned)sc->count_low & (buf_size - 1U); - while ( len > 0 ) - { - size_t clen; - uint32_t clow, clow2; - - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( sc->buf + (ptr>>2), vdata, clen>>2 ); - vdata = vdata + (clen>>2); - ptr += clen; - len -= clen; - if ( ptr == buf_size ) - { - sha256_8way_round( sc->buf, sc->val ); - ptr = 0; - } - clow = sc->count_low; - clow2 = SPH_T32( clow + clen ); - sc->count_low = clow2; - if ( clow2 < clow ) - sc->count_high++; - } -} - -void sha256_8way_close( sha256_8way_context *sc, void *dst ) -{ - unsigned ptr; - uint32_t low, high; - const int buf_size = 64; - const int pad = buf_size - 8; - - ptr = (unsigned)sc->count_low & (buf_size - 1U); - sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 ); - ptr += 4; - - if ( ptr > pad ) - { - memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_8way_round( sc->buf, sc->val ); - memset_zero_256( sc->buf, pad >> 2 ); - } - else - memset_zero_256( sc->buf + (ptr>>2), (pad - ptr) >> 2 ); - - low = sc->count_low; - high = (sc->count_high << 3) | (low >> 29); - low = low << 3; - - sc->buf[ pad >> 2 ] = - mm256_bswap_32( _mm256_set1_epi32( high ) ); - sc->buf[ ( pad+4 ) >> 2 ] = - mm256_bswap_32( _mm256_set1_epi32( low ) ); - - sha256_8way_round( sc->buf, sc->val ); - - mm256_block_bswap_32( dst, sc->val ); -} - - -// SHA-512 4 way 64 bit - -static const sph_u64 H512[8] = { - SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), - SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), - SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), - SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) -}; - -static const sph_u64 K512[80] = { - SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD), - SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC), - SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019), - SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118), - SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE), - SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2), - SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1), - SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694), - SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3), - SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65), - SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483), - SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5), - SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210), - SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4), - SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725), - SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70), - SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926), - SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF), - SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8), - SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B), - SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001), - SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30), - SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910), - SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8), - SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53), - SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8), - SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB), - SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3), - SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60), - SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC), - SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9), - SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B), - SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207), - SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178), - SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6), - SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B), - SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493), - SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C), - SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A), - SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) -}; - -#define CH(X, Y, Z) \ - _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) - -#define MAJ(X, Y, Z) \ - _mm256_or_si256( _mm256_and_si256( X, Y ), \ - _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) - -#define BSG5_0(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) ) - -#define BSG5_1(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) ) - -#define SSG5_0(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 1), mm256_ror_64(x, 8) ), _mm256_srli_epi64(x, 7) ) - -#define SSG5_1(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) ) - -#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ -do { \ - register __m256i T1, T2; \ - T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \ - _mm256_set1_epi64x( K512[i] ), W[i] ) ); \ - T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \ - D = _mm256_add_epi64( D, T1 ); \ - H = _mm256_add_epi64( T1, T2 ); \ -} while (0) - -static void -sha512_4way_round( __m256i *in, __m256i r[8] ) -{ - int i; - register __m256i A, B, C, D, E, F, G, H; - __m256i W[80]; - - mm256_block_bswap_64( W , in ); - mm256_block_bswap_64( W+8, in+8 ); - - for ( i = 16; i < 80; i++ ) - W[i] = mm256_add4_64( SSG5_1( W[ i- 2 ] ), W[ i- 7 ], - SSG5_0( W[ i-15 ] ), W[ i-16 ] ); - - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - - for ( i = 0; i < 80; i += 8 ) - { - SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); - SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 ); - SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 ); - SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 ); - SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 ); - SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 ); - SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 ); - SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 ); - } - - r[0] = _mm256_add_epi64( r[0], A ); - r[1] = _mm256_add_epi64( r[1], B ); - r[2] = _mm256_add_epi64( r[2], C ); - r[3] = _mm256_add_epi64( r[3], D ); - r[4] = _mm256_add_epi64( r[4], E ); - r[5] = _mm256_add_epi64( r[5], F ); - r[6] = _mm256_add_epi64( r[6], G ); - r[7] = _mm256_add_epi64( r[7], H ); -} - -void sha512_4way_init( sha512_4way_context *sc ) -{ - sc->count = 0; - sc->val[0] = _mm256_set1_epi64x( H512[0] ); - sc->val[1] = _mm256_set1_epi64x( H512[1] ); - sc->val[2] = _mm256_set1_epi64x( H512[2] ); - sc->val[3] = _mm256_set1_epi64x( H512[3] ); - sc->val[4] = _mm256_set1_epi64x( H512[4] ); - sc->val[5] = _mm256_set1_epi64x( H512[5] ); - sc->val[6] = _mm256_set1_epi64x( H512[6] ); - sc->val[7] = _mm256_set1_epi64x( H512[7] ); -} - -void sha512_4way( sha512_4way_context *sc, const void *data, size_t len ) -{ - __m256i *vdata = (__m256i*)data; - size_t ptr; - const int buf_size = 128; - - ptr = (unsigned)sc->count & (buf_size - 1U); - while ( len > 0 ) - { - size_t clen; - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 ); - vdata = vdata + (clen>>3); - ptr += clen; - len -= clen; - if ( ptr == buf_size ) - { - sha512_4way_round( sc->buf, sc->val ); - ptr = 0; - } - sc->count += clen; - } -} - -void sha512_4way_close( sha512_4way_context *sc, void *dst ) -{ - unsigned ptr; - const int buf_size = 128; - const int pad = buf_size - 16; - - ptr = (unsigned)sc->count & (buf_size - 1U); - sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 ); - ptr += 8; - if ( ptr > pad ) - { - memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 ); - sha512_4way_round( sc->buf, sc->val ); - memset_zero_256( sc->buf, pad >> 3 ); - } - else - memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); - - sc->buf[ pad >> 3 ] = - mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); - sc->buf[ ( pad+8 ) >> 3 ] = - mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); - sha512_4way_round( sc->buf, sc->val ); - - mm256_block_bswap_64( dst, sc->val ); -} - -#endif // __AVX2__ -#endif // __SSE2__ diff --git a/algo/sha/sha2-hash-4way.h b/algo/sha/sha2-hash-4way.h deleted file mode 100644 index 8ec16f3..0000000 --- a/algo/sha/sha2-hash-4way.h +++ /dev/null @@ -1,132 +0,0 @@ -/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * SHA-224, SHA-256, SHA-384 and SHA-512 interface. - * - * SHA-256 has been published in FIPS 180-2, now amended with a change - * notice to include SHA-224 as well (which is a simple variation on - * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS - * standards can be found at: - * http://csrc.nist.gov/publications/fips/ - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_sha2.h - * @author Thomas Pornin - */ - -#ifndef SHA2_HASH_4WAY_H__ -#define SHA2_HASH_4WAY_H__ 1 - -#include -#include "sph_types.h" -#include "simd-utils.h" - -#if defined(__SSE2__) -//#if defined(__SSE4_2__) - -//#define SPH_SIZE_sha256 256 - -// SHA-256 4 way - -typedef struct { - __m128i buf[64>>2]; - __m128i val[8]; - uint32_t count_high, count_low; -} sha256_4way_context; - -void sha256_4way_init( sha256_4way_context *sc ); -void sha256_4way( sha256_4way_context *sc, const void *data, size_t len ); -void sha256_4way_close( sha256_4way_context *sc, void *dst ); - -/* -// SHA-256 7 way hybrid -// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel. -typedef struct { - __m128i bufx[64>>2]; - __m128i valx[8]; - __m64 bufy[64>>2]; - __m64 valy[8]; - uint32_t bufz[64>>2]; - uint32_t valz[8]; - uint32_t count_high, count_low; -} sha256_7way_context; - -void sha256_7way_init( sha256_7way_context *ctx ); -void sha256_7way( sha256_7way_context *ctx, const void *datax, - void *datay, void *dataz, size_t len ); -void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx, - void *dstz ); -*/ - -#if defined (__AVX2__) - -// SHA-256 8 way - -typedef struct { - __m256i buf[64>>2]; - __m256i val[8]; - uint32_t count_high, count_low; -} sha256_8way_context; - -void sha256_8way_init( sha256_8way_context *sc ); -void sha256_8way( sha256_8way_context *sc, const void *data, size_t len ); -void sha256_8way_close( sha256_8way_context *sc, void *dst ); - -//#define SPH_SIZE_sha512 512 - -// SHA-512 4 way - -typedef struct { - __m256i buf[128>>3]; - __m256i val[8]; - uint64_t count; -} sha512_4way_context; - -void sha512_4way_init( sha512_4way_context *sc); -void sha512_4way( sha512_4way_context *sc, const void *data, size_t len ); -void sha512_4way_close( sha512_4way_context *sc, void *dst ); - -// SHA-256 11 way hybrid -// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel. -typedef struct { - __m256i bufx[64>>2]; - __m256i valx[8]; - __m64 bufy[64>>2]; - __m64 valy[8]; - uint32_t bufz[64>>2]; - uint32_t valz[8]; - uint32_t count_high, count_low; -} sha256_11way_context; - -void sha256_11way_init( sha256_11way_context *ctx ); -void sha256_11way_update( sha256_11way_context *ctx, const void *datax, - const void *datay, const void *dataz, size_t len ); -void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx, - void *dstz ); - -#endif // __AVX2__ -#endif // __SSE2__ -#endif // SHA256_4WAY_H__ diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c index 5ab3ee8..33cc6c1 100644 --- a/algo/sha/sha2.c +++ b/algo/sha/sha2.c @@ -197,7 +197,7 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) extern void sha256d(unsigned char *hash, const unsigned char *data, int len) { - uint32_t S[16], T[16]; + uint32_t S[16], T[16]; int i, r; sha256_init(S); @@ -467,8 +467,8 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W, void sha256d_ms_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate, const uint32_t *prehash); -static inline int scanhash_sha256d_4way(int thr_id, struct work *work, - uint32_t max_nonce, uint64_t *hashes_done) +static inline int scanhash_sha256d_4way( struct work *work, + uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -480,6 +480,7 @@ static inline int scanhash_sha256d_4way(int thr_id, struct work *work, uint32_t n = pdata[19] - 1; const uint32_t first_nonce = pdata[19]; const uint32_t Htarg = ptarget[7]; + int thr_id = mythr->id; int i, j; memcpy(data, pdata + 16, 64); @@ -509,10 +510,8 @@ static inline int scanhash_sha256d_4way(int thr_id, struct work *work, if (swab32(hash[4 * 7 + i]) <= Htarg) { pdata[19] = data[4 * 3 + i]; sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } + if ( fulltest( hash, ptarget ) && !opt_benchmark ) + submit_solution( work, hash, mythr ); } } } while (n < max_nonce && !work_restart[thr_id].restart); @@ -529,8 +528,8 @@ static inline int scanhash_sha256d_4way(int thr_id, struct work *work, void sha256d_ms_8way(uint32_t *hash, uint32_t *data, const uint32_t *midstate, const uint32_t *prehash); -static inline int scanhash_sha256d_8way(int thr_id, struct work *work, - uint32_t max_nonce, uint64_t *hashes_done) +static inline int scanhash_sha256d_8way( struct work *work, + uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -542,6 +541,7 @@ static inline int scanhash_sha256d_8way(int thr_id, struct work *work, uint32_t n = pdata[19] - 1; const uint32_t first_nonce = pdata[19]; const uint32_t Htarg = ptarget[7]; + int thr_id = mythr->id; int i, j; memcpy(data, pdata + 16, 64); @@ -571,10 +571,8 @@ static inline int scanhash_sha256d_8way(int thr_id, struct work *work, if (swab32(hash[8 * 7 + i]) <= Htarg) { pdata[19] = data[8 * 3 + i]; sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } + if ( fulltest( hash, ptarget ) && !opt_benchmark ) + submit_solution( work, hash, mythr ); } } } while (n < max_nonce && !work_restart[thr_id].restart); @@ -602,13 +600,11 @@ int scanhash_sha256d( struct work *work, #ifdef HAVE_SHA256_8WAY if (sha256_use_8way()) - return scanhash_sha256d_8way(thr_id, work, - max_nonce, hashes_done); + return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr ); #endif #ifdef HAVE_SHA256_4WAY if (sha256_use_4way()) - return scanhash_sha256d_4way(thr_id, work, - max_nonce, hashes_done); + return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr ); #endif memcpy(data, pdata + 16, 64); @@ -635,10 +631,42 @@ int scanhash_sha256d( struct work *work, return 0; } +int scanhash_SHA256d( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t _ALIGN(128) hash[8]; + uint32_t _ALIGN(64) data[20]; + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + int thr_id = mythr->id; + + memcpy( data, pdata, 80 ); + + do { + data[19] = ++n; + sha256d( (unsigned char*)hash, (const unsigned char*)data, 80 ); + if ( unlikely( swab32( hash[7] ) <= Htarg ) ) + { + pdata[19] = n; + sha256d_80_swap(hash, pdata); + if ( fulltest( hash, ptarget ) && !opt_benchmark ) + submit_solution( work, hash, mythr ); + } + } while ( likely( n < max_nonce && !work_restart[thr_id].restart ) ); + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + + bool register_sha256d_algo( algo_gate_t* gate ) { - gate->scanhash = (void*)&scanhash_sha256d; - gate->hash = (void*)&sha256d; - return true; + gate->optimizations = SSE2_OPT | AVX2_OPT; + gate->scanhash = (void*)&scanhash_sha256d; + gate->hash = (void*)&sha256d; + return true; }; diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c new file mode 100644 index 0000000..eac8f31 --- /dev/null +++ b/algo/sha/sha256-hash-opt.c @@ -0,0 +1,214 @@ +/* Intel SHA extensions using C intrinsics */ +/* Written and place in public domain by Jeffrey Walton */ +/* Based on code from Intel, and by Sean Gulley for */ +/* the miTLS project. */ + +// A drop in replacement for the function of the same name in sph_sha2.c. + +#if defined(__SHA__) + +#include "simd-utils.h" + +static void sha2_round(const uint8_t input[], uint32_t state[8]) { + __m128i STATE0, STATE1; + __m128i MSG, TMP, MASK; + __m128i TMSG0, TMSG1, TMSG2, TMSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + + // Load initial values + TMP = _mm_load_si128((__m128i *)&state[0]); + STATE1 = _mm_load_si128((__m128i *)&state[4]); + MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Rounds 0-3 + MSG = _mm_load_si128((const __m128i *)(input + 0)); + TMSG0 = _mm_shuffle_epi8(MSG, MASK); + MSG = _mm_add_epi32( + TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 4-7 + TMSG1 = _mm_load_si128((const __m128i *)(input + 16)); + TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); + MSG = _mm_add_epi32( + TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 8-11 + TMSG2 = _mm_load_si128((const __m128i *)(input + 32)); + TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); + MSG = _mm_add_epi32( + TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 12-15 + TMSG3 = _mm_load_si128((const __m128i *)(input + 48)); + TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); + MSG = _mm_add_epi32( + TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 16-19 + MSG = _mm_add_epi32( + TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 20-23 + MSG = _mm_add_epi32( + TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 24-27 + MSG = _mm_add_epi32( + TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 28-31 + MSG = _mm_add_epi32( + TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 32-35 + MSG = _mm_add_epi32( + TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 36-39 + MSG = _mm_add_epi32( + TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 40-43 + MSG = _mm_add_epi32( + TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 44-47 + MSG = _mm_add_epi32( + TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 48-51 + MSG = _mm_add_epi32( + TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 52-55 + MSG = _mm_add_epi32( + TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 56-59 + MSG = _mm_add_epi32( + TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 60-63 + MSG = _mm_add_epi32( + TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Add values back to state + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF + + // Save state + _mm_store_si128((__m128i *)&state[0], STATE0); + _mm_store_si128((__m128i *)&state[4], STATE1); +} + +#endif diff --git a/algo/sha/sha256_hash_11way.c b/algo/sha/sha256_hash_11way.c deleted file mode 100644 index df06375..0000000 --- a/algo/sha/sha256_hash_11way.c +++ /dev/null @@ -1,538 +0,0 @@ -#if 0 - -#include -#include - -#include "sha2-hash-4way.h" - -#if defined(__AVX2__) - -// naming convention for variables and macros -// VARx: AVX2 8 way 32 bit -// VARy: MMX 2 way 32 bit -// VARz: scalar integer 32 bit - - -static const uint32_t H256[8] = -{ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 -}; - -static const uint32_t K256[64] = -{ - 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, - 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, - 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, - 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, - 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, - 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, - 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, - 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, - 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, - 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, - 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, - 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, - 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, - 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, - 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, - 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 -}; - -#define CHx(X, Y, Z) \ - _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) - -#define CHy(X, Y, Z) \ - _mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z ) - -#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) ) - - -#define MAJx(X, Y, Z) \ - _mm256_or_si256( _mm256_and_si256( X, Y ), \ - _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) - -#define MAJy(X, Y, Z) \ - _mm_or_si64( _mm_and_si64( X, Y ), \ - _mm_and_si64( _mm_or_si64( X, Y ), Z ) ) - -#define MAJz(X, Y, Z) ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) ) - -#define BSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) ) - -#define BSG2_0y(x) \ - _mm_xor_si64( _mm_xor_si64( \ - mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) ) - -#define BSG2_0z(x) ( u32_ror_32(x,2) ^ u32_ror_32(x,13) ^ ((x)>>22) ) - -#define BSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) ) - -#define BSG2_1y(x) \ - _mm_xor_si64( _mm_xor_si64( \ - mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) ) - -#define BSG2_1z(x) ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) ) - -#define SSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) ) - -#define SSG2_0y(x) \ - _mm_xor_si64( _mm_xor_si64( \ - mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) ) - -#define SSG2_0z(x) (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) ) - -#define SSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) ) - -#define SSG2_1y(x) \ - _mm_xor_si64( _mm_xor_si64( \ - mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) ) - -#define SSG2_1z(x) ( u32_ror_32(x,17) ^ u32_ror_32(x,19) ^ ((x)>>10) ) - -#define SHA2x_MEXP( a, b, c, d ) \ - _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \ - SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] ) - -#define SHA2y_MEXP( a, b, c, d ) \ - _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \ - SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] ) - -#define SHA2z_MEXP( a, b, c, d ) \ - ( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] ) - - -#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \ - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \ - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \ -do { \ - __m256i T1x, T2x; \ - __m64 T1y, T2y; \ - uint32_t T1z, T2z; \ - T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \ - _mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \ - _mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \ - T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \ - _mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \ - _mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \ - T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \ - T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \ - T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \ - T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \ - Dx = _mm256_add_epi32( Dx, T1x ); \ - Dy = _mm_add_pi32( Dy, T1y ); \ - Dz = Dz + T1z; \ - Hx = _mm256_add_epi32( T1x, T2x ); \ - Hy = _mm_add_pi32( T1y, T2y ); \ - Hz = T1z + T2z; \ -} while (0) - -void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8], - uint32_t *inz, uint32_t rz[8] ) -{ - __m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx; - __m256i Wx[16]; - __m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy; - __m64 Wy[16]; - uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz; - uint32_t Wz[16]; - - Wx[ 0] = mm256_bswap_32( inx[ 0] ); - Wy[ 0] = mm64_bswap_32( iny[ 0] ); - Wz[ 0] = bswap_32( inz[ 0] ); - - Wx[ 1] = mm256_bswap_32( inx[ 1] ); - Wy[ 1] = mm64_bswap_32( iny[ 1] ); - Wz[ 1] = bswap_32( inz[ 1] ); - - Wx[ 2] = mm256_bswap_32( inx[ 2] ); - Wy[ 2] = mm64_bswap_32( iny[ 2] ); - Wz[ 2] = bswap_32( inz[ 2] ); - - Wx[ 3] = mm256_bswap_32( inx[ 3] ); - Wy[ 3] = mm64_bswap_32( iny[ 3] ); - Wz[ 3] = bswap_32( inz[ 3] ); - - Wx[ 4] = mm256_bswap_32( inx[ 4] ); - Wy[ 4] = mm64_bswap_32( iny[ 4] ); - Wz[ 4] = bswap_32( inz[ 4] ); - - Wx[ 5] = mm256_bswap_32( inx[ 5] ); - Wy[ 5] = mm64_bswap_32( iny[ 5] ); - Wz[ 5] = bswap_32( inz[ 5] ); - - Wx[ 6] = mm256_bswap_32( inx[ 6] ); - Wy[ 6] = mm64_bswap_32( iny[ 6] ); - Wz[ 6] = bswap_32( inz[ 6] ); - - Wx[ 7] = mm256_bswap_32( inx[ 7] ); - Wy[ 7] = mm64_bswap_32( iny[ 7] ); - Wz[ 7] = bswap_32( inz[ 7] ); - - Wx[ 8] = mm256_bswap_32( inx[ 8] ); - Wy[ 8] = mm64_bswap_32( iny[ 8] ); - Wz[ 8] = bswap_32( inz[ 8] ); - - Wx[ 9] = mm256_bswap_32( inx[ 9] ); - Wy[ 9] = mm64_bswap_32( iny[ 9] ); - Wz[ 9] = bswap_32( inz[ 9] ); - - Wx[10] = mm256_bswap_32( inx[10] ); - Wy[10] = mm64_bswap_32( iny[10] ); - Wz[10] = bswap_32( inz[10] ); - - Wx[11] = mm256_bswap_32( inx[11] ); - Wy[11] = mm64_bswap_32( iny[11] ); - Wz[11] = bswap_32( inz[11] ); - - Wx[12] = mm256_bswap_32( inx[12] ); - Wy[12] = mm64_bswap_32( iny[12] ); - Wz[12] = bswap_32( inz[12] ); - - Wx[13] = mm256_bswap_32( inx[13] ); - Wy[13] = mm64_bswap_32( iny[13] ); - Wz[13] = bswap_32( inz[13] ); - - Wx[14] = mm256_bswap_32( inx[14] ); - Wy[14] = mm64_bswap_32( iny[14] ); - Wz[14] = bswap_32( inz[14] ); - - Wx[15] = mm256_bswap_32( inx[15] ); - Wy[15] = mm64_bswap_32( iny[15] ); - Wz[15] = bswap_32( inz[15] ); - - Ax = rx[0]; Ay = ry[0]; Az = rz[0]; - Bx = rx[1]; By = ry[1]; Bz = rz[1]; - Cx = rx[2]; Cy = ry[2]; Cz = rz[2]; - Dx = rx[3]; Dy = ry[3]; Dz = rz[3]; - Ex = rx[4]; Ey = ry[4]; Ez = rz[4]; - Fx = rx[5]; Fy = ry[5]; Fz = rz[5]; - Gx = rx[6]; Gy = ry[6]; Gz = rz[6]; - Hx = rx[7]; Hy = ry[7]; Hz = rz[7]; - - SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, 0 ); - SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, - Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, - Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, 0 ); - SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, - Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, - Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, 0 ); - SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, - Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, - Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, 0 ); - SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, - Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, - Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, 0 ); - SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, - Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, - Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, 0 ); - SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, - Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, - Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, 0 ); - SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, - By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, - Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, 0 ); - SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, 0 ); - SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, - Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, - Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, 0 ); - SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, - Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, - Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 ); - SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, - Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, - Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 ); - SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, - Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, - Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 ); - SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, - Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, - Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 ); - SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, - Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, - Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 ); - SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, - By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, - Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - Wx[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); - Wy[ 0] = SHA2y_MEXP( 14, 9, 1, 0 ); - Wz[ 0] = SHA2z_MEXP( 14, 9, 1, 0 ); - - Wx[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); - Wy[ 1] = SHA2y_MEXP( 15, 10, 2, 1 ); - Wz[ 1] = SHA2z_MEXP( 15, 10, 2, 1 ); - - Wx[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); - Wy[ 2] = SHA2y_MEXP( 0, 11, 3, 2 ); - Wz[ 2] = SHA2z_MEXP( 0, 11, 3, 2 ); - - Wx[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); - Wy[ 3] = SHA2y_MEXP( 1, 12, 4, 3 ); - Wz[ 3] = SHA2z_MEXP( 1, 12, 4, 3 ); - - Wx[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); - Wy[ 4] = SHA2y_MEXP( 2, 13, 5, 4 ); - Wz[ 4] = SHA2z_MEXP( 2, 13, 5, 4 ); - - Wx[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); - Wy[ 5] = SHA2y_MEXP( 3, 14, 6, 5 ); - Wz[ 5] = SHA2z_MEXP( 3, 14, 6, 5 ); - - Wx[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); - Wy[ 6] = SHA2y_MEXP( 4, 15, 7, 6 ); - Wz[ 6] = SHA2z_MEXP( 4, 15, 7, 6 ); - - Wx[ 7] = SHA2x_MEXP( 5, 0, 8, 7); - Wy[ 7] = SHA2y_MEXP( 5, 0, 8, 7); - Wz[ 7] = SHA2z_MEXP( 5, 0, 8, 7); - - Wx[ 8] = SHA2x_MEXP( 6, 1, 9, 8); - Wy[ 8] = SHA2y_MEXP( 6, 1, 9, 8); - Wz[ 8] = SHA2z_MEXP( 6, 1, 9, 8); - - Wx[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); - Wy[ 9] = SHA2y_MEXP( 7, 2, 10, 9); - Wz[ 9] = SHA2z_MEXP( 7, 2, 10, 9); - - Wx[10] = SHA2x_MEXP( 8, 3, 11, 10 ); - Wy[10] = SHA2y_MEXP( 8, 3, 11, 10); - Wz[10] = SHA2z_MEXP( 8, 3, 11, 10); - - Wx[11] = SHA2x_MEXP( 9, 4, 12, 11); - Wy[11] = SHA2y_MEXP( 9, 4, 12, 11); - Wz[11] = SHA2z_MEXP( 9, 4, 12, 11 ); - - Wx[12] = SHA2x_MEXP( 10, 5, 13, 12 ); - Wy[12] = SHA2y_MEXP( 10, 5, 13, 12 ); - Wz[12] = SHA2z_MEXP( 10, 5, 13, 12 ); - - Wx[13] = SHA2x_MEXP( 11, 6, 14, 13 ); - Wy[13] = SHA2y_MEXP( 11, 6, 14, 13 ); - Wz[13] = SHA2z_MEXP( 11, 6, 14, 13 ); - - Wx[14] = SHA2x_MEXP( 12, 7, 15, 14 ); - Wy[14] = SHA2y_MEXP( 12, 7, 15, 14 ); - Wz[14] = SHA2z_MEXP( 12, 7, 15, 14 ); - - Wx[15] = SHA2x_MEXP( 13, 8, 0, 15 ); - Wy[15] = SHA2y_MEXP( 13, 8, 0, 15 ); - Wz[15] = SHA2z_MEXP( 13, 8, 0, 15 ); - - - SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, j ); - SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, - Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, - Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, j ); - SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, - Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, - Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, j ); - SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, - Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, - Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, j ); - SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, - Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, - Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, j ); - SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, - Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, - Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, j ); - SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, - Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, - Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, j ); - SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, - By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, - Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, j ); - SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, j ); - SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, - Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, - Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, j ); - SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, - Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, - Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j ); - SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, - Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, - Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j ); - SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, - Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, - Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j ); - SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, - Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, - Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j ); - SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, - Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, - Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j ); - SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, - By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, - Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j ); - } - - rx[0] = _mm256_add_epi32( rx[0], Ax ); - ry[0] = _mm_add_pi32( ry[0], Ay ); - rz[0] = rz[0]+ Az; - rx[1] = _mm256_add_epi32( rx[1], Bx ); - ry[1] = _mm_add_pi32( ry[1], By ); - rz[1] = rz[1]+ Bz; - rx[2] = _mm256_add_epi32( rx[2], Cx ); - ry[2] = _mm_add_pi32( ry[2], Cy ); - rz[3] = rz[3]+ Dz; - rx[4] = _mm256_add_epi32( rx[4], Ex ); - ry[4] = _mm_add_pi32( ry[4], Ey ); - rz[4] = rz[4]+ Ez; - rx[5] = _mm256_add_epi32( rx[5], Fx ); - ry[5] = _mm_add_pi32( ry[5], Fy ); - rz[5] = rz[5]+ Fz; - rx[6] = _mm256_add_epi32( rx[6], Gx ); - ry[6] = _mm_add_pi32( ry[6], Gy ); - rz[6] = rz[6]+ Gz; - rx[7] = _mm256_add_epi32( rx[7], Hx ); - ry[7] = _mm_add_pi32( ry[7], Hy ); - rz[7] = rz[7]+ Hz; - -} - -void sha256_11way_init( sha256_11way_context *ctx ) -{ - ctx->count_high = ctx->count_low = 0; - ctx->valx[0] = _mm256_set1_epi32( H256[0] ); - ctx->valy[0] = _mm_set1_pi32( H256[0] ); - ctx->valx[1] = _mm256_set1_epi32( H256[0] ); - ctx->valy[1] = _mm_set1_pi32( H256[0] ); - ctx->valx[2] = _mm256_set1_epi32( H256[0] ); - ctx->valy[2] = _mm_set1_pi32( H256[0] ); - ctx->valx[3] = _mm256_set1_epi32( H256[0] ); - ctx->valy[3] = _mm_set1_pi32( H256[0] ); - ctx->valx[4] = _mm256_set1_epi32( H256[0] ); - ctx->valy[4] = _mm_set1_pi32( H256[0] ); - ctx->valx[5] = _mm256_set1_epi32( H256[0] ); - ctx->valy[5] = _mm_set1_pi32( H256[0] ); - ctx->valx[6] = _mm256_set1_epi32( H256[0] ); - ctx->valy[6] = _mm_set1_pi32( H256[0] ); - ctx->valx[7] = _mm256_set1_epi32( H256[0] ); - ctx->valy[7] = _mm_set1_pi32( H256[0] ); - memcpy( ctx->valz, H256, 32 ); -} - - -void sha256_11way_update( sha256_11way_context *ctx, const void *datax, - const void *datay, const void *dataz, size_t len ) -{ - __m256i *vdatax = (__m256i*) datax; - __m64 *vdatay = (__m64*) datay; - uint32_t *idataz = (uint32_t*)dataz; - size_t ptr; - const int buf_size = 64; - - ptr = (unsigned)ctx->count_low & (buf_size - 1U); - while ( len > 0 ) - { - size_t clen; - uint32_t clow, clow2; - - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 ); - memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 ); - memcpy ( ctx->bufz + ptr, idataz + ptr, clen ); - ptr += clen; - len -= clen; - if ( ptr == buf_size ) - { - sha256_11way_round( ctx->bufx, ctx->valx, - ctx->bufy, ctx->valy, - ctx->bufz, ctx->valz ); - ptr = 0; - } - clow = ctx->count_low; - clow2 = clow + clen; - ctx->count_low = clow2; - if ( clow2 < clow ) - ctx->count_high++; - } -} - - -void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty, - void *dstz) -{ - unsigned ptr, u; - uint32_t low, high; - const int buf_size = 64; - const int pad = buf_size - 8; - - ptr = (unsigned)ctx->count_low & (buf_size - 1U); - ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 ); - ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 ); - ctx->bufz[ ptr>>2 ] = 0x80; - ptr += 4; - - if ( ptr > pad ) - { - memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 ); - memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 ); - memset( ctx->bufz + (ptr>>2), 0, (buf_size - ptr) >> 2 ); - sha256_11way_round( ctx->bufx, ctx->valx, - ctx->bufy, ctx->valy, - ctx->bufz, ctx->valz ); - memset_zero_256( ctx->bufx, pad >> 2 ); - memset_zero_m64( ctx->bufy, pad >> 2 ); - memset( ctx->bufz, 0, pad >> 2 ); - } - else - { - memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 ); - memset_zero_m64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 ); - memset( ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 ); - } - - low = ctx->count_low; - high = (ctx->count_high << 3) | (low >> 29); - low = low << 3; - - ctx->bufx[ pad >> 2 ] = - mm256_bswap_32( _mm256_set1_epi32( high ) ); - ctx->bufy[ pad >> 2 ] = - mm64_bswap_32( _mm_set1_pi32( high ) ); - ctx->bufz[ pad >> 2 ] = - bswap_32( high ); - - - ctx->bufx[ ( pad+4 ) >> 2 ] = - mm256_bswap_32( _mm256_set1_epi32( low ) ); - ctx->bufy[ ( pad+4 ) >> 2 ] = - mm64_bswap_32( _mm_set1_pi32( low ) ); - ctx->bufz[ ( pad+4 ) >> 2 ] = - bswap_32( low ); - - sha256_11way_round( ctx->bufx, ctx->valx, - ctx->bufy, ctx->valy, - ctx->bufz, ctx->valz ); - - for ( u = 0; u < 8; u ++ ) - { - casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] ); - casti_m64 ( dsty, u ) = mm64_bswap_32( ctx->valy[u] ); - ((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] ); - } -} - -#endif -#endif // 0 diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c deleted file mode 100644 index cc47b1c..0000000 --- a/algo/sha/sha256q-4way.c +++ /dev/null @@ -1,188 +0,0 @@ -#include "sha256t-gate.h" -#include -#include -#include -#include -#include "sha2-hash-4way.h" - -#if defined(SHA256T_8WAY) - -static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64))); - -void sha256q_8way_hash( void* output, const void* input ) -{ - uint32_t vhash[8*8] __attribute__ ((aligned (64))); - sha256_8way_context ctx; - memcpy( &ctx, &sha256_ctx8, sizeof ctx ); - - sha256_8way( &ctx, input + (64<<3), 16 ); - sha256_8way_close( &ctx, vhash ); - - sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, vhash ); - - sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, vhash ); - - sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, output ); -} - -int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t hash[8*8] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - const uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; - - // Need big endian data - mm256_bswap32_intrlv80_8x32( vdata, pdata ); - sha256_8way_init( &sha256_ctx8 ); - sha256_8way( &sha256_ctx8, vdata, 64 ); - - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_bswap_32( - _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) ); - - pdata[19] = n; - sha256q_8way_hash( hash, vdata ); - - uint32_t *hash7 = &(hash[7<<3]); - - for ( int lane = 0; lane < 8; lane++ ) - if ( !( hash7[ lane ] & mask ) ) - { - // deinterleave hash for lane - extr_lane_8x32( lane_hash, hash, lane, 256 ); - - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 8; - } while ( (n < max_nonce-10) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - -#if defined(SHA256T_4WAY) - -static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64))); - -void sha256q_4way_hash( void* output, const void* input ) -{ - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - sha256_4way_context ctx; - memcpy( &ctx, &sha256_ctx4, sizeof ctx ); - - sha256_4way( &ctx, input + (64<<2), 16 ); - sha256_4way_close( &ctx, vhash ); - - sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, vhash ); - - sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, vhash ); - - sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, output ); -} - -int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[8*4] __attribute__ ((aligned (32))); - uint32_t *hash7 = &(hash[7<<2]); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - const uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - sha256_4way_init( &sha256_ctx4 ); - sha256_4way( &sha256_ctx4, vdata, 64 ); - - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); - pdata[19] = n; - - sha256q_4way_hash( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( !( hash7[ lane ] & mask ) ) - { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - diff --git a/algo/sha/sha256q.c b/algo/sha/sha256q.c deleted file mode 100644 index 25f7d2d..0000000 --- a/algo/sha/sha256q.c +++ /dev/null @@ -1,104 +0,0 @@ -#include "sha256t-gate.h" -#include -#include -#include -#include -#include - -static __thread SHA256_CTX sha256q_ctx __attribute__ ((aligned (64))); - -void sha256q_midstate( const void* input ) -{ - SHA256_Init( &sha256q_ctx ); - SHA256_Update( &sha256q_ctx, input, 64 ); -} - -void sha256q_hash( void* output, const void* input ) -{ - uint32_t _ALIGN(64) hash[16]; - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - SHA256_CTX ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx ); - - SHA256_Update( &ctx, input + midlen, tail ); - SHA256_Final( (unsigned char*)hash, &ctx ); - - SHA256_Init( &ctx ); - SHA256_Update( &ctx, hash, 32 ); - SHA256_Final( (unsigned char*)hash, &ctx ); - - SHA256_Init( &ctx ); - SHA256_Update( &ctx, hash, 32 ); - SHA256_Final( (unsigned char*)hash, &ctx ); - - SHA256_Init( &ctx ); - SHA256_Update( &ctx, hash, 32 ); - SHA256_Final( (unsigned char*)hash, &ctx ); - - memcpy( output, hash, 32 ); -} - -int scanhash_sha256q( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; -#ifdef _MSC_VER - uint32_t __declspec(align(32)) hash64[8]; -#else - uint32_t hash64[8] __attribute__((aligned(32))); -#endif - uint32_t endiandata[32]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - sha256q_midstate( endiandata ); - - for ( int m = 0; m < 6; m++ ) - { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - sha256q_hash( hash64, endiandata ); - if ( !( hash64[7] & mask ) ) - if ( fulltest( hash64, ptarget ) && !opt_benchmark ) - submit_solution( work, hash64, mythr ); - } while ( n < max_nonce && !work_restart[thr_id].restart ); - break; - } - } - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c deleted file mode 100644 index c17ea1d..0000000 --- a/algo/sha/sha256t-4way.c +++ /dev/null @@ -1,307 +0,0 @@ -#include "sha256t-gate.h" -#include -#include -#include -#include -#include "sha2-hash-4way.h" - -#if defined(SHA256T_11WAY) - -static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64))); - -void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx, - const void *inpy, const void*inpz ) -{ - uint32_t hashx[8*8] __attribute__ ((aligned (64))); - uint32_t hashy[8*2] __attribute__ ((aligned (64))); - uint32_t hashz[8] __attribute__ ((aligned (64))); - sha256_11way_context ctx; - const void *inpx64 = inpx+(64<<3); - const void *inpy64 = inpy+(64<<1); - const void *inpz64 = inpz+ 64; - - memcpy( &ctx, &sha256_ctx11, sizeof ctx ); - sha256_11way_update( &ctx, inpx64, inpy64, inpz64, 16 ); - sha256_11way_close( &ctx, hashx, hashy, hashz ); - - sha256_11way_init( &ctx ); - sha256_11way_update( &ctx, hashx, hashy, hashz, 32 ); - sha256_11way_close( &ctx, hashx, hashy, hashz ); - - sha256_11way_init( &ctx ); - sha256_11way_update( &ctx, hashx, hashy, hashz, 32 ); - sha256_11way_close( &ctx, outx, outy, outz ); -} - -int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t datax[20*8] __attribute__ ((aligned (64))); - uint32_t datay[20*2] __attribute__ ((aligned (32))); - uint32_t dataz[20] __attribute__ ((aligned (32))); - uint32_t hashx[8*8] __attribute__ ((aligned (32))); - uint32_t hashy[8*2] __attribute__ ((aligned (32))); - uint32_t hashz[8] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncex = (__m256i*) datax + 19; - __m64 *noncey = (__m64*) datay + 19; - uint32_t *noncez = (uint32_t*)dataz + 19; - int thr_id = mythr->id; // thr_id arg is deprecated - int i; - const uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - const uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; - - // Use dataz (scalar) to stage bswapped data for the vectors. - casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); - casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); - casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - intrlv_8x32( datax, dataz, dataz, dataz, dataz, - dataz, dataz, dataz, dataz, 640 ); - mm64_interleave_2x32( datay, dataz, dataz, 640 ); - - sha256_11way_init( &sha256_ctx11 ); - sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 ); - - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncex = mm256_bswap_32( - _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) ); - *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) ); - *noncez = bswap_32( n+10 ); - - pdata[19] = n; - - sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz ); - - if ( opt_benchmark ) { n += 11; continue; } - - hash7 = &(hashx[7<<3]); - for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) ) - { - // deinterleave hash for lane - extr_lane_8x32( lane_hash, hashx, i, 256 ); - if ( fulltest( lane_hash, ptarget ) ) - { - pdata[19] = n + i; - submit_lane_solution( work, lane_hash, mythr, i ); - } - } - - hash7 = &(hashy[7<<1]); - for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) ) - - { - mm64_extr_lane_2x32( lane_hash, hashy, i, 256 ); - if ( fulltest( lane_hash, ptarget ) ) - { - pdata[19] = n + 8 + i; - submit_lane_solution( work, lane_hash, mythr, i+8 ); - } - } - - if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) ) - { - pdata[19] = n+10; - submit_lane_solution( work, hashz, mythr, 10 ); - } - n += 11; - - } while ( (n < max_nonce-12) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - -#if defined(SHA256T_8WAY) - -static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64))); - -void sha256t_8way_hash( void* output, const void* input ) -{ - uint32_t vhash[8*8] __attribute__ ((aligned (64))); - sha256_8way_context ctx; - memcpy( &ctx, &sha256_ctx8, sizeof ctx ); - - sha256_8way( &ctx, input + (64<<3), 16 ); - sha256_8way_close( &ctx, vhash ); - - sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, vhash ); - - sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, output ); -} - -int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t hash[8*8] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash7 = &(hash[7<<3]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - const uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; - - - // Need big endian data - mm256_bswap32_intrlv80_8x32( vdata, pdata ); - sha256_8way_init( &sha256_ctx8 ); - sha256_8way( &sha256_ctx8, vdata, 64 ); - - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_bswap_32( _mm256_set_epi32( - n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) ); - pdata[19] = n; - sha256t_8way_hash( hash, vdata ); - for ( int lane = 0; lane < 8; lane++ ) - if ( !( hash7[ lane ] & mask ) ) - { - // deinterleave hash for lane - extr_lane_8x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 8; - } while ( (n < max_nonce-10) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - -#if defined(SHA256T_4WAY) - -static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64))); - -void sha256t_4way_hash( void* output, const void* input ) -{ - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - sha256_4way_context ctx; - memcpy( &ctx, &sha256_ctx4, sizeof ctx ); - - sha256_4way( &ctx, input + (64<<2), 16 ); - sha256_4way_close( &ctx, vhash ); - - sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, vhash ); - - sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, output ); -} - -int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[8*4] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[7<<2]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - const uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - sha256_4way_init( &sha256_ctx4 ); - sha256_4way( &sha256_ctx4, vdata, 64 ); - - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); - pdata[19] = n; - - sha256t_4way_hash( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( !( hash7[ lane ] & mask ) ) - { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c deleted file mode 100644 index 0271234..0000000 --- a/algo/sha/sha256t-gate.c +++ /dev/null @@ -1,41 +0,0 @@ -#include "sha256t-gate.h" - -bool register_sha256t_algo( algo_gate_t* gate ) -{ -#if defined(SHA256T_8WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_sha256t_8way; - gate->hash = (void*)&sha256t_8way_hash; -#elif defined(SHA256T_4WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_sha256t_4way; - gate->hash = (void*)&sha256t_4way_hash; -#else - gate->optimizations = SHA_OPT; - gate->scanhash = (void*)&scanhash_sha256t; - gate->hash = (void*)&sha256t_hash; -#endif - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -} - -bool register_sha256q_algo( algo_gate_t* gate ) -{ -#if defined(SHA256T_8WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_sha256q_8way; - gate->hash = (void*)&sha256q_8way_hash; -#elif defined(SHA256T_4WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_sha256q_4way; - gate->hash = (void*)&sha256q_4way_hash; -#else - gate->optimizations = SHA_OPT; - gate->scanhash = (void*)&scanhash_sha256q; - gate->hash = (void*)&sha256q_hash; -#endif - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; - -} - diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h deleted file mode 100644 index 0d519aa..0000000 --- a/algo/sha/sha256t-gate.h +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef __SHA256T_GATE_H__ -#define __SHA256T_GATE_H__ 1 - -#include -#include "algo-gate-api.h" - -// Override multi way on ryzen, SHA is better. -#if !defined(__SHA__) - #if defined(__AVX2__) - #define SHA256T_8WAY - #elif defined(__SSE2__) - #define SHA256T_4WAY - #endif -#endif - -bool register_sha256t_algo( algo_gate_t* gate ); -bool register_sha256q_algo( algo_gate_t* gate ); - -#if defined(SHA256T_8WAY) - -void sha256t_8way_hash( void *output, const void *input ); -int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void sha256q_8way_hash( void *output, const void *input ); -int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - -#if defined(SHA256T_4WAY) - -void sha256t_4way_hash( void *output, const void *input ); -int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void sha256q_4way_hash( void *output, const void *input ); -int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - -void sha256t_hash( void *output, const void *input ); -int scanhash_sha256t( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void sha256q_hash( void *output, const void *input ); -int scanhash_sha256q( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c deleted file mode 100644 index bb401d0..0000000 --- a/algo/sha/sha256t.c +++ /dev/null @@ -1,100 +0,0 @@ -#include "sha256t-gate.h" -#include -#include -#include -#include -#include - -static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64))); - -void sha256t_midstate( const void* input ) -{ - SHA256_Init( &sha256t_ctx ); - SHA256_Update( &sha256t_ctx, input, 64 ); -} - -void sha256t_hash( void* output, const void* input ) -{ - uint32_t _ALIGN(64) hash[16]; - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - SHA256_CTX ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx ); - - SHA256_Update( &ctx, input + midlen, tail ); - SHA256_Final( (unsigned char*)hash, &ctx ); - - SHA256_Init( &ctx ); - SHA256_Update( &ctx, hash, 32 ); - SHA256_Final( (unsigned char*)hash, &ctx ); - - SHA256_Init( &ctx ); - SHA256_Update( &ctx, hash, 32 ); - SHA256_Final( (unsigned char*)hash, &ctx ); - - memcpy( output, hash, 32 ); -} - -int scanhash_sha256t( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; -#ifdef _MSC_VER - uint32_t __declspec(align(32)) hash64[8]; -#else - uint32_t hash64[8] __attribute__((aligned(32))); -#endif - uint32_t endiandata[32]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - sha256t_midstate( endiandata ); - - for ( int m = 0; m < 6; m++ ) - { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - sha256t_hash( hash64, endiandata ); - if ( !(hash64[7] & mask) ) - if ( fulltest( hash64, ptarget ) && !opt_benchmark ) - submit_solution( work, hash64, mythr ); - } while ( n < max_nonce && !work_restart[thr_id].restart ); - break; - } - } - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/sha/sha3-defs.h b/algo/sha/sha3-defs.h index 1060737..c3fbb7f 100644 --- a/algo/sha/sha3-defs.h +++ b/algo/sha/sha3-defs.h @@ -1,16 +1,22 @@ #ifndef DEFS_X5_H__ #define DEFS_X5_H__ + +#if defined(__arm__) || defined(__aarch64__) +#include "sse2neon.h" +#else #include +#endif + typedef unsigned char BitSequence; typedef unsigned long long DataLength; -typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn; +typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2 } HashReturn; typedef unsigned char uint8; typedef unsigned int uint32; typedef unsigned long long uint64; -//typedef struct { +// typedef struct { // uint32 buffer[8]; /* Buffer to be hashed */ // __m128i chainv[10]; /* Chaining values */ // uint64 bitlen[2]; /* Message length in bits */ diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c index 1eb225d..e96a2d1 100644 --- a/algo/sha/sph_sha2.c +++ b/algo/sha/sph_sha2.c @@ -71,8 +71,13 @@ static const sph_u32 H256[8] = { * of the compression function. */ -#if SPH_SMALL_FOOTPRINT_SHA2 +#if defined(__SHA__) + +#include "sha256-hash-opt.c" + +#else // no SHA +/* static const sph_u32 K[64] = { SPH_C32(0x428A2F98), SPH_C32(0x71374491), SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5), @@ -107,6 +112,9 @@ static const sph_u32 K[64] = { SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB), SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2) }; +*/ + +#if SPH_SMALL_FOOTPRINT_SHA2 #define SHA2_MEXP1(in, pc) do { \ W[pc] = in(pc); \ @@ -191,7 +199,7 @@ static const sph_u32 K[64] = { (r)[7] = SPH_T32((r)[7] + H); \ } while (0) -#else +#else // large footprint (default) #define SHA2_ROUND_BODY(in, r) do { \ sph_u32 A, B, C, D, E, F, G, H, T1, T2; \ @@ -600,7 +608,7 @@ static const sph_u32 K[64] = { (r)[7] = SPH_T32((r)[7] + H); \ } while (0) -#endif +#endif // small footprint else /* * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access. @@ -613,6 +621,9 @@ sha2_round(const unsigned char *data, sph_u32 r[8]) #undef SHA2_IN } +#endif // SHA else + + /* see sph_sha2.h */ void sph_sha224_init(void *cc) @@ -653,7 +664,7 @@ void sph_sha224_close(void *cc, void *dst) { sha224_close(cc, dst, 7); - sph_sha224_init(cc); +// sph_sha224_init(cc); } /* see sph_sha2.h */ @@ -661,7 +672,7 @@ void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) { sha224_addbits_and_close(cc, ub, n, dst, 7); - sph_sha224_init(cc); +// sph_sha224_init(cc); } /* see sph_sha2.h */ @@ -677,14 +688,22 @@ void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) { sha224_addbits_and_close(cc, ub, n, dst, 8); - sph_sha256_init(cc); +// sph_sha256_init(cc); } -/* see sph_sha2.h */ -void -sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]) +void sph_sha256_full( void *dst, const void *data, size_t len ) { -#define SHA2_IN(x) msg[x] - SHA2_ROUND_BODY(SHA2_IN, val); -#undef SHA2_IN -} + sph_sha256_context cc; + sph_sha256_init( &cc ); + sph_sha256( &cc, data, len ); + sph_sha256_close( &cc, dst ); +} + +/* see sph_sha2.h */ +//void +//sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]) +//{ +//#define SHA2_IN(x) msg[x] +// SHA2_ROUND_BODY(SHA2_IN, val); +//#undef SHA2_IN +//} diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h index d5bda73..e3a83eb 100644 --- a/algo/sha/sph_sha2.h +++ b/algo/sha/sph_sha2.h @@ -73,7 +73,7 @@ typedef struct { sph_u32 count_high, count_low; #endif #endif -} sph_sha224_context; +} sph_sha224_context __attribute__((aligned(64))); /** * This structure is a context for SHA-256 computations. It is identical @@ -205,6 +205,10 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]); #define sph_sha256_comp sph_sha224_comp #endif +void sph_sha256_full( void *dst, const void *data, size_t len ); + + + #if SPH_64 /** diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c deleted file mode 100644 index 383e936..0000000 --- a/algo/shabal/shabal-hash-4way.c +++ /dev/null @@ -1,618 +0,0 @@ -/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */ -/* - * Shabal implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#ifdef __AVX2__ - -#include "shabal-hash-4way.h" -#ifdef __cplusplus -extern "C"{ -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -/* - * Part of this code was automatically generated (the part between - * the "BEGIN" and "END" markers). - */ - -#define sM 16 - -#define C32 SPH_C32 -#define T32 SPH_T32 - -#define O1 13 -#define O2 9 -#define O3 6 - -/* - * We copy the state into local variables, so that the compiler knows - * that it can optimize them at will. - */ - -/* BEGIN -- automatically generated code. */ - -#define DECL_STATE \ - __m128i A00, A01, A02, A03, A04, A05, A06, A07, \ - A08, A09, A0A, A0B; \ - __m128i B0, B1, B2, B3, B4, B5, B6, B7, \ - B8, B9, BA, BB, BC, BD, BE, BF; \ - __m128i C0, C1, C2, C3, C4, C5, C6, C7, \ - C8, C9, CA, CB, CC, CD, CE, CF; \ - __m128i M0, M1, M2, M3, M4, M5, M6, M7, \ - M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u32 Wlow, Whigh; - -#define READ_STATE(state) do { \ - A00 = (state)->A[0]; \ - A01 = (state)->A[1]; \ - A02 = (state)->A[2]; \ - A03 = (state)->A[3]; \ - A04 = (state)->A[4]; \ - A05 = (state)->A[5]; \ - A06 = (state)->A[6]; \ - A07 = (state)->A[7]; \ - A08 = (state)->A[8]; \ - A09 = (state)->A[9]; \ - A0A = (state)->A[10]; \ - A0B = (state)->A[11]; \ - B0 = (state)->B[0]; \ - B1 = (state)->B[1]; \ - B2 = (state)->B[2]; \ - B3 = (state)->B[3]; \ - B4 = (state)->B[4]; \ - B5 = (state)->B[5]; \ - B6 = (state)->B[6]; \ - B7 = (state)->B[7]; \ - B8 = (state)->B[8]; \ - B9 = (state)->B[9]; \ - BA = (state)->B[10]; \ - BB = (state)->B[11]; \ - BC = (state)->B[12]; \ - BD = (state)->B[13]; \ - BE = (state)->B[14]; \ - BF = (state)->B[15]; \ - C0 = (state)->C[0]; \ - C1 = (state)->C[1]; \ - C2 = (state)->C[2]; \ - C3 = (state)->C[3]; \ - C4 = (state)->C[4]; \ - C5 = (state)->C[5]; \ - C6 = (state)->C[6]; \ - C7 = (state)->C[7]; \ - C8 = (state)->C[8]; \ - C9 = (state)->C[9]; \ - CA = (state)->C[10]; \ - CB = (state)->C[11]; \ - CC = (state)->C[12]; \ - CD = (state)->C[13]; \ - CE = (state)->C[14]; \ - CF = (state)->C[15]; \ - Wlow = (state)->Wlow; \ - Whigh = (state)->Whigh; \ - } while (0) - -#define WRITE_STATE(state) do { \ - (state)->A[0] = A00; \ - (state)->A[1] = A01; \ - (state)->A[2] = A02; \ - (state)->A[3] = A03; \ - (state)->A[4] = A04; \ - (state)->A[5] = A05; \ - (state)->A[6] = A06; \ - (state)->A[7] = A07; \ - (state)->A[8] = A08; \ - (state)->A[9] = A09; \ - (state)->A[10] = A0A; \ - (state)->A[11] = A0B; \ - (state)->B[0] = B0; \ - (state)->B[1] = B1; \ - (state)->B[2] = B2; \ - (state)->B[3] = B3; \ - (state)->B[4] = B4; \ - (state)->B[5] = B5; \ - (state)->B[6] = B6; \ - (state)->B[7] = B7; \ - (state)->B[8] = B8; \ - (state)->B[9] = B9; \ - (state)->B[10] = BA; \ - (state)->B[11] = BB; \ - (state)->B[12] = BC; \ - (state)->B[13] = BD; \ - (state)->B[14] = BE; \ - (state)->B[15] = BF; \ - (state)->C[0] = C0; \ - (state)->C[1] = C1; \ - (state)->C[2] = C2; \ - (state)->C[3] = C3; \ - (state)->C[4] = C4; \ - (state)->C[5] = C5; \ - (state)->C[6] = C6; \ - (state)->C[7] = C7; \ - (state)->C[8] = C8; \ - (state)->C[9] = C9; \ - (state)->C[10] = CA; \ - (state)->C[11] = CB; \ - (state)->C[12] = CC; \ - (state)->C[13] = CD; \ - (state)->C[14] = CE; \ - (state)->C[15] = CF; \ - (state)->Wlow = Wlow; \ - (state)->Whigh = Whigh; \ - } while (0) - -#define DECODE_BLOCK \ -do { \ - M0 = buf[ 0]; \ - M1 = buf[ 1]; \ - M2 = buf[ 2]; \ - M3 = buf[ 3]; \ - M4 = buf[ 4]; \ - M5 = buf[ 5]; \ - M6 = buf[ 6]; \ - M7 = buf[ 7]; \ - M8 = buf[ 8]; \ - M9 = buf[ 9]; \ - MA = buf[10]; \ - MB = buf[11]; \ - MC = buf[12]; \ - MD = buf[13]; \ - ME = buf[14]; \ - MF = buf[15]; \ -} while (0) - -#define INPUT_BLOCK_ADD \ -do { \ - B0 = _mm_add_epi32( B0, M0 );\ - B1 = _mm_add_epi32( B1, M1 );\ - B2 = _mm_add_epi32( B2, M2 );\ - B3 = _mm_add_epi32( B3, M3 );\ - B4 = _mm_add_epi32( B4, M4 );\ - B5 = _mm_add_epi32( B5, M5 );\ - B6 = _mm_add_epi32( B6, M6 );\ - B7 = _mm_add_epi32( B7, M7 );\ - B8 = _mm_add_epi32( B8, M8 );\ - B9 = _mm_add_epi32( B9, M9 );\ - BA = _mm_add_epi32( BA, MA );\ - BB = _mm_add_epi32( BB, MB );\ - BC = _mm_add_epi32( BC, MC );\ - BD = _mm_add_epi32( BD, MD );\ - BE = _mm_add_epi32( BE, ME );\ - BF = _mm_add_epi32( BF, MF );\ -} while (0) - -#define INPUT_BLOCK_SUB \ -do { \ - C0 = _mm_sub_epi32( C0, M0 ); \ - C1 = _mm_sub_epi32( C1, M1 ); \ - C2 = _mm_sub_epi32( C2, M2 ); \ - C3 = _mm_sub_epi32( C3, M3 ); \ - C4 = _mm_sub_epi32( C4, M4 ); \ - C5 = _mm_sub_epi32( C5, M5 ); \ - C6 = _mm_sub_epi32( C6, M6 ); \ - C7 = _mm_sub_epi32( C7, M7 ); \ - C8 = _mm_sub_epi32( C8, M8 ); \ - C9 = _mm_sub_epi32( C9, M9 ); \ - CA = _mm_sub_epi32( CA, MA ); \ - CB = _mm_sub_epi32( CB, MB ); \ - CC = _mm_sub_epi32( CC, MC ); \ - CD = _mm_sub_epi32( CD, MD ); \ - CE = _mm_sub_epi32( CE, ME ); \ - CF = _mm_sub_epi32( CF, MF ); \ -} while (0) - -#define XOR_W \ -do { \ - A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \ - A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \ -} while (0) -/* -#define SWAP(v1, v2) do { \ - sph_u32 tmp = (v1); \ - (v1) = (v2); \ - (v2) = tmp; \ - } while (0) -*/ -#define SWAP_BC \ -do { \ - mm128_swap128_256( B0, C0 ); \ - mm128_swap128_256( B1, C1 ); \ - mm128_swap128_256( B2, C2 ); \ - mm128_swap128_256( B3, C3 ); \ - mm128_swap128_256( B4, C4 ); \ - mm128_swap128_256( B5, C5 ); \ - mm128_swap128_256( B6, C6 ); \ - mm128_swap128_256( B7, C7 ); \ - mm128_swap128_256( B8, C8 ); \ - mm128_swap128_256( B9, C9 ); \ - mm128_swap128_256( BA, CA ); \ - mm128_swap128_256( BB, CB ); \ - mm128_swap128_256( BC, CC ); \ - mm128_swap128_256( BD, CD ); \ - mm128_swap128_256( BE, CE ); \ - mm128_swap128_256( BF, CF ); \ -} while (0) - -#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ -do { \ - xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \ - _mm_andnot_si128( xb3, xb2 ), \ - _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \ - _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \ - ) ), _mm_set1_epi32(3UL) ) ) ) ); \ - xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \ -} while (0) - -#define PERM_STEP_0 do { \ - PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ - } while (0) - -#define PERM_STEP_1 do { \ - PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ - } while (0) - -#define PERM_STEP_2 do { \ - PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ - } while (0) - -#define APPLY_P \ -do { \ - B0 = mm128_ror_32( B0, 15 ); \ - B1 = mm128_ror_32( B1, 15 ); \ - B2 = mm128_ror_32( B2, 15 ); \ - B3 = mm128_ror_32( B3, 15 ); \ - B4 = mm128_ror_32( B4, 15 ); \ - B5 = mm128_ror_32( B5, 15 ); \ - B6 = mm128_ror_32( B6, 15 ); \ - B7 = mm128_ror_32( B7, 15 ); \ - B8 = mm128_ror_32( B8, 15 ); \ - B9 = mm128_ror_32( B9, 15 ); \ - BA = mm128_ror_32( BA, 15 ); \ - BB = mm128_ror_32( BB, 15 ); \ - BC = mm128_ror_32( BC, 15 ); \ - BD = mm128_ror_32( BD, 15 ); \ - BE = mm128_ror_32( BE, 15 ); \ - BF = mm128_ror_32( BF, 15 ); \ - PERM_STEP_0; \ - PERM_STEP_1; \ - PERM_STEP_2; \ - A0B = _mm_add_epi32( A0B, C6 ); \ - A0A = _mm_add_epi32( A0A, C5 ); \ - A09 = _mm_add_epi32( A09, C4 ); \ - A08 = _mm_add_epi32( A08, C3 ); \ - A07 = _mm_add_epi32( A07, C2 ); \ - A06 = _mm_add_epi32( A06, C1 ); \ - A05 = _mm_add_epi32( A05, C0 ); \ - A04 = _mm_add_epi32( A04, CF ); \ - A03 = _mm_add_epi32( A03, CE ); \ - A02 = _mm_add_epi32( A02, CD ); \ - A01 = _mm_add_epi32( A01, CC ); \ - A00 = _mm_add_epi32( A00, CB ); \ - A0B = _mm_add_epi32( A0B, CA ); \ - A0A = _mm_add_epi32( A0A, C9 ); \ - A09 = _mm_add_epi32( A09, C8 ); \ - A08 = _mm_add_epi32( A08, C7 ); \ - A07 = _mm_add_epi32( A07, C6 ); \ - A06 = _mm_add_epi32( A06, C5 ); \ - A05 = _mm_add_epi32( A05, C4 ); \ - A04 = _mm_add_epi32( A04, C3 ); \ - A03 = _mm_add_epi32( A03, C2 ); \ - A02 = _mm_add_epi32( A02, C1 ); \ - A01 = _mm_add_epi32( A01, C0 ); \ - A00 = _mm_add_epi32( A00, CF ); \ - A0B = _mm_add_epi32( A0B, CE ); \ - A0A = _mm_add_epi32( A0A, CD ); \ - A09 = _mm_add_epi32( A09, CC ); \ - A08 = _mm_add_epi32( A08, CB ); \ - A07 = _mm_add_epi32( A07, CA ); \ - A06 = _mm_add_epi32( A06, C9 ); \ - A05 = _mm_add_epi32( A05, C8 ); \ - A04 = _mm_add_epi32( A04, C7 ); \ - A03 = _mm_add_epi32( A03, C6 ); \ - A02 = _mm_add_epi32( A02, C5 ); \ - A01 = _mm_add_epi32( A01, C4 ); \ - A00 = _mm_add_epi32( A00, C3 ); \ -} while (0) - -#define INCR_W do { \ - if ((Wlow = T32(Wlow + 1)) == 0) \ - Whigh = T32(Whigh + 1); \ - } while (0) - -static const sph_u32 A_init_256[] = { - C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191), - C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C), - C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A) -}; - -static const sph_u32 B_init_256[] = { - C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F), - C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002), - C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890), - C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5) -}; - -static const sph_u32 C_init_256[] = { - C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55), - C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433), - C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F), - C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60) -}; - -static const sph_u32 A_init_512[] = { - C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632), - C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B), - C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F) -}; - -static const sph_u32 B_init_512[] = { - C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640), - C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08), - C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E), - C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B) -}; - -static const sph_u32 C_init_512[] = { - C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359), - C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780), - C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A), - C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969) -}; - -static void -shabal_4way_init( void *cc, unsigned size ) -{ - shabal_4way_context *sc = (shabal_4way_context*)cc; - int i; - - if ( size == 512 ) - { - for ( i = 0; i < 12; i++ ) - sc->A[i] = _mm_set1_epi32( A_init_512[i] ); - for ( i = 0; i < 16; i++ ) - { - sc->B[i] = _mm_set1_epi32( B_init_512[i] ); - sc->C[i] = _mm_set1_epi32( C_init_512[i] ); - } - } - else - { - for ( i = 0; i < 12; i++ ) - sc->A[i] = _mm_set1_epi32( A_init_256[i] ); - for ( i = 0; i < 16; i++ ) - { - sc->B[i] = _mm_set1_epi32( B_init_256[i] ); - sc->C[i] = _mm_set1_epi32( C_init_256[i] ); - } - } - sc->Wlow = 1; - sc->Whigh = 0; - sc->ptr = 0; -} - -static void -shabal_4way_core( void *cc, const unsigned char *data, size_t len ) -{ - shabal_4way_context *sc = (shabal_4way_context*)cc; - __m128i *buf; - __m128i *vdata = (__m128i*)data; - const int buf_size = 64; - size_t ptr; - DECL_STATE - - buf = sc->buf; - ptr = sc->ptr; - - if ( len < (buf_size - ptr ) ) - { - memcpy_128( buf + (ptr>>2), vdata, len>>2 ); - ptr += len; - sc->ptr = ptr; - return; - } - READ_STATE(sc); - - while ( len > 0 ) - { - size_t clen; - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_128( buf + (ptr>>2), vdata, clen>>2 ); - - ptr += clen; - vdata += clen>>2; - len -= clen; - if ( ptr == buf_size ) - { - DECODE_BLOCK; - INPUT_BLOCK_ADD; - XOR_W; - APPLY_P; - INPUT_BLOCK_SUB; - SWAP_BC; - INCR_W; - ptr = 0; - } - } - WRITE_STATE(sc); - sc->ptr = ptr; -} - -static void -shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst, - unsigned size_words ) -{ - shabal_4way_context *sc = (shabal_4way_context*)cc; - __m128i *buf; - const int buf_size = 64; - size_t ptr; - int i; - unsigned z, zz; - DECL_STATE - - buf = sc->buf; - ptr = sc->ptr; - z = 0x80 >> n; - zz = ((ub & -z) | z) & 0xFF; - buf[ptr>>2] = _mm_set1_epi32( zz ); - memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 ); - READ_STATE(sc); - DECODE_BLOCK; - INPUT_BLOCK_ADD; - XOR_W; - APPLY_P; - - for ( i = 0; i < 3; i ++ ) - { - SWAP_BC; - XOR_W; - APPLY_P; - } - - __m128i *d = (__m128i*)dst; - if ( size_words == 16 ) // 512 - { - d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3; - d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7; - d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB; - d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF; - } - else // 256 - { - d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB; - d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF; - } -} - -void -shabal256_4way_init( void *cc ) -{ - shabal_4way_init(cc, 256); -} - -void -shabal256_4way( void *cc, const void *data, size_t len ) -{ - shabal_4way_core( cc, data, len ); -} - -void -shabal256_4way_close( void *cc, void *dst ) -{ - shabal_4way_close(cc, 0, 0, dst, 8); -} - -void -shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ) -{ - shabal_4way_close(cc, ub, n, dst, 8); -} - -void -shabal512_4way_init(void *cc) -{ - shabal_4way_init(cc, 512); -} - -void -shabal512_4way(void *cc, const void *data, size_t len) -{ - shabal_4way_core(cc, data, len); -} - -void -shabal512_4way_close(void *cc, void *dst) -{ - shabal_4way_close(cc, 0, 0, dst, 16); -} - -void -shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - shabal_4way_close(cc, ub, n, dst, 16); -} -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h deleted file mode 100644 index dbdfe2b..0000000 --- a/algo/shabal/shabal-hash-4way.h +++ /dev/null @@ -1,82 +0,0 @@ -/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */ -/** - * Shabal interface. Shabal is a family of functions which differ by - * their output size; this implementation defines Shabal for output - * sizes 192, 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_shabal.h - * @author Thomas Pornin - */ - -#ifndef SHABAL_HASH_4WAY_H__ -#define SHABAL_HASH_4WAY_H__ 1 - -#ifdef __AVX2__ - -#include -#include "algo/sha/sph_types.h" -#include "simd-utils.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#define SPH_SIZE_shabal256 256 - -#define SPH_SIZE_shabal512 512 - -typedef struct { - __m128i buf[16] __attribute__ ((aligned (64))); - __m128i A[12], B[16], C[16]; - sph_u32 Whigh, Wlow; - size_t ptr; -} shabal_4way_context; - -typedef shabal_4way_context shabal256_4way_context; -typedef shabal_4way_context shabal512_4way_context; - -void shabal256_4way_init( void *cc ); -void shabal256_4way( void *cc, const void *data, size_t len ); -void shabal256_4way_close( void *cc, void *dst ); -void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); - -void shabal512_4way_init( void *cc ); -void shabal512_4way( void *cc, const void *data, size_t len ); -void shabal512_4way_close( void *cc, void *dst ); -void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); - -#ifdef __cplusplus -} -#endif - -#endif - -#endif - diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c deleted file mode 100644 index 1a4a685..0000000 --- a/algo/shavite/shavite-hash-2way.c +++ /dev/null @@ -1,411 +0,0 @@ -#include "shavite-hash-2way.h" -#include "algo/sha/sph_types.h" - -#include - -#if defined(__AVX2__) - -static const uint32_t IV512[] = -{ - 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, - 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, - 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, - 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A -}; - -#define mm256_ror2x256hi_1x32( a, b ) \ - _mm256_blend_epi32( mm256_ror1x32_128( a ), \ - mm256_ror1x32_128( b ), 0x88 ) - -static void -c512_2way( shavite512_2way_context *ctx, const void *msg ) -{ - const __m128i zero = _mm_setzero_si128(); - __m256i p0, p1, p2, p3, x; - __m256i k00, k01, k02, k03, k10, k11, k12, k13; - __m256i *m = (__m256i*)msg; - __m256i *h = (__m256i*)ctx->h; - int r; - - p0 = h[0]; - p1 = h[1]; - p2 = h[2]; - p3 = h[3]; - - // round - k00 = m[0]; - x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero ); - k01 = m[1]; - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = m[2]; - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = m[3]; - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); - - p0 = _mm256_xor_si256( p0, x ); - - k10 = m[4]; - x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero ); - k11 = m[5]; - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = m[6]; - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = m[7]; - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); - - p2 = _mm256_xor_si256( p2, x ); - - for ( r = 0; r < 3; r ++ ) - { - // round 1, 5, 9 - - k00 = _mm256_xor_si256( k13, mm256_ror1x32_128( - mm256_aesenc_2x128( k00, zero ) ) ); - - if ( r == 0 ) - k00 = _mm256_xor_si256( k00, _mm256_set_epi32( - ~ctx->count3, ctx->count2, ctx->count1, ctx->count0, - ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) ); - - x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); - k01 = _mm256_xor_si256( k00, - mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) ); - - if ( r == 1 ) - k01 = _mm256_xor_si256( k01, _mm256_set_epi32( - ~ctx->count0, ctx->count1, ctx->count2, ctx->count3, - ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); - - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( k01, - mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( k02, - mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); - - p3 = _mm256_xor_si256( p3, x ); - - k10 = _mm256_xor_si256( k03, - mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); - k11 = _mm256_xor_si256( k10, - mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = _mm256_xor_si256( k11, - mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( k12, - mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) ); - - if ( r == 2 ) - k13 = _mm256_xor_si256( k13, _mm256_set_epi32( - ~ctx->count1, ctx->count0, ctx->count3, ctx->count2, - ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) ); - - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); - p1 = _mm256_xor_si256( p1, x ); - - // round 2, 6, 10 - - k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero ); - k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); - - p2 = _mm256_xor_si256( p2, x ); - - k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero ); - k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); - - p0 = _mm256_xor_si256( p0, x ); - - // round 3, 7, 11 - - k00 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k00, zero ) ), k13 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero ); - k01 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k01, zero ) ), k00 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k02, zero ) ), k01 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k03, zero ) ), k02 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); - - p1 = _mm256_xor_si256( p1, x ); - - k10 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k10, zero ) ), k03 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero ); - k11 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k11, zero ) ), k10 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k12, zero ) ), k11 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k13, zero ) ), k12 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); - - p3 = _mm256_xor_si256( p3, x ); - - // round 4, 8, 12 - - k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero ); - k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); - - p0 = _mm256_xor_si256( p0, x ); - - k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero ); - k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); - - p2 = _mm256_xor_si256( p2, x ); - - } - - // round 13 - - k00 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k00, zero ) ), k13 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); - k01 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k01, zero ) ), k00 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k02, zero ) ), k01 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k03, zero ) ), k02 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); - - p3 = _mm256_xor_si256( p3, x ); - - k10 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k10, zero ) ), k03 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); - k11 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k11, zero ) ), k10 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - - k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ); - k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32( - ~ctx->count2, ctx->count3, ctx->count0, ctx->count1, - ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) ); - - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( mm256_ror1x32_128( - mm256_aesenc_2x128( k13, zero ) ), k12 ); - x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); - - p1 = _mm256_xor_si256( p1, x ); - - h[0] = _mm256_xor_si256( h[0], p2 ); - h[1] = _mm256_xor_si256( h[1], p3 ); - h[2] = _mm256_xor_si256( h[2], p0 ); - h[3] = _mm256_xor_si256( h[3], p1 ); -} - -void shavite512_2way_init( shavite512_2way_context *ctx ) -{ - casti_m256i( ctx->h, 0 ) = - _mm256_set_epi32( IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0], - IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0] ); - casti_m256i( ctx->h, 1 ) = - _mm256_set_epi32( IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4], - IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4] ); - casti_m256i( ctx->h, 2 ) = - _mm256_set_epi32( IV512[11], IV512[10], IV512[ 9], IV512[ 8], - IV512[11], IV512[10], IV512[ 9], IV512[ 8] ); - casti_m256i( ctx->h, 3 ) = - _mm256_set_epi32( IV512[15], IV512[14], IV512[13], IV512[12], - IV512[15], IV512[14], IV512[13], IV512[12] ); - ctx->ptr = 0; - ctx->count0 = 0; - ctx->count1 = 0; - ctx->count2 = 0; - ctx->count3 = 0; -} - -void shavite512_2way_update( shavite512_2way_context *ctx, const void *data, - size_t len ) -{ - unsigned char *buf = ctx->buf; - size_t ptr = ctx->ptr; - - while ( len > 0 ) - { - size_t clen; - - clen = (sizeof ctx->buf) - ptr; - if ( clen > len << 1 ) - clen = len << 1; - memcpy( buf + ptr, data, clen ); - data = (const unsigned char *)data + clen; - ptr += clen; - len -= clen >> 1; - if ( ptr == sizeof ctx->buf ) - { - if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 ) - { - ctx->count1 = ctx->count1 + 1; - if ( ctx->count1 == 0 ) - { - ctx->count2 = ctx->count2 + 1; - if ( ctx->count2 == 0 ) - ctx->count3 = ctx->count3 + 1; - } - } - c512_2way( ctx, buf ); - ptr = 0; - } - } - ctx->ptr = ptr; -} - -void shavite512_2way_close( shavite512_2way_context *ctx, void *dst ) -{ - unsigned char *buf; - union - { - uint32_t u32[4]; - uint16_t u16[8]; - } count; - - buf = ctx->buf; - uint32_t vp = ctx->ptr>>5; - - // Terminating byte then zero pad - casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ); - - // Zero pad full vectors up to count - for ( ; vp < 6; vp++ ) - casti_m256i( buf, vp ) = m256_zero; - - // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200 - // Count is misaligned to 16 bits and straddles a vector. - // Use u32 overlay to stage then u16 to load buf. - count.u32[0] = ctx->count0 += (ctx->ptr << 2); // ptr/2 * 8 - count.u32[1] = ctx->count1; - count.u32[2] = ctx->count2; - count.u32[3] = ctx->count3; - - casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0, - count.u16[0], 0,0,0,0,0,0,0 ); - casti_m256i( buf, 7 ) = _mm256_set_epi16( - 0x0200 , count.u16[7], count.u16[6], count.u16[5], - count.u16[4], count.u16[3], count.u16[2], count.u16[1], - 0x0200 , count.u16[7], count.u16[6], count.u16[5], - count.u16[4], count.u16[3], count.u16[2], count.u16[1] ); - - c512_2way( ctx, buf); - - casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 ); - casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 ); - casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 ); - casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 ); -} - -void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst, - const void *data, size_t len ) -{ - unsigned char *buf = ctx->buf; - size_t ptr = ctx->ptr; - - // process full blocks and load buf with remainder. - while ( len > 0 ) - { - size_t clen; - - clen = (sizeof ctx->buf) - ptr; - if ( clen > len << 1 ) - clen = len << 1; - memcpy( buf + ptr, data, clen ); - data = (const unsigned char *)data + clen; - ptr += clen; - len -= (clen >> 1); - if ( ptr == sizeof ctx->buf ) - { - if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 ) - { - ctx->count1 = ctx->count1 + 1; - if ( ctx->count1 == 0 ) - { - ctx->count2 = ctx->count2 + 1; - if ( ctx->count2 == 0 ) - ctx->count3 = ctx->count3 + 1; - } - } - c512_2way( ctx, buf ); - ptr = 0; - } - } - - uint32_t vp = ptr>>5; - // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200 - // Count is misaligned to 16 bits and straddles 2 vectors. - // Use u32 overlay to stage then u16 to load buf. - union - { - uint32_t u32[4]; - uint16_t u16[8]; - } count; - - count.u32[0] = ctx->count0 += (ptr << 2); // ptr/2 * 8 - count.u32[1] = ctx->count1; - count.u32[2] = ctx->count2; - count.u32[3] = ctx->count3; - - if ( vp == 0 ) // empty buf, xevan. - { - casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ); - memset_zero_256( (__m256i*)buf + 1, 5 ); - ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; - } - else // half full buf, everyone else. - { - casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ); - memset_zero_256( (__m256i*)buf + vp, 6 - vp ); - } - - casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0, - count.u16[0], 0,0,0,0,0,0,0 ); - casti_m256i( buf, 7 ) = _mm256_set_epi16( - 0x0200 , count.u16[7], count.u16[6], count.u16[5], - count.u16[4], count.u16[3], count.u16[2], count.u16[1], - 0x0200 , count.u16[7], count.u16[6], count.u16[5], - count.u16[4], count.u16[3], count.u16[2], count.u16[1] ); - - c512_2way( ctx, buf); - - casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 ); - casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 ); - casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 ); - casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 ); -} - -#endif // AVX2 diff --git a/algo/shavite/shavite-hash-2way.h b/algo/shavite/shavite-hash-2way.h deleted file mode 100644 index 869bf4a..0000000 --- a/algo/shavite/shavite-hash-2way.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef SHAVITE_HASH_2WAY_H__ -#define SHAVITE_HASH_2WAY_H__ - -#if defined(__AVX2__) - -#include "simd-utils.h" - -typedef struct { - unsigned char buf[128<<1]; - uint32_t h[16<<1]; - size_t ptr; - uint32_t count0, count1, count2, count3; -} shavite512_2way_context __attribute__ ((aligned (64))); - -void shavite512_2way_init( shavite512_2way_context *ctx ); -void shavite512_2way_update( shavite512_2way_context *ctx, const void *data, - size_t len ); -void shavite512_2way_close( shavite512_2way_context *ctx, void *dst ); -void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst, - const void *data, size_t len ); - -#endif // AVX2 - -#endif // SHAVITE_HASH_2WAY_H__ - diff --git a/algo/shavite/shavite.c b/algo/shavite/shavite.c deleted file mode 100644 index 9ad9844..0000000 --- a/algo/shavite/shavite.c +++ /dev/null @@ -1,159 +0,0 @@ -#include "miner.h" -#include "algo-gate-api.h" -#include -#include - -#include "sph_shavite.h" - -extern void inkhash(void *state, const void *input) -{ - sph_shavite512_context ctx_shavite; - uint32_t hash[16]; - - sph_shavite512_init(&ctx_shavite); - sph_shavite512 (&ctx_shavite, (const void*) input, 80); - sph_shavite512_close(&ctx_shavite, (void*) hash); - - sph_shavite512_init(&ctx_shavite); - sph_shavite512(&ctx_shavite, (const void*) hash, 64); - sph_shavite512_close(&ctx_shavite, (void*) hash); - - memcpy(state, hash, 32); - -/* - int ii; - printf("result: "); - for (ii=0; ii < 32; ii++) - { - printf ("%.2x",((uint8_t*)state)[ii]); - }; - printf ("\n"); -*/ -} - -int scanhash_ink( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; - - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - //const uint32_t Htarg = ptarget[7]; - - uint32_t _ALIGN(32) hash64[8]; - uint32_t endiandata[32]; - - //char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"}; - - //we need bigendian data... - //lessons learned: do NOT endianchange directly in pdata, this will all proof-of-works be considered as stale from minerd.... - int kk=0; - for (; kk < 32; kk++) - { - be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]); - }; - -// if (opt_debug) -// { -// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce); -// } - - /* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */ - /* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */ - if (ptarget[7]==0) { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFFFFFF)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - else if (ptarget[7]<=0xF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFFFFF0)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - else if (ptarget[7]<=0xFF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFFFF00)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - else if (ptarget[7]<=0xFFF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFFF000)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - } - else if (ptarget[7]<=0xFFFF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFF0000)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - } - else - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -bool register_shavite_algo( algo_gate_t* gate ) -{ - algo_not_implemented(); - return false; - -// gate->scanhash = (void*)&scanhash_ink; -// gate->hash = (void*)&inkhash; -// return true; -}; - diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c index b60a3b8..e047d77 100644 --- a/algo/shavite/sph-shavite-aesni.c +++ b/algo/shavite/sph-shavite-aesni.c @@ -33,7 +33,7 @@ #include #include -#ifdef __AES__ +#if defined(__AES__) #include "sph_shavite.h" #include "simd-utils.h" @@ -100,9 +100,20 @@ c512( sph_shavite_big_context *sc, const void *msg ) p3 = h[3]; // round + +// working proof of concept +/* + __m512i K = m512_const1_128( m[0] ); + __m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K ); + X = _mm512_aesenc_epi128( X, m512_zero ); + k00 = _mm512_castsi512_si128( K ); + x = _mm512_castsi512_si128( X ); +*/ + k00 = m[0]; x = _mm_xor_si128( p1, k00 ); x = _mm_aesenc_si128( x, zero ); + k01 = m[1]; x = _mm_xor_si128( x, k01 ); x = _mm_aesenc_si128( x, zero ); diff --git a/algo/shavite/sph_shavite.c b/algo/shavite/sph_shavite.c index ba4384b..41988f9 100644 --- a/algo/shavite/sph_shavite.c +++ b/algo/shavite/sph_shavite.c @@ -35,6 +35,8 @@ #include "sph_shavite.h" +#if !defined(__AES__) + #ifdef __cplusplus extern "C"{ #endif @@ -1762,3 +1764,6 @@ sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst #ifdef __cplusplus } #endif + +#endif // !AES + diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h index ed06ca6..cca5972 100644 --- a/algo/shavite/sph_shavite.h +++ b/algo/shavite/sph_shavite.h @@ -262,15 +262,9 @@ void sph_shavite384_close(void *cc, void *dst); void sph_shavite384_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); -// Always define sw but only define aesni when available -// Define fptrs for aesni or sw, not both. -void sph_shavite512_sw_init(void *cc); -void sph_shavite512_sw(void *cc, const void *data, size_t len); -void sph_shavite512_sw_close(void *cc, void *dst); -void sph_shavite512_sw_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - +//Don't call these directly from application code, use the macros below. #ifdef __AES__ + void sph_shavite512_aesni_init(void *cc); void sph_shavite512_aesni(void *cc, const void *data, size_t len); void sph_shavite512_aesni_close(void *cc, void *dst); @@ -285,6 +279,13 @@ void sph_shavite512_aesni_addbits_and_close( #else +void sph_shavite512_sw_init(void *cc); +void sph_shavite512_sw(void *cc, const void *data, size_t len); +void sph_shavite512_sw_close(void *cc, void *dst); +void sph_shavite512_sw_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + + #define sph_shavite512_init sph_shavite512_sw_init #define sph_shavite512 sph_shavite512_sw #define sph_shavite512_close sph_shavite512_sw_close @@ -293,6 +294,20 @@ void sph_shavite512_aesni_addbits_and_close( #endif +// Use these macros from application code. +#define shavite512_context sph_shavite512_context + +#define shavite512_init sph_shavite512_init +#define shavite512_update sph_shavite512 +#define shavite512_close sph_shavite512_close + +#define shavite512_full( cc, dst, data, len ) \ +do{ \ + shavite512_init( cc ); \ + shavite512_update( cc, data, len ); \ + shavite512_close( cc, dst ); \ +}while(0) + #ifdef __cplusplus } #endif diff --git a/algo/simd/nist.c b/algo/simd/nist.c index 73b5131..fbd4e71 100644 --- a/algo/simd/nist.c +++ b/algo/simd/nist.c @@ -83,13 +83,14 @@ HashReturn init_sd(hashState_sd *state, int hashbitlen) { char *init; #ifndef NO_PRECOMPUTED_IV - if (hashbitlen == 224) - r=InitIV(state, hashbitlen, IV_224); - else if (hashbitlen == 256) - r=InitIV(state, hashbitlen, IV_256); - else if (hashbitlen == 384) - r=InitIV(state, hashbitlen, IV_384); - else if (hashbitlen == 512) +// if (hashbitlen == 224) +// r=InitIV(state, hashbitlen, IV_224); +// else if (hashbitlen == 256) +// r=InitIV(state, hashbitlen, IV_256); +// else if (hashbitlen == 384) +// r=InitIV(state, hashbitlen, IV_384); +// else + if (hashbitlen == 512) r=InitIV(state, hashbitlen, IV_512); else #endif @@ -359,18 +360,116 @@ HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval, return SUCCESS; } +int simd_full( hashState_sd *state, BitSequence *hashval, + const BitSequence *data, DataLength databitlen ) +{ + -/*HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, - BitSequence *hashval) { - hashState_sd s; - HashReturn r; - r = Init(&s, hashbitlen); - if (r != SUCCESS) - return r; - r = Update(&s, data, databitlen); - if (r != SUCCESS) - return r; - r = Final(&s, hashval); - return r; + InitIV( state, 512, IV_512 ); + + int current, i; + unsigned int bs = state->blocksize; + static int align = -1; + BitSequence out[64]; + int isshort = 1; + u64 l; + + if (align == -1) + align = RequiredAlignment(); + +#ifdef HAS_64 + current = state->count & (bs - 1); +#else + current = state->count_low & (bs - 1); +#endif + + if ( current & 7 ) + { + // The number of hashed bits is not a multiple of 8. + // Very painfull to implement and not required by the NIST API. + return FAIL; + } + + while ( databitlen > 0 ) + { + if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs ) + { + // We can hash the data directly from the input buffer. + SIMD_Compress(state, data, 0); + databitlen -= bs; + data += bs/8; + IncreaseCounter(state, bs); + } + else + { + // Copy a chunk of data to the buffer + unsigned int len = bs - current; + if ( databitlen < len ) + { + memcpy( state->buffer+current/8, data, (databitlen+7)/8 ); + IncreaseCounter( state, databitlen ); + break; + } + else + { + memcpy( state->buffer+current/8, data, len/8 ); + IncreaseCounter( state,len ); + databitlen -= len; + data += len/8; + current = 0; + SIMD_Compress( state, state->buffer, 0 ); + } + } + } + + current = state->count & (state->blocksize - 1); + + // If there is still some data in the buffer, hash it + if ( current ) + { + // We first need to zero out the end of the buffer. + if ( current & 7 ) + { + BitSequence mask = 0xff >> ( current & 7 ); + state->buffer[current/8] &= ~mask; + } + current = ( current+7 ) / 8; + memset( state->buffer+current, 0, state->blocksize/8 - current ); + SIMD_Compress( state, state->buffer, 0 ); + } + + //* Input the message length as the last block + memset( state->buffer, 0, state->blocksize / 8 ); + l = state->count; + for ( i=0; i<8; i++ ) + { + state->buffer[i] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; + + SIMD_Compress( state, state->buffer, isshort ); + + // Decode the 32-bit words into a BitSequence + for ( i=0; i < 2*state->n_feistels; i++ ) + { + u32 x = state->A[i]; + out[4*i ] = x & 0xff; + x >>= 8; + out[4*i+1] = x & 0xff; + x >>= 8; + out[4*i+2] = x & 0xff; + x >>= 8; + out[4*i+3] = x & 0xff; + } + + memcpy( hashval, out, state->hashbitlen / 8 ); + if ( state->hashbitlen % 8 ) + { + BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) ); + hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask; + } + return SUCCESS; } -*/ + diff --git a/algo/simd/nist.h b/algo/simd/nist.h index f4b017d..b4737ff 100644 --- a/algo/simd/nist.h +++ b/algo/simd/nist.h @@ -47,8 +47,8 @@ HashReturn final_sd(hashState_sd *state, BitSequence *hashval); HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval, const BitSequence *data, DataLength databitlen ); -//HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, -// BitSequence *hashval); +int simd_full( hashState_sd *state, BitSequence *hashval, + const BitSequence *data, DataLength databitlen ); /* * Internal API diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c index 41f4723..f2652f3 100644 --- a/algo/simd/simd-hash-2way.c +++ b/algo/simd/simd-hash-2way.c @@ -23,6 +23,7 @@ uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 }; +// targetted /* Twiddle tables */ static const m256_v16 FFT64_Twiddle[] = @@ -99,6 +100,7 @@ static const m256_v16 FFT256_Twiddle[] = -30, 55, -58, -65, -95, -40, -98, 94 }} }; +// generic #define SHUFXOR_1 0xb1 /* 0b10110001 */ #define SHUFXOR_2 0x4e /* 0b01001110 */ #define SHUFXOR_3 0x1b /* 0b00011011 */ @@ -106,28 +108,1161 @@ static const m256_v16 FFT256_Twiddle[] = #define CAT(x, y) x##y #define XCAT(x,y) CAT(x,y) -#define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s )) +#define SUM7_00 0 +#define SUM7_01 1 +#define SUM7_02 2 +#define SUM7_03 3 +#define SUM7_04 4 +#define SUM7_05 5 +#define SUM7_06 6 + +#define SUM7_10 1 +#define SUM7_11 2 +#define SUM7_12 3 +#define SUM7_13 4 +#define SUM7_14 5 +#define SUM7_15 6 +#define SUM7_16 0 + +#define SUM7_20 2 +#define SUM7_21 3 +#define SUM7_22 4 +#define SUM7_23 5 +#define SUM7_24 6 +#define SUM7_25 0 +#define SUM7_26 1 + +#define SUM7_30 3 +#define SUM7_31 4 +#define SUM7_32 5 +#define SUM7_33 6 +#define SUM7_34 0 +#define SUM7_35 1 +#define SUM7_36 2 + +#define SUM7_40 4 +#define SUM7_41 5 +#define SUM7_42 6 +#define SUM7_43 0 +#define SUM7_44 1 +#define SUM7_45 2 +#define SUM7_46 3 + +#define SUM7_50 5 +#define SUM7_51 6 +#define SUM7_52 0 +#define SUM7_53 1 +#define SUM7_54 2 +#define SUM7_55 3 +#define SUM7_56 4 + +#define SUM7_60 6 +#define SUM7_61 0 +#define SUM7_62 1 +#define SUM7_63 2 +#define SUM7_64 3 +#define SUM7_65 4 +#define SUM7_66 5 + +#define PERM(z,d,a,shufxor) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a,shufxor) + +#define PERM_0(d,a,shufxor) /* XOR 1 */ \ +do { \ + d##l = shufxor( a##l, 1 ); \ + d##h = shufxor( a##h, 1 ); \ + } while(0) + +#define PERM_1(d,a,shufxor) /* XOR 6 */ \ +do { \ + d##l = shufxor( a##h, 2 ); \ + d##h = shufxor( a##l, 2 ); \ +} while(0) + +#define PERM_2(d,a,shufxor) /* XOR 2 */ \ +do { \ + d##l = shufxor( a##l, 2 ); \ + d##h = shufxor( a##h, 2 ); \ +} while(0) + +#define PERM_3(d,a,shufxor) /* XOR 3 */ \ +do { \ + d##l = shufxor( a##l, 3 ); \ + d##h = shufxor( a##h, 3 ); \ +} while(0) + +#define PERM_4(d,a,shufxor) /* XOR 5 */ \ +do { \ + d##l = shufxor( a##h, 1 ); \ + d##h = shufxor( a##l, 1 ); \ +} while(0) + +#define PERM_5(d,a,shufxor) /* XOR 7 */ \ +do { \ + d##l = shufxor( a##h, 3 ); \ + d##h = shufxor( a##l, 3 ); \ +} while(0) + +#define PERM_6(d,a,shufxor) /* XOR 4 */ \ +do { \ + d##l = a##h; \ + d##h = a##l; \ +} while(0) + + +// targetted +#define shufxor2w(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s )) + +#define REDUCE(x) \ + _mm256_sub_epi16( _mm256_and_si256( x, m256_const1_64( \ + 0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) ) + +#define EXTRA_REDUCE_S(x)\ + _mm256_sub_epi16( x, _mm256_and_si256( \ + m256_const1_64( 0x0101010101010101 ), \ + _mm256_cmpgt_epi16( x, m256_const1_64( 0x0080008000800080 ) ) ) ) + +#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) + +//#define DO_REDUCE( i ) X(i) = REDUCE( X(i) ) + +#define DO_REDUCE_FULL_S(i) \ +do { \ + X(i) = REDUCE( X(i) ); \ + X(i) = EXTRA_REDUCE_S( X(i) ); \ +} while(0) + + + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +//////////////////////////////////// +// +// SIMD 4 way AVX512 + +union _m512_v16 { + uint16_t u16[32]; + __m512i v512; +}; +typedef union _m512_v16 m512_v16; + +static const m512_v16 FFT64_Twiddle4w[] = +{ + {{ 1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128 }}, + {{ 1, 60, 2, 120, 4, -17, 8, -34, + 1, 60, 2, 120, 4, -17, 8, -34, + 1, 60, 2, 120, 4, -17, 8, -34, + 1, 60, 2, 120, 4, -17, 8, -34 }}, + {{ 1, 120, 8, -68, 64, -30, -2, 17, + 1, 120, 8, -68, 64, -30, -2, 17, + 1, 120, 8, -68, 64, -30, -2, 17, + 1, 120, 8, -68, 64, -30, -2, 17 }}, + {{ 1, 46, 60, -67, 2, 92, 120, 123, + 1, 46, 60, -67, 2, 92, 120, 123, + 1, 46, 60, -67, 2, 92, 120, 123, + 1, 46, 60, -67, 2, 92, 120, 123 }}, + {{ 1, 92, -17, -22, 32, 117, -30, 67, + 1, 92, -17, -22, 32, 117, -30, 67, + 1, 92, -17, -22, 32, 117, -30, 67, + 1, 92, -17, -22, 32, 117, -30, 67 }}, + {{ 1, -67, 120, -73, 8, -22, -68, -70, + 1, -67, 120, -73, 8, -22, -68, -70, + 1, -67, 120, -73, 8, -22, -68, -70, + 1, -67, 120, -73, 8, -22, -68, -70 }}, + {{ 1, 123, -34, -70, 128, 67, 17, 35, + 1, 123, -34, -70, 128, 67, 17, 35, + 1, 123, -34, -70, 128, 67, 17, 35, + 1, 123, -34, -70, 128, 67, 17, 35 }}, +}; + +static const m512_v16 FFT128_Twiddle4w[] = +{ + {{ 1, -118, 46, -31, 60, 116, -67, -61, + 1, -118, 46, -31, 60, 116, -67, -61, + 1, -118, 46, -31, 60, 116, -67, -61, + 1, -118, 46, -31, 60, 116, -67, -61 }}, + {{ 2, 21, 92, -62, 120, -25, 123, -122, + 2, 21, 92, -62, 120, -25, 123, -122, + 2, 21, 92, -62, 120, -25, 123, -122, + 2, 21, 92, -62, 120, -25, 123, -122 }}, + {{ 4, 42, -73, -124, -17, -50, -11, 13, + 4, 42, -73, -124, -17, -50, -11, 13, + 4, 42, -73, -124, -17, -50, -11, 13, + 4, 42, -73, -124, -17, -50, -11, 13 }}, + {{ 8, 84, 111, 9, -34, -100, -22, 26, + 8, 84, 111, 9, -34, -100, -22, 26, + 8, 84, 111, 9, -34, -100, -22, 26, + 8, 84, 111, 9, -34, -100, -22, 26 }}, + {{ 16, -89, -35, 18, -68, 57, -44, 52, + 16, -89, -35, 18, -68, 57, -44, 52, + 16, -89, -35, 18, -68, 57, -44, 52, + 16, -89, -35, 18, -68, 57, -44, 52 }}, + {{ 32, 79, -70, 36, 121, 114, -88, 104, + 32, 79, -70, 36, 121, 114, -88, 104, + 32, 79, -70, 36, 121, 114, -88, 104, + 32, 79, -70, 36, 121, 114, -88, 104 }}, + {{ 64, -99, 117, 72, -15, -29, 81, -49, + 64, -99, 117, 72, -15, -29, 81, -49, + 64, -99, 117, 72, -15, -29, 81, -49, + 64, -99, 117, 72, -15, -29, 81, -49 }}, + {{ 128, 59, -23, -113, -30, -58, -95, -98, + 128, 59, -23, -113, -30, -58, -95, -98, + 128, 59, -23, -113, -30, -58, -95, -98, + 128, 59, -23, -113, -30, -58, -95, -98 }}, +}; + +static const m512_v16 FFT256_Twiddle4w[] = +{ + {{ 1, 41, -118, 45, 46, 87, -31, 14, + 1, 41, -118, 45, 46, 87, -31, 14, + 1, 41, -118, 45, 46, 87, -31, 14, + 1, 41, -118, 45, 46, 87, -31, 14 }}, + {{ 60, -110, 116, -127, -67, 80, -61, 69, + 60, -110, 116, -127, -67, 80, -61, 69, + 60, -110, 116, -127, -67, 80, -61, 69, + 60, -110, 116, -127, -67, 80, -61, 69 }}, + {{ 2, 82, 21, 90, 92, -83, -62, 28, + 2, 82, 21, 90, 92, -83, -62, 28, + 2, 82, 21, 90, 92, -83, -62, 28, + 2, 82, 21, 90, 92, -83, -62, 28 }}, + {{ 120, 37, -25, 3, 123, -97, -122, -119, + 120, 37, -25, 3, 123, -97, -122, -119, + 120, 37, -25, 3, 123, -97, -122, -119, + 120, 37, -25, 3, 123, -97, -122, -119 }}, + {{ 4, -93, 42, -77, -73, 91, -124, 56, + 4, -93, 42, -77, -73, 91, -124, 56, + 4, -93, 42, -77, -73, 91, -124, 56, + 4, -93, 42, -77, -73, 91, -124, 56 }}, + {{ -17, 74, -50, 6, -11, 63, 13, 19, + -17, 74, -50, 6, -11, 63, 13, 19, + -17, 74, -50, 6, -11, 63, 13, 19, + -17, 74, -50, 6, -11, 63, 13, 19 }}, + {{ 8, 71, 84, 103, 111, -75, 9, 112, + 8, 71, 84, 103, 111, -75, 9, 112, + 8, 71, 84, 103, 111, -75, 9, 112, + 8, 71, 84, 103, 111, -75, 9, 112 }}, + {{ -34, -109, -100, 12, -22, 126, 26, 38, + -34, -109, -100, 12, -22, 126, 26, 38, + -34, -109, -100, 12, -22, 126, 26, 38, + -34, -109, -100, 12, -22, 126, 26, 38 }}, + {{ 16, -115, -89, -51, -35, 107, 18, -33, + 16, -115, -89, -51, -35, 107, 18, -33, + 16, -115, -89, -51, -35, 107, 18, -33, + 16, -115, -89, -51, -35, 107, 18, -33 }}, + {{ -68, 39, 57, 24, -44, -5, 52, 76, + -68, 39, 57, 24, -44, -5, 52, 76, + -68, 39, 57, 24, -44, -5, 52, 76, + -68, 39, 57, 24, -44, -5, 52, 76 }}, + {{ 32, 27, 79, -102, -70, -43, 36, -66, + 32, 27, 79, -102, -70, -43, 36, -66, + 32, 27, 79, -102, -70, -43, 36, -66, + 32, 27, 79, -102, -70, -43, 36, -66 }}, + {{ 121, 78, 114, 48, -88, -10, 104, -105, + 121, 78, 114, 48, -88, -10, 104, -105, + 121, 78, 114, 48, -88, -10, 104, -105, + 121, 78, 114, 48, -88, -10, 104, -105 }}, + {{ 64, 54, -99, 53, 117, -86, 72, 125, + 64, 54, -99, 53, 117, -86, 72, 125, + 64, 54, -99, 53, 117, -86, 72, 125, + 64, 54, -99, 53, 117, -86, 72, 125 }}, + {{ -15, -101, -29, 96, 81, -20, -49, 47, + -15, -101, -29, 96, 81, -20, -49, 47, + -15, -101, -29, 96, 81, -20, -49, 47, + -15, -101, -29, 96, 81, -20, -49, 47 }}, + {{ 128, 108, 59, 106, -23, 85, -113, -7, + 128, 108, 59, 106, -23, 85, -113, -7, + 128, 108, 59, 106, -23, 85, -113, -7, + 128, 108, 59, 106, -23, 85, -113, -7 }}, + {{ -30, 55, -58, -65, -95, -40, -98, 94, + -30, 55, -58, -65, -95, -40, -98, 94, + -30, 55, -58, -65, -95, -40, -98, 94, + -30, 55, -58, -65, -95, -40, -98, 94 }} +}; + +#define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s )) + +#define REDUCE4w(x) \ + _mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \ + 0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) ) + +#define EXTRA_REDUCE_S4w(x)\ + _mm512_sub_epi16( x, _mm512_and_si512( \ + m512_const1_64( 0x0101010101010101 ), \ + _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \ + x, m512_const1_64( 0x0080008000800080 ) ) ) ) ) + +// generic, except it calls targetted macros +#define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) ) + +//#define DO_REDUCE4w( i ) X(i) = REDUCE4w( X(i) ) + +#define DO_REDUCE_FULL_S4w(i) \ +do { \ + X(i) = REDUCE4w( X(i) ); \ + X(i) = EXTRA_REDUCE_S4w( X(i) ); \ +} while(0) + + +// targetted +void fft64_4way( void *a ) +{ + __m512i* const A = a; + register __m512i X0, X1, X2, X3, X4, X5, X6, X7; + +// generic +#define X(i) X##i + + X0 = A[0]; + X1 = A[1]; + X2 = A[2]; + X3 = A[3]; + X4 = A[4]; + X5 = A[5]; + X6 = A[6]; + X7 = A[7]; + +#define DO_REDUCE(i) X(i) = REDUCE4w( X(i) ) + + // Begin with 8 parallels DIF FFT_8 + // + // FFT_8 using w=4 as 8th root of unity + // Unrolled decimation in frequency (DIF) radix-2 NTT. + // Output data is in revbin_permuted order. + + static const int w[] = {0, 2, 4, 6}; +// __m256i *Twiddle = (__m256i*)FFT64_Twiddle; + + +// targetted +#define BUTTERFLY_0( i,j ) \ +do { \ + __m512i v = X(j); \ + X(j) = _mm512_add_epi16( X(i), X(j) ); \ + X(i) = _mm512_sub_epi16( X(i), v ); \ +} while(0) + +#define BUTTERFLY_N( i,j,n ) \ +do { \ + __m512i v = X(j); \ + X(j) = _mm512_add_epi16( X(i), X(j) ); \ + X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w[n] ); \ +} while(0) + + BUTTERFLY_0( 0, 4 ); + BUTTERFLY_N( 1, 5, 1 ); + BUTTERFLY_N( 2, 6, 2 ); + BUTTERFLY_N( 3, 7, 3 ); + + DO_REDUCE( 2 ); + DO_REDUCE( 3 ); + + BUTTERFLY_0( 0, 2 ); + BUTTERFLY_0( 4, 6 ); + BUTTERFLY_N( 1, 3, 2 ); + BUTTERFLY_N( 5, 7, 2 ); + + DO_REDUCE( 1 ); + + BUTTERFLY_0( 0, 1 ); + BUTTERFLY_0( 2, 3 ); + BUTTERFLY_0( 4, 5 ); + BUTTERFLY_0( 6, 7 ); + + /* We don't need to reduce X(7) */ + DO_REDUCE_FULL_S4w( 0 ); + DO_REDUCE_FULL_S4w( 1 ); + DO_REDUCE_FULL_S4w( 2 ); + DO_REDUCE_FULL_S4w( 3 ); + DO_REDUCE_FULL_S4w( 4 ); + DO_REDUCE_FULL_S4w( 5 ); + DO_REDUCE_FULL_S4w( 6 ); + +#undef BUTTERFLY_0 +#undef BUTTERFLY_N + +// twiddle is hard coded T[0] = m512_const2_64( {128,64,32,16}, {8,4,2,1} ) + // Multiply by twiddle factors +// X(6) = _mm512_mullo_epi16( X(6), m512_const2_64( 0x0080004000200010, +// 0x0008000400020001 ); +// X(5) = _mm512_mullo_epi16( X(5), m512_const2_64( 0xffdc0008ffef0004, +// 0x00780002003c0001 ); + + + X(6) = _mm512_mullo_epi16( X(6), FFT64_Twiddle4w[0].v512 ); + X(5) = _mm512_mullo_epi16( X(5), FFT64_Twiddle4w[1].v512 ); + X(4) = _mm512_mullo_epi16( X(4), FFT64_Twiddle4w[2].v512 ); + X(3) = _mm512_mullo_epi16( X(3), FFT64_Twiddle4w[3].v512 ); + X(2) = _mm512_mullo_epi16( X(2), FFT64_Twiddle4w[4].v512 ); + X(1) = _mm512_mullo_epi16( X(1), FFT64_Twiddle4w[5].v512 ); + X(0) = _mm512_mullo_epi16( X(0), FFT64_Twiddle4w[6].v512 ); + + // Transpose the FFT state with a revbin order permutation + // on the rows and the column. + // This will make the full FFT_64 in order. +#define INTERLEAVE(i,j) \ + do { \ + __m512i t1= X(i); \ + __m512i t2= X(j); \ + X(i) = _mm512_unpacklo_epi16( t1, t2 ); \ + X(j) = _mm512_unpackhi_epi16( t1, t2 ); \ + } while(0) + + INTERLEAVE( 1, 0 ); + INTERLEAVE( 3, 2 ); + INTERLEAVE( 5, 4 ); + INTERLEAVE( 7, 6 ); + + INTERLEAVE( 2, 0 ); + INTERLEAVE( 3, 1 ); + INTERLEAVE( 6, 4 ); + INTERLEAVE( 7, 5 ); + + INTERLEAVE( 4, 0 ); + INTERLEAVE( 5, 1 ); + INTERLEAVE( 6, 2 ); + INTERLEAVE( 7, 3 ); + +#undef INTERLEAVE + +#define BUTTERFLY_0( i,j ) \ +do { \ + __m512i u = X(j); \ + X(j) = _mm512_sub_epi16( X(j), X(i) ); \ + X(i) = _mm512_add_epi16( u, X(i) ); \ +} while(0) + + +#define BUTTERFLY_N( i,j,n ) \ +do { \ + __m512i u = X(j); \ + X(i) = _mm512_slli_epi16( X(i), w[n] ); \ + X(j) = _mm512_sub_epi16( X(j), X(i) ); \ + X(i) = _mm512_add_epi16( u, X(i) ); \ +} while(0) + + DO_REDUCE( 0 ); + DO_REDUCE( 1 ); + DO_REDUCE( 2 ); + DO_REDUCE( 3 ); + DO_REDUCE( 4 ); + DO_REDUCE( 5 ); + DO_REDUCE( 6 ); + DO_REDUCE( 7 ); + + BUTTERFLY_0( 0, 1 ); + BUTTERFLY_0( 2, 3 ); + BUTTERFLY_0( 4, 5 ); + BUTTERFLY_0( 6, 7 ); + + BUTTERFLY_0( 0, 2 ); + BUTTERFLY_0( 4, 6 ); + BUTTERFLY_N( 1, 3, 2 ); + BUTTERFLY_N( 5, 7, 2 ); + + DO_REDUCE( 3 ); + + BUTTERFLY_0( 0, 4 ); + BUTTERFLY_N( 1, 5, 1 ); + BUTTERFLY_N( 2, 6, 2 ); + BUTTERFLY_N( 3, 7, 3 ); + + DO_REDUCE_FULL_S4w( 0 ); + DO_REDUCE_FULL_S4w( 1 ); + DO_REDUCE_FULL_S4w( 2 ); + DO_REDUCE_FULL_S4w( 3 ); + DO_REDUCE_FULL_S4w( 4 ); + DO_REDUCE_FULL_S4w( 5 ); + DO_REDUCE_FULL_S4w( 6 ); + DO_REDUCE_FULL_S4w( 7 ); + +#undef BUTTERFLY_0 +#undef BUTTERFLY_N +#undef DO_REDUCE + + A[0] = X0; + A[1] = X1; + A[2] = X2; + A[3] = X3; + A[4] = X4; + A[5] = X5; + A[6] = X6; + A[7] = X7; + +#undef X +} + +void fft128_4way( void *a ) +{ + int i; + // Temp space to help for interleaving in the end + __m512i B[8]; + __m512i *A = (__m512i*) a; +// __m256i *Twiddle = (__m256i*)FFT128_Twiddle; + + /* Size-2 butterflies */ + for ( i = 0; i<8; i++ ) + { + B[ i ] = _mm512_add_epi16( A[ i ], A[ i+8 ] ); + B[ i ] = REDUCE_FULL_S4w( B[ i ] ); + A[ i+8 ] = _mm512_sub_epi16( A[ i ], A[ i+8 ] ); + A[ i+8 ] = REDUCE_FULL_S4w( A[ i+8 ] ); + A[ i+8 ] = _mm512_mullo_epi16( A[ i+8 ], FFT128_Twiddle4w[i].v512 ); + A[ i+8 ] = REDUCE_FULL_S4w( A[ i+8 ] ); + } + + fft64_4way( B ); + fft64_4way( A+8 ); + + /* Transposi (i.e. interleave) */ + for ( i = 0; i < 8; i++ ) + { + A[ 2*i ] = _mm512_unpacklo_epi16( B[ i ], A[ i+8 ] ); + A[ 2*i+1 ] = _mm512_unpackhi_epi16( B[ i ], A[ i+8 ] ); + } +} + +void fft128_4way_msg( uint16_t *a, const uint8_t *x, int final ) +{ + const __m512i zero = _mm512_setzero_si512(); + static const m512_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, + 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1 }}; + static const m512_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, + 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1 }}; + + __m512i *X = (__m512i*)x; + __m512i *A = (__m512i*)a; +// __m256i *Twiddle = (__m256i*)FFT128_Twiddle; + +#define UNPACK( i ) \ +do { \ + __m512i t = X[i]; \ + A[2*i] = _mm512_unpacklo_epi8( t, zero ); \ + A[2*i+8] = _mm512_mullo_epi16( A[2*i], FFT128_Twiddle4w[2*i].v512 ); \ + A[2*i+8] = REDUCE4w(A[2*i+8]); \ + A[2*i+1] = _mm512_unpackhi_epi8( t, zero ); \ + A[2*i+9] = _mm512_mullo_epi16(A[2*i+1], FFT128_Twiddle4w[2*i+1].v512 ); \ + A[2*i+9] = REDUCE4w(A[2*i+9]); \ +} while(0) + + // This allows to tweak the last butterflies to introduce X^127 +#define UNPACK_TWEAK( i,tw ) \ +do { \ + __m512i t = X[i]; \ + __m512i tmp; \ + A[2*i] = _mm512_unpacklo_epi8( t, zero ); \ + A[2*i+8] = _mm512_mullo_epi16( A[ 2*i ], FFT128_Twiddle4w[ 2*i ].v512 ); \ + A[2*i+8] = REDUCE4w( A[ 2*i+8 ] ); \ + tmp = _mm512_unpackhi_epi8( t, zero ); \ + A[2*i+1] = _mm512_add_epi16( tmp, tw ); \ + A[2*i+9] = _mm512_mullo_epi16( _mm512_sub_epi16( tmp, tw ), \ + FFT128_Twiddle4w[ 2*i+1 ].v512 );\ + A[2*i+9] = REDUCE4w( A[ 2*i+9 ] ); \ +} while(0) + + UNPACK( 0 ); + UNPACK( 1 ); + UNPACK( 2 ); + if ( final ) + UNPACK_TWEAK( 3, FinalTweak.v512 ); + else + UNPACK_TWEAK( 3, Tweak.v512 ); + +#undef UNPACK +#undef UNPACK_TWEAK + + fft64_4way( a ); + fft64_4way( a+256 ); +} + +void fft256_4way_msg( uint16_t *a, const uint8_t *x, int final ) +{ + const __m512i zero = _mm512_setzero_si512(); + static const m512_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, + 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1 }}; + static const m512_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, + 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1 }}; + + __m512i *X = (__m512i*)x; + __m512i *A = (__m512i*)a; +// __m256i *Twiddle = (__m256i*)FFT256_Twiddle; + +#define UNPACK( i ) \ +do { \ + __m512i t = X[i]; \ + A[ 2*i ] = _mm512_unpacklo_epi8( t, zero ); \ + A[ 2*i + 16 ] = _mm512_mullo_epi16( A[ 2*i ], \ + FFT256_Twiddle4w[ 2*i ].v512 ); \ + A[ 2*i + 16 ] = REDUCE4w( A[ 2*i + 16 ] ); \ + A[ 2*i + 1 ] = _mm512_unpackhi_epi8( t, zero ); \ + A[ 2*i + 17 ] = _mm512_mullo_epi16( A[ 2*i + 1 ], \ + FFT256_Twiddle4w[ 2*i + 1 ].v512 ); \ + A[ 2*i + 17 ] = REDUCE4w( A[ 2*i + 17 ] ); \ +} while(0) + + // This allows to tweak the last butterflies to introduce X^127 +#define UNPACK_TWEAK( i,tw ) \ +do { \ + __m512i t = X[i]; \ + __m512i tmp; \ + A[ 2*i ] = _mm512_unpacklo_epi8( t, zero ); \ + A[ 2*i + 16 ] = _mm512_mullo_epi16( A[ 2*i ], \ + FFT256_Twiddle4w[ 2*i ].v512 ); \ + A[ 2*i + 16 ] = REDUCE4w( A[ 2*i + 16 ] ); \ + tmp = _mm512_unpackhi_epi8( t, zero ); \ + A[ 2*i + 1 ] = _mm512_add_epi16( tmp, tw ); \ + A[ 2*i + 17 ] = _mm512_mullo_epi16( _mm512_sub_epi16( tmp, tw ), \ + FFT256_Twiddle4w[ 2*i + 1 ].v512 ); \ + } while(0) + + UNPACK( 0 ); + UNPACK( 1 ); + UNPACK( 2 ); + UNPACK( 3 ); + UNPACK( 4 ); + UNPACK( 5 ); + UNPACK( 6 ); + if ( final ) + UNPACK_TWEAK( 7, FinalTweak.v512 ); + else + UNPACK_TWEAK( 7, Tweak.v512 ); + +#undef UNPACK +#undef UNPACK_TWEAK + + fft128_4way( a ); + fft128_4way( a+512 ); +} + +#define c1_16_512( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }} + +void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft ) +{ + register __m512i S0l, S1l, S2l, S3l; + register __m512i S0h, S1h, S2h, S3h; + __m512i *S = (__m512i*) state; + __m512i *M = (__m512i*) msg; + __m512i *W = (__m512i*) fft; + + static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) }; + + +// static const m512_v16 code[] = { c1_16(185), c1_16(233), +// c1_16(185), c1_16(233) }; + + + S0l = _mm512_xor_si512( S[0], M[0] ); + S0h = _mm512_xor_si512( S[1], M[1] ); + S1l = _mm512_xor_si512( S[2], M[2] ); + S1h = _mm512_xor_si512( S[3], M[3] ); + S2l = _mm512_xor_si512( S[4], M[4] ); + S2h = _mm512_xor_si512( S[5], M[5] ); + S3l = _mm512_xor_si512( S[6], M[6] ); + S3h = _mm512_xor_si512( S[7], M[7] ); + +// targetted, local macros don't need a unique name +#define S(i) S##i + +#define F_0(B, C, D) \ + _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D ) +#define F_1(B, C, D) \ + _mm512_or_si512( _mm512_and_si512( D, C ),\ + _mm512_and_si512( _mm512_or_si512( D,C ), B ) ) + +#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l) +#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h) + + // We split the round function in two halfes + // so as to insert some independent computations in between + +// generic +#if 0 +#define SUM7_00 0 +#define SUM7_01 1 +#define SUM7_02 2 +#define SUM7_03 3 +#define SUM7_04 4 +#define SUM7_05 5 +#define SUM7_06 6 + +#define SUM7_10 1 +#define SUM7_11 2 +#define SUM7_12 3 +#define SUM7_13 4 +#define SUM7_14 5 +#define SUM7_15 6 +#define SUM7_16 0 + +#define SUM7_20 2 +#define SUM7_21 3 +#define SUM7_22 4 +#define SUM7_23 5 +#define SUM7_24 6 +#define SUM7_25 0 +#define SUM7_26 1 + +#define SUM7_30 3 +#define SUM7_31 4 +#define SUM7_32 5 +#define SUM7_33 6 +#define SUM7_34 0 +#define SUM7_35 1 +#define SUM7_36 2 + +#define SUM7_40 4 +#define SUM7_41 5 +#define SUM7_42 6 +#define SUM7_43 0 +#define SUM7_44 1 +#define SUM7_45 2 +#define SUM7_46 3 + +#define SUM7_50 5 +#define SUM7_51 6 +#define SUM7_52 0 +#define SUM7_53 1 +#define SUM7_54 2 +#define SUM7_55 3 +#define SUM7_56 4 + +#define SUM7_60 6 +#define SUM7_61 0 +#define SUM7_62 1 +#define SUM7_63 2 +#define SUM7_64 3 +#define SUM7_65 4 +#define SUM7_66 5 + +#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a) + +#define PERM_0(d,a) /* XOR 1 */ \ +do { \ + d##l = shufxor( a##l, 1 ); \ + d##h = shufxor( a##h, 1 ); \ + } while(0) + +#define PERM_1(d,a) /* XOR 6 */ \ +do { \ + d##l = shufxor( a##h, 2 ); \ + d##h = shufxor( a##l, 2 ); \ +} while(0) + +#define PERM_2(d,a) /* XOR 2 */ \ +do { \ + d##l = shufxor( a##l, 2 ); \ + d##h = shufxor( a##h, 2 ); \ +} while(0) + +#define PERM_3(d,a) /* XOR 3 */ \ +do { \ + d##l = shufxor( a##l, 3 ); \ + d##h = shufxor( a##h, 3 ); \ +} while(0) + +#define PERM_4(d,a) /* XOR 5 */ \ +do { \ + d##l = shufxor( a##h, 1 ); \ + d##h = shufxor( a##l, 1 ); \ +} while(0) + +#define PERM_5(d,a) /* XOR 7 */ \ +do { \ + d##l = shufxor( a##h, 3 ); \ + d##h = shufxor( a##l, 3 ); \ +} while(0) + +#define PERM_6(d,a) /* XOR 4 */ \ +do { \ + d##l = a##h; \ + d##h = a##l; \ +} while(0) +#endif + +// targetted + +#define STEP_1_(a,b,c,d,w,fun,r,s,z) \ +do { \ + TTl = Fl( a,b,c,fun ); \ + TTh = Fh( a,b,c,fun ); \ + a##l = mm512_rol_32( a##l, r ); \ + a##h = mm512_rol_32( a##h, r ); \ + w##l = _mm512_add_epi32( w##l, d##l ); \ + w##h = _mm512_add_epi32( w##h, d##h ); \ + TTl = _mm512_add_epi32( TTl, w##l ); \ + TTh = _mm512_add_epi32( TTh, w##h ); \ + TTl = mm512_rol_32( TTl, s ); \ + TTh = mm512_rol_32( TTh, s ); \ + PERM( z,d,a, shufxor4w ); \ +} while(0) + +#define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z ) + +#define STEP_2_( a,b,c,d,w,fun,r,s ) \ +do { \ + d##l = _mm512_add_epi32( d##l, TTl ); \ + d##h = _mm512_add_epi32( d##h, TTh ); \ +} while(0) + +#define STEP_2( a,b,c,d,w,fun,r,s ) STEP_2_( a,b,c,d,w,fun,r,s ) + +#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \ +do { \ + register __m512i TTl, TTh, Wl=w1, Wh=w2; \ + STEP_1( a,b,c,d,W,fun,r,s,z ); \ + STEP_2( a,b,c,d,W,fun,r,s ); \ +} while(0); + +#define MSG_l(x) (2*(x)) +#define MSG_h(x) (2*(x)+1) + +#define MSG( w,hh,ll,u,z ) \ +do { \ + int a = MSG_##u(hh); \ + int b = MSG_##u(ll); \ + w##l = _mm512_unpacklo_epi16( W[a], W[b] ); \ + w##l = _mm512_mullo_epi16( w##l, code[z].v512 ); \ + w##h = _mm512_unpackhi_epi16( W[a], W[b]) ; \ + w##h = _mm512_mullo_epi16( w##h, code[z].v512 ); \ +} while(0) + +#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \ +do { \ + register __m512i W0l, W1l, W2l, W3l, TTl; \ + register __m512i W0h, W1h, W2h, W3h, TTh; \ + MSG( W0, h0, l0, u0, z ); \ + STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \ + MSG( W1, h1, l1, u1, z ); \ + STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \ + STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \ + MSG( W2,h2,l2,u2,z ); \ + STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \ + STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \ + MSG( W3,h3,l3,u3,z ); \ + STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \ + STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \ + STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \ +} while(0) + + // 4 rounds with code 185 +#define PERM_START 0 + ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0); +#undef PERM_START +#define PERM_START 4 + ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0); +#undef PERM_START +#define PERM_START 1 + ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0); +#undef PERM_START +#define PERM_START 5 + ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0); +#undef PERM_START + + // 4 rounds with code 233 +#define PERM_START 2 + ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1); +#undef PERM_START +#define PERM_START 6 + ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1); +#undef PERM_START +#define PERM_START 3 + ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1); +#undef PERM_START +#define PERM_START 0 + ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1); +#undef PERM_START + + // 1 round as feed-forward +#define PERM_START 4 + STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0 ); + STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 ); + STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 ); + STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3 ); + + S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h; + S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h; + +#undef PERM_START +#undef STEP_1 +#undef STEP_1_ +#undef STEP_2 +#undef STEP_2_ +#undef STEP +#undef ROUND +#undef S +#undef F_0 +#undef F_1 +#undef Fl +#undef Fh +#undef MSG_l +#undef MSG_h +#undef MSG +} + +void SIMD_4way_Compress( simd_4way_context *state, const void *m, int final ) +{ + m512_v16 Y[32]; + uint16_t *y = (uint16_t*) Y[0].u16; + + fft256_4way_msg( y, m, final ); + + rounds512_4way( state->A, m, y ); +} + +// imported from nist.c + +int simd_4way_init( simd_4way_context *state, int hashbitlen ) +{ + __m512i *A = (__m512i*)state->A; + int n = 8; + + state->hashbitlen = hashbitlen; + state->n_feistels = n; + state->blocksize = 128*8; + state->count = 0; + + for ( int i = 0; i < 8; i++ ) + A[i] = _mm512_set4_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2], + SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] ); + return 0; +} + +int simd_4way_update( simd_4way_context *state, const void *data, + int databitlen ) +{ + int bs = state->blocksize; + int current = state->count & (bs - 1); + + while ( databitlen > 0 ) + { + if ( ( current == 0 ) && ( databitlen >= bs ) ) + { + // We can hash the data directly from the input buffer. + SIMD_4way_Compress( state, data, 0 ); + databitlen -= bs; + data += 4*(bs/8); + state->count += bs; + } + else + { + // Copy a chunk of data to the buffer + int len = bs - current; + if ( databitlen < len ) + { + memcpy( state->buffer + 4 * (current/8), data, 4 * (databitlen/8) ); + state->count += databitlen; + return 0; + } + else + { + memcpy( state->buffer + 4 * (current / 8), data, 4 * (len / 8) ); + state->count += len; + databitlen -= len; + data += 4*(len/8); + current = 0; + SIMD_4way_Compress( state, state->buffer, 0 ); + } + } + } + return 0; +} -// imported from vector.c +int simd_4way_close( simd_4way_context *state, void *hashval ) +{ + uint64_t l; + int current = state->count & (state->blocksize - 1); + int i; + int isshort = 1; -#define REDUCE(x) \ - _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \ - _mm256_srai_epi16( x, 8 ) ) + // If there is still some data in the buffer, hash it + if ( current ) + { + current = ( current+7 ) / 8; + memset( state->buffer + 4*current, 0, 4*( state->blocksize/8 - current ) ); + SIMD_4way_Compress( state, state->buffer, 0 ); + } -#define EXTRA_REDUCE_S(x)\ - _mm256_sub_epi16( x, \ - _mm256_and_si256( _mm256_set1_epi16( 257 ), \ - _mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) ) + //* Input the message length as the last block + memset( state->buffer, 0, 4*(state->blocksize / 8) ); + l = state->count; + for ( i = 0; i < 8; i++ ) + { + state->buffer[ i ] = l & 0xff; + state->buffer[ i+16 ] = l & 0xff; + state->buffer[ i+32 ] = l & 0xff; + state->buffer[ i+48 ] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; -#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) + SIMD_4way_Compress( state, state->buffer, isshort ); + memcpy( hashval, state->A, 4*(state->hashbitlen / 8) ); -#define DO_REDUCE( i ) X(i) = REDUCE( X(i) ) + return 0; +} -#define DO_REDUCE_FULL_S(i) \ -do { \ - X(i) = REDUCE( X(i) ); \ - X(i) = EXTRA_REDUCE_S( X(i) ); \ -} while(0) +int simd_4way_update_close( simd_4way_context *state, void *hashval, + const void *data, int databitlen ) +{ + int current, i; + int bs = state->blocksize; // bits in one lane + int isshort = 1; + uint64_t l; + + current = state->count & (bs - 1); + + while ( databitlen > 0 ) + { + if ( current == 0 && databitlen >= bs ) + { + // We can hash the data directly from the input buffer. + SIMD_4way_Compress( state, data, 0 ); + databitlen -= bs; + data += 4*( bs/8 ); + state->count += bs; + } + else + { + // Copy a chunk of data to the buffer + int len = bs - current; + if ( databitlen < len ) + { + memcpy( state->buffer + 4*( current/8 ), data, 4*( (databitlen)/8 ) ); + state->count += databitlen; + break; + } + else + { + memcpy( state->buffer + 4*(current/8), data, 4*(len/8) ); + state->count += len; + databitlen -= len; + data += 4*( len/8 ); + current = 0; + SIMD_4way_Compress( state, state->buffer, 0 ); + } + } + } + + current = state->count & (state->blocksize - 1); + + // If there is still some data in the buffer, hash it + if ( current ) + { + current = current / 8; + memset( state->buffer + 4*current, 0, 4*( state->blocksize/8 - current) ); + SIMD_4way_Compress( state, state->buffer, 0 ); + } + + //* Input the message length as the last block + memset( state->buffer, 0, 4*( state->blocksize/8 ) ); + l = state->count; + for ( i = 0; i < 8; i++ ) + { + state->buffer[ i ] = l & 0xff; + state->buffer[ i+16 ] = l & 0xff; + state->buffer[ i+32 ] = l & 0xff; + state->buffer[ i+48 ] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; + + SIMD_4way_Compress( state, state->buffer, isshort ); + memcpy( hashval, state->A, 4*( state->hashbitlen / 8 ) ); + return 0; +} + +int simd512_4way_full( simd_4way_context *state, void *hashval, + const void *data, int datalen ) +{ + __m512i *A = (__m512i*)state->A; + + state->hashbitlen = 512; + state->n_feistels = 8; + state->blocksize = 128*8; + state->count = 0; + + for ( int i = 0; i < 8; i++ ) + A[i] = _mm512_set4_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2], + SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] ); + + int current, i; + int bs = state->blocksize; // bits in one lane + int isshort = 1; + uint64_t l; + int databitlen = datalen * 8; + + current = state->count & (bs - 1); + + while ( databitlen > 0 ) + { + if ( current == 0 && databitlen >= bs ) + { + // We can hash the data directly from the input buffer. + SIMD_4way_Compress( state, data, 0 ); + databitlen -= bs; + data += 4*( bs/8 ); + state->count += bs; + } + else + { + // Copy a chunk of data to the buffer + int len = bs - current; + if ( databitlen < len ) + { + memcpy( state->buffer + 4*( current/8 ), data, 4*( (databitlen)/8 ) ); + state->count += databitlen; + break; + } + else + { + memcpy( state->buffer + 4*(current/8), data, 4*(len/8) ); + state->count += len; + databitlen -= len; + data += 4*( len/8 ); + current = 0; + SIMD_4way_Compress( state, state->buffer, 0 ); + } + } + } + + current = state->count & (state->blocksize - 1); + + // If there is still some data in the buffer, hash it + if ( current ) + { + current = current / 8; + memset( state->buffer + 4*current, 0, 4*( state->blocksize/8 - current) ); + SIMD_4way_Compress( state, state->buffer, 0 ); + } + + //* Input the message length as the last block + memset( state->buffer, 0, 4*( state->blocksize/8 ) ); + l = state->count; + for ( i = 0; i < 8; i++ ) + { + state->buffer[ i ] = l & 0xff; + state->buffer[ i+16 ] = l & 0xff; + state->buffer[ i+32 ] = l & 0xff; + state->buffer[ i+48 ] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; + + SIMD_4way_Compress( state, state->buffer, isshort ); + memcpy( hashval, state->A, 4*( state->hashbitlen / 8 ) ); + return 0; +} + + + +#endif // AVX512 + +//////////////////////////////////// +// +// SIMD 2 way AVX2 void fft64_2way( void *a ) { @@ -296,7 +1431,9 @@ do { \ DO_REDUCE_FULL_S( 6 ); DO_REDUCE_FULL_S( 7 ); -#undef BUTTERFLY +#undef BUTTERFLY_0 +#undef BUTTERFLY_N +#undef DO_REDUCE A[0] = X0; A[1] = X1; @@ -446,6 +1583,7 @@ do { \ fft128_2way( a ); fft128_2way( a+256 ); + } #define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }} @@ -482,7 +1620,7 @@ void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft ) // We split the round function in two halfes // so as to insert some independent computations in between - +#if 0 #define SUM7_00 0 #define SUM7_01 1 #define SUM7_02 2 @@ -582,6 +1720,7 @@ do { \ d##l = a##h; \ d##h = a##l; \ } while(0) +#endif #define STEP_1_(a,b,c,d,w,fun,r,s,z) \ do { \ @@ -595,7 +1734,7 @@ do { \ TTh = _mm256_add_epi32( TTh, w##h ); \ TTl = mm256_rol_32( TTl, s ); \ TTh = mm256_rol_32( TTh, s ); \ - PERM( z,d,a ); \ + PERM( z,d,a, shufxor2w ); \ } while(0) #define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z ) @@ -686,16 +1825,28 @@ do { \ #undef PERM_START #undef STEP_1 +#undef STEP_1_ #undef STEP_2 +#undef STEP_2_ #undef STEP #undef ROUND +#undef S +#undef F_0 +#undef F_1 +#undef Fl +#undef Fh +#undef MSG_l +#undef MSG_h +#undef MSG } void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final ) { m256_v16 Y[32]; uint16_t *y = (uint16_t*) Y[0].u16; + fft256_2way_msg( y, m, final ); + rounds512_2way( state->A, m, y ); } @@ -808,6 +1959,92 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval, { // We can hash the data directly from the input buffer. SIMD_2way_Compress( state, data, 0 ); + + databitlen -= bs; + data += 2*( bs/8 ); + state->count += bs; + } + else + { + // Copy a chunk of data to the buffer + int len = bs - current; + if ( databitlen < len ) + { + + memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) ); + state->count += databitlen; + break; + } + else + { + memcpy( state->buffer + 2*(current/8), data, 2*(len/8) ); + state->count += len; + databitlen -= len; + data += 2*( len/8 ); + current = 0; + SIMD_2way_Compress( state, state->buffer, 0 ); + } + } + } + + current = state->count & (state->blocksize - 1); + + // If there is still some data in the buffer, hash it + if ( current ) + { + current = ( current+7 ) / 8; + memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) ); + SIMD_2way_Compress( state, state->buffer, 0 ); + } + + //* Input the message length as the last block + memset( state->buffer, 0, 2*( state->blocksize/8 ) ); + l = state->count; + for ( i = 0; i < 8; i++ ) + { + state->buffer[ i ] = l & 0xff; + state->buffer[ i+16 ] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; + + SIMD_2way_Compress( state, state->buffer, isshort ); + memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) ); + return 0; +} + +int simd512_2way_full( simd_2way_context *state, void *hashval, + const void *data, int datalen ) +{ + __m256i *A = (__m256i*)state->A; + + state->hashbitlen = 512; + state->n_feistels = 8; + state->blocksize = 128*8; + state->count = 0; + + for ( int i = 0; i < 8; i++ ) + A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2], + SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0], + SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2], + SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] ); + + int current, i; + int bs = state->blocksize; // bits in one lane + int isshort = 1; + uint64_t l; + int databitlen = datalen * 8; + + current = state->count & (bs - 1); + + while ( databitlen > 0 ) + { + if ( current == 0 && databitlen >= bs ) + { + // We can hash the data directly from the input buffer. + SIMD_2way_Compress( state, data, 0 ); + databitlen -= bs; data += 2*( bs/8 ); state->count += bs; @@ -818,7 +2055,8 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval, int len = bs - current; if ( databitlen < len ) { - memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) ); + + memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) ); state->count += databitlen; break; } @@ -861,4 +2099,5 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval, return 0; } + #endif diff --git a/algo/simd/simd-hash-2way.h b/algo/simd/simd-hash-2way.h index 2b99965..9aad145 100644 --- a/algo/simd/simd-hash-2way.h +++ b/algo/simd/simd-hash-2way.h @@ -7,15 +7,39 @@ #include "simd-utils.h" + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + typedef struct { - uint32_t A[ 32*2 ] __attribute__((aligned(64))); - uint8_t buffer[ 128*2 ] __attribute__((aligned(64))); + uint32_t A[ 32*4 ]; + uint8_t buffer[ 128*4 ]; + uint64_t count; + unsigned int hashbitlen; + unsigned int blocksize; + unsigned int n_feistels; + +} simd_4way_context __attribute__((aligned(128))); + +int simd_4way_init( simd_4way_context *state, int hashbitlen ); +int simd_4way_update( simd_4way_context *state, const void *data, + int databitlen ); +int simd_4way_close( simd_4way_context *state, void *hashval ); +int simd_4way_update_close( simd_4way_context *state, void *hashval, + const void *data, int databitlen ); +int simd512_4way_full( simd_4way_context *state, void *hashval, + const void *data, int datalen ); + +#endif + +typedef struct { + uint32_t A[ 32*2 ]; + uint8_t buffer[ 128*2 ]; uint64_t count; unsigned int hashbitlen; unsigned int blocksize; unsigned int n_feistels; -} simd_2way_context; +} simd_2way_context __attribute__((aligned(128))); int simd_2way_init( simd_2way_context *state, int hashbitlen ); int simd_2way_update( simd_2way_context *state, const void *data, @@ -23,5 +47,8 @@ int simd_2way_update( simd_2way_context *state, const void *data, int simd_2way_close( simd_2way_context *state, void *hashval ); int simd_2way_update_close( simd_2way_context *state, void *hashval, const void *data, int databitlen ); +int simd512_2way_full( simd_2way_context *state, void *hashval, + const void *data, int datalen ); + #endif #endif diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c deleted file mode 100644 index d600e60..0000000 --- a/algo/skein/skein-4way.c +++ /dev/null @@ -1,101 +0,0 @@ -#include "skein-gate.h" -#include -#include -#include "skein-hash-4way.h" -#if defined(__SHA__) - #include -#else - #include "algo/sha/sha2-hash-4way.h" -#endif - -#if defined (SKEIN_4WAY) - -void skeinhash_4way( void *state, const void *input ) -{ - uint64_t vhash64[16*4] __attribute__ ((aligned (64))); - skein512_4way_context ctx_skein; -#if defined(__SHA__) - uint32_t hash0[16] __attribute__ ((aligned (64))); - uint32_t hash1[16] __attribute__ ((aligned (64))); - uint32_t hash2[16] __attribute__ ((aligned (64))); - uint32_t hash3[16] __attribute__ ((aligned (64))); - SHA256_CTX ctx_sha256; -#else - uint32_t vhash32[16*4] __attribute__ ((aligned (64))); - sha256_4way_context ctx_sha256; -#endif - - skein512_4way_init( &ctx_skein ); - skein512_4way( &ctx_skein, input, 80 ); - skein512_4way_close( &ctx_skein, vhash64 ); - -#if defined(__SHA__) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 ); - SHA256_Final( (unsigned char*)hash0, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 ); - SHA256_Final( (unsigned char*)hash1, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 ); - SHA256_Final( (unsigned char*)hash2, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 ); - SHA256_Final( (unsigned char*)hash3, &ctx_sha256 ); - - intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 ); -#else - rintrlv_4x64_4x32( vhash32, vhash64, 512 ); - - sha256_4way_init( &ctx_sha256 ); - sha256_4way( &ctx_sha256, vhash32, 64 ); - sha256_4way_close( &ctx_sha256, state ); -#endif -} - -int scanhash_skein_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[16*4] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash7 = &(hash[7<<2]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - skeinhash_4way( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane ] <= Htarg ) - { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/skein/skein-gate.c b/algo/skein/skein-gate.c deleted file mode 100644 index f41c874..0000000 --- a/algo/skein/skein-gate.c +++ /dev/null @@ -1,20 +0,0 @@ -#include "skein-gate.h" -#include "sph_skein.h" -#include "skein-hash-4way.h" - -int64_t skein_get_max64() { return 0x7ffffLL; } - -bool register_skein_algo( algo_gate_t* gate ) -{ - gate->optimizations = AVX2_OPT | SHA_OPT; -#if defined (SKEIN_4WAY) - gate->scanhash = (void*)&scanhash_skein_4way; - gate->hash = (void*)&skeinhash_4way; -#else - gate->scanhash = (void*)&scanhash_skein; - gate->hash = (void*)&skeinhash; -#endif - gate->get_max64 = (void*)&skein_get_max64; - return true; -}; - diff --git a/algo/skein/skein-gate.h b/algo/skein/skein-gate.h deleted file mode 100644 index ac7f281..0000000 --- a/algo/skein/skein-gate.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __SKEIN_GATE_H__ -#define __SKEIN_GATE_H__ -#include -#include "algo-gate-api.h" - -#if defined(__AVX2__) - #define SKEIN_4WAY -#endif - -#if defined(SKEIN_4WAY) - -void skeinhash_4way( void *output, const void *input ); - -int scanhash_skein_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - -void skeinhash( void *output, const void *input ); - -int scanhash_skein( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c deleted file mode 100644 index 358ecd8..0000000 --- a/algo/skein/skein-hash-4way.c +++ /dev/null @@ -1,582 +0,0 @@ -/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */ -/* - * Skein implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#if defined (__AVX2__) - -#include -#include -#include "skein-hash-4way.h" - - -#ifdef __cplusplus -extern "C"{ -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -/* - * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). - */ - -#define M9_0_0 0 -#define M9_0_1 1 -#define M9_0_2 2 -#define M9_0_3 3 -#define M9_0_4 4 -#define M9_0_5 5 -#define M9_0_6 6 -#define M9_0_7 7 - -#define M9_1_0 1 -#define M9_1_1 2 -#define M9_1_2 3 -#define M9_1_3 4 -#define M9_1_4 5 -#define M9_1_5 6 -#define M9_1_6 7 -#define M9_1_7 8 - -#define M9_2_0 2 -#define M9_2_1 3 -#define M9_2_2 4 -#define M9_2_3 5 -#define M9_2_4 6 -#define M9_2_5 7 -#define M9_2_6 8 -#define M9_2_7 0 - -#define M9_3_0 3 -#define M9_3_1 4 -#define M9_3_2 5 -#define M9_3_3 6 -#define M9_3_4 7 -#define M9_3_5 8 -#define M9_3_6 0 -#define M9_3_7 1 - -#define M9_4_0 4 -#define M9_4_1 5 -#define M9_4_2 6 -#define M9_4_3 7 -#define M9_4_4 8 -#define M9_4_5 0 -#define M9_4_6 1 -#define M9_4_7 2 - -#define M9_5_0 5 -#define M9_5_1 6 -#define M9_5_2 7 -#define M9_5_3 8 -#define M9_5_4 0 -#define M9_5_5 1 -#define M9_5_6 2 -#define M9_5_7 3 - -#define M9_6_0 6 -#define M9_6_1 7 -#define M9_6_2 8 -#define M9_6_3 0 -#define M9_6_4 1 -#define M9_6_5 2 -#define M9_6_6 3 -#define M9_6_7 4 - -#define M9_7_0 7 -#define M9_7_1 8 -#define M9_7_2 0 -#define M9_7_3 1 -#define M9_7_4 2 -#define M9_7_5 3 -#define M9_7_6 4 -#define M9_7_7 5 - -#define M9_8_0 8 -#define M9_8_1 0 -#define M9_8_2 1 -#define M9_8_3 2 -#define M9_8_4 3 -#define M9_8_5 4 -#define M9_8_6 5 -#define M9_8_7 6 - -#define M9_9_0 0 -#define M9_9_1 1 -#define M9_9_2 2 -#define M9_9_3 3 -#define M9_9_4 4 -#define M9_9_5 5 -#define M9_9_6 6 -#define M9_9_7 7 - -#define M9_10_0 1 -#define M9_10_1 2 -#define M9_10_2 3 -#define M9_10_3 4 -#define M9_10_4 5 -#define M9_10_5 6 -#define M9_10_6 7 -#define M9_10_7 8 - -#define M9_11_0 2 -#define M9_11_1 3 -#define M9_11_2 4 -#define M9_11_3 5 -#define M9_11_4 6 -#define M9_11_5 7 -#define M9_11_6 8 -#define M9_11_7 0 - -#define M9_12_0 3 -#define M9_12_1 4 -#define M9_12_2 5 -#define M9_12_3 6 -#define M9_12_4 7 -#define M9_12_5 8 -#define M9_12_6 0 -#define M9_12_7 1 - -#define M9_13_0 4 -#define M9_13_1 5 -#define M9_13_2 6 -#define M9_13_3 7 -#define M9_13_4 8 -#define M9_13_5 0 -#define M9_13_6 1 -#define M9_13_7 2 - -#define M9_14_0 5 -#define M9_14_1 6 -#define M9_14_2 7 -#define M9_14_3 8 -#define M9_14_4 0 -#define M9_14_5 1 -#define M9_14_6 2 -#define M9_14_7 3 - -#define M9_15_0 6 -#define M9_15_1 7 -#define M9_15_2 8 -#define M9_15_3 0 -#define M9_15_4 1 -#define M9_15_5 2 -#define M9_15_6 3 -#define M9_15_7 4 - -#define M9_16_0 7 -#define M9_16_1 8 -#define M9_16_2 0 -#define M9_16_3 1 -#define M9_16_4 2 -#define M9_16_5 3 -#define M9_16_6 4 -#define M9_16_7 5 - -#define M9_17_0 8 -#define M9_17_1 0 -#define M9_17_2 1 -#define M9_17_3 2 -#define M9_17_4 3 -#define M9_17_5 4 -#define M9_17_6 5 -#define M9_17_7 6 - -#define M9_18_0 0 -#define M9_18_1 1 -#define M9_18_2 2 -#define M9_18_3 3 -#define M9_18_4 4 -#define M9_18_5 5 -#define M9_18_6 6 -#define M9_18_7 7 - -/* - * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1). - */ - -#define M3_0_0 0 -#define M3_0_1 1 -#define M3_1_0 1 -#define M3_1_1 2 -#define M3_2_0 2 -#define M3_2_1 0 -#define M3_3_0 0 -#define M3_3_1 1 -#define M3_4_0 1 -#define M3_4_1 2 -#define M3_5_0 2 -#define M3_5_1 0 -#define M3_6_0 0 -#define M3_6_1 1 -#define M3_7_0 1 -#define M3_7_1 2 -#define M3_8_0 2 -#define M3_8_1 0 -#define M3_9_0 0 -#define M3_9_1 1 -#define M3_10_0 1 -#define M3_10_1 2 -#define M3_11_0 2 -#define M3_11_1 0 -#define M3_12_0 0 -#define M3_12_1 1 -#define M3_13_0 1 -#define M3_13_1 2 -#define M3_14_0 2 -#define M3_14_1 0 -#define M3_15_0 0 -#define M3_15_1 1 -#define M3_16_0 1 -#define M3_16_1 2 -#define M3_17_0 2 -#define M3_17_1 0 -#define M3_18_0 0 -#define M3_18_1 1 - -#define XCAT(x, y) XCAT_(x, y) -#define XCAT_(x, y) x ## y - - -#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) -#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) - -// AVX2 all scalar vars are now vectors representing 4 nonces in parallel - -#define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \ -do { \ - k8 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( _mm256_xor_si256( k0, k1 ), \ - _mm256_xor_si256( k2, k3 ) ), \ - _mm256_xor_si256( _mm256_xor_si256( k4, k5 ), \ - _mm256_xor_si256( k6, k7 ) ) ), \ - m256_const1_64( 0x1BD11BDAA9FC1A22) ); \ - t2 = t0 ^ t1; \ -} while (0) - -#define TFBIG_ADDKEY_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \ -do { \ - w0 = _mm256_add_epi64( w0, SKBI(k,s,0) ); \ - w1 = _mm256_add_epi64( w1, SKBI(k,s,1) ); \ - w2 = _mm256_add_epi64( w2, SKBI(k,s,2) ); \ - w3 = _mm256_add_epi64( w3, SKBI(k,s,3) ); \ - w4 = _mm256_add_epi64( w4, SKBI(k,s,4) ); \ - w5 = _mm256_add_epi64( w5, _mm256_add_epi64( SKBI(k,s,5), \ - m256_const1_64( SKBT(t,s,0) ) ) ); \ - w6 = _mm256_add_epi64( w6, _mm256_add_epi64( SKBI(k,s,6), \ - m256_const1_64( SKBT(t,s,1) ) ) ); \ - w7 = _mm256_add_epi64( w7, _mm256_add_epi64( SKBI(k,s,7), \ - m256_const1_64( s ) ) ); \ -} while (0) - - -#define TFBIG_MIX_4WAY(x0, x1, rc) \ -do { \ - x0 = _mm256_add_epi64( x0, x1 ); \ - x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 ); \ -} while (0) - - -// typeless -#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \ - TFBIG_MIX_4WAY(w0, w1, rc0); \ - TFBIG_MIX_4WAY(w2, w3, rc1); \ - TFBIG_MIX_4WAY(w4, w5, rc2); \ - TFBIG_MIX_4WAY(w6, w7, rc3); \ - } while (0) - - -#define TFBIG_4e(s) do { \ - TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ - TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ - TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ - TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ - TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ - } while (0) - -#define TFBIG_4o(s) do { \ - TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ - TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ - TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ - TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ - TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ - } while (0) - - -// scale buf offset by 4 -#define UBI_BIG_4WAY(etype, extra) \ -do { \ - sph_u64 t0, t1, t2; \ - __m256i h8; \ - __m256i m0 = buf[0]; \ - __m256i m1 = buf[1]; \ - __m256i m2 = buf[2]; \ - __m256i m3 = buf[3]; \ - __m256i m4 = buf[4]; \ - __m256i m5 = buf[5]; \ - __m256i m6 = buf[6]; \ - __m256i m7 = buf[7]; \ -\ - __m256i p0 = m0; \ - __m256i p1 = m1; \ - __m256i p2 = m2; \ - __m256i p3 = m3; \ - __m256i p4 = m4; \ - __m256i p5 = m5; \ - __m256i p6 = m6; \ - __m256i p7 = m7; \ - t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ - t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ - TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \ - TFBIG_4e(0); \ - TFBIG_4o(1); \ - TFBIG_4e(2); \ - TFBIG_4o(3); \ - TFBIG_4e(4); \ - TFBIG_4o(5); \ - TFBIG_4e(6); \ - TFBIG_4o(7); \ - TFBIG_4e(8); \ - TFBIG_4o(9); \ - TFBIG_4e(10); \ - TFBIG_4o(11); \ - TFBIG_4e(12); \ - TFBIG_4o(13); \ - TFBIG_4e(14); \ - TFBIG_4o(15); \ - TFBIG_4e(16); \ - TFBIG_4o(17); \ - TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \ - h0 = _mm256_xor_si256( m0, p0 );\ - h1 = _mm256_xor_si256( m1, p1 );\ - h2 = _mm256_xor_si256( m2, p2 );\ - h3 = _mm256_xor_si256( m3, p3 );\ - h4 = _mm256_xor_si256( m4, p4 );\ - h5 = _mm256_xor_si256( m5, p5 );\ - h6 = _mm256_xor_si256( m6, p6 );\ - h7 = _mm256_xor_si256( m7, p7 );\ -} while (0) - - -#define DECL_STATE_BIG_4WAY \ - __m256i h0, h1, h2, h3, h4, h5, h6, h7; \ - sph_u64 bcount; - -#define READ_STATE_BIG(sc) do { \ - h0 = (sc)->h0; \ - h1 = (sc)->h1; \ - h2 = (sc)->h2; \ - h3 = (sc)->h3; \ - h4 = (sc)->h4; \ - h5 = (sc)->h5; \ - h6 = (sc)->h6; \ - h7 = (sc)->h7; \ - bcount = sc->bcount; \ - } while (0) - -#define WRITE_STATE_BIG(sc) do { \ - (sc)->h0 = h0; \ - (sc)->h1 = h1; \ - (sc)->h2 = h2; \ - (sc)->h3 = h3; \ - (sc)->h4 = h4; \ - (sc)->h5 = h5; \ - (sc)->h6 = h6; \ - (sc)->h7 = h7; \ - sc->bcount = bcount; \ - } while (0) - - -static void -skein_big_init_4way( skein512_4way_context *sc, const sph_u64 *iv ) -{ - sc->h0 = _mm256_set_epi64x( iv[0], iv[0],iv[0],iv[0] ); - sc->h1 = _mm256_set_epi64x( iv[1], iv[1],iv[1],iv[1] ); - sc->h2 = _mm256_set_epi64x( iv[2], iv[2],iv[2],iv[2] ); - sc->h3 = _mm256_set_epi64x( iv[3], iv[3],iv[3],iv[3] ); - sc->h4 = _mm256_set_epi64x( iv[4], iv[4],iv[4],iv[4] ); - sc->h5 = _mm256_set_epi64x( iv[5], iv[5],iv[5],iv[5] ); - sc->h6 = _mm256_set_epi64x( iv[6], iv[6],iv[6],iv[6] ); - sc->h7 = _mm256_set_epi64x( iv[7], iv[7],iv[7],iv[7] ); - sc->bcount = 0; - sc->ptr = 0; -} - -static void -skein_big_core_4way( skein512_4way_context *sc, const void *data, - size_t len ) -{ - __m256i *vdata = (__m256i*)data; - __m256i *buf; - size_t ptr; - unsigned first; - DECL_STATE_BIG_4WAY - - buf = sc->buf; - ptr = sc->ptr; - const int buf_size = 64; // 64 * _m256i - - if ( len <= buf_size - ptr ) - { - memcpy_256( buf + (ptr>>3), vdata, len>>3 ); - sc->ptr = ptr + len; - return; - } - - READ_STATE_BIG( sc ); - first = ( bcount == 0 ) << 7; - do { - size_t clen; - - if ( ptr == buf_size ) - { - bcount ++; - UBI_BIG_4WAY( 96 + first, 0 ); - first = 0; - ptr = 0; - } - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( buf + (ptr>>3), vdata, clen>>3 ); - ptr += clen; - vdata += (clen>>3); - len -= clen; - } while ( len > 0 ); - WRITE_STATE_BIG( sc ); - sc->ptr = ptr; -} - -static void -skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n, - void *dst, size_t out_len ) -{ - __m256i *buf; - size_t ptr; - unsigned et; - DECL_STATE_BIG_4WAY - - buf = sc->buf; - ptr = sc->ptr; - const int buf_size = 64; - - /* - * At that point, if ptr == 0, then the message was empty; - * otherwise, there is between 1 and 64 bytes (inclusive) which - * are yet to be processed. Either way, we complete the buffer - * to a full block with zeros (the Skein specification mandates - * that an empty message is padded so that there is at least - * one block to process). - * - * Once this block has been processed, we do it again, with - * a block full of zeros, for the output (that block contains - * the encoding of "0", over 8 bytes, then padded with zeros). - */ - - READ_STATE_BIG(sc); - - memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 ); - et = 352 + ((bcount == 0) << 7); - UBI_BIG_4WAY( et, ptr ); - - memset_zero_256( buf, buf_size >> 3 ); - bcount = 0; - UBI_BIG_4WAY( 510, 8 ); - - buf[0] = h0; - buf[1] = h1; - buf[2] = h2; - buf[3] = h3; - buf[4] = h4; - buf[5] = h5; - buf[6] = h6; - buf[7] = h7; - - memcpy_256( dst, buf, out_len >> 3 ); -} - -static const sph_u64 IV256[] = { - SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), - SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), - SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251), - SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13) -}; - -static const sph_u64 IV512[] = { - SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), - SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), - SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), - SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) -}; - - -void -skein256_4way_init(void *cc) -{ - skein_big_init_4way(cc, IV256); -} - -void -skein256_4way(void *cc, const void *data, size_t len) -{ - skein_big_core_4way(cc, data, len); -} - -void -skein256_4way_close(void *cc, void *dst) -{ - skein_big_close_4way(cc, 0, 0, dst, 32); -} - -void -skein512_4way_init(void *cc) -{ - skein_big_init_4way(cc, IV512); -} - -void -skein512_4way(void *cc, const void *data, size_t len) -{ - skein_big_core_4way(cc, data, len); -} - -void -skein512_4way_close(void *cc, void *dst) -{ - skein_big_close_4way(cc, 0, 0, dst, 64); -} - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h deleted file mode 100644 index 106daeb..0000000 --- a/algo/skein/skein-hash-4way.h +++ /dev/null @@ -1,85 +0,0 @@ -/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */ -/** - * Skein interface. The Skein specification defines three main - * functions, called Skein-256, Skein-512 and Skein-1024, which can be - * further parameterized with an output length. For the SHA-3 - * competition, Skein-512 is used for output sizes of 224, 256, 384 and - * 512 bits; this is what this code implements. Thus, we hereafter call - * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein - * specification defines as Skein-512-224, Skein-512-256, Skein-512-384 - * and Skein-512-512, respectively. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_skein.h - * @author Thomas Pornin - */ - -#ifndef __SKEIN_HASH_4WAY_H__ -#define __SKEIN_HASH_4WAY_H__ 1 - -#ifdef __AVX2__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "algo/sha/sph_types.h" -#include "simd-utils.h" - -// Output size in bits -#define SPH_SIZE_skein256 256 -#define SPH_SIZE_skein512 512 - -typedef struct { - __m256i buf[8] __attribute__ ((aligned (32))); - __m256i h0, h1, h2, h3, h4, h5, h6, h7; - size_t ptr; - sph_u64 bcount; -} sph_skein_4way_big_context; - -typedef sph_skein_4way_big_context skein512_4way_context; -typedef sph_skein_4way_big_context skein256_4way_context; - -void skein512_4way_init(void *cc); -void skein512_4way(void *cc, const void *data, size_t len); -void skein512_4way_close(void *cc, void *dst); -//void sph_skein512_addbits_and_close( -// void *cc, unsigned ub, unsigned n, void *dst); - -void skein256_4way_init(void *cc); -void skein256_4way(void *cc, const void *data, size_t len); -void skein256_4way_close(void *cc, void *dst); -//void sph_skein256_addbits_and_close( -// void *cc, unsigned ub, unsigned n, void *dst); - - -#ifdef __cplusplus -} -#endif -#endif -#endif diff --git a/algo/skein/skein.c b/algo/skein/skein.c deleted file mode 100644 index c493406..0000000 --- a/algo/skein/skein.c +++ /dev/null @@ -1,55 +0,0 @@ -#include "algo-gate-api.h" -#include -#include -#include "sph_skein.h" -#include - -void skeinhash(void *state, const void *input) -{ - uint32_t hash[16] __attribute__ ((aligned (64))); - sph_skein512_context ctx_skein; - SHA256_CTX ctx_sha256; - - sph_skein512_init( &ctx_skein ); - sph_skein512( &ctx_skein, input, 80 ); - sph_skein512_close( &ctx_skein, hash ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash, 64 ); - SHA256_Final( (unsigned char*) hash, &ctx_sha256 ); - - memcpy(state, hash, 32); -} - -int scanhash_skein( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t hash64[8] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__ ((aligned (64))); - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - swab32_array( endiandata, pdata, 20 ); - - do { - be32enc(&endiandata[19], n); - skeinhash(hash64, endiandata); - if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return true; - } - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} - diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c deleted file mode 100644 index b67fa78..0000000 --- a/algo/skein/skein2-4way.c +++ /dev/null @@ -1,62 +0,0 @@ -#include "skein2-gate.h" -#include -#include -#include "skein-hash-4way.h" - -#if defined(SKEIN2_4WAY) - -void skein2hash_4way( void *output, const void *input ) -{ - skein512_4way_context ctx; - uint64_t hash[16*4] __attribute__ ((aligned (64))); - - skein512_4way_init( &ctx ); - skein512_4way( &ctx, input, 80 ); - skein512_4way_close( &ctx, hash ); - - skein512_4way_init( &ctx ); - skein512_4way( &ctx, hash, 64 ); - skein512_4way_close( &ctx, output ); -} - -int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[16*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[25]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - skein2hash_4way( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane<<1 ] <= Htarg ) - { - extr_lane_4x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/skein/skein2-gate.c b/algo/skein/skein2-gate.c deleted file mode 100644 index 34483b2..0000000 --- a/algo/skein/skein2-gate.c +++ /dev/null @@ -1,23 +0,0 @@ -#include "skein2-gate.h" -#include -#include "sph_skein.h" - -int64_t skein2_get_max64 () -{ - return 0x7ffffLL; -} - -bool register_skein2_algo( algo_gate_t* gate ) -{ - gate->optimizations = AVX2_OPT; -#if defined (SKEIN2_4WAY) - gate->scanhash = (void*)&scanhash_skein2_4way; - gate->hash = (void*)&skein2hash_4way; -#else - gate->scanhash = (void*)&scanhash_skein2; - gate->hash = (void*)&skein2hash; -#endif - gate->get_max64 = (void*)&skein2_get_max64; - return true; -}; - diff --git a/algo/skein/skein2-gate.h b/algo/skein/skein2-gate.h deleted file mode 100644 index 5f3759b..0000000 --- a/algo/skein/skein2-gate.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef __SKEIN2GATE_H__ -#define __SKEIN2_GATE_H__ -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) - #define SKEIN2_4WAY -#endif - -#if defined(SKEIN2_4WAY) -void skein2hash_4way( void *output, const void *input ); -int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, - uint64_t* hashes_done, struct thr_info *mythr ); -#endif - -void skein2hash( void *output, const void *input ); -int scanhash_skein2( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - diff --git a/algo/skein/skein2.c b/algo/skein/skein2.c deleted file mode 100644 index 93f3c07..0000000 --- a/algo/skein/skein2.c +++ /dev/null @@ -1,69 +0,0 @@ -#include "algo-gate-api.h" -#include -#include - -#include "sph_skein.h" - -// ctx caching seems slower with this algo -//typedef struct { -// sph_skein512_context skein; -//} skein2_ctx_holder; - -//skein2_ctx_holder skein2_ctx; - -//void init_skein2_ctx() -//{ -// sph_skein512_init(&skein2_ctx.skein); -//} - -void skein2hash(void *output, const void *input) -{ - sph_skein512_context ctx_skein; - - uint32_t hash[16] __attribute__ ((aligned (64))); - - sph_skein512_init(&ctx_skein); - sph_skein512(&ctx_skein, input, 80); - sph_skein512_close(&ctx_skein, hash); - - sph_skein512_init(&ctx_skein); - sph_skein512(&ctx_skein, hash, 64); - sph_skein512_close(&ctx_skein, hash); - - memcpy(output, hash, 32); - -} - -int scanhash_skein2( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t hash64[8] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__ ((aligned (64))); - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - swab32_array( endiandata, pdata, 20 ); - - do { - be32enc(&endiandata[19], n); - skein2hash(hash64, endiandata); - if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return true; - } - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} - - diff --git a/algo/skein/sse2/skein.c b/algo/skein/sse2/skein.c deleted file mode 100644 index e4d9199..0000000 --- a/algo/skein/sse2/skein.c +++ /dev/null @@ -1,482 +0,0 @@ -/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */ -/* - * Skein implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "../sph_skein.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - - - -/* - * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). - */ - -#define M9_0_0 0 -#define M9_0_1 1 -#define M9_0_2 2 -#define M9_0_3 3 -#define M9_0_4 4 -#define M9_0_5 5 -#define M9_0_6 6 -#define M9_0_7 7 - -#define M9_1_0 1 -#define M9_1_1 2 -#define M9_1_2 3 -#define M9_1_3 4 -#define M9_1_4 5 -#define M9_1_5 6 -#define M9_1_6 7 -#define M9_1_7 8 - -#define M9_2_0 2 -#define M9_2_1 3 -#define M9_2_2 4 -#define M9_2_3 5 -#define M9_2_4 6 -#define M9_2_5 7 -#define M9_2_6 8 -#define M9_2_7 0 - -#define M9_3_0 3 -#define M9_3_1 4 -#define M9_3_2 5 -#define M9_3_3 6 -#define M9_3_4 7 -#define M9_3_5 8 -#define M9_3_6 0 -#define M9_3_7 1 - -#define M9_4_0 4 -#define M9_4_1 5 -#define M9_4_2 6 -#define M9_4_3 7 -#define M9_4_4 8 -#define M9_4_5 0 -#define M9_4_6 1 -#define M9_4_7 2 - -#define M9_5_0 5 -#define M9_5_1 6 -#define M9_5_2 7 -#define M9_5_3 8 -#define M9_5_4 0 -#define M9_5_5 1 -#define M9_5_6 2 -#define M9_5_7 3 - -#define M9_6_0 6 -#define M9_6_1 7 -#define M9_6_2 8 -#define M9_6_3 0 -#define M9_6_4 1 -#define M9_6_5 2 -#define M9_6_6 3 -#define M9_6_7 4 - -#define M9_7_0 7 -#define M9_7_1 8 -#define M9_7_2 0 -#define M9_7_3 1 -#define M9_7_4 2 -#define M9_7_5 3 -#define M9_7_6 4 -#define M9_7_7 5 - -#define M9_8_0 8 -#define M9_8_1 0 -#define M9_8_2 1 -#define M9_8_3 2 -#define M9_8_4 3 -#define M9_8_5 4 -#define M9_8_6 5 -#define M9_8_7 6 - -#define M9_9_0 0 -#define M9_9_1 1 -#define M9_9_2 2 -#define M9_9_3 3 -#define M9_9_4 4 -#define M9_9_5 5 -#define M9_9_6 6 -#define M9_9_7 7 - -#define M9_10_0 1 -#define M9_10_1 2 -#define M9_10_2 3 -#define M9_10_3 4 -#define M9_10_4 5 -#define M9_10_5 6 -#define M9_10_6 7 -#define M9_10_7 8 - -#define M9_11_0 2 -#define M9_11_1 3 -#define M9_11_2 4 -#define M9_11_3 5 -#define M9_11_4 6 -#define M9_11_5 7 -#define M9_11_6 8 -#define M9_11_7 0 - -#define M9_12_0 3 -#define M9_12_1 4 -#define M9_12_2 5 -#define M9_12_3 6 -#define M9_12_4 7 -#define M9_12_5 8 -#define M9_12_6 0 -#define M9_12_7 1 - -#define M9_13_0 4 -#define M9_13_1 5 -#define M9_13_2 6 -#define M9_13_3 7 -#define M9_13_4 8 -#define M9_13_5 0 -#define M9_13_6 1 -#define M9_13_7 2 - -#define M9_14_0 5 -#define M9_14_1 6 -#define M9_14_2 7 -#define M9_14_3 8 -#define M9_14_4 0 -#define M9_14_5 1 -#define M9_14_6 2 -#define M9_14_7 3 - -#define M9_15_0 6 -#define M9_15_1 7 -#define M9_15_2 8 -#define M9_15_3 0 -#define M9_15_4 1 -#define M9_15_5 2 -#define M9_15_6 3 -#define M9_15_7 4 - -#define M9_16_0 7 -#define M9_16_1 8 -#define M9_16_2 0 -#define M9_16_3 1 -#define M9_16_4 2 -#define M9_16_5 3 -#define M9_16_6 4 -#define M9_16_7 5 - -#define M9_17_0 8 -#define M9_17_1 0 -#define M9_17_2 1 -#define M9_17_3 2 -#define M9_17_4 3 -#define M9_17_5 4 -#define M9_17_6 5 -#define M9_17_7 6 - -#define M9_18_0 0 -#define M9_18_1 1 -#define M9_18_2 2 -#define M9_18_3 3 -#define M9_18_4 4 -#define M9_18_5 5 -#define M9_18_6 6 -#define M9_18_7 7 - -/* - * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1). - */ - -#define M3_0_0 0 -#define M3_0_1 1 -#define M3_1_0 1 -#define M3_1_1 2 -#define M3_2_0 2 -#define M3_2_1 0 -#define M3_3_0 0 -#define M3_3_1 1 -#define M3_4_0 1 -#define M3_4_1 2 -#define M3_5_0 2 -#define M3_5_1 0 -#define M3_6_0 0 -#define M3_6_1 1 -#define M3_7_0 1 -#define M3_7_1 2 -#define M3_8_0 2 -#define M3_8_1 0 -#define M3_9_0 0 -#define M3_9_1 1 -#define M3_10_0 1 -#define M3_10_1 2 -#define M3_11_0 2 -#define M3_11_1 0 -#define M3_12_0 0 -#define M3_12_1 1 -#define M3_13_0 1 -#define M3_13_1 2 -#define M3_14_0 2 -#define M3_14_1 0 -#define M3_15_0 0 -#define M3_15_1 1 -#define M3_16_0 1 -#define M3_16_1 2 -#define M3_17_0 2 -#define M3_17_1 0 -#define M3_18_0 0 -#define M3_18_1 1 - -#define XCAT(x, y) XCAT_(x, y) -#define XCAT_(x, y) x ## y - -#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) -#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) - -#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) do { \ - k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ - ^ SPH_C64(0x1BD11BDAA9FC1A22); \ - t2 = t0 ^ t1; \ - } while (0) - -#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) do { \ - w0 = SPH_T64(w0 + SKBI(k, s, 0)); \ - w1 = SPH_T64(w1 + SKBI(k, s, 1)); \ - w2 = SPH_T64(w2 + SKBI(k, s, 2)); \ - w3 = SPH_T64(w3 + SKBI(k, s, 3)); \ - w4 = SPH_T64(w4 + SKBI(k, s, 4)); \ - w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ - w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ - w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \ - } while (0) - - -#define TFBIG_MIX(x0, x1, rc) do { \ - x0 = SPH_T64(x0 + x1); \ - x1 = SPH_ROTL64(x1, rc) ^ x0; \ - } while (0) - -#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \ - TFBIG_MIX(w0, w1, rc0); \ - TFBIG_MIX(w2, w3, rc1); \ - TFBIG_MIX(w4, w5, rc2); \ - TFBIG_MIX(w6, w7, rc3); \ - } while (0) - -#define TFBIG_4e(s) do { \ - TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, sknh, t, s); \ - TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ - TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ - TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ - TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ - } while (0) - -#define TFBIG_4o(s) do { \ - TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, sknh, t, s); \ - TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ - TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ - TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ - TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ - } while (0) - -#define UBI_BIG(etype, extra) do { \ - sph_u64 sknh8, t0, t1, t2; \ - sph_u64 m0 = sph_dec64le_aligned(buf + 0); \ - sph_u64 m1 = sph_dec64le_aligned(buf + 8); \ - sph_u64 m2 = sph_dec64le_aligned(buf + 16); \ - sph_u64 m3 = sph_dec64le_aligned(buf + 24); \ - sph_u64 m4 = sph_dec64le_aligned(buf + 32); \ - sph_u64 m5 = sph_dec64le_aligned(buf + 40); \ - sph_u64 m6 = sph_dec64le_aligned(buf + 48); \ - sph_u64 m7 = sph_dec64le_aligned(buf + 56); \ - sph_u64 p0 = m0; \ - sph_u64 p1 = m1; \ - sph_u64 p2 = m2; \ - sph_u64 p3 = m3; \ - sph_u64 p4 = m4; \ - sph_u64 p5 = m5; \ - sph_u64 p6 = m6; \ - sph_u64 p7 = m7; \ - t0 = SPH_T64(hashctA << 6) + (sph_u64)(extra); \ - t1 = (hashctA >> 58) + ((sph_u64)(etype) << 55); \ - TFBIG_KINIT(sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7, sknh8, t0, t1, t2); \ - TFBIG_4e(0); \ - TFBIG_4o(1); \ - TFBIG_4e(2); \ - TFBIG_4o(3); \ - TFBIG_4e(4); \ - TFBIG_4o(5); \ - TFBIG_4e(6); \ - TFBIG_4o(7); \ - TFBIG_4e(8); \ - TFBIG_4o(9); \ - TFBIG_4e(10); \ - TFBIG_4o(11); \ - TFBIG_4e(12); \ - TFBIG_4o(13); \ - TFBIG_4e(14); \ - TFBIG_4o(15); \ - TFBIG_4e(16); \ - TFBIG_4o(17); \ - TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, sknh, t, 18); \ - sknh0 = m0 ^ p0; \ - sknh1 = m1 ^ p1; \ - sknh2 = m2 ^ p2; \ - sknh3 = m3 ^ p3; \ - sknh4 = m4 ^ p4; \ - sknh5 = m5 ^ p5; \ - sknh6 = m6 ^ p6; \ - sknh7 = m7 ^ p7; \ - } while (0) - - -#define sknDECL_STATE_BIG \ - sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; \ - -#define DECL_SKN \ - sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; \ - -#define sknREAD_STATE_BIG(sc) do { \ - sknh0 = (sc)->sknh0; \ - sknh1 = (sc)->sknh1; \ - sknh2 = (sc)->sknh2; \ - sknh3 = (sc)->sknh3; \ - sknh4 = (sc)->sknh4; \ - sknh5 = (sc)->sknh5; \ - sknh6 = (sc)->sknh6; \ - sknh7 = (sc)->sknh7; \ - } while (0) - -#define sknWRITE_STATE_BIG(sc) do { \ - (sc)->sknh0 = sknh0; \ - (sc)->sknh1 = sknh1; \ - (sc)->sknh2 = sknh2; \ - (sc)->sknh3 = sknh3; \ - (sc)->sknh4 = sknh4; \ - (sc)->sknh5 = sknh5; \ - (sc)->sknh6 = sknh6; \ - (sc)->sknh7 = sknh7; \ - } while (0) - - -/* not used */ -#define SKN_H \ -do { \ - sph_skein512_init(&ctx_skein); \ - skein_big_core(&ctx_skein, hash,64); \ - sph_skein512_close(&ctx_skein, hash); \ -} while (0) - -/* load initial constants */ -#define SKN_I \ -do { \ - sknh0 = sknIV512[0]; \ - sknh1 = sknIV512[1]; \ - sknh2 = sknIV512[2]; \ - sknh3 = sknIV512[3]; \ - sknh4 = sknIV512[4]; \ - sknh5 = sknIV512[5]; \ - sknh6 = sknIV512[6]; \ - sknh7 = sknIV512[7]; \ - hashctA = 0; \ - hashptr = 0; \ -} while (0) - -/* load hash for loop */ -#define SKN_U \ -do { \ - unsigned char *buf; \ - size_t ptr; \ - size_t len = 64; \ - const void *data = hash; \ - buf = hashbuf; \ - ptr = hashptr; \ - memcpy(buf + ptr, data, len); \ - ptr += len; \ - hashptr = ptr; \ -} while (0) - -/* skein512 hash loaded */ -/* hash = skein512(loaded) */ -#define SKN_C \ -do { \ - unsigned char *buf; \ - size_t ptr; \ - unsigned et; \ - \ - buf = hashbuf; \ - ptr = hashptr; \ - \ - memset(buf + ptr, 0, (sizeof(char)*64) - ptr); \ - /* for break loop */ \ - /* one copy of inline UBI_BIG */ \ - et = 352 + ((hashctA == 0) << 7) + (0 != 0); \ - for (;;) { \ - UBI_BIG(et, ptr); \ - /* et gets changed for 2nd run */ \ - if (et == 510) break; \ - memset(buf, 0, (sizeof(char)*64)); \ - hashctA = 0; \ - et = 510; \ - ptr = 8; \ - } \ - \ - sph_enc64le_aligned(buf + 0, sknh0); \ - sph_enc64le_aligned(buf + 8, sknh1); \ - sph_enc64le_aligned(buf + 16, sknh2); \ - sph_enc64le_aligned(buf + 24, sknh3); \ - sph_enc64le_aligned(buf + 32, sknh4); \ - sph_enc64le_aligned(buf + 40, sknh5); \ - sph_enc64le_aligned(buf + 48, sknh6); \ - sph_enc64le_aligned(buf + 56, sknh7); \ - memcpy(hash, buf, 64); \ - \ -} while (0) - -static const sph_u64 sknIV512[] = { - SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), - SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), - SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), - SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) -}; - - -#ifdef __cplusplus -} -#endif diff --git a/algo/skein/sse2/sph_skein.h b/algo/skein/sse2/sph_skein.h deleted file mode 100644 index adac1ee..0000000 --- a/algo/skein/sse2/sph_skein.h +++ /dev/null @@ -1,66 +0,0 @@ -/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */ -/** - * Skein interface. The Skein specification defines three main - * functions, called Skein-256, Skein-512 and Skein-1024, which can be - * further parameterized with an output length. For the SHA-3 - * competition, Skein-512 is used for output sizes of 224, 256, 384 and - * 512 bits; this is what this code implements. Thus, we hereafter call - * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein - * specification defines as Skein-512-224, Skein-512-256, Skein-512-384 - * and Skein-512-512, respectively. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_skein.h - * @author Thomas Pornin - */ - -#ifndef SPH_SKEIN_H__ -#define SPH_SKEIN_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "sph_types.h" - -#define SPH_SIZE_skein512 512 - -typedef struct { -#ifndef DOXYGEN_IGNORE - sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; -#endif -} sph_skein_big_context; - -typedef sph_skein_big_context sph_skein512_context; - - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/sm3/sm3-hash-4way.c b/algo/sm3/sm3-hash-4way.c deleted file mode 100644 index 1396d23..0000000 --- a/algo/sm3/sm3-hash-4way.c +++ /dev/null @@ -1,231 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2014 - 2017 The GmSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the GmSSL Project. - * (http://gmssl.org/)" - * - * 4. The name "GmSSL Project" must not be used to endorse or promote - * products derived from this software without prior written - * permission. For written permission, please contact - * guanzhi1980@gmail.com. - * - * 5. Products derived from this software may not be called "GmSSL" - * nor may "GmSSL" appear in their names without prior written - * permission of the GmSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the GmSSL Project - * (http://gmssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - */ - -#include -#include "sm3-hash-4way.h" - -#ifdef __SSE4_2__ - -void sm3_4way_init( sm3_4way_ctx_t *ctx ) -{ - ctx->digest[0] = _mm_set1_epi32( 0x7380166F ); - ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 ); - ctx->digest[2] = _mm_set1_epi32( 0x172442D7 ); - ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 ); - ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC ); - ctx->digest[5] = _mm_set1_epi32( 0x163138AA ); - ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D ); - ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E ); - ctx->nblocks = 0; - ctx->num = 0; -} - -void sm3_4way( void *cc, const void *data, size_t len ) -{ - sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc; - __m128i *block = (__m128i*)ctx->block; - __m128i *vdata = (__m128i*)data; - - if ( ctx->num ) - { - unsigned int left = SM3_BLOCK_SIZE - ctx->num; - if ( len < left ) - { - memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); - ctx->num += len; - return; - } - else - { - memcpy_128( block + (ctx->num >> 2), vdata , left>>2 ); - sm3_4way_compress( ctx->digest, block ); - ctx->nblocks++; - vdata += left>>2; - len -= left; - } - } - while ( len >= SM3_BLOCK_SIZE ) - { - sm3_4way_compress( ctx->digest, vdata ); - ctx->nblocks++; - vdata += SM3_BLOCK_SIZE>>2; - len -= SM3_BLOCK_SIZE; - } - ctx->num = len; - if ( len ) - memcpy_128( block, vdata, len>>2 ); -} - -void sm3_4way_close( void *cc, void *dst ) -{ - sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc; - __m128i *hash = (__m128i*)dst; - __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) ); - __m128i *block = (__m128i*)ctx->block; - int i; - - block[ctx->num] = _mm_set1_epi32( 0x80 ); - - if ( ctx->num + 8 <= SM3_BLOCK_SIZE ) - { - memset_zero_128( block + (ctx->num >> 2) + 1, - ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); - } - else - { - memset_zero_128( block + (ctx->num >> 2) + 1, - ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) ); - sm3_4way_compress( ctx->digest, block ); - memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 ); - } - - count[0] = mm128_bswap_32( - _mm_set1_epi32( ctx->nblocks >> 23 ) ); - count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) + - ( ctx->num << 3 ) ) ); - sm3_4way_compress( ctx->digest, block ); - - for ( i = 0; i < 8 ; i++ ) - hash[i] = mm128_bswap_32( ctx->digest[i] ); -} - -#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 9 ), \ - mm128_rol_32( x, 17 ) ) ) -#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \ - mm128_rol_32( x, 23 ) ) ) - -#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) ) -#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \ - _mm_and_si128( x, z ) ), \ - _mm_and_si128( y, z ) ) - -#define GG0(x,y,z) FF0(x,y,z) -#define GG1(x,y,z) _mm_or_si128( _mm_and_si128( x, y ), \ - _mm_andnot_si128( x, z ) ) - - -void sm3_4way_compress( __m128i *digest, __m128i *block ) -{ - __m128i W[68], W1[64]; - __m128i A = digest[ 0 ]; - __m128i B = digest[ 1 ]; - __m128i C = digest[ 2 ]; - __m128i D = digest[ 3 ]; - __m128i E = digest[ 4 ]; - __m128i F = digest[ 5 ]; - __m128i G = digest[ 6 ]; - __m128i H = digest[ 7 ]; - __m128i SS1, SS2, TT1, TT2, T; - int j; - - for ( j = 0; j < 16; j++ ) - W[j] = mm128_bswap_32( block[j] ); - - for ( j = 16; j < 68; j++ ) - W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ], - W[ j-9 ] ), - mm128_rol_32( W[ j-3 ], 15 ) ) ), - _mm_xor_si128( mm128_rol_32( W[ j-13 ], 7 ), - W[ j-6 ] ) ); - - for( j = 0; j < 64; j++ ) - W1[j] = _mm_xor_si128( W[j], W[j+4] ); - - T = _mm_set1_epi32( 0x79CC4519UL ); - for( j =0; j < 16; j++ ) - { - SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ), - mm128_rol_32( T, j ) ), 7 ); - SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) ); - TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ), - SS2 ), W1[j] ); - TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ), - SS1 ), W[j] ); - D = C; - C = mm128_rol_32( B, 9 ); - B = A; - A = TT1; - H = G; - G = mm128_rol_32( F, 19 ); - F = E; - E = P0( TT2 ); - } - - T = _mm_set1_epi32( 0x7A879D8AUL ); - for( j =16; j < 64; j++ ) - { - SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ), - mm128_rol_32( T, j&31 ) ), 7 ); - SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) ); - TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), - SS2 ), W1[j] ); - TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ), - SS1 ), W[j] ); - D = C; - C = mm128_rol_32( B, 9 ); - B = A; - A = TT1; - H = G; - G = mm128_rol_32( F, 19 ); - F = E; - E = P0( TT2 ); - } - - digest[0] = _mm_xor_si128( digest[0], A ); - digest[1] = _mm_xor_si128( digest[1], B ); - digest[2] = _mm_xor_si128( digest[2], C ); - digest[3] = _mm_xor_si128( digest[3], D ); - digest[4] = _mm_xor_si128( digest[4], E ); - digest[5] = _mm_xor_si128( digest[5], F ); - digest[6] = _mm_xor_si128( digest[6], G ); - digest[7] = _mm_xor_si128( digest[7], H ); -} - -#endif - diff --git a/algo/sm3/sm3-hash-4way.h b/algo/sm3/sm3-hash-4way.h deleted file mode 100644 index 06159d8..0000000 --- a/algo/sm3/sm3-hash-4way.h +++ /dev/null @@ -1,89 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2014 - 2016 The GmSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the GmSSL Project. - * (http://gmssl.org/)" - * - * 4. The name "GmSSL Project" must not be used to endorse or promote - * products derived from this software without prior written - * permission. For written permission, please contact - * guanzhi1980@gmail.com. - * - * 5. Products derived from this software may not be called "GmSSL" - * nor may "GmSSL" appear in their names without prior written - * permission of the GmSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the GmSSL Project - * (http://gmssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - */ - -#ifndef SPH_SM3_HASH_4WAY_H -#define SPH_SM3_HASH_4WAY_H - -#define SM3_DIGEST_LENGTH 32 -#define SM3_BLOCK_SIZE 64 -#define SM3_CBLOCK (SM3_BLOCK_SIZE) -#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH) - - -#include -#include -#include -#include "simd-utils.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -typedef struct { - __m128i block[16] __attribute__ ((aligned (64))); - __m128i digest[8]; - uint32_t nblocks; - uint32_t num; -} sm3_4way_ctx_t; - -void sm3_4way_init( sm3_4way_ctx_t *ctx ); -//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data, -// size_t data_len ); -//void sm3_4way_final( sm3_4way_ctx_t *ctx, -// unsigned char digest[SM3_DIGEST_LENGTH] ); -void sm3_4way_compress( __m128i *digest, __m128i *block ); - -void sm3_4way(void *cc, const void *data, size_t len); -void sm3_4way_close(void *cc, void *dst); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/algo/sm3/sm3.c b/algo/sm3/sm3.c deleted file mode 100644 index aea56cb..0000000 --- a/algo/sm3/sm3.c +++ /dev/null @@ -1,226 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2014 - 2017 The GmSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the GmSSL Project. - * (http://gmssl.org/)" - * - * 4. The name "GmSSL Project" must not be used to endorse or promote - * products derived from this software without prior written - * permission. For written permission, please contact - * guanzhi1980@gmail.com. - * - * 5. Products derived from this software may not be called "GmSSL" - * nor may "GmSSL" appear in their names without prior written - * permission of the GmSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the GmSSL Project - * (http://gmssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - */ - -#include -#include "sph_sm3.h" - -void sm3_init(sm3_ctx_t *ctx) -{ - ctx->digest[0] = 0x7380166F; - ctx->digest[1] = 0x4914B2B9; - ctx->digest[2] = 0x172442D7; - ctx->digest[3] = 0xDA8A0600; - ctx->digest[4] = 0xA96F30BC; - ctx->digest[5] = 0x163138AA; - ctx->digest[6] = 0xE38DEE4D; - ctx->digest[7] = 0xB0FB0E4E; - - ctx->nblocks = 0; - ctx->num = 0; -} - -void -sph_sm3(void *cc, const void *data, size_t len) -{ - sm3_update(cc, data, len); -} - -void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len) -{ - if (ctx->num) { - unsigned int left = SM3_BLOCK_SIZE - ctx->num; - if (data_len < left) { - memcpy(ctx->block + ctx->num, data, data_len); - ctx->num += data_len; - return; - } else { - memcpy(ctx->block + ctx->num, data, left); - sm3_compress(ctx->digest, ctx->block); - ctx->nblocks++; - data += left; - data_len -= left; - } - } - while (data_len >= SM3_BLOCK_SIZE) { - sm3_compress(ctx->digest, data); - ctx->nblocks++; - data += SM3_BLOCK_SIZE; - data_len -= SM3_BLOCK_SIZE; - } - ctx->num = data_len; - if (data_len) { - memcpy(ctx->block, data, data_len); - } -} - -void -sph_sm3_close(void *cc, void *dst) -{ - sm3_final(cc, dst); - memset(cc, 0, sizeof(sm3_ctx_t)); -} - -void sm3_final(sm3_ctx_t *ctx, unsigned char *digest) -{ - int i; - uint32_t *pdigest = (uint32_t *)digest; - uint32_t *count = (uint32_t *)(ctx->block + SM3_BLOCK_SIZE - 8); - - ctx->block[ctx->num] = 0x80; - - if (ctx->num + 9 <= SM3_BLOCK_SIZE) { - memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 9); - } else { - memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 1); - sm3_compress(ctx->digest, ctx->block); - memset(ctx->block, 0, SM3_BLOCK_SIZE - 8); - } - - count[0] = cpu_to_be32((ctx->nblocks) >> 23); - count[1] = cpu_to_be32((ctx->nblocks << 9) + (ctx->num << 3)); - - sm3_compress(ctx->digest, ctx->block); - for (i = 0; i < sizeof(ctx->digest)/sizeof(ctx->digest[0]); i++) { - pdigest[i] = cpu_to_be32(ctx->digest[i]); - } -} - -#define ROTATELEFT(X,n) (((X)<<(n)) | ((X)>>(32-(n)))) - -#define P0(x) ((x) ^ ROTATELEFT((x),9) ^ ROTATELEFT((x),17)) -#define P1(x) ((x) ^ ROTATELEFT((x),15) ^ ROTATELEFT((x),23)) - -#define FF0(x,y,z) ( (x) ^ (y) ^ (z)) -#define FF1(x,y,z) (((x) & (y)) | ( (x) & (z)) | ( (y) & (z))) - -#define GG0(x,y,z) ( (x) ^ (y) ^ (z)) -#define GG1(x,y,z) (((x) & (y)) | ( (~(x)) & (z)) ) - - -void sm3_compress(uint32_t digest[8], const unsigned char block[64]) -{ - int j; - uint32_t W[68], W1[64]; - const uint32_t *pblock = (const uint32_t *)block; - - uint32_t A = digest[0]; - uint32_t B = digest[1]; - uint32_t C = digest[2]; - uint32_t D = digest[3]; - uint32_t E = digest[4]; - uint32_t F = digest[5]; - uint32_t G = digest[6]; - uint32_t H = digest[7]; - uint32_t SS1,SS2,TT1,TT2,T[64]; - - for (j = 0; j < 16; j++) { - W[j] = cpu_to_be32(pblock[j]); - } - for (j = 16; j < 68; j++) { - W[j] = P1( W[j-16] ^ W[j-9] ^ ROTATELEFT(W[j-3],15)) ^ ROTATELEFT(W[j - 13],7 ) ^ W[j-6];; - } - for( j = 0; j < 64; j++) { - W1[j] = W[j] ^ W[j+4]; - } - - for(j =0; j < 16; j++) { - - T[j] = 0x79CC4519; - SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7); - SS2 = SS1 ^ ROTATELEFT(A,12); - TT1 = FF0(A,B,C) + D + SS2 + W1[j]; - TT2 = GG0(E,F,G) + H + SS1 + W[j]; - D = C; - C = ROTATELEFT(B,9); - B = A; - A = TT1; - H = G; - G = ROTATELEFT(F,19); - F = E; - E = P0(TT2); - } - - for(j =16; j < 64; j++) { - - T[j] = 0x7A879D8A; - SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j&31)), 7); - SS2 = SS1 ^ ROTATELEFT(A,12); - TT1 = FF1(A,B,C) + D + SS2 + W1[j]; - TT2 = GG1(E,F,G) + H + SS1 + W[j]; - D = C; - C = ROTATELEFT(B,9); - B = A; - A = TT1; - H = G; - G = ROTATELEFT(F,19); - F = E; - E = P0(TT2); - } - - digest[0] ^= A; - digest[1] ^= B; - digest[2] ^= C; - digest[3] ^= D; - digest[4] ^= E; - digest[5] ^= F; - digest[6] ^= G; - digest[7] ^= H; -} - -void sm3(const unsigned char *msg, size_t msglen, - unsigned char dgst[SM3_DIGEST_LENGTH]) -{ - sm3_ctx_t ctx; - - sm3_init(&ctx); - sm3_update(&ctx, msg, msglen); - sm3_final(&ctx, dgst); - - memset(&ctx, 0, sizeof(sm3_ctx_t)); -} diff --git a/algo/sm3/sph_sm3.h b/algo/sm3/sph_sm3.h deleted file mode 100644 index eab61d3..0000000 --- a/algo/sm3/sph_sm3.h +++ /dev/null @@ -1,120 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2014 - 2016 The GmSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the GmSSL Project. - * (http://gmssl.org/)" - * - * 4. The name "GmSSL Project" must not be used to endorse or promote - * products derived from this software without prior written - * permission. For written permission, please contact - * guanzhi1980@gmail.com. - * - * 5. Products derived from this software may not be called "GmSSL" - * nor may "GmSSL" appear in their names without prior written - * permission of the GmSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the GmSSL Project - * (http://gmssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - */ - -#ifndef SPH_SM3_H -#define SPH_SM3_H - -#define SM3_DIGEST_LENGTH 32 -#define SM3_BLOCK_SIZE 64 -#define SM3_CBLOCK (SM3_BLOCK_SIZE) -#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH) - - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - - -typedef struct { - uint32_t digest[8]; - int nblocks; - unsigned char block[64]; - int num; -} sm3_ctx_t; - -void sm3_init(sm3_ctx_t *ctx); -void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len); -void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]); -void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]); -void sm3(const unsigned char *data, size_t datalen, - unsigned char digest[SM3_DIGEST_LENGTH]); - -void sph_sm3(void *cc, const void *data, size_t len); -void sph_sm3_close(void *cc, void *dst); - -typedef struct { - sm3_ctx_t sm3_ctx; - unsigned char key[SM3_BLOCK_SIZE]; -} sm3_hmac_ctx_t; - -void sm3_hmac_init(sm3_hmac_ctx_t *ctx, const unsigned char *key, size_t key_len); -void sm3_hmac_update(sm3_hmac_ctx_t *ctx, const unsigned char *data, size_t data_len); -void sm3_hmac_final(sm3_hmac_ctx_t *ctx, unsigned char mac[SM3_HMAC_SIZE]); -void sm3_hmac(const unsigned char *data, size_t data_len, - const unsigned char *key, size_t key_len, unsigned char mac[SM3_HMAC_SIZE]); - -#ifdef CPU_BIGENDIAN - -#define cpu_to_be16(v) (v) -#define cpu_to_be32(v) (v) -#define be16_to_cpu(v) (v) -#define be32_to_cpu(v) (v) - -#else - -#define cpu_to_le16(v) (v) -#define cpu_to_le32(v) (v) -#define le16_to_cpu(v) (v) -#define le32_to_cpu(v) (v) - -#define cpu_to_be16(v) (((v)<< 8) | ((v)>>8)) -#define cpu_to_be32(v) (((v)>>24) | (((v)>>8)&0xff00) | (((v)<<8)&0xff0000) | ((v)<<24)) -#define be16_to_cpu(v) cpu_to_be16(v) -#define be32_to_cpu(v) cpu_to_be32(v) - -#endif - -#ifdef __cplusplus -} -#endif -#endif diff --git a/algo/tiger/sph_tiger.c b/algo/tiger/sph_tiger.c deleted file mode 100644 index aa563ad..0000000 --- a/algo/tiger/sph_tiger.c +++ /dev/null @@ -1,698 +0,0 @@ -/* $Id: tiger.c 216 2010-06-08 09:46:57Z tp $ */ -/* - * Tiger / Tiger2 implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph_tiger.h" - -#if SPH_64 - -static const sph_u64 T1[256] = { - SPH_C64(0x02AAB17CF7E90C5E), SPH_C64(0xAC424B03E243A8EC), - SPH_C64(0x72CD5BE30DD5FCD3), SPH_C64(0x6D019B93F6F97F3A), - SPH_C64(0xCD9978FFD21F9193), SPH_C64(0x7573A1C9708029E2), - SPH_C64(0xB164326B922A83C3), SPH_C64(0x46883EEE04915870), - SPH_C64(0xEAACE3057103ECE6), SPH_C64(0xC54169B808A3535C), - SPH_C64(0x4CE754918DDEC47C), SPH_C64(0x0AA2F4DFDC0DF40C), - SPH_C64(0x10B76F18A74DBEFA), SPH_C64(0xC6CCB6235AD1AB6A), - SPH_C64(0x13726121572FE2FF), SPH_C64(0x1A488C6F199D921E), - SPH_C64(0x4BC9F9F4DA0007CA), SPH_C64(0x26F5E6F6E85241C7), - SPH_C64(0x859079DBEA5947B6), SPH_C64(0x4F1885C5C99E8C92), - SPH_C64(0xD78E761EA96F864B), SPH_C64(0x8E36428C52B5C17D), - SPH_C64(0x69CF6827373063C1), SPH_C64(0xB607C93D9BB4C56E), - SPH_C64(0x7D820E760E76B5EA), SPH_C64(0x645C9CC6F07FDC42), - SPH_C64(0xBF38A078243342E0), SPH_C64(0x5F6B343C9D2E7D04), - SPH_C64(0xF2C28AEB600B0EC6), SPH_C64(0x6C0ED85F7254BCAC), - SPH_C64(0x71592281A4DB4FE5), SPH_C64(0x1967FA69CE0FED9F), - SPH_C64(0xFD5293F8B96545DB), SPH_C64(0xC879E9D7F2A7600B), - SPH_C64(0x860248920193194E), SPH_C64(0xA4F9533B2D9CC0B3), - SPH_C64(0x9053836C15957613), SPH_C64(0xDB6DCF8AFC357BF1), - SPH_C64(0x18BEEA7A7A370F57), SPH_C64(0x037117CA50B99066), - SPH_C64(0x6AB30A9774424A35), SPH_C64(0xF4E92F02E325249B), - SPH_C64(0x7739DB07061CCAE1), SPH_C64(0xD8F3B49CECA42A05), - SPH_C64(0xBD56BE3F51382F73), SPH_C64(0x45FAED5843B0BB28), - SPH_C64(0x1C813D5C11BF1F83), SPH_C64(0x8AF0E4B6D75FA169), - SPH_C64(0x33EE18A487AD9999), SPH_C64(0x3C26E8EAB1C94410), - SPH_C64(0xB510102BC0A822F9), SPH_C64(0x141EEF310CE6123B), - SPH_C64(0xFC65B90059DDB154), SPH_C64(0xE0158640C5E0E607), - SPH_C64(0x884E079826C3A3CF), SPH_C64(0x930D0D9523C535FD), - SPH_C64(0x35638D754E9A2B00), SPH_C64(0x4085FCCF40469DD5), - SPH_C64(0xC4B17AD28BE23A4C), SPH_C64(0xCAB2F0FC6A3E6A2E), - SPH_C64(0x2860971A6B943FCD), SPH_C64(0x3DDE6EE212E30446), - SPH_C64(0x6222F32AE01765AE), SPH_C64(0x5D550BB5478308FE), - SPH_C64(0xA9EFA98DA0EDA22A), SPH_C64(0xC351A71686C40DA7), - SPH_C64(0x1105586D9C867C84), SPH_C64(0xDCFFEE85FDA22853), - SPH_C64(0xCCFBD0262C5EEF76), SPH_C64(0xBAF294CB8990D201), - SPH_C64(0xE69464F52AFAD975), SPH_C64(0x94B013AFDF133E14), - SPH_C64(0x06A7D1A32823C958), SPH_C64(0x6F95FE5130F61119), - SPH_C64(0xD92AB34E462C06C0), SPH_C64(0xED7BDE33887C71D2), - SPH_C64(0x79746D6E6518393E), SPH_C64(0x5BA419385D713329), - SPH_C64(0x7C1BA6B948A97564), SPH_C64(0x31987C197BFDAC67), - SPH_C64(0xDE6C23C44B053D02), SPH_C64(0x581C49FED002D64D), - SPH_C64(0xDD474D6338261571), SPH_C64(0xAA4546C3E473D062), - SPH_C64(0x928FCE349455F860), SPH_C64(0x48161BBACAAB94D9), - SPH_C64(0x63912430770E6F68), SPH_C64(0x6EC8A5E602C6641C), - SPH_C64(0x87282515337DDD2B), SPH_C64(0x2CDA6B42034B701B), - SPH_C64(0xB03D37C181CB096D), SPH_C64(0xE108438266C71C6F), - SPH_C64(0x2B3180C7EB51B255), SPH_C64(0xDF92B82F96C08BBC), - SPH_C64(0x5C68C8C0A632F3BA), SPH_C64(0x5504CC861C3D0556), - SPH_C64(0xABBFA4E55FB26B8F), SPH_C64(0x41848B0AB3BACEB4), - SPH_C64(0xB334A273AA445D32), SPH_C64(0xBCA696F0A85AD881), - SPH_C64(0x24F6EC65B528D56C), SPH_C64(0x0CE1512E90F4524A), - SPH_C64(0x4E9DD79D5506D35A), SPH_C64(0x258905FAC6CE9779), - SPH_C64(0x2019295B3E109B33), SPH_C64(0xF8A9478B73A054CC), - SPH_C64(0x2924F2F934417EB0), SPH_C64(0x3993357D536D1BC4), - SPH_C64(0x38A81AC21DB6FF8B), SPH_C64(0x47C4FBF17D6016BF), - SPH_C64(0x1E0FAADD7667E3F5), SPH_C64(0x7ABCFF62938BEB96), - SPH_C64(0xA78DAD948FC179C9), SPH_C64(0x8F1F98B72911E50D), - SPH_C64(0x61E48EAE27121A91), SPH_C64(0x4D62F7AD31859808), - SPH_C64(0xECEBA345EF5CEAEB), SPH_C64(0xF5CEB25EBC9684CE), - SPH_C64(0xF633E20CB7F76221), SPH_C64(0xA32CDF06AB8293E4), - SPH_C64(0x985A202CA5EE2CA4), SPH_C64(0xCF0B8447CC8A8FB1), - SPH_C64(0x9F765244979859A3), SPH_C64(0xA8D516B1A1240017), - SPH_C64(0x0BD7BA3EBB5DC726), SPH_C64(0xE54BCA55B86ADB39), - SPH_C64(0x1D7A3AFD6C478063), SPH_C64(0x519EC608E7669EDD), - SPH_C64(0x0E5715A2D149AA23), SPH_C64(0x177D4571848FF194), - SPH_C64(0xEEB55F3241014C22), SPH_C64(0x0F5E5CA13A6E2EC2), - SPH_C64(0x8029927B75F5C361), SPH_C64(0xAD139FABC3D6E436), - SPH_C64(0x0D5DF1A94CCF402F), SPH_C64(0x3E8BD948BEA5DFC8), - SPH_C64(0xA5A0D357BD3FF77E), SPH_C64(0xA2D12E251F74F645), - SPH_C64(0x66FD9E525E81A082), SPH_C64(0x2E0C90CE7F687A49), - SPH_C64(0xC2E8BCBEBA973BC5), SPH_C64(0x000001BCE509745F), - SPH_C64(0x423777BBE6DAB3D6), SPH_C64(0xD1661C7EAEF06EB5), - SPH_C64(0xA1781F354DAACFD8), SPH_C64(0x2D11284A2B16AFFC), - SPH_C64(0xF1FC4F67FA891D1F), SPH_C64(0x73ECC25DCB920ADA), - SPH_C64(0xAE610C22C2A12651), SPH_C64(0x96E0A810D356B78A), - SPH_C64(0x5A9A381F2FE7870F), SPH_C64(0xD5AD62EDE94E5530), - SPH_C64(0xD225E5E8368D1427), SPH_C64(0x65977B70C7AF4631), - SPH_C64(0x99F889B2DE39D74F), SPH_C64(0x233F30BF54E1D143), - SPH_C64(0x9A9675D3D9A63C97), SPH_C64(0x5470554FF334F9A8), - SPH_C64(0x166ACB744A4F5688), SPH_C64(0x70C74CAAB2E4AEAD), - SPH_C64(0xF0D091646F294D12), SPH_C64(0x57B82A89684031D1), - SPH_C64(0xEFD95A5A61BE0B6B), SPH_C64(0x2FBD12E969F2F29A), - SPH_C64(0x9BD37013FEFF9FE8), SPH_C64(0x3F9B0404D6085A06), - SPH_C64(0x4940C1F3166CFE15), SPH_C64(0x09542C4DCDF3DEFB), - SPH_C64(0xB4C5218385CD5CE3), SPH_C64(0xC935B7DC4462A641), - SPH_C64(0x3417F8A68ED3B63F), SPH_C64(0xB80959295B215B40), - SPH_C64(0xF99CDAEF3B8C8572), SPH_C64(0x018C0614F8FCB95D), - SPH_C64(0x1B14ACCD1A3ACDF3), SPH_C64(0x84D471F200BB732D), - SPH_C64(0xC1A3110E95E8DA16), SPH_C64(0x430A7220BF1A82B8), - SPH_C64(0xB77E090D39DF210E), SPH_C64(0x5EF4BD9F3CD05E9D), - SPH_C64(0x9D4FF6DA7E57A444), SPH_C64(0xDA1D60E183D4A5F8), - SPH_C64(0xB287C38417998E47), SPH_C64(0xFE3EDC121BB31886), - SPH_C64(0xC7FE3CCC980CCBEF), SPH_C64(0xE46FB590189BFD03), - SPH_C64(0x3732FD469A4C57DC), SPH_C64(0x7EF700A07CF1AD65), - SPH_C64(0x59C64468A31D8859), SPH_C64(0x762FB0B4D45B61F6), - SPH_C64(0x155BAED099047718), SPH_C64(0x68755E4C3D50BAA6), - SPH_C64(0xE9214E7F22D8B4DF), SPH_C64(0x2ADDBF532EAC95F4), - SPH_C64(0x32AE3909B4BD0109), SPH_C64(0x834DF537B08E3450), - SPH_C64(0xFA209DA84220728D), SPH_C64(0x9E691D9B9EFE23F7), - SPH_C64(0x0446D288C4AE8D7F), SPH_C64(0x7B4CC524E169785B), - SPH_C64(0x21D87F0135CA1385), SPH_C64(0xCEBB400F137B8AA5), - SPH_C64(0x272E2B66580796BE), SPH_C64(0x3612264125C2B0DE), - SPH_C64(0x057702BDAD1EFBB2), SPH_C64(0xD4BABB8EACF84BE9), - SPH_C64(0x91583139641BC67B), SPH_C64(0x8BDC2DE08036E024), - SPH_C64(0x603C8156F49F68ED), SPH_C64(0xF7D236F7DBEF5111), - SPH_C64(0x9727C4598AD21E80), SPH_C64(0xA08A0896670A5FD7), - SPH_C64(0xCB4A8F4309EBA9CB), SPH_C64(0x81AF564B0F7036A1), - SPH_C64(0xC0B99AA778199ABD), SPH_C64(0x959F1EC83FC8E952), - SPH_C64(0x8C505077794A81B9), SPH_C64(0x3ACAAF8F056338F0), - SPH_C64(0x07B43F50627A6778), SPH_C64(0x4A44AB49F5ECCC77), - SPH_C64(0x3BC3D6E4B679EE98), SPH_C64(0x9CC0D4D1CF14108C), - SPH_C64(0x4406C00B206BC8A0), SPH_C64(0x82A18854C8D72D89), - SPH_C64(0x67E366B35C3C432C), SPH_C64(0xB923DD61102B37F2), - SPH_C64(0x56AB2779D884271D), SPH_C64(0xBE83E1B0FF1525AF), - SPH_C64(0xFB7C65D4217E49A9), SPH_C64(0x6BDBE0E76D48E7D4), - SPH_C64(0x08DF828745D9179E), SPH_C64(0x22EA6A9ADD53BD34), - SPH_C64(0xE36E141C5622200A), SPH_C64(0x7F805D1B8CB750EE), - SPH_C64(0xAFE5C7A59F58E837), SPH_C64(0xE27F996A4FB1C23C), - SPH_C64(0xD3867DFB0775F0D0), SPH_C64(0xD0E673DE6E88891A), - SPH_C64(0x123AEB9EAFB86C25), SPH_C64(0x30F1D5D5C145B895), - SPH_C64(0xBB434A2DEE7269E7), SPH_C64(0x78CB67ECF931FA38), - SPH_C64(0xF33B0372323BBF9C), SPH_C64(0x52D66336FB279C74), - SPH_C64(0x505F33AC0AFB4EAA), SPH_C64(0xE8A5CD99A2CCE187), - SPH_C64(0x534974801E2D30BB), SPH_C64(0x8D2D5711D5876D90), - SPH_C64(0x1F1A412891BC038E), SPH_C64(0xD6E2E71D82E56648), - SPH_C64(0x74036C3A497732B7), SPH_C64(0x89B67ED96361F5AB), - SPH_C64(0xFFED95D8F1EA02A2), SPH_C64(0xE72B3BD61464D43D), - SPH_C64(0xA6300F170BDC4820), SPH_C64(0xEBC18760ED78A77A), -}; - -static const sph_u64 T2[256] = { - SPH_C64(0xE6A6BE5A05A12138), SPH_C64(0xB5A122A5B4F87C98), - SPH_C64(0x563C6089140B6990), SPH_C64(0x4C46CB2E391F5DD5), - SPH_C64(0xD932ADDBC9B79434), SPH_C64(0x08EA70E42015AFF5), - SPH_C64(0xD765A6673E478CF1), SPH_C64(0xC4FB757EAB278D99), - SPH_C64(0xDF11C6862D6E0692), SPH_C64(0xDDEB84F10D7F3B16), - SPH_C64(0x6F2EF604A665EA04), SPH_C64(0x4A8E0F0FF0E0DFB3), - SPH_C64(0xA5EDEEF83DBCBA51), SPH_C64(0xFC4F0A2A0EA4371E), - SPH_C64(0xE83E1DA85CB38429), SPH_C64(0xDC8FF882BA1B1CE2), - SPH_C64(0xCD45505E8353E80D), SPH_C64(0x18D19A00D4DB0717), - SPH_C64(0x34A0CFEDA5F38101), SPH_C64(0x0BE77E518887CAF2), - SPH_C64(0x1E341438B3C45136), SPH_C64(0xE05797F49089CCF9), - SPH_C64(0xFFD23F9DF2591D14), SPH_C64(0x543DDA228595C5CD), - SPH_C64(0x661F81FD99052A33), SPH_C64(0x8736E641DB0F7B76), - SPH_C64(0x15227725418E5307), SPH_C64(0xE25F7F46162EB2FA), - SPH_C64(0x48A8B2126C13D9FE), SPH_C64(0xAFDC541792E76EEA), - SPH_C64(0x03D912BFC6D1898F), SPH_C64(0x31B1AAFA1B83F51B), - SPH_C64(0xF1AC2796E42AB7D9), SPH_C64(0x40A3A7D7FCD2EBAC), - SPH_C64(0x1056136D0AFBBCC5), SPH_C64(0x7889E1DD9A6D0C85), - SPH_C64(0xD33525782A7974AA), SPH_C64(0xA7E25D09078AC09B), - SPH_C64(0xBD4138B3EAC6EDD0), SPH_C64(0x920ABFBE71EB9E70), - SPH_C64(0xA2A5D0F54FC2625C), SPH_C64(0xC054E36B0B1290A3), - SPH_C64(0xF6DD59FF62FE932B), SPH_C64(0x3537354511A8AC7D), - SPH_C64(0xCA845E9172FADCD4), SPH_C64(0x84F82B60329D20DC), - SPH_C64(0x79C62CE1CD672F18), SPH_C64(0x8B09A2ADD124642C), - SPH_C64(0xD0C1E96A19D9E726), SPH_C64(0x5A786A9B4BA9500C), - SPH_C64(0x0E020336634C43F3), SPH_C64(0xC17B474AEB66D822), - SPH_C64(0x6A731AE3EC9BAAC2), SPH_C64(0x8226667AE0840258), - SPH_C64(0x67D4567691CAECA5), SPH_C64(0x1D94155C4875ADB5), - SPH_C64(0x6D00FD985B813FDF), SPH_C64(0x51286EFCB774CD06), - SPH_C64(0x5E8834471FA744AF), SPH_C64(0xF72CA0AEE761AE2E), - SPH_C64(0xBE40E4CDAEE8E09A), SPH_C64(0xE9970BBB5118F665), - SPH_C64(0x726E4BEB33DF1964), SPH_C64(0x703B000729199762), - SPH_C64(0x4631D816F5EF30A7), SPH_C64(0xB880B5B51504A6BE), - SPH_C64(0x641793C37ED84B6C), SPH_C64(0x7B21ED77F6E97D96), - SPH_C64(0x776306312EF96B73), SPH_C64(0xAE528948E86FF3F4), - SPH_C64(0x53DBD7F286A3F8F8), SPH_C64(0x16CADCE74CFC1063), - SPH_C64(0x005C19BDFA52C6DD), SPH_C64(0x68868F5D64D46AD3), - SPH_C64(0x3A9D512CCF1E186A), SPH_C64(0x367E62C2385660AE), - SPH_C64(0xE359E7EA77DCB1D7), SPH_C64(0x526C0773749ABE6E), - SPH_C64(0x735AE5F9D09F734B), SPH_C64(0x493FC7CC8A558BA8), - SPH_C64(0xB0B9C1533041AB45), SPH_C64(0x321958BA470A59BD), - SPH_C64(0x852DB00B5F46C393), SPH_C64(0x91209B2BD336B0E5), - SPH_C64(0x6E604F7D659EF19F), SPH_C64(0xB99A8AE2782CCB24), - SPH_C64(0xCCF52AB6C814C4C7), SPH_C64(0x4727D9AFBE11727B), - SPH_C64(0x7E950D0C0121B34D), SPH_C64(0x756F435670AD471F), - SPH_C64(0xF5ADD442615A6849), SPH_C64(0x4E87E09980B9957A), - SPH_C64(0x2ACFA1DF50AEE355), SPH_C64(0xD898263AFD2FD556), - SPH_C64(0xC8F4924DD80C8FD6), SPH_C64(0xCF99CA3D754A173A), - SPH_C64(0xFE477BACAF91BF3C), SPH_C64(0xED5371F6D690C12D), - SPH_C64(0x831A5C285E687094), SPH_C64(0xC5D3C90A3708A0A4), - SPH_C64(0x0F7F903717D06580), SPH_C64(0x19F9BB13B8FDF27F), - SPH_C64(0xB1BD6F1B4D502843), SPH_C64(0x1C761BA38FFF4012), - SPH_C64(0x0D1530C4E2E21F3B), SPH_C64(0x8943CE69A7372C8A), - SPH_C64(0xE5184E11FEB5CE66), SPH_C64(0x618BDB80BD736621), - SPH_C64(0x7D29BAD68B574D0B), SPH_C64(0x81BB613E25E6FE5B), - SPH_C64(0x071C9C10BC07913F), SPH_C64(0xC7BEEB7909AC2D97), - SPH_C64(0xC3E58D353BC5D757), SPH_C64(0xEB017892F38F61E8), - SPH_C64(0xD4EFFB9C9B1CC21A), SPH_C64(0x99727D26F494F7AB), - SPH_C64(0xA3E063A2956B3E03), SPH_C64(0x9D4A8B9A4AA09C30), - SPH_C64(0x3F6AB7D500090FB4), SPH_C64(0x9CC0F2A057268AC0), - SPH_C64(0x3DEE9D2DEDBF42D1), SPH_C64(0x330F49C87960A972), - SPH_C64(0xC6B2720287421B41), SPH_C64(0x0AC59EC07C00369C), - SPH_C64(0xEF4EAC49CB353425), SPH_C64(0xF450244EEF0129D8), - SPH_C64(0x8ACC46E5CAF4DEB6), SPH_C64(0x2FFEAB63989263F7), - SPH_C64(0x8F7CB9FE5D7A4578), SPH_C64(0x5BD8F7644E634635), - SPH_C64(0x427A7315BF2DC900), SPH_C64(0x17D0C4AA2125261C), - SPH_C64(0x3992486C93518E50), SPH_C64(0xB4CBFEE0A2D7D4C3), - SPH_C64(0x7C75D6202C5DDD8D), SPH_C64(0xDBC295D8E35B6C61), - SPH_C64(0x60B369D302032B19), SPH_C64(0xCE42685FDCE44132), - SPH_C64(0x06F3DDB9DDF65610), SPH_C64(0x8EA4D21DB5E148F0), - SPH_C64(0x20B0FCE62FCD496F), SPH_C64(0x2C1B912358B0EE31), - SPH_C64(0xB28317B818F5A308), SPH_C64(0xA89C1E189CA6D2CF), - SPH_C64(0x0C6B18576AAADBC8), SPH_C64(0xB65DEAA91299FAE3), - SPH_C64(0xFB2B794B7F1027E7), SPH_C64(0x04E4317F443B5BEB), - SPH_C64(0x4B852D325939D0A6), SPH_C64(0xD5AE6BEEFB207FFC), - SPH_C64(0x309682B281C7D374), SPH_C64(0xBAE309A194C3B475), - SPH_C64(0x8CC3F97B13B49F05), SPH_C64(0x98A9422FF8293967), - SPH_C64(0x244B16B01076FF7C), SPH_C64(0xF8BF571C663D67EE), - SPH_C64(0x1F0D6758EEE30DA1), SPH_C64(0xC9B611D97ADEB9B7), - SPH_C64(0xB7AFD5887B6C57A2), SPH_C64(0x6290AE846B984FE1), - SPH_C64(0x94DF4CDEACC1A5FD), SPH_C64(0x058A5BD1C5483AFF), - SPH_C64(0x63166CC142BA3C37), SPH_C64(0x8DB8526EB2F76F40), - SPH_C64(0xE10880036F0D6D4E), SPH_C64(0x9E0523C9971D311D), - SPH_C64(0x45EC2824CC7CD691), SPH_C64(0x575B8359E62382C9), - SPH_C64(0xFA9E400DC4889995), SPH_C64(0xD1823ECB45721568), - SPH_C64(0xDAFD983B8206082F), SPH_C64(0xAA7D29082386A8CB), - SPH_C64(0x269FCD4403B87588), SPH_C64(0x1B91F5F728BDD1E0), - SPH_C64(0xE4669F39040201F6), SPH_C64(0x7A1D7C218CF04ADE), - SPH_C64(0x65623C29D79CE5CE), SPH_C64(0x2368449096C00BB1), - SPH_C64(0xAB9BF1879DA503BA), SPH_C64(0xBC23ECB1A458058E), - SPH_C64(0x9A58DF01BB401ECC), SPH_C64(0xA070E868A85F143D), - SPH_C64(0x4FF188307DF2239E), SPH_C64(0x14D565B41A641183), - SPH_C64(0xEE13337452701602), SPH_C64(0x950E3DCF3F285E09), - SPH_C64(0x59930254B9C80953), SPH_C64(0x3BF299408930DA6D), - SPH_C64(0xA955943F53691387), SPH_C64(0xA15EDECAA9CB8784), - SPH_C64(0x29142127352BE9A0), SPH_C64(0x76F0371FFF4E7AFB), - SPH_C64(0x0239F450274F2228), SPH_C64(0xBB073AF01D5E868B), - SPH_C64(0xBFC80571C10E96C1), SPH_C64(0xD267088568222E23), - SPH_C64(0x9671A3D48E80B5B0), SPH_C64(0x55B5D38AE193BB81), - SPH_C64(0x693AE2D0A18B04B8), SPH_C64(0x5C48B4ECADD5335F), - SPH_C64(0xFD743B194916A1CA), SPH_C64(0x2577018134BE98C4), - SPH_C64(0xE77987E83C54A4AD), SPH_C64(0x28E11014DA33E1B9), - SPH_C64(0x270CC59E226AA213), SPH_C64(0x71495F756D1A5F60), - SPH_C64(0x9BE853FB60AFEF77), SPH_C64(0xADC786A7F7443DBF), - SPH_C64(0x0904456173B29A82), SPH_C64(0x58BC7A66C232BD5E), - SPH_C64(0xF306558C673AC8B2), SPH_C64(0x41F639C6B6C9772A), - SPH_C64(0x216DEFE99FDA35DA), SPH_C64(0x11640CC71C7BE615), - SPH_C64(0x93C43694565C5527), SPH_C64(0xEA038E6246777839), - SPH_C64(0xF9ABF3CE5A3E2469), SPH_C64(0x741E768D0FD312D2), - SPH_C64(0x0144B883CED652C6), SPH_C64(0xC20B5A5BA33F8552), - SPH_C64(0x1AE69633C3435A9D), SPH_C64(0x97A28CA4088CFDEC), - SPH_C64(0x8824A43C1E96F420), SPH_C64(0x37612FA66EEEA746), - SPH_C64(0x6B4CB165F9CF0E5A), SPH_C64(0x43AA1C06A0ABFB4A), - SPH_C64(0x7F4DC26FF162796B), SPH_C64(0x6CBACC8E54ED9B0F), - SPH_C64(0xA6B7FFEFD2BB253E), SPH_C64(0x2E25BC95B0A29D4F), - SPH_C64(0x86D6A58BDEF1388C), SPH_C64(0xDED74AC576B6F054), - SPH_C64(0x8030BDBC2B45805D), SPH_C64(0x3C81AF70E94D9289), - SPH_C64(0x3EFF6DDA9E3100DB), SPH_C64(0xB38DC39FDFCC8847), - SPH_C64(0x123885528D17B87E), SPH_C64(0xF2DA0ED240B1B642), - SPH_C64(0x44CEFADCD54BF9A9), SPH_C64(0x1312200E433C7EE6), - SPH_C64(0x9FFCC84F3A78C748), SPH_C64(0xF0CD1F72248576BB), - SPH_C64(0xEC6974053638CFE4), SPH_C64(0x2BA7B67C0CEC4E4C), - SPH_C64(0xAC2F4DF3E5CE32ED), SPH_C64(0xCB33D14326EA4C11), - SPH_C64(0xA4E9044CC77E58BC), SPH_C64(0x5F513293D934FCEF), - SPH_C64(0x5DC9645506E55444), SPH_C64(0x50DE418F317DE40A), - SPH_C64(0x388CB31A69DDE259), SPH_C64(0x2DB4A83455820A86), - SPH_C64(0x9010A91E84711AE9), SPH_C64(0x4DF7F0B7B1498371), - SPH_C64(0xD62A2EABC0977179), SPH_C64(0x22FAC097AA8D5C0E), -}; - -static const sph_u64 T3[256] = { - SPH_C64(0xF49FCC2FF1DAF39B), SPH_C64(0x487FD5C66FF29281), - SPH_C64(0xE8A30667FCDCA83F), SPH_C64(0x2C9B4BE3D2FCCE63), - SPH_C64(0xDA3FF74B93FBBBC2), SPH_C64(0x2FA165D2FE70BA66), - SPH_C64(0xA103E279970E93D4), SPH_C64(0xBECDEC77B0E45E71), - SPH_C64(0xCFB41E723985E497), SPH_C64(0xB70AAA025EF75017), - SPH_C64(0xD42309F03840B8E0), SPH_C64(0x8EFC1AD035898579), - SPH_C64(0x96C6920BE2B2ABC5), SPH_C64(0x66AF4163375A9172), - SPH_C64(0x2174ABDCCA7127FB), SPH_C64(0xB33CCEA64A72FF41), - SPH_C64(0xF04A4933083066A5), SPH_C64(0x8D970ACDD7289AF5), - SPH_C64(0x8F96E8E031C8C25E), SPH_C64(0xF3FEC02276875D47), - SPH_C64(0xEC7BF310056190DD), SPH_C64(0xF5ADB0AEBB0F1491), - SPH_C64(0x9B50F8850FD58892), SPH_C64(0x4975488358B74DE8), - SPH_C64(0xA3354FF691531C61), SPH_C64(0x0702BBE481D2C6EE), - SPH_C64(0x89FB24057DEDED98), SPH_C64(0xAC3075138596E902), - SPH_C64(0x1D2D3580172772ED), SPH_C64(0xEB738FC28E6BC30D), - SPH_C64(0x5854EF8F63044326), SPH_C64(0x9E5C52325ADD3BBE), - SPH_C64(0x90AA53CF325C4623), SPH_C64(0xC1D24D51349DD067), - SPH_C64(0x2051CFEEA69EA624), SPH_C64(0x13220F0A862E7E4F), - SPH_C64(0xCE39399404E04864), SPH_C64(0xD9C42CA47086FCB7), - SPH_C64(0x685AD2238A03E7CC), SPH_C64(0x066484B2AB2FF1DB), - SPH_C64(0xFE9D5D70EFBF79EC), SPH_C64(0x5B13B9DD9C481854), - SPH_C64(0x15F0D475ED1509AD), SPH_C64(0x0BEBCD060EC79851), - SPH_C64(0xD58C6791183AB7F8), SPH_C64(0xD1187C5052F3EEE4), - SPH_C64(0xC95D1192E54E82FF), SPH_C64(0x86EEA14CB9AC6CA2), - SPH_C64(0x3485BEB153677D5D), SPH_C64(0xDD191D781F8C492A), - SPH_C64(0xF60866BAA784EBF9), SPH_C64(0x518F643BA2D08C74), - SPH_C64(0x8852E956E1087C22), SPH_C64(0xA768CB8DC410AE8D), - SPH_C64(0x38047726BFEC8E1A), SPH_C64(0xA67738B4CD3B45AA), - SPH_C64(0xAD16691CEC0DDE19), SPH_C64(0xC6D4319380462E07), - SPH_C64(0xC5A5876D0BA61938), SPH_C64(0x16B9FA1FA58FD840), - SPH_C64(0x188AB1173CA74F18), SPH_C64(0xABDA2F98C99C021F), - SPH_C64(0x3E0580AB134AE816), SPH_C64(0x5F3B05B773645ABB), - SPH_C64(0x2501A2BE5575F2F6), SPH_C64(0x1B2F74004E7E8BA9), - SPH_C64(0x1CD7580371E8D953), SPH_C64(0x7F6ED89562764E30), - SPH_C64(0xB15926FF596F003D), SPH_C64(0x9F65293DA8C5D6B9), - SPH_C64(0x6ECEF04DD690F84C), SPH_C64(0x4782275FFF33AF88), - SPH_C64(0xE41433083F820801), SPH_C64(0xFD0DFE409A1AF9B5), - SPH_C64(0x4325A3342CDB396B), SPH_C64(0x8AE77E62B301B252), - SPH_C64(0xC36F9E9F6655615A), SPH_C64(0x85455A2D92D32C09), - SPH_C64(0xF2C7DEA949477485), SPH_C64(0x63CFB4C133A39EBA), - SPH_C64(0x83B040CC6EBC5462), SPH_C64(0x3B9454C8FDB326B0), - SPH_C64(0x56F56A9E87FFD78C), SPH_C64(0x2DC2940D99F42BC6), - SPH_C64(0x98F7DF096B096E2D), SPH_C64(0x19A6E01E3AD852BF), - SPH_C64(0x42A99CCBDBD4B40B), SPH_C64(0xA59998AF45E9C559), - SPH_C64(0x366295E807D93186), SPH_C64(0x6B48181BFAA1F773), - SPH_C64(0x1FEC57E2157A0A1D), SPH_C64(0x4667446AF6201AD5), - SPH_C64(0xE615EBCACFB0F075), SPH_C64(0xB8F31F4F68290778), - SPH_C64(0x22713ED6CE22D11E), SPH_C64(0x3057C1A72EC3C93B), - SPH_C64(0xCB46ACC37C3F1F2F), SPH_C64(0xDBB893FD02AAF50E), - SPH_C64(0x331FD92E600B9FCF), SPH_C64(0xA498F96148EA3AD6), - SPH_C64(0xA8D8426E8B6A83EA), SPH_C64(0xA089B274B7735CDC), - SPH_C64(0x87F6B3731E524A11), SPH_C64(0x118808E5CBC96749), - SPH_C64(0x9906E4C7B19BD394), SPH_C64(0xAFED7F7E9B24A20C), - SPH_C64(0x6509EADEEB3644A7), SPH_C64(0x6C1EF1D3E8EF0EDE), - SPH_C64(0xB9C97D43E9798FB4), SPH_C64(0xA2F2D784740C28A3), - SPH_C64(0x7B8496476197566F), SPH_C64(0x7A5BE3E6B65F069D), - SPH_C64(0xF96330ED78BE6F10), SPH_C64(0xEEE60DE77A076A15), - SPH_C64(0x2B4BEE4AA08B9BD0), SPH_C64(0x6A56A63EC7B8894E), - SPH_C64(0x02121359BA34FEF4), SPH_C64(0x4CBF99F8283703FC), - SPH_C64(0x398071350CAF30C8), SPH_C64(0xD0A77A89F017687A), - SPH_C64(0xF1C1A9EB9E423569), SPH_C64(0x8C7976282DEE8199), - SPH_C64(0x5D1737A5DD1F7ABD), SPH_C64(0x4F53433C09A9FA80), - SPH_C64(0xFA8B0C53DF7CA1D9), SPH_C64(0x3FD9DCBC886CCB77), - SPH_C64(0xC040917CA91B4720), SPH_C64(0x7DD00142F9D1DCDF), - SPH_C64(0x8476FC1D4F387B58), SPH_C64(0x23F8E7C5F3316503), - SPH_C64(0x032A2244E7E37339), SPH_C64(0x5C87A5D750F5A74B), - SPH_C64(0x082B4CC43698992E), SPH_C64(0xDF917BECB858F63C), - SPH_C64(0x3270B8FC5BF86DDA), SPH_C64(0x10AE72BB29B5DD76), - SPH_C64(0x576AC94E7700362B), SPH_C64(0x1AD112DAC61EFB8F), - SPH_C64(0x691BC30EC5FAA427), SPH_C64(0xFF246311CC327143), - SPH_C64(0x3142368E30E53206), SPH_C64(0x71380E31E02CA396), - SPH_C64(0x958D5C960AAD76F1), SPH_C64(0xF8D6F430C16DA536), - SPH_C64(0xC8FFD13F1BE7E1D2), SPH_C64(0x7578AE66004DDBE1), - SPH_C64(0x05833F01067BE646), SPH_C64(0xBB34B5AD3BFE586D), - SPH_C64(0x095F34C9A12B97F0), SPH_C64(0x247AB64525D60CA8), - SPH_C64(0xDCDBC6F3017477D1), SPH_C64(0x4A2E14D4DECAD24D), - SPH_C64(0xBDB5E6D9BE0A1EEB), SPH_C64(0x2A7E70F7794301AB), - SPH_C64(0xDEF42D8A270540FD), SPH_C64(0x01078EC0A34C22C1), - SPH_C64(0xE5DE511AF4C16387), SPH_C64(0x7EBB3A52BD9A330A), - SPH_C64(0x77697857AA7D6435), SPH_C64(0x004E831603AE4C32), - SPH_C64(0xE7A21020AD78E312), SPH_C64(0x9D41A70C6AB420F2), - SPH_C64(0x28E06C18EA1141E6), SPH_C64(0xD2B28CBD984F6B28), - SPH_C64(0x26B75F6C446E9D83), SPH_C64(0xBA47568C4D418D7F), - SPH_C64(0xD80BADBFE6183D8E), SPH_C64(0x0E206D7F5F166044), - SPH_C64(0xE258A43911CBCA3E), SPH_C64(0x723A1746B21DC0BC), - SPH_C64(0xC7CAA854F5D7CDD3), SPH_C64(0x7CAC32883D261D9C), - SPH_C64(0x7690C26423BA942C), SPH_C64(0x17E55524478042B8), - SPH_C64(0xE0BE477656A2389F), SPH_C64(0x4D289B5E67AB2DA0), - SPH_C64(0x44862B9C8FBBFD31), SPH_C64(0xB47CC8049D141365), - SPH_C64(0x822C1B362B91C793), SPH_C64(0x4EB14655FB13DFD8), - SPH_C64(0x1ECBBA0714E2A97B), SPH_C64(0x6143459D5CDE5F14), - SPH_C64(0x53A8FBF1D5F0AC89), SPH_C64(0x97EA04D81C5E5B00), - SPH_C64(0x622181A8D4FDB3F3), SPH_C64(0xE9BCD341572A1208), - SPH_C64(0x1411258643CCE58A), SPH_C64(0x9144C5FEA4C6E0A4), - SPH_C64(0x0D33D06565CF620F), SPH_C64(0x54A48D489F219CA1), - SPH_C64(0xC43E5EAC6D63C821), SPH_C64(0xA9728B3A72770DAF), - SPH_C64(0xD7934E7B20DF87EF), SPH_C64(0xE35503B61A3E86E5), - SPH_C64(0xCAE321FBC819D504), SPH_C64(0x129A50B3AC60BFA6), - SPH_C64(0xCD5E68EA7E9FB6C3), SPH_C64(0xB01C90199483B1C7), - SPH_C64(0x3DE93CD5C295376C), SPH_C64(0xAED52EDF2AB9AD13), - SPH_C64(0x2E60F512C0A07884), SPH_C64(0xBC3D86A3E36210C9), - SPH_C64(0x35269D9B163951CE), SPH_C64(0x0C7D6E2AD0CDB5FA), - SPH_C64(0x59E86297D87F5733), SPH_C64(0x298EF221898DB0E7), - SPH_C64(0x55000029D1A5AA7E), SPH_C64(0x8BC08AE1B5061B45), - SPH_C64(0xC2C31C2B6C92703A), SPH_C64(0x94CC596BAF25EF42), - SPH_C64(0x0A1D73DB22540456), SPH_C64(0x04B6A0F9D9C4179A), - SPH_C64(0xEFFDAFA2AE3D3C60), SPH_C64(0xF7C8075BB49496C4), - SPH_C64(0x9CC5C7141D1CD4E3), SPH_C64(0x78BD1638218E5534), - SPH_C64(0xB2F11568F850246A), SPH_C64(0xEDFABCFA9502BC29), - SPH_C64(0x796CE5F2DA23051B), SPH_C64(0xAAE128B0DC93537C), - SPH_C64(0x3A493DA0EE4B29AE), SPH_C64(0xB5DF6B2C416895D7), - SPH_C64(0xFCABBD25122D7F37), SPH_C64(0x70810B58105DC4B1), - SPH_C64(0xE10FDD37F7882A90), SPH_C64(0x524DCAB5518A3F5C), - SPH_C64(0x3C9E85878451255B), SPH_C64(0x4029828119BD34E2), - SPH_C64(0x74A05B6F5D3CECCB), SPH_C64(0xB610021542E13ECA), - SPH_C64(0x0FF979D12F59E2AC), SPH_C64(0x6037DA27E4F9CC50), - SPH_C64(0x5E92975A0DF1847D), SPH_C64(0xD66DE190D3E623FE), - SPH_C64(0x5032D6B87B568048), SPH_C64(0x9A36B7CE8235216E), - SPH_C64(0x80272A7A24F64B4A), SPH_C64(0x93EFED8B8C6916F7), - SPH_C64(0x37DDBFF44CCE1555), SPH_C64(0x4B95DB5D4B99BD25), - SPH_C64(0x92D3FDA169812FC0), SPH_C64(0xFB1A4A9A90660BB6), - SPH_C64(0x730C196946A4B9B2), SPH_C64(0x81E289AA7F49DA68), - SPH_C64(0x64669A0F83B1A05F), SPH_C64(0x27B3FF7D9644F48B), - SPH_C64(0xCC6B615C8DB675B3), SPH_C64(0x674F20B9BCEBBE95), - SPH_C64(0x6F31238275655982), SPH_C64(0x5AE488713E45CF05), - SPH_C64(0xBF619F9954C21157), SPH_C64(0xEABAC46040A8EAE9), - SPH_C64(0x454C6FE9F2C0C1CD), SPH_C64(0x419CF6496412691C), - SPH_C64(0xD3DC3BEF265B0F70), SPH_C64(0x6D0E60F5C3578A9E), -}; - -static const sph_u64 T4[256] = { - SPH_C64(0x5B0E608526323C55), SPH_C64(0x1A46C1A9FA1B59F5), - SPH_C64(0xA9E245A17C4C8FFA), SPH_C64(0x65CA5159DB2955D7), - SPH_C64(0x05DB0A76CE35AFC2), SPH_C64(0x81EAC77EA9113D45), - SPH_C64(0x528EF88AB6AC0A0D), SPH_C64(0xA09EA253597BE3FF), - SPH_C64(0x430DDFB3AC48CD56), SPH_C64(0xC4B3A67AF45CE46F), - SPH_C64(0x4ECECFD8FBE2D05E), SPH_C64(0x3EF56F10B39935F0), - SPH_C64(0x0B22D6829CD619C6), SPH_C64(0x17FD460A74DF2069), - SPH_C64(0x6CF8CC8E8510ED40), SPH_C64(0xD6C824BF3A6ECAA7), - SPH_C64(0x61243D581A817049), SPH_C64(0x048BACB6BBC163A2), - SPH_C64(0xD9A38AC27D44CC32), SPH_C64(0x7FDDFF5BAAF410AB), - SPH_C64(0xAD6D495AA804824B), SPH_C64(0xE1A6A74F2D8C9F94), - SPH_C64(0xD4F7851235DEE8E3), SPH_C64(0xFD4B7F886540D893), - SPH_C64(0x247C20042AA4BFDA), SPH_C64(0x096EA1C517D1327C), - SPH_C64(0xD56966B4361A6685), SPH_C64(0x277DA5C31221057D), - SPH_C64(0x94D59893A43ACFF7), SPH_C64(0x64F0C51CCDC02281), - SPH_C64(0x3D33BCC4FF6189DB), SPH_C64(0xE005CB184CE66AF1), - SPH_C64(0xFF5CCD1D1DB99BEA), SPH_C64(0xB0B854A7FE42980F), - SPH_C64(0x7BD46A6A718D4B9F), SPH_C64(0xD10FA8CC22A5FD8C), - SPH_C64(0xD31484952BE4BD31), SPH_C64(0xC7FA975FCB243847), - SPH_C64(0x4886ED1E5846C407), SPH_C64(0x28CDDB791EB70B04), - SPH_C64(0xC2B00BE2F573417F), SPH_C64(0x5C9590452180F877), - SPH_C64(0x7A6BDDFFF370EB00), SPH_C64(0xCE509E38D6D9D6A4), - SPH_C64(0xEBEB0F00647FA702), SPH_C64(0x1DCC06CF76606F06), - SPH_C64(0xE4D9F28BA286FF0A), SPH_C64(0xD85A305DC918C262), - SPH_C64(0x475B1D8732225F54), SPH_C64(0x2D4FB51668CCB5FE), - SPH_C64(0xA679B9D9D72BBA20), SPH_C64(0x53841C0D912D43A5), - SPH_C64(0x3B7EAA48BF12A4E8), SPH_C64(0x781E0E47F22F1DDF), - SPH_C64(0xEFF20CE60AB50973), SPH_C64(0x20D261D19DFFB742), - SPH_C64(0x16A12B03062A2E39), SPH_C64(0x1960EB2239650495), - SPH_C64(0x251C16FED50EB8B8), SPH_C64(0x9AC0C330F826016E), - SPH_C64(0xED152665953E7671), SPH_C64(0x02D63194A6369570), - SPH_C64(0x5074F08394B1C987), SPH_C64(0x70BA598C90B25CE1), - SPH_C64(0x794A15810B9742F6), SPH_C64(0x0D5925E9FCAF8C6C), - SPH_C64(0x3067716CD868744E), SPH_C64(0x910AB077E8D7731B), - SPH_C64(0x6A61BBDB5AC42F61), SPH_C64(0x93513EFBF0851567), - SPH_C64(0xF494724B9E83E9D5), SPH_C64(0xE887E1985C09648D), - SPH_C64(0x34B1D3C675370CFD), SPH_C64(0xDC35E433BC0D255D), - SPH_C64(0xD0AAB84234131BE0), SPH_C64(0x08042A50B48B7EAF), - SPH_C64(0x9997C4EE44A3AB35), SPH_C64(0x829A7B49201799D0), - SPH_C64(0x263B8307B7C54441), SPH_C64(0x752F95F4FD6A6CA6), - SPH_C64(0x927217402C08C6E5), SPH_C64(0x2A8AB754A795D9EE), - SPH_C64(0xA442F7552F72943D), SPH_C64(0x2C31334E19781208), - SPH_C64(0x4FA98D7CEAEE6291), SPH_C64(0x55C3862F665DB309), - SPH_C64(0xBD0610175D53B1F3), SPH_C64(0x46FE6CB840413F27), - SPH_C64(0x3FE03792DF0CFA59), SPH_C64(0xCFE700372EB85E8F), - SPH_C64(0xA7BE29E7ADBCE118), SPH_C64(0xE544EE5CDE8431DD), - SPH_C64(0x8A781B1B41F1873E), SPH_C64(0xA5C94C78A0D2F0E7), - SPH_C64(0x39412E2877B60728), SPH_C64(0xA1265EF3AFC9A62C), - SPH_C64(0xBCC2770C6A2506C5), SPH_C64(0x3AB66DD5DCE1CE12), - SPH_C64(0xE65499D04A675B37), SPH_C64(0x7D8F523481BFD216), - SPH_C64(0x0F6F64FCEC15F389), SPH_C64(0x74EFBE618B5B13C8), - SPH_C64(0xACDC82B714273E1D), SPH_C64(0xDD40BFE003199D17), - SPH_C64(0x37E99257E7E061F8), SPH_C64(0xFA52626904775AAA), - SPH_C64(0x8BBBF63A463D56F9), SPH_C64(0xF0013F1543A26E64), - SPH_C64(0xA8307E9F879EC898), SPH_C64(0xCC4C27A4150177CC), - SPH_C64(0x1B432F2CCA1D3348), SPH_C64(0xDE1D1F8F9F6FA013), - SPH_C64(0x606602A047A7DDD6), SPH_C64(0xD237AB64CC1CB2C7), - SPH_C64(0x9B938E7225FCD1D3), SPH_C64(0xEC4E03708E0FF476), - SPH_C64(0xFEB2FBDA3D03C12D), SPH_C64(0xAE0BCED2EE43889A), - SPH_C64(0x22CB8923EBFB4F43), SPH_C64(0x69360D013CF7396D), - SPH_C64(0x855E3602D2D4E022), SPH_C64(0x073805BAD01F784C), - SPH_C64(0x33E17A133852F546), SPH_C64(0xDF4874058AC7B638), - SPH_C64(0xBA92B29C678AA14A), SPH_C64(0x0CE89FC76CFAADCD), - SPH_C64(0x5F9D4E0908339E34), SPH_C64(0xF1AFE9291F5923B9), - SPH_C64(0x6E3480F60F4A265F), SPH_C64(0xEEBF3A2AB29B841C), - SPH_C64(0xE21938A88F91B4AD), SPH_C64(0x57DFEFF845C6D3C3), - SPH_C64(0x2F006B0BF62CAAF2), SPH_C64(0x62F479EF6F75EE78), - SPH_C64(0x11A55AD41C8916A9), SPH_C64(0xF229D29084FED453), - SPH_C64(0x42F1C27B16B000E6), SPH_C64(0x2B1F76749823C074), - SPH_C64(0x4B76ECA3C2745360), SPH_C64(0x8C98F463B91691BD), - SPH_C64(0x14BCC93CF1ADE66A), SPH_C64(0x8885213E6D458397), - SPH_C64(0x8E177DF0274D4711), SPH_C64(0xB49B73B5503F2951), - SPH_C64(0x10168168C3F96B6B), SPH_C64(0x0E3D963B63CAB0AE), - SPH_C64(0x8DFC4B5655A1DB14), SPH_C64(0xF789F1356E14DE5C), - SPH_C64(0x683E68AF4E51DAC1), SPH_C64(0xC9A84F9D8D4B0FD9), - SPH_C64(0x3691E03F52A0F9D1), SPH_C64(0x5ED86E46E1878E80), - SPH_C64(0x3C711A0E99D07150), SPH_C64(0x5A0865B20C4E9310), - SPH_C64(0x56FBFC1FE4F0682E), SPH_C64(0xEA8D5DE3105EDF9B), - SPH_C64(0x71ABFDB12379187A), SPH_C64(0x2EB99DE1BEE77B9C), - SPH_C64(0x21ECC0EA33CF4523), SPH_C64(0x59A4D7521805C7A1), - SPH_C64(0x3896F5EB56AE7C72), SPH_C64(0xAA638F3DB18F75DC), - SPH_C64(0x9F39358DABE9808E), SPH_C64(0xB7DEFA91C00B72AC), - SPH_C64(0x6B5541FD62492D92), SPH_C64(0x6DC6DEE8F92E4D5B), - SPH_C64(0x353F57ABC4BEEA7E), SPH_C64(0x735769D6DA5690CE), - SPH_C64(0x0A234AA642391484), SPH_C64(0xF6F9508028F80D9D), - SPH_C64(0xB8E319A27AB3F215), SPH_C64(0x31AD9C1151341A4D), - SPH_C64(0x773C22A57BEF5805), SPH_C64(0x45C7561A07968633), - SPH_C64(0xF913DA9E249DBE36), SPH_C64(0xDA652D9B78A64C68), - SPH_C64(0x4C27A97F3BC334EF), SPH_C64(0x76621220E66B17F4), - SPH_C64(0x967743899ACD7D0B), SPH_C64(0xF3EE5BCAE0ED6782), - SPH_C64(0x409F753600C879FC), SPH_C64(0x06D09A39B5926DB6), - SPH_C64(0x6F83AEB0317AC588), SPH_C64(0x01E6CA4A86381F21), - SPH_C64(0x66FF3462D19F3025), SPH_C64(0x72207C24DDFD3BFB), - SPH_C64(0x4AF6B6D3E2ECE2EB), SPH_C64(0x9C994DBEC7EA08DE), - SPH_C64(0x49ACE597B09A8BC4), SPH_C64(0xB38C4766CF0797BA), - SPH_C64(0x131B9373C57C2A75), SPH_C64(0xB1822CCE61931E58), - SPH_C64(0x9D7555B909BA1C0C), SPH_C64(0x127FAFDD937D11D2), - SPH_C64(0x29DA3BADC66D92E4), SPH_C64(0xA2C1D57154C2ECBC), - SPH_C64(0x58C5134D82F6FE24), SPH_C64(0x1C3AE3515B62274F), - SPH_C64(0xE907C82E01CB8126), SPH_C64(0xF8ED091913E37FCB), - SPH_C64(0x3249D8F9C80046C9), SPH_C64(0x80CF9BEDE388FB63), - SPH_C64(0x1881539A116CF19E), SPH_C64(0x5103F3F76BD52457), - SPH_C64(0x15B7E6F5AE47F7A8), SPH_C64(0xDBD7C6DED47E9CCF), - SPH_C64(0x44E55C410228BB1A), SPH_C64(0xB647D4255EDB4E99), - SPH_C64(0x5D11882BB8AAFC30), SPH_C64(0xF5098BBB29D3212A), - SPH_C64(0x8FB5EA14E90296B3), SPH_C64(0x677B942157DD025A), - SPH_C64(0xFB58E7C0A390ACB5), SPH_C64(0x89D3674C83BD4A01), - SPH_C64(0x9E2DA4DF4BF3B93B), SPH_C64(0xFCC41E328CAB4829), - SPH_C64(0x03F38C96BA582C52), SPH_C64(0xCAD1BDBD7FD85DB2), - SPH_C64(0xBBB442C16082AE83), SPH_C64(0xB95FE86BA5DA9AB0), - SPH_C64(0xB22E04673771A93F), SPH_C64(0x845358C9493152D8), - SPH_C64(0xBE2A488697B4541E), SPH_C64(0x95A2DC2DD38E6966), - SPH_C64(0xC02C11AC923C852B), SPH_C64(0x2388B1990DF2A87B), - SPH_C64(0x7C8008FA1B4F37BE), SPH_C64(0x1F70D0C84D54E503), - SPH_C64(0x5490ADEC7ECE57D4), SPH_C64(0x002B3C27D9063A3A), - SPH_C64(0x7EAEA3848030A2BF), SPH_C64(0xC602326DED2003C0), - SPH_C64(0x83A7287D69A94086), SPH_C64(0xC57A5FCB30F57A8A), - SPH_C64(0xB56844E479EBE779), SPH_C64(0xA373B40F05DCBCE9), - SPH_C64(0xD71A786E88570EE2), SPH_C64(0x879CBACDBDE8F6A0), - SPH_C64(0x976AD1BCC164A32F), SPH_C64(0xAB21E25E9666D78B), - SPH_C64(0x901063AAE5E5C33C), SPH_C64(0x9818B34448698D90), - SPH_C64(0xE36487AE3E1E8ABB), SPH_C64(0xAFBDF931893BDCB4), - SPH_C64(0x6345A0DC5FBBD519), SPH_C64(0x8628FE269B9465CA), - SPH_C64(0x1E5D01603F9C51EC), SPH_C64(0x4DE44006A15049B7), - SPH_C64(0xBF6C70E5F776CBB1), SPH_C64(0x411218F2EF552BED), - SPH_C64(0xCB0C0708705A36A3), SPH_C64(0xE74D14754F986044), - SPH_C64(0xCD56D9430EA8280E), SPH_C64(0xC12591D7535F5065), - SPH_C64(0xC83223F1720AEF96), SPH_C64(0xC3A0396F7363A51F), -}; - -#define PASS(a, b, c, mul) do { \ - ROUND(a, b, c, X0, mul); \ - ROUND(b, c, a, X1, mul); \ - ROUND(c, a, b, X2, mul); \ - ROUND(a, b, c, X3, mul); \ - ROUND(b, c, a, X4, mul); \ - ROUND(c, a, b, X5, mul); \ - ROUND(a, b, c, X6, mul); \ - ROUND(b, c, a, X7, mul); \ - } while (0) - -#define ROUND(a, b, c, x, mul) do { \ - c ^= x; \ - a = SPH_T64(a - (T1[c & 0xFF] ^ T2[(c >> 16) & 0xFF] \ - ^ T3[(c >> 32) & 0xFF] ^ T4[(c >> 48) & 0xFF])); \ - b = SPH_T64(b + (T4[(c >> 8) & 0xFF] ^ T3[(c >> 24) & 0xFF] \ - ^ T2[(c >> 40) & 0xFF] ^ T1[(c >> 56) & 0xFF])); \ - b = mul(b); \ - } while (0) - -#define MUL5(x) SPH_T64((x) * SPH_C64(5)) -#define MUL7(x) SPH_T64((x) * SPH_C64(7)) -#define MUL9(x) SPH_T64((x) * SPH_C64(9)) - -#define KSCHED do { \ - X0 = SPH_T64(X0 - (X7 ^ SPH_C64(0xA5A5A5A5A5A5A5A5))); \ - X1 ^= X0; \ - X2 = SPH_T64(X2 + X1); \ - X3 = SPH_T64(X3 - (X2 ^ (~X1 << 19))); \ - X4 ^= X3; \ - X5 = SPH_T64(X5 + X4); \ - X6 = SPH_T64(X6 - (X5 ^ (~X4 >> 23))); \ - X7 ^= X6; \ - X0 = SPH_T64(X0 + X7); \ - X1 = SPH_T64(X1 - (X0 ^ (~X7 << 19))); \ - X2 ^= X1; \ - X3 = SPH_T64(X3 + X2); \ - X4 = SPH_T64(X4 - (X3 ^ (~X2 >> 23))); \ - X5 ^= X4; \ - X6 = SPH_T64(X6 + X5); \ - X7 = SPH_T64(X7 - (X6 ^ SPH_C64(0x0123456789ABCDEF))); \ - } while (0) - -#define TIGER_ROUND_BODY(in, r) do { \ - sph_u64 A, B, C; \ - sph_u64 X0, X1, X2, X3, X4, X5, X6, X7; \ - \ - A = (r)[0]; \ - B = (r)[1]; \ - C = (r)[2]; \ - \ - X0 = (in(0)); \ - X1 = (in(1)); \ - X2 = (in(2)); \ - X3 = (in(3)); \ - X4 = (in(4)); \ - X5 = (in(5)); \ - X6 = (in(6)); \ - X7 = (in(7)); \ - PASS(A, B, C, MUL5); \ - KSCHED; \ - PASS(C, A, B, MUL7); \ - KSCHED; \ - PASS(B, C, A, MUL9); \ - \ - (r)[0] ^= A; \ - (r)[1] = SPH_T64(B - (r)[1]); \ - (r)[2] = SPH_T64(C + (r)[2]); \ - } while (0) - -/* - * One round of Tiger. The data must be aligned for 64-bit access. - */ -static void -tiger_round(const unsigned char *data, sph_u64 r[3]) -{ -#define TIGER_IN(i) sph_dec64le_aligned(data + 8 * (i)) - TIGER_ROUND_BODY(TIGER_IN, r); -#undef TIGER_IN -} - -/* see sph_tiger.h */ -void -sph_tiger_init(void *cc) -{ - sph_tiger_context *sc; - - sc = (sph_tiger_context*)cc; - sc->val[0] = SPH_C64(0x0123456789ABCDEF); - sc->val[1] = SPH_C64(0xFEDCBA9876543210); - sc->val[2] = SPH_C64(0xF096A5B4C3B2E187); - sc->count = 0; -} - -#define RFUN tiger_round -#define HASH tiger -#define LE64 1 -#define BLEN 64U -#define PW01 1 -#define PLW1 1 -#include "algo/sha/md_helper.c" - -/* see sph_tiger.h */ -void -sph_tiger_close(void *cc, void *dst) -{ - tiger_close(cc, dst, 3); -// sph_tiger_init(cc); -} - -/* see sph_tiger.h */ -void -sph_tiger_comp(const sph_u64 msg[8], sph_u64 val[3]) -{ -#define TIGER_IN(i) msg[i] - TIGER_ROUND_BODY(TIGER_IN, val); -#undef TIGER_IN -} - -#undef HASH -#define HASH tiger2 -#undef PW01 -#define CLOSE_ONLY 1 -#include "algo/sha/md_helper.c" - -/* see sph_tiger.h */ -void -sph_tiger2_close(void *cc, void *dst) -{ - tiger2_close(cc, dst, 3); -// sph_tiger2_init(cc); -} - -#endif - diff --git a/algo/tiger/sph_tiger.h b/algo/tiger/sph_tiger.h deleted file mode 100644 index 6461b47..0000000 --- a/algo/tiger/sph_tiger.h +++ /dev/null @@ -1,192 +0,0 @@ -/* $Id: sph_tiger.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * Tiger / Tiger-2 interface. - * - * Tiger has been published in: R. Anderson, E. Biham, "Tiger: A Fast - * New Hash Function", Fast Software Encryption - FSE'96, LNCS 1039, - * Springer (1996), pp. 89--97. - * - * Tiger2 has never been formally published, but it was described as - * identical to Tiger, except for the padding which is the same in - * Tiger2 as it is in MD4. Fortunately, an implementation of Tiger2 - * was submitted to NESSIE, which produced test vectors; the sphlib - * implementation of Tiger2 is compatible with the NESSIE test vectors. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_tiger.h - * @author Thomas Pornin - */ - -#ifndef SPH_TIGER_H__ -#define SPH_TIGER_H__ - -#include -#include "algo/sha/sph_types.h" - -#if SPH_64 - -/** - * Output size (in bits) for Tiger. - */ -#define SPH_SIZE_tiger 192 - -/** - * Output size (in bits) for Tiger2. - */ -#define SPH_SIZE_tiger2 192 - -/** - * This structure is a context for Tiger computations: it contains the - * intermediate values and some data from the last entered block. Once - * a Tiger computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running Tiger computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ - sph_u64 val[3]; - sph_u64 count; -#endif -} sph_tiger_context; - -/** - * Initialize a Tiger context. This process performs no memory allocation. - * - * @param cc the Tiger context (pointer to - * a sph_tiger_context) - */ -void sph_tiger_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Tiger context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_tiger(void *cc, const void *data, size_t len); - -/** - * Terminate the current Tiger computation and output the result into the - * provided buffer. The destination buffer must be wide enough to - * accomodate the result (24 bytes). The context is automatically - * reinitialized. - * - * @param cc the Tiger context - * @param dst the destination buffer - */ -void sph_tiger_close(void *cc, void *dst); - -/** - * Apply the Tiger compression function on the provided data. The - * msg parameter contains the 8 64-bit input blocks, - * as numerical values (hence after the little-endian decoding). The - * val parameter contains the 3 64-bit input blocks for - * the compression function; the output is written in place in this - * array. - * - * @param msg the message block (8 values) - * @param val the function 192-bit input and output - */ -void sph_tiger_comp(const sph_u64 msg[8], sph_u64 val[3]); - -/** - * This structure is a context for Tiger2 computations. It is identical - * to the Tiger context, and they may be freely exchanged, since the - * difference between Tiger and Tiger2 resides solely in the padding, which - * is computed only in the last computation step. - */ -typedef sph_tiger_context sph_tiger2_context; - -#ifdef DOXYGEN_IGNORE -/** - * Initialize a Tiger2 context. This function is identical to - * sph_tiger_init(). - * - * @param cc the Tiger2 context (pointer to - * a sph_tiger2_context) - */ -void sph_tiger2_init(void *cc); -#endif - -#ifndef DOXYGEN_IGNORE -#define sph_tiger2_init sph_tiger_init -#endif - -#ifdef DOXYGEN_IGNORE -/** - * Process some data bytes. This function is identical to - * sph_tiger(). - * - * @param cc the Tiger2 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_tiger2(void *cc, const void *data, size_t len); -#endif - -#ifndef DOXYGEN_IGNORE -#define sph_tiger2 sph_tiger -#endif - -/** - * Terminate the current Tiger2 computation and output the result into the - * provided buffer. The destination buffer must be wide enough to - * accomodate the result (24 bytes). The context is automatically - * reinitialized. Note that this function is NOT identical to - * sph_tiger2_close(): this is the exact and unique point - * where Tiger and Tiger2 differ. - * - * @param cc the Tiger context - * @param dst the destination buffer - */ -void sph_tiger2_close(void *cc, void *dst); - -#ifdef DOXYGEN_IGNORE -/** - * Apply the Tiger2 compression function, which is identical to the Tiger - * compression function. - * - * @param msg the message block (8 values) - * @param val the function 192-bit input and output - */ -void sph_tiger2_comp(const sph_u64 msg[8], sph_u64 val[3]); -#endif - -#ifndef DOXYGEN_IGNORE -#define sph_tiger2_comp sph_tiger_comp -#endif - -#endif - -#endif - diff --git a/algo/whirlpool/md_helper.c b/algo/whirlpool/md_helper.c index a9f11db..5048806 100644 --- a/algo/whirlpool/md_helper.c +++ b/algo/whirlpool/md_helper.c @@ -252,12 +252,6 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc, current = (unsigned)sc->count_low & (SPH_BLEN - 1U); #endif -//uint64_t *b= (uint64_t*)sc->buf; -//uint64_t *s= (uint64_t*)sc->state; -// printf("Sptr 1= %u\n",current); -// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] ); -// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] ); - #ifdef PW01 sc->buf[current ++] = (0x100 | (ub & 0xFF)) >> (8 - n); #else @@ -269,10 +263,6 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc, } #endif -// printf("Sptr 2= %u\n",current); -// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] ); -// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] ); - if (current > SPH_MAXPAD) { memset(sc->buf + current, 0, SPH_BLEN - current); RFUN(sc->buf, SPH_VAL); @@ -333,16 +323,8 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc, #endif #endif -// printf("Sptr 3= %u\n",current); -// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] ); -// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] ); - RFUN(sc->buf, SPH_VAL); -// printf("Sptr after= %u\n",current); -// printf("SState %016llx %016llx %016llx %016llx\n", s[0], s[1], s[2], s[3] ); -// printf("SState %016llx %016llx %016llx %016llx\n", s[4], s[5], s[6], s[7] ); - #ifdef SPH_NO_OUTPUT (void)dst; (void)rnum; diff --git a/algo/whirlpool/sph_whirlpool.h b/algo/whirlpool/sph_whirlpool.h index 70dc7fa..801a9f9 100644 --- a/algo/whirlpool/sph_whirlpool.h +++ b/algo/whirlpool/sph_whirlpool.h @@ -120,6 +120,13 @@ void sph_whirlpool(void *cc, const void *data, size_t len); */ void sph_whirlpool_close(void *cc, void *dst); +#define sph_whirlpool512_full( cc, dst, data, len ) \ +do{ \ + sph_whirlpool_init( cc ); \ + sph_whirlpool( cc, data, len ); \ + sph_whirlpool_close( cc, dst ); \ +}while(0) + /** * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL. */ diff --git a/algo/whirlpool/whirlpool-gate.c b/algo/whirlpool/whirlpool-gate.c deleted file mode 100644 index 8cf33e3..0000000 --- a/algo/whirlpool/whirlpool-gate.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "whirlpool-gate.h" - -bool register_whirlpool_algo( algo_gate_t* gate ) -{ -#if defined (WHIRLPOOL_4WAY) - four_way_not_tested(); - gate->optimizations = AVX2_OPT; - gate->scanhash = (void*)&scanhash_whirlpool_4way; - gate->hash = (void*)&whirlpool_hash_4way; -#else - gate->scanhash = (void*)&scanhash_whirlpool; - gate->hash = (void*)&whirlpool_hash; - init_whirlpool_ctx(); -#endif - return true; -}; - diff --git a/algo/whirlpool/whirlpool-gate.h b/algo/whirlpool/whirlpool-gate.h deleted file mode 100644 index f82e179..0000000 --- a/algo/whirlpool/whirlpool-gate.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef WHIRLPOOL_GATE_H__ -#define WHIRLPOOL_GATE_H__ - -#include "algo-gate-api.h" -#include - -/* -#if defined(FOUR_WAY) && defined(__AVX2__) - #define WHIRLPOOL_4WAY -#endif -*/ - -#if defined (WHIRLPOOL_4WAY) - -void whirlpool_hash_4way(void *state, const void *input); - -int scanhash_whirlpool_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#else - -void whirlpool_hash( void *state, const void *input ); - -int scanhash_whirlpool( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_whirlpool_ctx(); -#endif - -#endif diff --git a/algo/whirlpool/whirlpool-hash-4way.c b/algo/whirlpool/whirlpool-hash-4way.c deleted file mode 100644 index 3958aef..0000000 --- a/algo/whirlpool/whirlpool-hash-4way.c +++ /dev/null @@ -1,3567 +0,0 @@ -/* $Id: whirlpool.c 227 2010-06-16 17:28:38Z tp $ */ -/* - * WHIRLPOOL implementation. - * - * Internally, we use little-endian convention, on the assumption that - * architectures which favour big-endian encoding are: - * 1. rarer - * 2. in decreasing numbers - * 3. able to decode little-endian data efficiently anyway - * - * The most common big-endian architecture is Sparc, and Ultrasparc CPU - * include special opcodes to perform little-endian accesses, which we use - * (see sph_types.h). Most modern CPU designs can work with both endianness - * and architecture designer now favour little-endian (basically, x86 has - * won the endianness war). - * - * TODO: implement a 32-bit version. Not only such a version would be handy - * for non-64-bit-able architectures, but it may also use smaller tables, - * at the expense of more lookups and XORs. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#ifdef __AVX2__ - -#include -#include -#include "whirlpool-hash-4way.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_WHIRLPOOL -#define SPH_SMALL_FOOTPRINT_WHIRLPOOL 1 -#endif - -// reduce warnings -#undef SPH_C64 -#define SPH_C64(x) ((const long long int)(x)) - -/* ====================================================================== */ -/* - * Constants for plain WHIRLPOOL (current version). - */ - -static const long long int plain_T0[256] = { - SPH_C64(0xD83078C018601818), SPH_C64(0x2646AF05238C2323), - SPH_C64(0xB891F97EC63FC6C6), SPH_C64(0xFBCD6F13E887E8E8), - SPH_C64(0xCB13A14C87268787), SPH_C64(0x116D62A9B8DAB8B8), - SPH_C64(0x0902050801040101), SPH_C64(0x0D9E6E424F214F4F), - SPH_C64(0x9B6CEEAD36D83636), SPH_C64(0xFF510459A6A2A6A6), - SPH_C64(0x0CB9BDDED26FD2D2), SPH_C64(0x0EF706FBF5F3F5F5), - SPH_C64(0x96F280EF79F97979), SPH_C64(0x30DECE5F6FA16F6F), - SPH_C64(0x6D3FEFFC917E9191), SPH_C64(0xF8A407AA52555252), - SPH_C64(0x47C0FD27609D6060), SPH_C64(0x35657689BCCABCBC), - SPH_C64(0x372BCDAC9B569B9B), SPH_C64(0x8A018C048E028E8E), - SPH_C64(0xD25B1571A3B6A3A3), SPH_C64(0x6C183C600C300C0C), - SPH_C64(0x84F68AFF7BF17B7B), SPH_C64(0x806AE1B535D43535), - SPH_C64(0xF53A69E81D741D1D), SPH_C64(0xB3DD4753E0A7E0E0), - SPH_C64(0x21B3ACF6D77BD7D7), SPH_C64(0x9C99ED5EC22FC2C2), - SPH_C64(0x435C966D2EB82E2E), SPH_C64(0x29967A624B314B4B), - SPH_C64(0x5DE121A3FEDFFEFE), SPH_C64(0xD5AE168257415757), - SPH_C64(0xBD2A41A815541515), SPH_C64(0xE8EEB69F77C17777), - SPH_C64(0x926EEBA537DC3737), SPH_C64(0x9ED7567BE5B3E5E5), - SPH_C64(0x1323D98C9F469F9F), SPH_C64(0x23FD17D3F0E7F0F0), - SPH_C64(0x20947F6A4A354A4A), SPH_C64(0x44A9959EDA4FDADA), - SPH_C64(0xA2B025FA587D5858), SPH_C64(0xCF8FCA06C903C9C9), - SPH_C64(0x7C528D5529A42929), SPH_C64(0x5A1422500A280A0A), - SPH_C64(0x507F4FE1B1FEB1B1), SPH_C64(0xC95D1A69A0BAA0A0), - SPH_C64(0x14D6DA7F6BB16B6B), SPH_C64(0xD917AB5C852E8585), - SPH_C64(0x3C677381BDCEBDBD), SPH_C64(0x8FBA34D25D695D5D), - SPH_C64(0x9020508010401010), SPH_C64(0x07F503F3F4F7F4F4), - SPH_C64(0xDD8BC016CB0BCBCB), SPH_C64(0xD37CC6ED3EF83E3E), - SPH_C64(0x2D0A112805140505), SPH_C64(0x78CEE61F67816767), - SPH_C64(0x97D55373E4B7E4E4), SPH_C64(0x024EBB25279C2727), - SPH_C64(0x7382583241194141), SPH_C64(0xA70B9D2C8B168B8B), - SPH_C64(0xF6530151A7A6A7A7), SPH_C64(0xB2FA94CF7DE97D7D), - SPH_C64(0x4937FBDC956E9595), SPH_C64(0x56AD9F8ED847D8D8), - SPH_C64(0x70EB308BFBCBFBFB), SPH_C64(0xCDC17123EE9FEEEE), - SPH_C64(0xBBF891C77CED7C7C), SPH_C64(0x71CCE31766856666), - SPH_C64(0x7BA78EA6DD53DDDD), SPH_C64(0xAF2E4BB8175C1717), - SPH_C64(0x458E460247014747), SPH_C64(0x1A21DC849E429E9E), - SPH_C64(0xD489C51ECA0FCACA), SPH_C64(0x585A99752DB42D2D), - SPH_C64(0x2E637991BFC6BFBF), SPH_C64(0x3F0E1B38071C0707), - SPH_C64(0xAC472301AD8EADAD), SPH_C64(0xB0B42FEA5A755A5A), - SPH_C64(0xEF1BB56C83368383), SPH_C64(0xB666FF8533CC3333), - SPH_C64(0x5CC6F23F63916363), SPH_C64(0x12040A1002080202), - SPH_C64(0x93493839AA92AAAA), SPH_C64(0xDEE2A8AF71D97171), - SPH_C64(0xC68DCF0EC807C8C8), SPH_C64(0xD1327DC819641919), - SPH_C64(0x3B92707249394949), SPH_C64(0x5FAF9A86D943D9D9), - SPH_C64(0x31F91DC3F2EFF2F2), SPH_C64(0xA8DB484BE3ABE3E3), - SPH_C64(0xB9B62AE25B715B5B), SPH_C64(0xBC0D9234881A8888), - SPH_C64(0x3E29C8A49A529A9A), SPH_C64(0x0B4CBE2D26982626), - SPH_C64(0xBF64FA8D32C83232), SPH_C64(0x597D4AE9B0FAB0B0), - SPH_C64(0xF2CF6A1BE983E9E9), SPH_C64(0x771E33780F3C0F0F), - SPH_C64(0x33B7A6E6D573D5D5), SPH_C64(0xF41DBA74803A8080), - SPH_C64(0x27617C99BEC2BEBE), SPH_C64(0xEB87DE26CD13CDCD), - SPH_C64(0x8968E4BD34D03434), SPH_C64(0x3290757A483D4848), - SPH_C64(0x54E324ABFFDBFFFF), SPH_C64(0x8DF48FF77AF57A7A), - SPH_C64(0x643DEAF4907A9090), SPH_C64(0x9DBE3EC25F615F5F), - SPH_C64(0x3D40A01D20802020), SPH_C64(0x0FD0D56768BD6868), - SPH_C64(0xCA3472D01A681A1A), SPH_C64(0xB7412C19AE82AEAE), - SPH_C64(0x7D755EC9B4EAB4B4), SPH_C64(0xCEA8199A544D5454), - SPH_C64(0x7F3BE5EC93769393), SPH_C64(0x2F44AA0D22882222), - SPH_C64(0x63C8E907648D6464), SPH_C64(0x2AFF12DBF1E3F1F1), - SPH_C64(0xCCE6A2BF73D17373), SPH_C64(0x82245A9012481212), - SPH_C64(0x7A805D3A401D4040), SPH_C64(0x4810284008200808), - SPH_C64(0x959BE856C32BC3C3), SPH_C64(0xDFC57B33EC97ECEC), - SPH_C64(0x4DAB9096DB4BDBDB), SPH_C64(0xC05F1F61A1BEA1A1), - SPH_C64(0x9107831C8D0E8D8D), SPH_C64(0xC87AC9F53DF43D3D), - SPH_C64(0x5B33F1CC97669797), SPH_C64(0x0000000000000000), - SPH_C64(0xF983D436CF1BCFCF), SPH_C64(0x6E5687452BAC2B2B), - SPH_C64(0xE1ECB39776C57676), SPH_C64(0xE619B06482328282), - SPH_C64(0x28B1A9FED67FD6D6), SPH_C64(0xC33677D81B6C1B1B), - SPH_C64(0x74775BC1B5EEB5B5), SPH_C64(0xBE432911AF86AFAF), - SPH_C64(0x1DD4DF776AB56A6A), SPH_C64(0xEAA00DBA505D5050), - SPH_C64(0x578A4C1245094545), SPH_C64(0x38FB18CBF3EBF3F3), - SPH_C64(0xAD60F09D30C03030), SPH_C64(0xC4C3742BEF9BEFEF), - SPH_C64(0xDA7EC3E53FFC3F3F), SPH_C64(0xC7AA1C9255495555), - SPH_C64(0xDB591079A2B2A2A2), SPH_C64(0xE9C96503EA8FEAEA), - SPH_C64(0x6ACAEC0F65896565), SPH_C64(0x036968B9BAD2BABA), - SPH_C64(0x4A5E93652FBC2F2F), SPH_C64(0x8E9DE74EC027C0C0), - SPH_C64(0x60A181BEDE5FDEDE), SPH_C64(0xFC386CE01C701C1C), - SPH_C64(0x46E72EBBFDD3FDFD), SPH_C64(0x1F9A64524D294D4D), - SPH_C64(0x7639E0E492729292), SPH_C64(0xFAEABC8F75C97575), - SPH_C64(0x360C1E3006180606), SPH_C64(0xAE0998248A128A8A), - SPH_C64(0x4B7940F9B2F2B2B2), SPH_C64(0x85D15963E6BFE6E6), - SPH_C64(0x7E1C36700E380E0E), SPH_C64(0xE73E63F81F7C1F1F), - SPH_C64(0x55C4F73762956262), SPH_C64(0x3AB5A3EED477D4D4), - SPH_C64(0x814D3229A89AA8A8), SPH_C64(0x5231F4C496629696), - SPH_C64(0x62EF3A9BF9C3F9F9), SPH_C64(0xA397F666C533C5C5), - SPH_C64(0x104AB13525942525), SPH_C64(0xABB220F259795959), - SPH_C64(0xD015AE54842A8484), SPH_C64(0xC5E4A7B772D57272), - SPH_C64(0xEC72DDD539E43939), SPH_C64(0x1698615A4C2D4C4C), - SPH_C64(0x94BC3BCA5E655E5E), SPH_C64(0x9FF085E778FD7878), - SPH_C64(0xE570D8DD38E03838), SPH_C64(0x980586148C0A8C8C), - SPH_C64(0x17BFB2C6D163D1D1), SPH_C64(0xE4570B41A5AEA5A5), - SPH_C64(0xA1D94D43E2AFE2E2), SPH_C64(0x4EC2F82F61996161), - SPH_C64(0x427B45F1B3F6B3B3), SPH_C64(0x3442A51521842121), - SPH_C64(0x0825D6949C4A9C9C), SPH_C64(0xEE3C66F01E781E1E), - SPH_C64(0x6186522243114343), SPH_C64(0xB193FC76C73BC7C7), - SPH_C64(0x4FE52BB3FCD7FCFC), SPH_C64(0x2408142004100404), - SPH_C64(0xE3A208B251595151), SPH_C64(0x252FC7BC995E9999), - SPH_C64(0x22DAC44F6DA96D6D), SPH_C64(0x651A39680D340D0D), - SPH_C64(0x79E93583FACFFAFA), SPH_C64(0x69A384B6DF5BDFDF), - SPH_C64(0xA9FC9BD77EE57E7E), SPH_C64(0x1948B43D24902424), - SPH_C64(0xFE76D7C53BEC3B3B), SPH_C64(0x9A4B3D31AB96ABAB), - SPH_C64(0xF081D13ECE1FCECE), SPH_C64(0x9922558811441111), - SPH_C64(0x8303890C8F068F8F), SPH_C64(0x049C6B4A4E254E4E), - SPH_C64(0x667351D1B7E6B7B7), SPH_C64(0xE0CB600BEB8BEBEB), - SPH_C64(0xC178CCFD3CF03C3C), SPH_C64(0xFD1FBF7C813E8181), - SPH_C64(0x4035FED4946A9494), SPH_C64(0x1CF30CEBF7FBF7F7), - SPH_C64(0x186F67A1B9DEB9B9), SPH_C64(0x8B265F98134C1313), - SPH_C64(0x51589C7D2CB02C2C), SPH_C64(0x05BBB8D6D36BD3D3), - SPH_C64(0x8CD35C6BE7BBE7E7), SPH_C64(0x39DCCB576EA56E6E), - SPH_C64(0xAA95F36EC437C4C4), SPH_C64(0x1B060F18030C0303), - SPH_C64(0xDCAC138A56455656), SPH_C64(0x5E88491A440D4444), - SPH_C64(0xA0FE9EDF7FE17F7F), SPH_C64(0x884F3721A99EA9A9), - SPH_C64(0x6754824D2AA82A2A), SPH_C64(0x0A6B6DB1BBD6BBBB), - SPH_C64(0x879FE246C123C1C1), SPH_C64(0xF1A602A253515353), - SPH_C64(0x72A58BAEDC57DCDC), SPH_C64(0x531627580B2C0B0B), - SPH_C64(0x0127D39C9D4E9D9D), SPH_C64(0x2BD8C1476CAD6C6C), - SPH_C64(0xA462F59531C43131), SPH_C64(0xF3E8B98774CD7474), - SPH_C64(0x15F109E3F6FFF6F6), SPH_C64(0x4C8C430A46054646), - SPH_C64(0xA5452609AC8AACAC), SPH_C64(0xB50F973C891E8989), - SPH_C64(0xB42844A014501414), SPH_C64(0xBADF425BE1A3E1E1), - SPH_C64(0xA62C4EB016581616), SPH_C64(0xF774D2CD3AE83A3A), - SPH_C64(0x06D2D06F69B96969), SPH_C64(0x41122D4809240909), - SPH_C64(0xD7E0ADA770DD7070), SPH_C64(0x6F7154D9B6E2B6B6), - SPH_C64(0x1EBDB7CED067D0D0), SPH_C64(0xD6C77E3BED93EDED), - SPH_C64(0xE285DB2ECC17CCCC), SPH_C64(0x6884572A42154242), - SPH_C64(0x2C2DC2B4985A9898), SPH_C64(0xED550E49A4AAA4A4), - SPH_C64(0x7550885D28A02828), SPH_C64(0x86B831DA5C6D5C5C), - SPH_C64(0x6BED3F93F8C7F8F8), SPH_C64(0xC211A44486228686) -}; - -#if !SPH_SMALL_FOOTPRINT_WHIRLPOOL - -static const long long int plain_T1[256] = { - SPH_C64(0x3078C018601818D8), SPH_C64(0x46AF05238C232326), - SPH_C64(0x91F97EC63FC6C6B8), SPH_C64(0xCD6F13E887E8E8FB), - SPH_C64(0x13A14C87268787CB), SPH_C64(0x6D62A9B8DAB8B811), - SPH_C64(0x0205080104010109), SPH_C64(0x9E6E424F214F4F0D), - SPH_C64(0x6CEEAD36D836369B), SPH_C64(0x510459A6A2A6A6FF), - SPH_C64(0xB9BDDED26FD2D20C), SPH_C64(0xF706FBF5F3F5F50E), - SPH_C64(0xF280EF79F9797996), SPH_C64(0xDECE5F6FA16F6F30), - SPH_C64(0x3FEFFC917E91916D), SPH_C64(0xA407AA52555252F8), - SPH_C64(0xC0FD27609D606047), SPH_C64(0x657689BCCABCBC35), - SPH_C64(0x2BCDAC9B569B9B37), SPH_C64(0x018C048E028E8E8A), - SPH_C64(0x5B1571A3B6A3A3D2), SPH_C64(0x183C600C300C0C6C), - SPH_C64(0xF68AFF7BF17B7B84), SPH_C64(0x6AE1B535D4353580), - SPH_C64(0x3A69E81D741D1DF5), SPH_C64(0xDD4753E0A7E0E0B3), - SPH_C64(0xB3ACF6D77BD7D721), SPH_C64(0x99ED5EC22FC2C29C), - SPH_C64(0x5C966D2EB82E2E43), SPH_C64(0x967A624B314B4B29), - SPH_C64(0xE121A3FEDFFEFE5D), SPH_C64(0xAE168257415757D5), - SPH_C64(0x2A41A815541515BD), SPH_C64(0xEEB69F77C17777E8), - SPH_C64(0x6EEBA537DC373792), SPH_C64(0xD7567BE5B3E5E59E), - SPH_C64(0x23D98C9F469F9F13), SPH_C64(0xFD17D3F0E7F0F023), - SPH_C64(0x947F6A4A354A4A20), SPH_C64(0xA9959EDA4FDADA44), - SPH_C64(0xB025FA587D5858A2), SPH_C64(0x8FCA06C903C9C9CF), - SPH_C64(0x528D5529A429297C), SPH_C64(0x1422500A280A0A5A), - SPH_C64(0x7F4FE1B1FEB1B150), SPH_C64(0x5D1A69A0BAA0A0C9), - SPH_C64(0xD6DA7F6BB16B6B14), SPH_C64(0x17AB5C852E8585D9), - SPH_C64(0x677381BDCEBDBD3C), SPH_C64(0xBA34D25D695D5D8F), - SPH_C64(0x2050801040101090), SPH_C64(0xF503F3F4F7F4F407), - SPH_C64(0x8BC016CB0BCBCBDD), SPH_C64(0x7CC6ED3EF83E3ED3), - SPH_C64(0x0A1128051405052D), SPH_C64(0xCEE61F6781676778), - SPH_C64(0xD55373E4B7E4E497), SPH_C64(0x4EBB25279C272702), - SPH_C64(0x8258324119414173), SPH_C64(0x0B9D2C8B168B8BA7), - SPH_C64(0x530151A7A6A7A7F6), SPH_C64(0xFA94CF7DE97D7DB2), - SPH_C64(0x37FBDC956E959549), SPH_C64(0xAD9F8ED847D8D856), - SPH_C64(0xEB308BFBCBFBFB70), SPH_C64(0xC17123EE9FEEEECD), - SPH_C64(0xF891C77CED7C7CBB), SPH_C64(0xCCE3176685666671), - SPH_C64(0xA78EA6DD53DDDD7B), SPH_C64(0x2E4BB8175C1717AF), - SPH_C64(0x8E46024701474745), SPH_C64(0x21DC849E429E9E1A), - SPH_C64(0x89C51ECA0FCACAD4), SPH_C64(0x5A99752DB42D2D58), - SPH_C64(0x637991BFC6BFBF2E), SPH_C64(0x0E1B38071C07073F), - SPH_C64(0x472301AD8EADADAC), SPH_C64(0xB42FEA5A755A5AB0), - SPH_C64(0x1BB56C83368383EF), SPH_C64(0x66FF8533CC3333B6), - SPH_C64(0xC6F23F639163635C), SPH_C64(0x040A100208020212), - SPH_C64(0x493839AA92AAAA93), SPH_C64(0xE2A8AF71D97171DE), - SPH_C64(0x8DCF0EC807C8C8C6), SPH_C64(0x327DC819641919D1), - SPH_C64(0x927072493949493B), SPH_C64(0xAF9A86D943D9D95F), - SPH_C64(0xF91DC3F2EFF2F231), SPH_C64(0xDB484BE3ABE3E3A8), - SPH_C64(0xB62AE25B715B5BB9), SPH_C64(0x0D9234881A8888BC), - SPH_C64(0x29C8A49A529A9A3E), SPH_C64(0x4CBE2D269826260B), - SPH_C64(0x64FA8D32C83232BF), SPH_C64(0x7D4AE9B0FAB0B059), - SPH_C64(0xCF6A1BE983E9E9F2), SPH_C64(0x1E33780F3C0F0F77), - SPH_C64(0xB7A6E6D573D5D533), SPH_C64(0x1DBA74803A8080F4), - SPH_C64(0x617C99BEC2BEBE27), SPH_C64(0x87DE26CD13CDCDEB), - SPH_C64(0x68E4BD34D0343489), SPH_C64(0x90757A483D484832), - SPH_C64(0xE324ABFFDBFFFF54), SPH_C64(0xF48FF77AF57A7A8D), - SPH_C64(0x3DEAF4907A909064), SPH_C64(0xBE3EC25F615F5F9D), - SPH_C64(0x40A01D208020203D), SPH_C64(0xD0D56768BD68680F), - SPH_C64(0x3472D01A681A1ACA), SPH_C64(0x412C19AE82AEAEB7), - SPH_C64(0x755EC9B4EAB4B47D), SPH_C64(0xA8199A544D5454CE), - SPH_C64(0x3BE5EC937693937F), SPH_C64(0x44AA0D228822222F), - SPH_C64(0xC8E907648D646463), SPH_C64(0xFF12DBF1E3F1F12A), - SPH_C64(0xE6A2BF73D17373CC), SPH_C64(0x245A901248121282), - SPH_C64(0x805D3A401D40407A), SPH_C64(0x1028400820080848), - SPH_C64(0x9BE856C32BC3C395), SPH_C64(0xC57B33EC97ECECDF), - SPH_C64(0xAB9096DB4BDBDB4D), SPH_C64(0x5F1F61A1BEA1A1C0), - SPH_C64(0x07831C8D0E8D8D91), SPH_C64(0x7AC9F53DF43D3DC8), - SPH_C64(0x33F1CC976697975B), SPH_C64(0x0000000000000000), - SPH_C64(0x83D436CF1BCFCFF9), SPH_C64(0x5687452BAC2B2B6E), - SPH_C64(0xECB39776C57676E1), SPH_C64(0x19B06482328282E6), - SPH_C64(0xB1A9FED67FD6D628), SPH_C64(0x3677D81B6C1B1BC3), - SPH_C64(0x775BC1B5EEB5B574), SPH_C64(0x432911AF86AFAFBE), - SPH_C64(0xD4DF776AB56A6A1D), SPH_C64(0xA00DBA505D5050EA), - SPH_C64(0x8A4C124509454557), SPH_C64(0xFB18CBF3EBF3F338), - SPH_C64(0x60F09D30C03030AD), SPH_C64(0xC3742BEF9BEFEFC4), - SPH_C64(0x7EC3E53FFC3F3FDA), SPH_C64(0xAA1C9255495555C7), - SPH_C64(0x591079A2B2A2A2DB), SPH_C64(0xC96503EA8FEAEAE9), - SPH_C64(0xCAEC0F658965656A), SPH_C64(0x6968B9BAD2BABA03), - SPH_C64(0x5E93652FBC2F2F4A), SPH_C64(0x9DE74EC027C0C08E), - SPH_C64(0xA181BEDE5FDEDE60), SPH_C64(0x386CE01C701C1CFC), - SPH_C64(0xE72EBBFDD3FDFD46), SPH_C64(0x9A64524D294D4D1F), - SPH_C64(0x39E0E49272929276), SPH_C64(0xEABC8F75C97575FA), - SPH_C64(0x0C1E300618060636), SPH_C64(0x0998248A128A8AAE), - SPH_C64(0x7940F9B2F2B2B24B), SPH_C64(0xD15963E6BFE6E685), - SPH_C64(0x1C36700E380E0E7E), SPH_C64(0x3E63F81F7C1F1FE7), - SPH_C64(0xC4F7376295626255), SPH_C64(0xB5A3EED477D4D43A), - SPH_C64(0x4D3229A89AA8A881), SPH_C64(0x31F4C49662969652), - SPH_C64(0xEF3A9BF9C3F9F962), SPH_C64(0x97F666C533C5C5A3), - SPH_C64(0x4AB1352594252510), SPH_C64(0xB220F259795959AB), - SPH_C64(0x15AE54842A8484D0), SPH_C64(0xE4A7B772D57272C5), - SPH_C64(0x72DDD539E43939EC), SPH_C64(0x98615A4C2D4C4C16), - SPH_C64(0xBC3BCA5E655E5E94), SPH_C64(0xF085E778FD78789F), - SPH_C64(0x70D8DD38E03838E5), SPH_C64(0x0586148C0A8C8C98), - SPH_C64(0xBFB2C6D163D1D117), SPH_C64(0x570B41A5AEA5A5E4), - SPH_C64(0xD94D43E2AFE2E2A1), SPH_C64(0xC2F82F619961614E), - SPH_C64(0x7B45F1B3F6B3B342), SPH_C64(0x42A5152184212134), - SPH_C64(0x25D6949C4A9C9C08), SPH_C64(0x3C66F01E781E1EEE), - SPH_C64(0x8652224311434361), SPH_C64(0x93FC76C73BC7C7B1), - SPH_C64(0xE52BB3FCD7FCFC4F), SPH_C64(0x0814200410040424), - SPH_C64(0xA208B251595151E3), SPH_C64(0x2FC7BC995E999925), - SPH_C64(0xDAC44F6DA96D6D22), SPH_C64(0x1A39680D340D0D65), - SPH_C64(0xE93583FACFFAFA79), SPH_C64(0xA384B6DF5BDFDF69), - SPH_C64(0xFC9BD77EE57E7EA9), SPH_C64(0x48B43D2490242419), - SPH_C64(0x76D7C53BEC3B3BFE), SPH_C64(0x4B3D31AB96ABAB9A), - SPH_C64(0x81D13ECE1FCECEF0), SPH_C64(0x2255881144111199), - SPH_C64(0x03890C8F068F8F83), SPH_C64(0x9C6B4A4E254E4E04), - SPH_C64(0x7351D1B7E6B7B766), SPH_C64(0xCB600BEB8BEBEBE0), - SPH_C64(0x78CCFD3CF03C3CC1), SPH_C64(0x1FBF7C813E8181FD), - SPH_C64(0x35FED4946A949440), SPH_C64(0xF30CEBF7FBF7F71C), - SPH_C64(0x6F67A1B9DEB9B918), SPH_C64(0x265F98134C13138B), - SPH_C64(0x589C7D2CB02C2C51), SPH_C64(0xBBB8D6D36BD3D305), - SPH_C64(0xD35C6BE7BBE7E78C), SPH_C64(0xDCCB576EA56E6E39), - SPH_C64(0x95F36EC437C4C4AA), SPH_C64(0x060F18030C03031B), - SPH_C64(0xAC138A56455656DC), SPH_C64(0x88491A440D44445E), - SPH_C64(0xFE9EDF7FE17F7FA0), SPH_C64(0x4F3721A99EA9A988), - SPH_C64(0x54824D2AA82A2A67), SPH_C64(0x6B6DB1BBD6BBBB0A), - SPH_C64(0x9FE246C123C1C187), SPH_C64(0xA602A253515353F1), - SPH_C64(0xA58BAEDC57DCDC72), SPH_C64(0x1627580B2C0B0B53), - SPH_C64(0x27D39C9D4E9D9D01), SPH_C64(0xD8C1476CAD6C6C2B), - SPH_C64(0x62F59531C43131A4), SPH_C64(0xE8B98774CD7474F3), - SPH_C64(0xF109E3F6FFF6F615), SPH_C64(0x8C430A460546464C), - SPH_C64(0x452609AC8AACACA5), SPH_C64(0x0F973C891E8989B5), - SPH_C64(0x2844A014501414B4), SPH_C64(0xDF425BE1A3E1E1BA), - SPH_C64(0x2C4EB016581616A6), SPH_C64(0x74D2CD3AE83A3AF7), - SPH_C64(0xD2D06F69B9696906), SPH_C64(0x122D480924090941), - SPH_C64(0xE0ADA770DD7070D7), SPH_C64(0x7154D9B6E2B6B66F), - SPH_C64(0xBDB7CED067D0D01E), SPH_C64(0xC77E3BED93EDEDD6), - SPH_C64(0x85DB2ECC17CCCCE2), SPH_C64(0x84572A4215424268), - SPH_C64(0x2DC2B4985A98982C), SPH_C64(0x550E49A4AAA4A4ED), - SPH_C64(0x50885D28A0282875), SPH_C64(0xB831DA5C6D5C5C86), - SPH_C64(0xED3F93F8C7F8F86B), SPH_C64(0x11A44486228686C2) -}; - -static const long long int plain_T2[256] = { - SPH_C64(0x78C018601818D830), SPH_C64(0xAF05238C23232646), - SPH_C64(0xF97EC63FC6C6B891), SPH_C64(0x6F13E887E8E8FBCD), - SPH_C64(0xA14C87268787CB13), SPH_C64(0x62A9B8DAB8B8116D), - SPH_C64(0x0508010401010902), SPH_C64(0x6E424F214F4F0D9E), - SPH_C64(0xEEAD36D836369B6C), SPH_C64(0x0459A6A2A6A6FF51), - SPH_C64(0xBDDED26FD2D20CB9), SPH_C64(0x06FBF5F3F5F50EF7), - SPH_C64(0x80EF79F9797996F2), SPH_C64(0xCE5F6FA16F6F30DE), - SPH_C64(0xEFFC917E91916D3F), SPH_C64(0x07AA52555252F8A4), - SPH_C64(0xFD27609D606047C0), SPH_C64(0x7689BCCABCBC3565), - SPH_C64(0xCDAC9B569B9B372B), SPH_C64(0x8C048E028E8E8A01), - SPH_C64(0x1571A3B6A3A3D25B), SPH_C64(0x3C600C300C0C6C18), - SPH_C64(0x8AFF7BF17B7B84F6), SPH_C64(0xE1B535D43535806A), - SPH_C64(0x69E81D741D1DF53A), SPH_C64(0x4753E0A7E0E0B3DD), - SPH_C64(0xACF6D77BD7D721B3), SPH_C64(0xED5EC22FC2C29C99), - SPH_C64(0x966D2EB82E2E435C), SPH_C64(0x7A624B314B4B2996), - SPH_C64(0x21A3FEDFFEFE5DE1), SPH_C64(0x168257415757D5AE), - SPH_C64(0x41A815541515BD2A), SPH_C64(0xB69F77C17777E8EE), - SPH_C64(0xEBA537DC3737926E), SPH_C64(0x567BE5B3E5E59ED7), - SPH_C64(0xD98C9F469F9F1323), SPH_C64(0x17D3F0E7F0F023FD), - SPH_C64(0x7F6A4A354A4A2094), SPH_C64(0x959EDA4FDADA44A9), - SPH_C64(0x25FA587D5858A2B0), SPH_C64(0xCA06C903C9C9CF8F), - SPH_C64(0x8D5529A429297C52), SPH_C64(0x22500A280A0A5A14), - SPH_C64(0x4FE1B1FEB1B1507F), SPH_C64(0x1A69A0BAA0A0C95D), - SPH_C64(0xDA7F6BB16B6B14D6), SPH_C64(0xAB5C852E8585D917), - SPH_C64(0x7381BDCEBDBD3C67), SPH_C64(0x34D25D695D5D8FBA), - SPH_C64(0x5080104010109020), SPH_C64(0x03F3F4F7F4F407F5), - SPH_C64(0xC016CB0BCBCBDD8B), SPH_C64(0xC6ED3EF83E3ED37C), - SPH_C64(0x1128051405052D0A), SPH_C64(0xE61F6781676778CE), - SPH_C64(0x5373E4B7E4E497D5), SPH_C64(0xBB25279C2727024E), - SPH_C64(0x5832411941417382), SPH_C64(0x9D2C8B168B8BA70B), - SPH_C64(0x0151A7A6A7A7F653), SPH_C64(0x94CF7DE97D7DB2FA), - SPH_C64(0xFBDC956E95954937), SPH_C64(0x9F8ED847D8D856AD), - SPH_C64(0x308BFBCBFBFB70EB), SPH_C64(0x7123EE9FEEEECDC1), - SPH_C64(0x91C77CED7C7CBBF8), SPH_C64(0xE3176685666671CC), - SPH_C64(0x8EA6DD53DDDD7BA7), SPH_C64(0x4BB8175C1717AF2E), - SPH_C64(0x460247014747458E), SPH_C64(0xDC849E429E9E1A21), - SPH_C64(0xC51ECA0FCACAD489), SPH_C64(0x99752DB42D2D585A), - SPH_C64(0x7991BFC6BFBF2E63), SPH_C64(0x1B38071C07073F0E), - SPH_C64(0x2301AD8EADADAC47), SPH_C64(0x2FEA5A755A5AB0B4), - SPH_C64(0xB56C83368383EF1B), SPH_C64(0xFF8533CC3333B666), - SPH_C64(0xF23F639163635CC6), SPH_C64(0x0A10020802021204), - SPH_C64(0x3839AA92AAAA9349), SPH_C64(0xA8AF71D97171DEE2), - SPH_C64(0xCF0EC807C8C8C68D), SPH_C64(0x7DC819641919D132), - SPH_C64(0x7072493949493B92), SPH_C64(0x9A86D943D9D95FAF), - SPH_C64(0x1DC3F2EFF2F231F9), SPH_C64(0x484BE3ABE3E3A8DB), - SPH_C64(0x2AE25B715B5BB9B6), SPH_C64(0x9234881A8888BC0D), - SPH_C64(0xC8A49A529A9A3E29), SPH_C64(0xBE2D269826260B4C), - SPH_C64(0xFA8D32C83232BF64), SPH_C64(0x4AE9B0FAB0B0597D), - SPH_C64(0x6A1BE983E9E9F2CF), SPH_C64(0x33780F3C0F0F771E), - SPH_C64(0xA6E6D573D5D533B7), SPH_C64(0xBA74803A8080F41D), - SPH_C64(0x7C99BEC2BEBE2761), SPH_C64(0xDE26CD13CDCDEB87), - SPH_C64(0xE4BD34D034348968), SPH_C64(0x757A483D48483290), - SPH_C64(0x24ABFFDBFFFF54E3), SPH_C64(0x8FF77AF57A7A8DF4), - SPH_C64(0xEAF4907A9090643D), SPH_C64(0x3EC25F615F5F9DBE), - SPH_C64(0xA01D208020203D40), SPH_C64(0xD56768BD68680FD0), - SPH_C64(0x72D01A681A1ACA34), SPH_C64(0x2C19AE82AEAEB741), - SPH_C64(0x5EC9B4EAB4B47D75), SPH_C64(0x199A544D5454CEA8), - SPH_C64(0xE5EC937693937F3B), SPH_C64(0xAA0D228822222F44), - SPH_C64(0xE907648D646463C8), SPH_C64(0x12DBF1E3F1F12AFF), - SPH_C64(0xA2BF73D17373CCE6), SPH_C64(0x5A90124812128224), - SPH_C64(0x5D3A401D40407A80), SPH_C64(0x2840082008084810), - SPH_C64(0xE856C32BC3C3959B), SPH_C64(0x7B33EC97ECECDFC5), - SPH_C64(0x9096DB4BDBDB4DAB), SPH_C64(0x1F61A1BEA1A1C05F), - SPH_C64(0x831C8D0E8D8D9107), SPH_C64(0xC9F53DF43D3DC87A), - SPH_C64(0xF1CC976697975B33), SPH_C64(0x0000000000000000), - SPH_C64(0xD436CF1BCFCFF983), SPH_C64(0x87452BAC2B2B6E56), - SPH_C64(0xB39776C57676E1EC), SPH_C64(0xB06482328282E619), - SPH_C64(0xA9FED67FD6D628B1), SPH_C64(0x77D81B6C1B1BC336), - SPH_C64(0x5BC1B5EEB5B57477), SPH_C64(0x2911AF86AFAFBE43), - SPH_C64(0xDF776AB56A6A1DD4), SPH_C64(0x0DBA505D5050EAA0), - SPH_C64(0x4C1245094545578A), SPH_C64(0x18CBF3EBF3F338FB), - SPH_C64(0xF09D30C03030AD60), SPH_C64(0x742BEF9BEFEFC4C3), - SPH_C64(0xC3E53FFC3F3FDA7E), SPH_C64(0x1C9255495555C7AA), - SPH_C64(0x1079A2B2A2A2DB59), SPH_C64(0x6503EA8FEAEAE9C9), - SPH_C64(0xEC0F658965656ACA), SPH_C64(0x68B9BAD2BABA0369), - SPH_C64(0x93652FBC2F2F4A5E), SPH_C64(0xE74EC027C0C08E9D), - SPH_C64(0x81BEDE5FDEDE60A1), SPH_C64(0x6CE01C701C1CFC38), - SPH_C64(0x2EBBFDD3FDFD46E7), SPH_C64(0x64524D294D4D1F9A), - SPH_C64(0xE0E4927292927639), SPH_C64(0xBC8F75C97575FAEA), - SPH_C64(0x1E3006180606360C), SPH_C64(0x98248A128A8AAE09), - SPH_C64(0x40F9B2F2B2B24B79), SPH_C64(0x5963E6BFE6E685D1), - SPH_C64(0x36700E380E0E7E1C), SPH_C64(0x63F81F7C1F1FE73E), - SPH_C64(0xF7376295626255C4), SPH_C64(0xA3EED477D4D43AB5), - SPH_C64(0x3229A89AA8A8814D), SPH_C64(0xF4C4966296965231), - SPH_C64(0x3A9BF9C3F9F962EF), SPH_C64(0xF666C533C5C5A397), - SPH_C64(0xB13525942525104A), SPH_C64(0x20F259795959ABB2), - SPH_C64(0xAE54842A8484D015), SPH_C64(0xA7B772D57272C5E4), - SPH_C64(0xDDD539E43939EC72), SPH_C64(0x615A4C2D4C4C1698), - SPH_C64(0x3BCA5E655E5E94BC), SPH_C64(0x85E778FD78789FF0), - SPH_C64(0xD8DD38E03838E570), SPH_C64(0x86148C0A8C8C9805), - SPH_C64(0xB2C6D163D1D117BF), SPH_C64(0x0B41A5AEA5A5E457), - SPH_C64(0x4D43E2AFE2E2A1D9), SPH_C64(0xF82F619961614EC2), - SPH_C64(0x45F1B3F6B3B3427B), SPH_C64(0xA515218421213442), - SPH_C64(0xD6949C4A9C9C0825), SPH_C64(0x66F01E781E1EEE3C), - SPH_C64(0x5222431143436186), SPH_C64(0xFC76C73BC7C7B193), - SPH_C64(0x2BB3FCD7FCFC4FE5), SPH_C64(0x1420041004042408), - SPH_C64(0x08B251595151E3A2), SPH_C64(0xC7BC995E9999252F), - SPH_C64(0xC44F6DA96D6D22DA), SPH_C64(0x39680D340D0D651A), - SPH_C64(0x3583FACFFAFA79E9), SPH_C64(0x84B6DF5BDFDF69A3), - SPH_C64(0x9BD77EE57E7EA9FC), SPH_C64(0xB43D249024241948), - SPH_C64(0xD7C53BEC3B3BFE76), SPH_C64(0x3D31AB96ABAB9A4B), - SPH_C64(0xD13ECE1FCECEF081), SPH_C64(0x5588114411119922), - SPH_C64(0x890C8F068F8F8303), SPH_C64(0x6B4A4E254E4E049C), - SPH_C64(0x51D1B7E6B7B76673), SPH_C64(0x600BEB8BEBEBE0CB), - SPH_C64(0xCCFD3CF03C3CC178), SPH_C64(0xBF7C813E8181FD1F), - SPH_C64(0xFED4946A94944035), SPH_C64(0x0CEBF7FBF7F71CF3), - SPH_C64(0x67A1B9DEB9B9186F), SPH_C64(0x5F98134C13138B26), - SPH_C64(0x9C7D2CB02C2C5158), SPH_C64(0xB8D6D36BD3D305BB), - SPH_C64(0x5C6BE7BBE7E78CD3), SPH_C64(0xCB576EA56E6E39DC), - SPH_C64(0xF36EC437C4C4AA95), SPH_C64(0x0F18030C03031B06), - SPH_C64(0x138A56455656DCAC), SPH_C64(0x491A440D44445E88), - SPH_C64(0x9EDF7FE17F7FA0FE), SPH_C64(0x3721A99EA9A9884F), - SPH_C64(0x824D2AA82A2A6754), SPH_C64(0x6DB1BBD6BBBB0A6B), - SPH_C64(0xE246C123C1C1879F), SPH_C64(0x02A253515353F1A6), - SPH_C64(0x8BAEDC57DCDC72A5), SPH_C64(0x27580B2C0B0B5316), - SPH_C64(0xD39C9D4E9D9D0127), SPH_C64(0xC1476CAD6C6C2BD8), - SPH_C64(0xF59531C43131A462), SPH_C64(0xB98774CD7474F3E8), - SPH_C64(0x09E3F6FFF6F615F1), SPH_C64(0x430A460546464C8C), - SPH_C64(0x2609AC8AACACA545), SPH_C64(0x973C891E8989B50F), - SPH_C64(0x44A014501414B428), SPH_C64(0x425BE1A3E1E1BADF), - SPH_C64(0x4EB016581616A62C), SPH_C64(0xD2CD3AE83A3AF774), - SPH_C64(0xD06F69B9696906D2), SPH_C64(0x2D48092409094112), - SPH_C64(0xADA770DD7070D7E0), SPH_C64(0x54D9B6E2B6B66F71), - SPH_C64(0xB7CED067D0D01EBD), SPH_C64(0x7E3BED93EDEDD6C7), - SPH_C64(0xDB2ECC17CCCCE285), SPH_C64(0x572A421542426884), - SPH_C64(0xC2B4985A98982C2D), SPH_C64(0x0E49A4AAA4A4ED55), - SPH_C64(0x885D28A028287550), SPH_C64(0x31DA5C6D5C5C86B8), - SPH_C64(0x3F93F8C7F8F86BED), SPH_C64(0xA44486228686C211) -}; - -static const long long int plain_T3[256] = { - SPH_C64(0xC018601818D83078), SPH_C64(0x05238C23232646AF), - SPH_C64(0x7EC63FC6C6B891F9), SPH_C64(0x13E887E8E8FBCD6F), - SPH_C64(0x4C87268787CB13A1), SPH_C64(0xA9B8DAB8B8116D62), - SPH_C64(0x0801040101090205), SPH_C64(0x424F214F4F0D9E6E), - SPH_C64(0xAD36D836369B6CEE), SPH_C64(0x59A6A2A6A6FF5104), - SPH_C64(0xDED26FD2D20CB9BD), SPH_C64(0xFBF5F3F5F50EF706), - SPH_C64(0xEF79F9797996F280), SPH_C64(0x5F6FA16F6F30DECE), - SPH_C64(0xFC917E91916D3FEF), SPH_C64(0xAA52555252F8A407), - SPH_C64(0x27609D606047C0FD), SPH_C64(0x89BCCABCBC356576), - SPH_C64(0xAC9B569B9B372BCD), SPH_C64(0x048E028E8E8A018C), - SPH_C64(0x71A3B6A3A3D25B15), SPH_C64(0x600C300C0C6C183C), - SPH_C64(0xFF7BF17B7B84F68A), SPH_C64(0xB535D43535806AE1), - SPH_C64(0xE81D741D1DF53A69), SPH_C64(0x53E0A7E0E0B3DD47), - SPH_C64(0xF6D77BD7D721B3AC), SPH_C64(0x5EC22FC2C29C99ED), - SPH_C64(0x6D2EB82E2E435C96), SPH_C64(0x624B314B4B29967A), - SPH_C64(0xA3FEDFFEFE5DE121), SPH_C64(0x8257415757D5AE16), - SPH_C64(0xA815541515BD2A41), SPH_C64(0x9F77C17777E8EEB6), - SPH_C64(0xA537DC3737926EEB), SPH_C64(0x7BE5B3E5E59ED756), - SPH_C64(0x8C9F469F9F1323D9), SPH_C64(0xD3F0E7F0F023FD17), - SPH_C64(0x6A4A354A4A20947F), SPH_C64(0x9EDA4FDADA44A995), - SPH_C64(0xFA587D5858A2B025), SPH_C64(0x06C903C9C9CF8FCA), - SPH_C64(0x5529A429297C528D), SPH_C64(0x500A280A0A5A1422), - SPH_C64(0xE1B1FEB1B1507F4F), SPH_C64(0x69A0BAA0A0C95D1A), - SPH_C64(0x7F6BB16B6B14D6DA), SPH_C64(0x5C852E8585D917AB), - SPH_C64(0x81BDCEBDBD3C6773), SPH_C64(0xD25D695D5D8FBA34), - SPH_C64(0x8010401010902050), SPH_C64(0xF3F4F7F4F407F503), - SPH_C64(0x16CB0BCBCBDD8BC0), SPH_C64(0xED3EF83E3ED37CC6), - SPH_C64(0x28051405052D0A11), SPH_C64(0x1F6781676778CEE6), - SPH_C64(0x73E4B7E4E497D553), SPH_C64(0x25279C2727024EBB), - SPH_C64(0x3241194141738258), SPH_C64(0x2C8B168B8BA70B9D), - SPH_C64(0x51A7A6A7A7F65301), SPH_C64(0xCF7DE97D7DB2FA94), - SPH_C64(0xDC956E95954937FB), SPH_C64(0x8ED847D8D856AD9F), - SPH_C64(0x8BFBCBFBFB70EB30), SPH_C64(0x23EE9FEEEECDC171), - SPH_C64(0xC77CED7C7CBBF891), SPH_C64(0x176685666671CCE3), - SPH_C64(0xA6DD53DDDD7BA78E), SPH_C64(0xB8175C1717AF2E4B), - SPH_C64(0x0247014747458E46), SPH_C64(0x849E429E9E1A21DC), - SPH_C64(0x1ECA0FCACAD489C5), SPH_C64(0x752DB42D2D585A99), - SPH_C64(0x91BFC6BFBF2E6379), SPH_C64(0x38071C07073F0E1B), - SPH_C64(0x01AD8EADADAC4723), SPH_C64(0xEA5A755A5AB0B42F), - SPH_C64(0x6C83368383EF1BB5), SPH_C64(0x8533CC3333B666FF), - SPH_C64(0x3F639163635CC6F2), SPH_C64(0x100208020212040A), - SPH_C64(0x39AA92AAAA934938), SPH_C64(0xAF71D97171DEE2A8), - SPH_C64(0x0EC807C8C8C68DCF), SPH_C64(0xC819641919D1327D), - SPH_C64(0x72493949493B9270), SPH_C64(0x86D943D9D95FAF9A), - SPH_C64(0xC3F2EFF2F231F91D), SPH_C64(0x4BE3ABE3E3A8DB48), - SPH_C64(0xE25B715B5BB9B62A), SPH_C64(0x34881A8888BC0D92), - SPH_C64(0xA49A529A9A3E29C8), SPH_C64(0x2D269826260B4CBE), - SPH_C64(0x8D32C83232BF64FA), SPH_C64(0xE9B0FAB0B0597D4A), - SPH_C64(0x1BE983E9E9F2CF6A), SPH_C64(0x780F3C0F0F771E33), - SPH_C64(0xE6D573D5D533B7A6), SPH_C64(0x74803A8080F41DBA), - SPH_C64(0x99BEC2BEBE27617C), SPH_C64(0x26CD13CDCDEB87DE), - SPH_C64(0xBD34D034348968E4), SPH_C64(0x7A483D4848329075), - SPH_C64(0xABFFDBFFFF54E324), SPH_C64(0xF77AF57A7A8DF48F), - SPH_C64(0xF4907A9090643DEA), SPH_C64(0xC25F615F5F9DBE3E), - SPH_C64(0x1D208020203D40A0), SPH_C64(0x6768BD68680FD0D5), - SPH_C64(0xD01A681A1ACA3472), SPH_C64(0x19AE82AEAEB7412C), - SPH_C64(0xC9B4EAB4B47D755E), SPH_C64(0x9A544D5454CEA819), - SPH_C64(0xEC937693937F3BE5), SPH_C64(0x0D228822222F44AA), - SPH_C64(0x07648D646463C8E9), SPH_C64(0xDBF1E3F1F12AFF12), - SPH_C64(0xBF73D17373CCE6A2), SPH_C64(0x901248121282245A), - SPH_C64(0x3A401D40407A805D), SPH_C64(0x4008200808481028), - SPH_C64(0x56C32BC3C3959BE8), SPH_C64(0x33EC97ECECDFC57B), - SPH_C64(0x96DB4BDBDB4DAB90), SPH_C64(0x61A1BEA1A1C05F1F), - SPH_C64(0x1C8D0E8D8D910783), SPH_C64(0xF53DF43D3DC87AC9), - SPH_C64(0xCC976697975B33F1), SPH_C64(0x0000000000000000), - SPH_C64(0x36CF1BCFCFF983D4), SPH_C64(0x452BAC2B2B6E5687), - SPH_C64(0x9776C57676E1ECB3), SPH_C64(0x6482328282E619B0), - SPH_C64(0xFED67FD6D628B1A9), SPH_C64(0xD81B6C1B1BC33677), - SPH_C64(0xC1B5EEB5B574775B), SPH_C64(0x11AF86AFAFBE4329), - SPH_C64(0x776AB56A6A1DD4DF), SPH_C64(0xBA505D5050EAA00D), - SPH_C64(0x1245094545578A4C), SPH_C64(0xCBF3EBF3F338FB18), - SPH_C64(0x9D30C03030AD60F0), SPH_C64(0x2BEF9BEFEFC4C374), - SPH_C64(0xE53FFC3F3FDA7EC3), SPH_C64(0x9255495555C7AA1C), - SPH_C64(0x79A2B2A2A2DB5910), SPH_C64(0x03EA8FEAEAE9C965), - SPH_C64(0x0F658965656ACAEC), SPH_C64(0xB9BAD2BABA036968), - SPH_C64(0x652FBC2F2F4A5E93), SPH_C64(0x4EC027C0C08E9DE7), - SPH_C64(0xBEDE5FDEDE60A181), SPH_C64(0xE01C701C1CFC386C), - SPH_C64(0xBBFDD3FDFD46E72E), SPH_C64(0x524D294D4D1F9A64), - SPH_C64(0xE4927292927639E0), SPH_C64(0x8F75C97575FAEABC), - SPH_C64(0x3006180606360C1E), SPH_C64(0x248A128A8AAE0998), - SPH_C64(0xF9B2F2B2B24B7940), SPH_C64(0x63E6BFE6E685D159), - SPH_C64(0x700E380E0E7E1C36), SPH_C64(0xF81F7C1F1FE73E63), - SPH_C64(0x376295626255C4F7), SPH_C64(0xEED477D4D43AB5A3), - SPH_C64(0x29A89AA8A8814D32), SPH_C64(0xC4966296965231F4), - SPH_C64(0x9BF9C3F9F962EF3A), SPH_C64(0x66C533C5C5A397F6), - SPH_C64(0x3525942525104AB1), SPH_C64(0xF259795959ABB220), - SPH_C64(0x54842A8484D015AE), SPH_C64(0xB772D57272C5E4A7), - SPH_C64(0xD539E43939EC72DD), SPH_C64(0x5A4C2D4C4C169861), - SPH_C64(0xCA5E655E5E94BC3B), SPH_C64(0xE778FD78789FF085), - SPH_C64(0xDD38E03838E570D8), SPH_C64(0x148C0A8C8C980586), - SPH_C64(0xC6D163D1D117BFB2), SPH_C64(0x41A5AEA5A5E4570B), - SPH_C64(0x43E2AFE2E2A1D94D), SPH_C64(0x2F619961614EC2F8), - SPH_C64(0xF1B3F6B3B3427B45), SPH_C64(0x15218421213442A5), - SPH_C64(0x949C4A9C9C0825D6), SPH_C64(0xF01E781E1EEE3C66), - SPH_C64(0x2243114343618652), SPH_C64(0x76C73BC7C7B193FC), - SPH_C64(0xB3FCD7FCFC4FE52B), SPH_C64(0x2004100404240814), - SPH_C64(0xB251595151E3A208), SPH_C64(0xBC995E9999252FC7), - SPH_C64(0x4F6DA96D6D22DAC4), SPH_C64(0x680D340D0D651A39), - SPH_C64(0x83FACFFAFA79E935), SPH_C64(0xB6DF5BDFDF69A384), - SPH_C64(0xD77EE57E7EA9FC9B), SPH_C64(0x3D249024241948B4), - SPH_C64(0xC53BEC3B3BFE76D7), SPH_C64(0x31AB96ABAB9A4B3D), - SPH_C64(0x3ECE1FCECEF081D1), SPH_C64(0x8811441111992255), - SPH_C64(0x0C8F068F8F830389), SPH_C64(0x4A4E254E4E049C6B), - SPH_C64(0xD1B7E6B7B7667351), SPH_C64(0x0BEB8BEBEBE0CB60), - SPH_C64(0xFD3CF03C3CC178CC), SPH_C64(0x7C813E8181FD1FBF), - SPH_C64(0xD4946A94944035FE), SPH_C64(0xEBF7FBF7F71CF30C), - SPH_C64(0xA1B9DEB9B9186F67), SPH_C64(0x98134C13138B265F), - SPH_C64(0x7D2CB02C2C51589C), SPH_C64(0xD6D36BD3D305BBB8), - SPH_C64(0x6BE7BBE7E78CD35C), SPH_C64(0x576EA56E6E39DCCB), - SPH_C64(0x6EC437C4C4AA95F3), SPH_C64(0x18030C03031B060F), - SPH_C64(0x8A56455656DCAC13), SPH_C64(0x1A440D44445E8849), - SPH_C64(0xDF7FE17F7FA0FE9E), SPH_C64(0x21A99EA9A9884F37), - SPH_C64(0x4D2AA82A2A675482), SPH_C64(0xB1BBD6BBBB0A6B6D), - SPH_C64(0x46C123C1C1879FE2), SPH_C64(0xA253515353F1A602), - SPH_C64(0xAEDC57DCDC72A58B), SPH_C64(0x580B2C0B0B531627), - SPH_C64(0x9C9D4E9D9D0127D3), SPH_C64(0x476CAD6C6C2BD8C1), - SPH_C64(0x9531C43131A462F5), SPH_C64(0x8774CD7474F3E8B9), - SPH_C64(0xE3F6FFF6F615F109), SPH_C64(0x0A460546464C8C43), - SPH_C64(0x09AC8AACACA54526), SPH_C64(0x3C891E8989B50F97), - SPH_C64(0xA014501414B42844), SPH_C64(0x5BE1A3E1E1BADF42), - SPH_C64(0xB016581616A62C4E), SPH_C64(0xCD3AE83A3AF774D2), - SPH_C64(0x6F69B9696906D2D0), SPH_C64(0x480924090941122D), - SPH_C64(0xA770DD7070D7E0AD), SPH_C64(0xD9B6E2B6B66F7154), - SPH_C64(0xCED067D0D01EBDB7), SPH_C64(0x3BED93EDEDD6C77E), - SPH_C64(0x2ECC17CCCCE285DB), SPH_C64(0x2A42154242688457), - SPH_C64(0xB4985A98982C2DC2), SPH_C64(0x49A4AAA4A4ED550E), - SPH_C64(0x5D28A02828755088), SPH_C64(0xDA5C6D5C5C86B831), - SPH_C64(0x93F8C7F8F86BED3F), SPH_C64(0x4486228686C211A4) -}; - -static const long long int plain_T4[256] = { - SPH_C64(0x18601818D83078C0), SPH_C64(0x238C23232646AF05), - SPH_C64(0xC63FC6C6B891F97E), SPH_C64(0xE887E8E8FBCD6F13), - SPH_C64(0x87268787CB13A14C), SPH_C64(0xB8DAB8B8116D62A9), - SPH_C64(0x0104010109020508), SPH_C64(0x4F214F4F0D9E6E42), - SPH_C64(0x36D836369B6CEEAD), SPH_C64(0xA6A2A6A6FF510459), - SPH_C64(0xD26FD2D20CB9BDDE), SPH_C64(0xF5F3F5F50EF706FB), - SPH_C64(0x79F9797996F280EF), SPH_C64(0x6FA16F6F30DECE5F), - SPH_C64(0x917E91916D3FEFFC), SPH_C64(0x52555252F8A407AA), - SPH_C64(0x609D606047C0FD27), SPH_C64(0xBCCABCBC35657689), - SPH_C64(0x9B569B9B372BCDAC), SPH_C64(0x8E028E8E8A018C04), - SPH_C64(0xA3B6A3A3D25B1571), SPH_C64(0x0C300C0C6C183C60), - SPH_C64(0x7BF17B7B84F68AFF), SPH_C64(0x35D43535806AE1B5), - SPH_C64(0x1D741D1DF53A69E8), SPH_C64(0xE0A7E0E0B3DD4753), - SPH_C64(0xD77BD7D721B3ACF6), SPH_C64(0xC22FC2C29C99ED5E), - SPH_C64(0x2EB82E2E435C966D), SPH_C64(0x4B314B4B29967A62), - SPH_C64(0xFEDFFEFE5DE121A3), SPH_C64(0x57415757D5AE1682), - SPH_C64(0x15541515BD2A41A8), SPH_C64(0x77C17777E8EEB69F), - SPH_C64(0x37DC3737926EEBA5), SPH_C64(0xE5B3E5E59ED7567B), - SPH_C64(0x9F469F9F1323D98C), SPH_C64(0xF0E7F0F023FD17D3), - SPH_C64(0x4A354A4A20947F6A), SPH_C64(0xDA4FDADA44A9959E), - SPH_C64(0x587D5858A2B025FA), SPH_C64(0xC903C9C9CF8FCA06), - SPH_C64(0x29A429297C528D55), SPH_C64(0x0A280A0A5A142250), - SPH_C64(0xB1FEB1B1507F4FE1), SPH_C64(0xA0BAA0A0C95D1A69), - SPH_C64(0x6BB16B6B14D6DA7F), SPH_C64(0x852E8585D917AB5C), - SPH_C64(0xBDCEBDBD3C677381), SPH_C64(0x5D695D5D8FBA34D2), - SPH_C64(0x1040101090205080), SPH_C64(0xF4F7F4F407F503F3), - SPH_C64(0xCB0BCBCBDD8BC016), SPH_C64(0x3EF83E3ED37CC6ED), - SPH_C64(0x051405052D0A1128), SPH_C64(0x6781676778CEE61F), - SPH_C64(0xE4B7E4E497D55373), SPH_C64(0x279C2727024EBB25), - SPH_C64(0x4119414173825832), SPH_C64(0x8B168B8BA70B9D2C), - SPH_C64(0xA7A6A7A7F6530151), SPH_C64(0x7DE97D7DB2FA94CF), - SPH_C64(0x956E95954937FBDC), SPH_C64(0xD847D8D856AD9F8E), - SPH_C64(0xFBCBFBFB70EB308B), SPH_C64(0xEE9FEEEECDC17123), - SPH_C64(0x7CED7C7CBBF891C7), SPH_C64(0x6685666671CCE317), - SPH_C64(0xDD53DDDD7BA78EA6), SPH_C64(0x175C1717AF2E4BB8), - SPH_C64(0x47014747458E4602), SPH_C64(0x9E429E9E1A21DC84), - SPH_C64(0xCA0FCACAD489C51E), SPH_C64(0x2DB42D2D585A9975), - SPH_C64(0xBFC6BFBF2E637991), SPH_C64(0x071C07073F0E1B38), - SPH_C64(0xAD8EADADAC472301), SPH_C64(0x5A755A5AB0B42FEA), - SPH_C64(0x83368383EF1BB56C), SPH_C64(0x33CC3333B666FF85), - SPH_C64(0x639163635CC6F23F), SPH_C64(0x0208020212040A10), - SPH_C64(0xAA92AAAA93493839), SPH_C64(0x71D97171DEE2A8AF), - SPH_C64(0xC807C8C8C68DCF0E), SPH_C64(0x19641919D1327DC8), - SPH_C64(0x493949493B927072), SPH_C64(0xD943D9D95FAF9A86), - SPH_C64(0xF2EFF2F231F91DC3), SPH_C64(0xE3ABE3E3A8DB484B), - SPH_C64(0x5B715B5BB9B62AE2), SPH_C64(0x881A8888BC0D9234), - SPH_C64(0x9A529A9A3E29C8A4), SPH_C64(0x269826260B4CBE2D), - SPH_C64(0x32C83232BF64FA8D), SPH_C64(0xB0FAB0B0597D4AE9), - SPH_C64(0xE983E9E9F2CF6A1B), SPH_C64(0x0F3C0F0F771E3378), - SPH_C64(0xD573D5D533B7A6E6), SPH_C64(0x803A8080F41DBA74), - SPH_C64(0xBEC2BEBE27617C99), SPH_C64(0xCD13CDCDEB87DE26), - SPH_C64(0x34D034348968E4BD), SPH_C64(0x483D48483290757A), - SPH_C64(0xFFDBFFFF54E324AB), SPH_C64(0x7AF57A7A8DF48FF7), - SPH_C64(0x907A9090643DEAF4), SPH_C64(0x5F615F5F9DBE3EC2), - SPH_C64(0x208020203D40A01D), SPH_C64(0x68BD68680FD0D567), - SPH_C64(0x1A681A1ACA3472D0), SPH_C64(0xAE82AEAEB7412C19), - SPH_C64(0xB4EAB4B47D755EC9), SPH_C64(0x544D5454CEA8199A), - SPH_C64(0x937693937F3BE5EC), SPH_C64(0x228822222F44AA0D), - SPH_C64(0x648D646463C8E907), SPH_C64(0xF1E3F1F12AFF12DB), - SPH_C64(0x73D17373CCE6A2BF), SPH_C64(0x1248121282245A90), - SPH_C64(0x401D40407A805D3A), SPH_C64(0x0820080848102840), - SPH_C64(0xC32BC3C3959BE856), SPH_C64(0xEC97ECECDFC57B33), - SPH_C64(0xDB4BDBDB4DAB9096), SPH_C64(0xA1BEA1A1C05F1F61), - SPH_C64(0x8D0E8D8D9107831C), SPH_C64(0x3DF43D3DC87AC9F5), - SPH_C64(0x976697975B33F1CC), SPH_C64(0x0000000000000000), - SPH_C64(0xCF1BCFCFF983D436), SPH_C64(0x2BAC2B2B6E568745), - SPH_C64(0x76C57676E1ECB397), SPH_C64(0x82328282E619B064), - SPH_C64(0xD67FD6D628B1A9FE), SPH_C64(0x1B6C1B1BC33677D8), - SPH_C64(0xB5EEB5B574775BC1), SPH_C64(0xAF86AFAFBE432911), - SPH_C64(0x6AB56A6A1DD4DF77), SPH_C64(0x505D5050EAA00DBA), - SPH_C64(0x45094545578A4C12), SPH_C64(0xF3EBF3F338FB18CB), - SPH_C64(0x30C03030AD60F09D), SPH_C64(0xEF9BEFEFC4C3742B), - SPH_C64(0x3FFC3F3FDA7EC3E5), SPH_C64(0x55495555C7AA1C92), - SPH_C64(0xA2B2A2A2DB591079), SPH_C64(0xEA8FEAEAE9C96503), - SPH_C64(0x658965656ACAEC0F), SPH_C64(0xBAD2BABA036968B9), - SPH_C64(0x2FBC2F2F4A5E9365), SPH_C64(0xC027C0C08E9DE74E), - SPH_C64(0xDE5FDEDE60A181BE), SPH_C64(0x1C701C1CFC386CE0), - SPH_C64(0xFDD3FDFD46E72EBB), SPH_C64(0x4D294D4D1F9A6452), - SPH_C64(0x927292927639E0E4), SPH_C64(0x75C97575FAEABC8F), - SPH_C64(0x06180606360C1E30), SPH_C64(0x8A128A8AAE099824), - SPH_C64(0xB2F2B2B24B7940F9), SPH_C64(0xE6BFE6E685D15963), - SPH_C64(0x0E380E0E7E1C3670), SPH_C64(0x1F7C1F1FE73E63F8), - SPH_C64(0x6295626255C4F737), SPH_C64(0xD477D4D43AB5A3EE), - SPH_C64(0xA89AA8A8814D3229), SPH_C64(0x966296965231F4C4), - SPH_C64(0xF9C3F9F962EF3A9B), SPH_C64(0xC533C5C5A397F666), - SPH_C64(0x25942525104AB135), SPH_C64(0x59795959ABB220F2), - SPH_C64(0x842A8484D015AE54), SPH_C64(0x72D57272C5E4A7B7), - SPH_C64(0x39E43939EC72DDD5), SPH_C64(0x4C2D4C4C1698615A), - SPH_C64(0x5E655E5E94BC3BCA), SPH_C64(0x78FD78789FF085E7), - SPH_C64(0x38E03838E570D8DD), SPH_C64(0x8C0A8C8C98058614), - SPH_C64(0xD163D1D117BFB2C6), SPH_C64(0xA5AEA5A5E4570B41), - SPH_C64(0xE2AFE2E2A1D94D43), SPH_C64(0x619961614EC2F82F), - SPH_C64(0xB3F6B3B3427B45F1), SPH_C64(0x218421213442A515), - SPH_C64(0x9C4A9C9C0825D694), SPH_C64(0x1E781E1EEE3C66F0), - SPH_C64(0x4311434361865222), SPH_C64(0xC73BC7C7B193FC76), - SPH_C64(0xFCD7FCFC4FE52BB3), SPH_C64(0x0410040424081420), - SPH_C64(0x51595151E3A208B2), SPH_C64(0x995E9999252FC7BC), - SPH_C64(0x6DA96D6D22DAC44F), SPH_C64(0x0D340D0D651A3968), - SPH_C64(0xFACFFAFA79E93583), SPH_C64(0xDF5BDFDF69A384B6), - SPH_C64(0x7EE57E7EA9FC9BD7), SPH_C64(0x249024241948B43D), - SPH_C64(0x3BEC3B3BFE76D7C5), SPH_C64(0xAB96ABAB9A4B3D31), - SPH_C64(0xCE1FCECEF081D13E), SPH_C64(0x1144111199225588), - SPH_C64(0x8F068F8F8303890C), SPH_C64(0x4E254E4E049C6B4A), - SPH_C64(0xB7E6B7B7667351D1), SPH_C64(0xEB8BEBEBE0CB600B), - SPH_C64(0x3CF03C3CC178CCFD), SPH_C64(0x813E8181FD1FBF7C), - SPH_C64(0x946A94944035FED4), SPH_C64(0xF7FBF7F71CF30CEB), - SPH_C64(0xB9DEB9B9186F67A1), SPH_C64(0x134C13138B265F98), - SPH_C64(0x2CB02C2C51589C7D), SPH_C64(0xD36BD3D305BBB8D6), - SPH_C64(0xE7BBE7E78CD35C6B), SPH_C64(0x6EA56E6E39DCCB57), - SPH_C64(0xC437C4C4AA95F36E), SPH_C64(0x030C03031B060F18), - SPH_C64(0x56455656DCAC138A), SPH_C64(0x440D44445E88491A), - SPH_C64(0x7FE17F7FA0FE9EDF), SPH_C64(0xA99EA9A9884F3721), - SPH_C64(0x2AA82A2A6754824D), SPH_C64(0xBBD6BBBB0A6B6DB1), - SPH_C64(0xC123C1C1879FE246), SPH_C64(0x53515353F1A602A2), - SPH_C64(0xDC57DCDC72A58BAE), SPH_C64(0x0B2C0B0B53162758), - SPH_C64(0x9D4E9D9D0127D39C), SPH_C64(0x6CAD6C6C2BD8C147), - SPH_C64(0x31C43131A462F595), SPH_C64(0x74CD7474F3E8B987), - SPH_C64(0xF6FFF6F615F109E3), SPH_C64(0x460546464C8C430A), - SPH_C64(0xAC8AACACA5452609), SPH_C64(0x891E8989B50F973C), - SPH_C64(0x14501414B42844A0), SPH_C64(0xE1A3E1E1BADF425B), - SPH_C64(0x16581616A62C4EB0), SPH_C64(0x3AE83A3AF774D2CD), - SPH_C64(0x69B9696906D2D06F), SPH_C64(0x0924090941122D48), - SPH_C64(0x70DD7070D7E0ADA7), SPH_C64(0xB6E2B6B66F7154D9), - SPH_C64(0xD067D0D01EBDB7CE), SPH_C64(0xED93EDEDD6C77E3B), - SPH_C64(0xCC17CCCCE285DB2E), SPH_C64(0x421542426884572A), - SPH_C64(0x985A98982C2DC2B4), SPH_C64(0xA4AAA4A4ED550E49), - SPH_C64(0x28A028287550885D), SPH_C64(0x5C6D5C5C86B831DA), - SPH_C64(0xF8C7F8F86BED3F93), SPH_C64(0x86228686C211A444) -}; - -static const long long int plain_T5[256] = { - SPH_C64(0x601818D83078C018), SPH_C64(0x8C23232646AF0523), - SPH_C64(0x3FC6C6B891F97EC6), SPH_C64(0x87E8E8FBCD6F13E8), - SPH_C64(0x268787CB13A14C87), SPH_C64(0xDAB8B8116D62A9B8), - SPH_C64(0x0401010902050801), SPH_C64(0x214F4F0D9E6E424F), - SPH_C64(0xD836369B6CEEAD36), SPH_C64(0xA2A6A6FF510459A6), - SPH_C64(0x6FD2D20CB9BDDED2), SPH_C64(0xF3F5F50EF706FBF5), - SPH_C64(0xF9797996F280EF79), SPH_C64(0xA16F6F30DECE5F6F), - SPH_C64(0x7E91916D3FEFFC91), SPH_C64(0x555252F8A407AA52), - SPH_C64(0x9D606047C0FD2760), SPH_C64(0xCABCBC35657689BC), - SPH_C64(0x569B9B372BCDAC9B), SPH_C64(0x028E8E8A018C048E), - SPH_C64(0xB6A3A3D25B1571A3), SPH_C64(0x300C0C6C183C600C), - SPH_C64(0xF17B7B84F68AFF7B), SPH_C64(0xD43535806AE1B535), - SPH_C64(0x741D1DF53A69E81D), SPH_C64(0xA7E0E0B3DD4753E0), - SPH_C64(0x7BD7D721B3ACF6D7), SPH_C64(0x2FC2C29C99ED5EC2), - SPH_C64(0xB82E2E435C966D2E), SPH_C64(0x314B4B29967A624B), - SPH_C64(0xDFFEFE5DE121A3FE), SPH_C64(0x415757D5AE168257), - SPH_C64(0x541515BD2A41A815), SPH_C64(0xC17777E8EEB69F77), - SPH_C64(0xDC3737926EEBA537), SPH_C64(0xB3E5E59ED7567BE5), - SPH_C64(0x469F9F1323D98C9F), SPH_C64(0xE7F0F023FD17D3F0), - SPH_C64(0x354A4A20947F6A4A), SPH_C64(0x4FDADA44A9959EDA), - SPH_C64(0x7D5858A2B025FA58), SPH_C64(0x03C9C9CF8FCA06C9), - SPH_C64(0xA429297C528D5529), SPH_C64(0x280A0A5A1422500A), - SPH_C64(0xFEB1B1507F4FE1B1), SPH_C64(0xBAA0A0C95D1A69A0), - SPH_C64(0xB16B6B14D6DA7F6B), SPH_C64(0x2E8585D917AB5C85), - SPH_C64(0xCEBDBD3C677381BD), SPH_C64(0x695D5D8FBA34D25D), - SPH_C64(0x4010109020508010), SPH_C64(0xF7F4F407F503F3F4), - SPH_C64(0x0BCBCBDD8BC016CB), SPH_C64(0xF83E3ED37CC6ED3E), - SPH_C64(0x1405052D0A112805), SPH_C64(0x81676778CEE61F67), - SPH_C64(0xB7E4E497D55373E4), SPH_C64(0x9C2727024EBB2527), - SPH_C64(0x1941417382583241), SPH_C64(0x168B8BA70B9D2C8B), - SPH_C64(0xA6A7A7F6530151A7), SPH_C64(0xE97D7DB2FA94CF7D), - SPH_C64(0x6E95954937FBDC95), SPH_C64(0x47D8D856AD9F8ED8), - SPH_C64(0xCBFBFB70EB308BFB), SPH_C64(0x9FEEEECDC17123EE), - SPH_C64(0xED7C7CBBF891C77C), SPH_C64(0x85666671CCE31766), - SPH_C64(0x53DDDD7BA78EA6DD), SPH_C64(0x5C1717AF2E4BB817), - SPH_C64(0x014747458E460247), SPH_C64(0x429E9E1A21DC849E), - SPH_C64(0x0FCACAD489C51ECA), SPH_C64(0xB42D2D585A99752D), - SPH_C64(0xC6BFBF2E637991BF), SPH_C64(0x1C07073F0E1B3807), - SPH_C64(0x8EADADAC472301AD), SPH_C64(0x755A5AB0B42FEA5A), - SPH_C64(0x368383EF1BB56C83), SPH_C64(0xCC3333B666FF8533), - SPH_C64(0x9163635CC6F23F63), SPH_C64(0x08020212040A1002), - SPH_C64(0x92AAAA93493839AA), SPH_C64(0xD97171DEE2A8AF71), - SPH_C64(0x07C8C8C68DCF0EC8), SPH_C64(0x641919D1327DC819), - SPH_C64(0x3949493B92707249), SPH_C64(0x43D9D95FAF9A86D9), - SPH_C64(0xEFF2F231F91DC3F2), SPH_C64(0xABE3E3A8DB484BE3), - SPH_C64(0x715B5BB9B62AE25B), SPH_C64(0x1A8888BC0D923488), - SPH_C64(0x529A9A3E29C8A49A), SPH_C64(0x9826260B4CBE2D26), - SPH_C64(0xC83232BF64FA8D32), SPH_C64(0xFAB0B0597D4AE9B0), - SPH_C64(0x83E9E9F2CF6A1BE9), SPH_C64(0x3C0F0F771E33780F), - SPH_C64(0x73D5D533B7A6E6D5), SPH_C64(0x3A8080F41DBA7480), - SPH_C64(0xC2BEBE27617C99BE), SPH_C64(0x13CDCDEB87DE26CD), - SPH_C64(0xD034348968E4BD34), SPH_C64(0x3D48483290757A48), - SPH_C64(0xDBFFFF54E324ABFF), SPH_C64(0xF57A7A8DF48FF77A), - SPH_C64(0x7A9090643DEAF490), SPH_C64(0x615F5F9DBE3EC25F), - SPH_C64(0x8020203D40A01D20), SPH_C64(0xBD68680FD0D56768), - SPH_C64(0x681A1ACA3472D01A), SPH_C64(0x82AEAEB7412C19AE), - SPH_C64(0xEAB4B47D755EC9B4), SPH_C64(0x4D5454CEA8199A54), - SPH_C64(0x7693937F3BE5EC93), SPH_C64(0x8822222F44AA0D22), - SPH_C64(0x8D646463C8E90764), SPH_C64(0xE3F1F12AFF12DBF1), - SPH_C64(0xD17373CCE6A2BF73), SPH_C64(0x48121282245A9012), - SPH_C64(0x1D40407A805D3A40), SPH_C64(0x2008084810284008), - SPH_C64(0x2BC3C3959BE856C3), SPH_C64(0x97ECECDFC57B33EC), - SPH_C64(0x4BDBDB4DAB9096DB), SPH_C64(0xBEA1A1C05F1F61A1), - SPH_C64(0x0E8D8D9107831C8D), SPH_C64(0xF43D3DC87AC9F53D), - SPH_C64(0x6697975B33F1CC97), SPH_C64(0x0000000000000000), - SPH_C64(0x1BCFCFF983D436CF), SPH_C64(0xAC2B2B6E5687452B), - SPH_C64(0xC57676E1ECB39776), SPH_C64(0x328282E619B06482), - SPH_C64(0x7FD6D628B1A9FED6), SPH_C64(0x6C1B1BC33677D81B), - SPH_C64(0xEEB5B574775BC1B5), SPH_C64(0x86AFAFBE432911AF), - SPH_C64(0xB56A6A1DD4DF776A), SPH_C64(0x5D5050EAA00DBA50), - SPH_C64(0x094545578A4C1245), SPH_C64(0xEBF3F338FB18CBF3), - SPH_C64(0xC03030AD60F09D30), SPH_C64(0x9BEFEFC4C3742BEF), - SPH_C64(0xFC3F3FDA7EC3E53F), SPH_C64(0x495555C7AA1C9255), - SPH_C64(0xB2A2A2DB591079A2), SPH_C64(0x8FEAEAE9C96503EA), - SPH_C64(0x8965656ACAEC0F65), SPH_C64(0xD2BABA036968B9BA), - SPH_C64(0xBC2F2F4A5E93652F), SPH_C64(0x27C0C08E9DE74EC0), - SPH_C64(0x5FDEDE60A181BEDE), SPH_C64(0x701C1CFC386CE01C), - SPH_C64(0xD3FDFD46E72EBBFD), SPH_C64(0x294D4D1F9A64524D), - SPH_C64(0x7292927639E0E492), SPH_C64(0xC97575FAEABC8F75), - SPH_C64(0x180606360C1E3006), SPH_C64(0x128A8AAE0998248A), - SPH_C64(0xF2B2B24B7940F9B2), SPH_C64(0xBFE6E685D15963E6), - SPH_C64(0x380E0E7E1C36700E), SPH_C64(0x7C1F1FE73E63F81F), - SPH_C64(0x95626255C4F73762), SPH_C64(0x77D4D43AB5A3EED4), - SPH_C64(0x9AA8A8814D3229A8), SPH_C64(0x6296965231F4C496), - SPH_C64(0xC3F9F962EF3A9BF9), SPH_C64(0x33C5C5A397F666C5), - SPH_C64(0x942525104AB13525), SPH_C64(0x795959ABB220F259), - SPH_C64(0x2A8484D015AE5484), SPH_C64(0xD57272C5E4A7B772), - SPH_C64(0xE43939EC72DDD539), SPH_C64(0x2D4C4C1698615A4C), - SPH_C64(0x655E5E94BC3BCA5E), SPH_C64(0xFD78789FF085E778), - SPH_C64(0xE03838E570D8DD38), SPH_C64(0x0A8C8C980586148C), - SPH_C64(0x63D1D117BFB2C6D1), SPH_C64(0xAEA5A5E4570B41A5), - SPH_C64(0xAFE2E2A1D94D43E2), SPH_C64(0x9961614EC2F82F61), - SPH_C64(0xF6B3B3427B45F1B3), SPH_C64(0x8421213442A51521), - SPH_C64(0x4A9C9C0825D6949C), SPH_C64(0x781E1EEE3C66F01E), - SPH_C64(0x1143436186522243), SPH_C64(0x3BC7C7B193FC76C7), - SPH_C64(0xD7FCFC4FE52BB3FC), SPH_C64(0x1004042408142004), - SPH_C64(0x595151E3A208B251), SPH_C64(0x5E9999252FC7BC99), - SPH_C64(0xA96D6D22DAC44F6D), SPH_C64(0x340D0D651A39680D), - SPH_C64(0xCFFAFA79E93583FA), SPH_C64(0x5BDFDF69A384B6DF), - SPH_C64(0xE57E7EA9FC9BD77E), SPH_C64(0x9024241948B43D24), - SPH_C64(0xEC3B3BFE76D7C53B), SPH_C64(0x96ABAB9A4B3D31AB), - SPH_C64(0x1FCECEF081D13ECE), SPH_C64(0x4411119922558811), - SPH_C64(0x068F8F8303890C8F), SPH_C64(0x254E4E049C6B4A4E), - SPH_C64(0xE6B7B7667351D1B7), SPH_C64(0x8BEBEBE0CB600BEB), - SPH_C64(0xF03C3CC178CCFD3C), SPH_C64(0x3E8181FD1FBF7C81), - SPH_C64(0x6A94944035FED494), SPH_C64(0xFBF7F71CF30CEBF7), - SPH_C64(0xDEB9B9186F67A1B9), SPH_C64(0x4C13138B265F9813), - SPH_C64(0xB02C2C51589C7D2C), SPH_C64(0x6BD3D305BBB8D6D3), - SPH_C64(0xBBE7E78CD35C6BE7), SPH_C64(0xA56E6E39DCCB576E), - SPH_C64(0x37C4C4AA95F36EC4), SPH_C64(0x0C03031B060F1803), - SPH_C64(0x455656DCAC138A56), SPH_C64(0x0D44445E88491A44), - SPH_C64(0xE17F7FA0FE9EDF7F), SPH_C64(0x9EA9A9884F3721A9), - SPH_C64(0xA82A2A6754824D2A), SPH_C64(0xD6BBBB0A6B6DB1BB), - SPH_C64(0x23C1C1879FE246C1), SPH_C64(0x515353F1A602A253), - SPH_C64(0x57DCDC72A58BAEDC), SPH_C64(0x2C0B0B531627580B), - SPH_C64(0x4E9D9D0127D39C9D), SPH_C64(0xAD6C6C2BD8C1476C), - SPH_C64(0xC43131A462F59531), SPH_C64(0xCD7474F3E8B98774), - SPH_C64(0xFFF6F615F109E3F6), SPH_C64(0x0546464C8C430A46), - SPH_C64(0x8AACACA5452609AC), SPH_C64(0x1E8989B50F973C89), - SPH_C64(0x501414B42844A014), SPH_C64(0xA3E1E1BADF425BE1), - SPH_C64(0x581616A62C4EB016), SPH_C64(0xE83A3AF774D2CD3A), - SPH_C64(0xB9696906D2D06F69), SPH_C64(0x24090941122D4809), - SPH_C64(0xDD7070D7E0ADA770), SPH_C64(0xE2B6B66F7154D9B6), - SPH_C64(0x67D0D01EBDB7CED0), SPH_C64(0x93EDEDD6C77E3BED), - SPH_C64(0x17CCCCE285DB2ECC), SPH_C64(0x1542426884572A42), - SPH_C64(0x5A98982C2DC2B498), SPH_C64(0xAAA4A4ED550E49A4), - SPH_C64(0xA028287550885D28), SPH_C64(0x6D5C5C86B831DA5C), - SPH_C64(0xC7F8F86BED3F93F8), SPH_C64(0x228686C211A44486) -}; - -static const long long int plain_T6[256] = { - SPH_C64(0x1818D83078C01860), SPH_C64(0x23232646AF05238C), - SPH_C64(0xC6C6B891F97EC63F), SPH_C64(0xE8E8FBCD6F13E887), - SPH_C64(0x8787CB13A14C8726), SPH_C64(0xB8B8116D62A9B8DA), - SPH_C64(0x0101090205080104), SPH_C64(0x4F4F0D9E6E424F21), - SPH_C64(0x36369B6CEEAD36D8), SPH_C64(0xA6A6FF510459A6A2), - SPH_C64(0xD2D20CB9BDDED26F), SPH_C64(0xF5F50EF706FBF5F3), - SPH_C64(0x797996F280EF79F9), SPH_C64(0x6F6F30DECE5F6FA1), - SPH_C64(0x91916D3FEFFC917E), SPH_C64(0x5252F8A407AA5255), - SPH_C64(0x606047C0FD27609D), SPH_C64(0xBCBC35657689BCCA), - SPH_C64(0x9B9B372BCDAC9B56), SPH_C64(0x8E8E8A018C048E02), - SPH_C64(0xA3A3D25B1571A3B6), SPH_C64(0x0C0C6C183C600C30), - SPH_C64(0x7B7B84F68AFF7BF1), SPH_C64(0x3535806AE1B535D4), - SPH_C64(0x1D1DF53A69E81D74), SPH_C64(0xE0E0B3DD4753E0A7), - SPH_C64(0xD7D721B3ACF6D77B), SPH_C64(0xC2C29C99ED5EC22F), - SPH_C64(0x2E2E435C966D2EB8), SPH_C64(0x4B4B29967A624B31), - SPH_C64(0xFEFE5DE121A3FEDF), SPH_C64(0x5757D5AE16825741), - SPH_C64(0x1515BD2A41A81554), SPH_C64(0x7777E8EEB69F77C1), - SPH_C64(0x3737926EEBA537DC), SPH_C64(0xE5E59ED7567BE5B3), - SPH_C64(0x9F9F1323D98C9F46), SPH_C64(0xF0F023FD17D3F0E7), - SPH_C64(0x4A4A20947F6A4A35), SPH_C64(0xDADA44A9959EDA4F), - SPH_C64(0x5858A2B025FA587D), SPH_C64(0xC9C9CF8FCA06C903), - SPH_C64(0x29297C528D5529A4), SPH_C64(0x0A0A5A1422500A28), - SPH_C64(0xB1B1507F4FE1B1FE), SPH_C64(0xA0A0C95D1A69A0BA), - SPH_C64(0x6B6B14D6DA7F6BB1), SPH_C64(0x8585D917AB5C852E), - SPH_C64(0xBDBD3C677381BDCE), SPH_C64(0x5D5D8FBA34D25D69), - SPH_C64(0x1010902050801040), SPH_C64(0xF4F407F503F3F4F7), - SPH_C64(0xCBCBDD8BC016CB0B), SPH_C64(0x3E3ED37CC6ED3EF8), - SPH_C64(0x05052D0A11280514), SPH_C64(0x676778CEE61F6781), - SPH_C64(0xE4E497D55373E4B7), SPH_C64(0x2727024EBB25279C), - SPH_C64(0x4141738258324119), SPH_C64(0x8B8BA70B9D2C8B16), - SPH_C64(0xA7A7F6530151A7A6), SPH_C64(0x7D7DB2FA94CF7DE9), - SPH_C64(0x95954937FBDC956E), SPH_C64(0xD8D856AD9F8ED847), - SPH_C64(0xFBFB70EB308BFBCB), SPH_C64(0xEEEECDC17123EE9F), - SPH_C64(0x7C7CBBF891C77CED), SPH_C64(0x666671CCE3176685), - SPH_C64(0xDDDD7BA78EA6DD53), SPH_C64(0x1717AF2E4BB8175C), - SPH_C64(0x4747458E46024701), SPH_C64(0x9E9E1A21DC849E42), - SPH_C64(0xCACAD489C51ECA0F), SPH_C64(0x2D2D585A99752DB4), - SPH_C64(0xBFBF2E637991BFC6), SPH_C64(0x07073F0E1B38071C), - SPH_C64(0xADADAC472301AD8E), SPH_C64(0x5A5AB0B42FEA5A75), - SPH_C64(0x8383EF1BB56C8336), SPH_C64(0x3333B666FF8533CC), - SPH_C64(0x63635CC6F23F6391), SPH_C64(0x020212040A100208), - SPH_C64(0xAAAA93493839AA92), SPH_C64(0x7171DEE2A8AF71D9), - SPH_C64(0xC8C8C68DCF0EC807), SPH_C64(0x1919D1327DC81964), - SPH_C64(0x49493B9270724939), SPH_C64(0xD9D95FAF9A86D943), - SPH_C64(0xF2F231F91DC3F2EF), SPH_C64(0xE3E3A8DB484BE3AB), - SPH_C64(0x5B5BB9B62AE25B71), SPH_C64(0x8888BC0D9234881A), - SPH_C64(0x9A9A3E29C8A49A52), SPH_C64(0x26260B4CBE2D2698), - SPH_C64(0x3232BF64FA8D32C8), SPH_C64(0xB0B0597D4AE9B0FA), - SPH_C64(0xE9E9F2CF6A1BE983), SPH_C64(0x0F0F771E33780F3C), - SPH_C64(0xD5D533B7A6E6D573), SPH_C64(0x8080F41DBA74803A), - SPH_C64(0xBEBE27617C99BEC2), SPH_C64(0xCDCDEB87DE26CD13), - SPH_C64(0x34348968E4BD34D0), SPH_C64(0x48483290757A483D), - SPH_C64(0xFFFF54E324ABFFDB), SPH_C64(0x7A7A8DF48FF77AF5), - SPH_C64(0x9090643DEAF4907A), SPH_C64(0x5F5F9DBE3EC25F61), - SPH_C64(0x20203D40A01D2080), SPH_C64(0x68680FD0D56768BD), - SPH_C64(0x1A1ACA3472D01A68), SPH_C64(0xAEAEB7412C19AE82), - SPH_C64(0xB4B47D755EC9B4EA), SPH_C64(0x5454CEA8199A544D), - SPH_C64(0x93937F3BE5EC9376), SPH_C64(0x22222F44AA0D2288), - SPH_C64(0x646463C8E907648D), SPH_C64(0xF1F12AFF12DBF1E3), - SPH_C64(0x7373CCE6A2BF73D1), SPH_C64(0x121282245A901248), - SPH_C64(0x40407A805D3A401D), SPH_C64(0x0808481028400820), - SPH_C64(0xC3C3959BE856C32B), SPH_C64(0xECECDFC57B33EC97), - SPH_C64(0xDBDB4DAB9096DB4B), SPH_C64(0xA1A1C05F1F61A1BE), - SPH_C64(0x8D8D9107831C8D0E), SPH_C64(0x3D3DC87AC9F53DF4), - SPH_C64(0x97975B33F1CC9766), SPH_C64(0x0000000000000000), - SPH_C64(0xCFCFF983D436CF1B), SPH_C64(0x2B2B6E5687452BAC), - SPH_C64(0x7676E1ECB39776C5), SPH_C64(0x8282E619B0648232), - SPH_C64(0xD6D628B1A9FED67F), SPH_C64(0x1B1BC33677D81B6C), - SPH_C64(0xB5B574775BC1B5EE), SPH_C64(0xAFAFBE432911AF86), - SPH_C64(0x6A6A1DD4DF776AB5), SPH_C64(0x5050EAA00DBA505D), - SPH_C64(0x4545578A4C124509), SPH_C64(0xF3F338FB18CBF3EB), - SPH_C64(0x3030AD60F09D30C0), SPH_C64(0xEFEFC4C3742BEF9B), - SPH_C64(0x3F3FDA7EC3E53FFC), SPH_C64(0x5555C7AA1C925549), - SPH_C64(0xA2A2DB591079A2B2), SPH_C64(0xEAEAE9C96503EA8F), - SPH_C64(0x65656ACAEC0F6589), SPH_C64(0xBABA036968B9BAD2), - SPH_C64(0x2F2F4A5E93652FBC), SPH_C64(0xC0C08E9DE74EC027), - SPH_C64(0xDEDE60A181BEDE5F), SPH_C64(0x1C1CFC386CE01C70), - SPH_C64(0xFDFD46E72EBBFDD3), SPH_C64(0x4D4D1F9A64524D29), - SPH_C64(0x92927639E0E49272), SPH_C64(0x7575FAEABC8F75C9), - SPH_C64(0x0606360C1E300618), SPH_C64(0x8A8AAE0998248A12), - SPH_C64(0xB2B24B7940F9B2F2), SPH_C64(0xE6E685D15963E6BF), - SPH_C64(0x0E0E7E1C36700E38), SPH_C64(0x1F1FE73E63F81F7C), - SPH_C64(0x626255C4F7376295), SPH_C64(0xD4D43AB5A3EED477), - SPH_C64(0xA8A8814D3229A89A), SPH_C64(0x96965231F4C49662), - SPH_C64(0xF9F962EF3A9BF9C3), SPH_C64(0xC5C5A397F666C533), - SPH_C64(0x2525104AB1352594), SPH_C64(0x5959ABB220F25979), - SPH_C64(0x8484D015AE54842A), SPH_C64(0x7272C5E4A7B772D5), - SPH_C64(0x3939EC72DDD539E4), SPH_C64(0x4C4C1698615A4C2D), - SPH_C64(0x5E5E94BC3BCA5E65), SPH_C64(0x78789FF085E778FD), - SPH_C64(0x3838E570D8DD38E0), SPH_C64(0x8C8C980586148C0A), - SPH_C64(0xD1D117BFB2C6D163), SPH_C64(0xA5A5E4570B41A5AE), - SPH_C64(0xE2E2A1D94D43E2AF), SPH_C64(0x61614EC2F82F6199), - SPH_C64(0xB3B3427B45F1B3F6), SPH_C64(0x21213442A5152184), - SPH_C64(0x9C9C0825D6949C4A), SPH_C64(0x1E1EEE3C66F01E78), - SPH_C64(0x4343618652224311), SPH_C64(0xC7C7B193FC76C73B), - SPH_C64(0xFCFC4FE52BB3FCD7), SPH_C64(0x0404240814200410), - SPH_C64(0x5151E3A208B25159), SPH_C64(0x9999252FC7BC995E), - SPH_C64(0x6D6D22DAC44F6DA9), SPH_C64(0x0D0D651A39680D34), - SPH_C64(0xFAFA79E93583FACF), SPH_C64(0xDFDF69A384B6DF5B), - SPH_C64(0x7E7EA9FC9BD77EE5), SPH_C64(0x24241948B43D2490), - SPH_C64(0x3B3BFE76D7C53BEC), SPH_C64(0xABAB9A4B3D31AB96), - SPH_C64(0xCECEF081D13ECE1F), SPH_C64(0x1111992255881144), - SPH_C64(0x8F8F8303890C8F06), SPH_C64(0x4E4E049C6B4A4E25), - SPH_C64(0xB7B7667351D1B7E6), SPH_C64(0xEBEBE0CB600BEB8B), - SPH_C64(0x3C3CC178CCFD3CF0), SPH_C64(0x8181FD1FBF7C813E), - SPH_C64(0x94944035FED4946A), SPH_C64(0xF7F71CF30CEBF7FB), - SPH_C64(0xB9B9186F67A1B9DE), SPH_C64(0x13138B265F98134C), - SPH_C64(0x2C2C51589C7D2CB0), SPH_C64(0xD3D305BBB8D6D36B), - SPH_C64(0xE7E78CD35C6BE7BB), SPH_C64(0x6E6E39DCCB576EA5), - SPH_C64(0xC4C4AA95F36EC437), SPH_C64(0x03031B060F18030C), - SPH_C64(0x5656DCAC138A5645), SPH_C64(0x44445E88491A440D), - SPH_C64(0x7F7FA0FE9EDF7FE1), SPH_C64(0xA9A9884F3721A99E), - SPH_C64(0x2A2A6754824D2AA8), SPH_C64(0xBBBB0A6B6DB1BBD6), - SPH_C64(0xC1C1879FE246C123), SPH_C64(0x5353F1A602A25351), - SPH_C64(0xDCDC72A58BAEDC57), SPH_C64(0x0B0B531627580B2C), - SPH_C64(0x9D9D0127D39C9D4E), SPH_C64(0x6C6C2BD8C1476CAD), - SPH_C64(0x3131A462F59531C4), SPH_C64(0x7474F3E8B98774CD), - SPH_C64(0xF6F615F109E3F6FF), SPH_C64(0x46464C8C430A4605), - SPH_C64(0xACACA5452609AC8A), SPH_C64(0x8989B50F973C891E), - SPH_C64(0x1414B42844A01450), SPH_C64(0xE1E1BADF425BE1A3), - SPH_C64(0x1616A62C4EB01658), SPH_C64(0x3A3AF774D2CD3AE8), - SPH_C64(0x696906D2D06F69B9), SPH_C64(0x090941122D480924), - SPH_C64(0x7070D7E0ADA770DD), SPH_C64(0xB6B66F7154D9B6E2), - SPH_C64(0xD0D01EBDB7CED067), SPH_C64(0xEDEDD6C77E3BED93), - SPH_C64(0xCCCCE285DB2ECC17), SPH_C64(0x42426884572A4215), - SPH_C64(0x98982C2DC2B4985A), SPH_C64(0xA4A4ED550E49A4AA), - SPH_C64(0x28287550885D28A0), SPH_C64(0x5C5C86B831DA5C6D), - SPH_C64(0xF8F86BED3F93F8C7), SPH_C64(0x8686C211A4448622) -}; - -static const long long int plain_T7[256] = { - SPH_C64(0x18D83078C0186018), SPH_C64(0x232646AF05238C23), - SPH_C64(0xC6B891F97EC63FC6), SPH_C64(0xE8FBCD6F13E887E8), - SPH_C64(0x87CB13A14C872687), SPH_C64(0xB8116D62A9B8DAB8), - SPH_C64(0x0109020508010401), SPH_C64(0x4F0D9E6E424F214F), - SPH_C64(0x369B6CEEAD36D836), SPH_C64(0xA6FF510459A6A2A6), - SPH_C64(0xD20CB9BDDED26FD2), SPH_C64(0xF50EF706FBF5F3F5), - SPH_C64(0x7996F280EF79F979), SPH_C64(0x6F30DECE5F6FA16F), - SPH_C64(0x916D3FEFFC917E91), SPH_C64(0x52F8A407AA525552), - SPH_C64(0x6047C0FD27609D60), SPH_C64(0xBC35657689BCCABC), - SPH_C64(0x9B372BCDAC9B569B), SPH_C64(0x8E8A018C048E028E), - SPH_C64(0xA3D25B1571A3B6A3), SPH_C64(0x0C6C183C600C300C), - SPH_C64(0x7B84F68AFF7BF17B), SPH_C64(0x35806AE1B535D435), - SPH_C64(0x1DF53A69E81D741D), SPH_C64(0xE0B3DD4753E0A7E0), - SPH_C64(0xD721B3ACF6D77BD7), SPH_C64(0xC29C99ED5EC22FC2), - SPH_C64(0x2E435C966D2EB82E), SPH_C64(0x4B29967A624B314B), - SPH_C64(0xFE5DE121A3FEDFFE), SPH_C64(0x57D5AE1682574157), - SPH_C64(0x15BD2A41A8155415), SPH_C64(0x77E8EEB69F77C177), - SPH_C64(0x37926EEBA537DC37), SPH_C64(0xE59ED7567BE5B3E5), - SPH_C64(0x9F1323D98C9F469F), SPH_C64(0xF023FD17D3F0E7F0), - SPH_C64(0x4A20947F6A4A354A), SPH_C64(0xDA44A9959EDA4FDA), - SPH_C64(0x58A2B025FA587D58), SPH_C64(0xC9CF8FCA06C903C9), - SPH_C64(0x297C528D5529A429), SPH_C64(0x0A5A1422500A280A), - SPH_C64(0xB1507F4FE1B1FEB1), SPH_C64(0xA0C95D1A69A0BAA0), - SPH_C64(0x6B14D6DA7F6BB16B), SPH_C64(0x85D917AB5C852E85), - SPH_C64(0xBD3C677381BDCEBD), SPH_C64(0x5D8FBA34D25D695D), - SPH_C64(0x1090205080104010), SPH_C64(0xF407F503F3F4F7F4), - SPH_C64(0xCBDD8BC016CB0BCB), SPH_C64(0x3ED37CC6ED3EF83E), - SPH_C64(0x052D0A1128051405), SPH_C64(0x6778CEE61F678167), - SPH_C64(0xE497D55373E4B7E4), SPH_C64(0x27024EBB25279C27), - SPH_C64(0x4173825832411941), SPH_C64(0x8BA70B9D2C8B168B), - SPH_C64(0xA7F6530151A7A6A7), SPH_C64(0x7DB2FA94CF7DE97D), - SPH_C64(0x954937FBDC956E95), SPH_C64(0xD856AD9F8ED847D8), - SPH_C64(0xFB70EB308BFBCBFB), SPH_C64(0xEECDC17123EE9FEE), - SPH_C64(0x7CBBF891C77CED7C), SPH_C64(0x6671CCE317668566), - SPH_C64(0xDD7BA78EA6DD53DD), SPH_C64(0x17AF2E4BB8175C17), - SPH_C64(0x47458E4602470147), SPH_C64(0x9E1A21DC849E429E), - SPH_C64(0xCAD489C51ECA0FCA), SPH_C64(0x2D585A99752DB42D), - SPH_C64(0xBF2E637991BFC6BF), SPH_C64(0x073F0E1B38071C07), - SPH_C64(0xADAC472301AD8EAD), SPH_C64(0x5AB0B42FEA5A755A), - SPH_C64(0x83EF1BB56C833683), SPH_C64(0x33B666FF8533CC33), - SPH_C64(0x635CC6F23F639163), SPH_C64(0x0212040A10020802), - SPH_C64(0xAA93493839AA92AA), SPH_C64(0x71DEE2A8AF71D971), - SPH_C64(0xC8C68DCF0EC807C8), SPH_C64(0x19D1327DC8196419), - SPH_C64(0x493B927072493949), SPH_C64(0xD95FAF9A86D943D9), - SPH_C64(0xF231F91DC3F2EFF2), SPH_C64(0xE3A8DB484BE3ABE3), - SPH_C64(0x5BB9B62AE25B715B), SPH_C64(0x88BC0D9234881A88), - SPH_C64(0x9A3E29C8A49A529A), SPH_C64(0x260B4CBE2D269826), - SPH_C64(0x32BF64FA8D32C832), SPH_C64(0xB0597D4AE9B0FAB0), - SPH_C64(0xE9F2CF6A1BE983E9), SPH_C64(0x0F771E33780F3C0F), - SPH_C64(0xD533B7A6E6D573D5), SPH_C64(0x80F41DBA74803A80), - SPH_C64(0xBE27617C99BEC2BE), SPH_C64(0xCDEB87DE26CD13CD), - SPH_C64(0x348968E4BD34D034), SPH_C64(0x483290757A483D48), - SPH_C64(0xFF54E324ABFFDBFF), SPH_C64(0x7A8DF48FF77AF57A), - SPH_C64(0x90643DEAF4907A90), SPH_C64(0x5F9DBE3EC25F615F), - SPH_C64(0x203D40A01D208020), SPH_C64(0x680FD0D56768BD68), - SPH_C64(0x1ACA3472D01A681A), SPH_C64(0xAEB7412C19AE82AE), - SPH_C64(0xB47D755EC9B4EAB4), SPH_C64(0x54CEA8199A544D54), - SPH_C64(0x937F3BE5EC937693), SPH_C64(0x222F44AA0D228822), - SPH_C64(0x6463C8E907648D64), SPH_C64(0xF12AFF12DBF1E3F1), - SPH_C64(0x73CCE6A2BF73D173), SPH_C64(0x1282245A90124812), - SPH_C64(0x407A805D3A401D40), SPH_C64(0x0848102840082008), - SPH_C64(0xC3959BE856C32BC3), SPH_C64(0xECDFC57B33EC97EC), - SPH_C64(0xDB4DAB9096DB4BDB), SPH_C64(0xA1C05F1F61A1BEA1), - SPH_C64(0x8D9107831C8D0E8D), SPH_C64(0x3DC87AC9F53DF43D), - SPH_C64(0x975B33F1CC976697), SPH_C64(0x0000000000000000), - SPH_C64(0xCFF983D436CF1BCF), SPH_C64(0x2B6E5687452BAC2B), - SPH_C64(0x76E1ECB39776C576), SPH_C64(0x82E619B064823282), - SPH_C64(0xD628B1A9FED67FD6), SPH_C64(0x1BC33677D81B6C1B), - SPH_C64(0xB574775BC1B5EEB5), SPH_C64(0xAFBE432911AF86AF), - SPH_C64(0x6A1DD4DF776AB56A), SPH_C64(0x50EAA00DBA505D50), - SPH_C64(0x45578A4C12450945), SPH_C64(0xF338FB18CBF3EBF3), - SPH_C64(0x30AD60F09D30C030), SPH_C64(0xEFC4C3742BEF9BEF), - SPH_C64(0x3FDA7EC3E53FFC3F), SPH_C64(0x55C7AA1C92554955), - SPH_C64(0xA2DB591079A2B2A2), SPH_C64(0xEAE9C96503EA8FEA), - SPH_C64(0x656ACAEC0F658965), SPH_C64(0xBA036968B9BAD2BA), - SPH_C64(0x2F4A5E93652FBC2F), SPH_C64(0xC08E9DE74EC027C0), - SPH_C64(0xDE60A181BEDE5FDE), SPH_C64(0x1CFC386CE01C701C), - SPH_C64(0xFD46E72EBBFDD3FD), SPH_C64(0x4D1F9A64524D294D), - SPH_C64(0x927639E0E4927292), SPH_C64(0x75FAEABC8F75C975), - SPH_C64(0x06360C1E30061806), SPH_C64(0x8AAE0998248A128A), - SPH_C64(0xB24B7940F9B2F2B2), SPH_C64(0xE685D15963E6BFE6), - SPH_C64(0x0E7E1C36700E380E), SPH_C64(0x1FE73E63F81F7C1F), - SPH_C64(0x6255C4F737629562), SPH_C64(0xD43AB5A3EED477D4), - SPH_C64(0xA8814D3229A89AA8), SPH_C64(0x965231F4C4966296), - SPH_C64(0xF962EF3A9BF9C3F9), SPH_C64(0xC5A397F666C533C5), - SPH_C64(0x25104AB135259425), SPH_C64(0x59ABB220F2597959), - SPH_C64(0x84D015AE54842A84), SPH_C64(0x72C5E4A7B772D572), - SPH_C64(0x39EC72DDD539E439), SPH_C64(0x4C1698615A4C2D4C), - SPH_C64(0x5E94BC3BCA5E655E), SPH_C64(0x789FF085E778FD78), - SPH_C64(0x38E570D8DD38E038), SPH_C64(0x8C980586148C0A8C), - SPH_C64(0xD117BFB2C6D163D1), SPH_C64(0xA5E4570B41A5AEA5), - SPH_C64(0xE2A1D94D43E2AFE2), SPH_C64(0x614EC2F82F619961), - SPH_C64(0xB3427B45F1B3F6B3), SPH_C64(0x213442A515218421), - SPH_C64(0x9C0825D6949C4A9C), SPH_C64(0x1EEE3C66F01E781E), - SPH_C64(0x4361865222431143), SPH_C64(0xC7B193FC76C73BC7), - SPH_C64(0xFC4FE52BB3FCD7FC), SPH_C64(0x0424081420041004), - SPH_C64(0x51E3A208B2515951), SPH_C64(0x99252FC7BC995E99), - SPH_C64(0x6D22DAC44F6DA96D), SPH_C64(0x0D651A39680D340D), - SPH_C64(0xFA79E93583FACFFA), SPH_C64(0xDF69A384B6DF5BDF), - SPH_C64(0x7EA9FC9BD77EE57E), SPH_C64(0x241948B43D249024), - SPH_C64(0x3BFE76D7C53BEC3B), SPH_C64(0xAB9A4B3D31AB96AB), - SPH_C64(0xCEF081D13ECE1FCE), SPH_C64(0x1199225588114411), - SPH_C64(0x8F8303890C8F068F), SPH_C64(0x4E049C6B4A4E254E), - SPH_C64(0xB7667351D1B7E6B7), SPH_C64(0xEBE0CB600BEB8BEB), - SPH_C64(0x3CC178CCFD3CF03C), SPH_C64(0x81FD1FBF7C813E81), - SPH_C64(0x944035FED4946A94), SPH_C64(0xF71CF30CEBF7FBF7), - SPH_C64(0xB9186F67A1B9DEB9), SPH_C64(0x138B265F98134C13), - SPH_C64(0x2C51589C7D2CB02C), SPH_C64(0xD305BBB8D6D36BD3), - SPH_C64(0xE78CD35C6BE7BBE7), SPH_C64(0x6E39DCCB576EA56E), - SPH_C64(0xC4AA95F36EC437C4), SPH_C64(0x031B060F18030C03), - SPH_C64(0x56DCAC138A564556), SPH_C64(0x445E88491A440D44), - SPH_C64(0x7FA0FE9EDF7FE17F), SPH_C64(0xA9884F3721A99EA9), - SPH_C64(0x2A6754824D2AA82A), SPH_C64(0xBB0A6B6DB1BBD6BB), - SPH_C64(0xC1879FE246C123C1), SPH_C64(0x53F1A602A2535153), - SPH_C64(0xDC72A58BAEDC57DC), SPH_C64(0x0B531627580B2C0B), - SPH_C64(0x9D0127D39C9D4E9D), SPH_C64(0x6C2BD8C1476CAD6C), - SPH_C64(0x31A462F59531C431), SPH_C64(0x74F3E8B98774CD74), - SPH_C64(0xF615F109E3F6FFF6), SPH_C64(0x464C8C430A460546), - SPH_C64(0xACA5452609AC8AAC), SPH_C64(0x89B50F973C891E89), - SPH_C64(0x14B42844A0145014), SPH_C64(0xE1BADF425BE1A3E1), - SPH_C64(0x16A62C4EB0165816), SPH_C64(0x3AF774D2CD3AE83A), - SPH_C64(0x6906D2D06F69B969), SPH_C64(0x0941122D48092409), - SPH_C64(0x70D7E0ADA770DD70), SPH_C64(0xB66F7154D9B6E2B6), - SPH_C64(0xD01EBDB7CED067D0), SPH_C64(0xEDD6C77E3BED93ED), - SPH_C64(0xCCE285DB2ECC17CC), SPH_C64(0x426884572A421542), - SPH_C64(0x982C2DC2B4985A98), SPH_C64(0xA4ED550E49A4AAA4), - SPH_C64(0x287550885D28A028), SPH_C64(0x5C86B831DA5C6D5C), - SPH_C64(0xF86BED3F93F8C7F8), SPH_C64(0x86C211A444862286) -}; - -#endif - -/* - * Round constants. - */ -static const long long int plain_RC[10] = { - SPH_C64(0x4F01B887E8C62318), - SPH_C64(0x52916F79F5D2A636), - SPH_C64(0x357B0CA38E9BBC60), - SPH_C64(0x57FE4B2EC2D7E01D), - SPH_C64(0xDA4AF09FE5377715), - SPH_C64(0x856BA0B10A29C958), - SPH_C64(0x67053ECBF4105DBD), - SPH_C64(0xD8957DA78B4127E4), - SPH_C64(0x9E4717DD667CEEFB), - SPH_C64(0x33835AAD07BF2DCA) -}; - -/* ====================================================================== */ -/* - * Constants for plain WHIRLPOOL-0 (first version). - */ - -static const long long int old0_T0[256] = { - SPH_C64(0xD50F67D568B86868), SPH_C64(0xB71ECEB7D06DD0D0), - SPH_C64(0x60E00B60EB20EBEB), SPH_C64(0x876E45872B7D2B2B), - SPH_C64(0x75327A7548D84848), SPH_C64(0xD3019CD39DBA9D9D), - SPH_C64(0xDF1D77DF6ABE6A6A), SPH_C64(0x53977353E431E4E4), - SPH_C64(0x48A84B48E338E3E3), SPH_C64(0x15D27115A3F8A3A3), - SPH_C64(0x13DC8A1356FA5656), SPH_C64(0xBFFD7CBF819E8181), - SPH_C64(0x94B2CF947D877D7D), SPH_C64(0x122ADB12F10EF1F1), - SPH_C64(0xABD95CAB85928585), SPH_C64(0xDC1A84DC9EBF9E9E), - SPH_C64(0x9C517D9C2C742C2C), SPH_C64(0x8C8A048C8E8F8E8E), - SPH_C64(0x859FE78578887878), SPH_C64(0xC5D41EC5CA43CACA), - SPH_C64(0x4BAFB84B17391717), SPH_C64(0x37882137A9E6A9A9), - SPH_C64(0xF84E2FF861A36161), SPH_C64(0xA633E6A6D562D5D5), - SPH_C64(0x348FD2345DE75D5D), SPH_C64(0x275358270B1D0B0B), - SPH_C64(0x869814868C898C8C), SPH_C64(0xCCC1FDCC3C443C3C), - SPH_C64(0xB6E89FB677997777), SPH_C64(0x08E3B20851F35151), - SPH_C64(0xAA2F0DAA22662222), SPH_C64(0x57682A5742C64242), - SPH_C64(0xC3DAE5C33F413F3F), SPH_C64(0x19CE9A1954FC5454), - SPH_C64(0x5873325841C34141), SPH_C64(0xBAF474BA809D8080), - SPH_C64(0xDBE22EDBCC49CCCC), SPH_C64(0xA4C244A486978686), - SPH_C64(0x4542F145B3C8B3B3), SPH_C64(0x78D8C07818281818), - SPH_C64(0x96436D962E722E2E), SPH_C64(0x16D5821657F95757), - SPH_C64(0x1E36301E060A0606), SPH_C64(0xF75537F762A66262), - SPH_C64(0x0307F303F401F4F4), SPH_C64(0xEE9BADEE365A3636), - SPH_C64(0xB217C6B2D16ED1D1), SPH_C64(0xDA147FDA6BBD6B6B), - SPH_C64(0x77C3D8771B2D1B1B), SPH_C64(0xEC6A0FEC65AF6565), - SPH_C64(0xBCFA8FBC759F7575), SPH_C64(0x5090805010301010), - SPH_C64(0x95449E95DA73DADA), SPH_C64(0x703B727049DB4949), - SPH_C64(0xBE0B2DBE266A2626), SPH_C64(0x3A629B3AF916F9F9), - SPH_C64(0xC0DD16C0CB40CBCB), SPH_C64(0xE37117E366AA6666), - SPH_C64(0x5C8C6B5CE734E7E7), SPH_C64(0x6803B968BAD3BABA), - SPH_C64(0x2CB7192CAEEFAEAE), SPH_C64(0x0DEABA0D50F05050), - SPH_C64(0x07F8AA0752F65252), SPH_C64(0x3D9A313DABE0ABAB), - SPH_C64(0x112D2811050F0505), SPH_C64(0x1723D317F00DF0F0), - SPH_C64(0x396568390D170D0D), SPH_C64(0xA2CCBFA273957373), - SPH_C64(0xD7FEC5D73B4D3B3B), SPH_C64(0x14242014040C0404), - SPH_C64(0xA03D1DA020602020), SPH_C64(0x215DA321FE1FFEFE), - SPH_C64(0x8E7BA68EDD7ADDDD), SPH_C64(0x060EFB06F502F5F5), - SPH_C64(0x5E7DC95EB4C1B4B4), SPH_C64(0x3E9DC23E5FE15F5F), - SPH_C64(0x225A50220A1E0A0A), SPH_C64(0x5B74C15BB5C2B5B5), - SPH_C64(0xE78E4EE7C05DC0C0), SPH_C64(0x1AC9691AA0FDA0A0), - SPH_C64(0xA8DEAFA871937171), SPH_C64(0x0BE4410BA5F2A5A5), - SPH_C64(0x995875992D772D2D), SPH_C64(0xFD4727FD60A06060), - SPH_C64(0xA7C5B7A772967272), SPH_C64(0xE57FECE593A89393), - SPH_C64(0xDDECD5DD394B3939), SPH_C64(0x2848402808180808), - SPH_C64(0xB5EF6CB583988383), SPH_C64(0xA53415A521632121), - SPH_C64(0x3186DA315CE45C5C), SPH_C64(0xA1CB4CA187948787), - SPH_C64(0x4F50E14FB1CEB1B1), SPH_C64(0x47B35347E03DE0E0), - SPH_C64(0x0000000000000000), SPH_C64(0xE89556E8C358C3C3), - SPH_C64(0x5A82905A12361212), SPH_C64(0xEF6DFCEF91AE9191), - SPH_C64(0x98AE24988A838A8A), SPH_C64(0x0A12100A02060202), - SPH_C64(0x6CFCE06C1C241C1C), SPH_C64(0x59856359E637E6E6), - SPH_C64(0x4C57124C45CF4545), SPH_C64(0xED9C5EEDC25BC2C2), - SPH_C64(0xF3AA6EF3C451C4C4), SPH_C64(0x2E46BB2EFD1AFDFD), - SPH_C64(0x792E9179BFDCBFBF), SPH_C64(0x495E1A4944CC4444), - SPH_C64(0x1FC0611FA1FEA1A1), SPH_C64(0x61165A614CD44C4C), - SPH_C64(0xFFB685FF33553333), SPH_C64(0xF6A366F6C552C5C5), - SPH_C64(0xAED054AE84918484), SPH_C64(0xAF2605AF23652323), - SPH_C64(0x91BBC7917C847C7C), SPH_C64(0x4A59E94AB0CDB0B0), - SPH_C64(0xB11035B1256F2525), SPH_C64(0x41BDA841153F1515), - SPH_C64(0xE180B5E1355F3535), SPH_C64(0xD0066FD069BB6969), - SPH_C64(0x2454AB24FF1CFFFF), SPH_C64(0xFE40D4FE94A19494), - SPH_C64(0x641F52644DD74D4D), SPH_C64(0xADD7A7AD70907070), - SPH_C64(0x10DB7910A2FBA2A2), SPH_C64(0x29BE1129AFECAFAF), - SPH_C64(0xDEEB26DECD4ACDCD), SPH_C64(0xA928FEA9D667D6D6), - SPH_C64(0xC12B47C16CB46C6C), SPH_C64(0x5166D151B7C4B7B7), - SPH_C64(0x3F6B933FF815F8F8), SPH_C64(0x2D41482D091B0909), - SPH_C64(0x1838CB18F308F3F3), SPH_C64(0xE6781FE667A96767), - SPH_C64(0x0EED490EA4F1A4A4), SPH_C64(0x65E90365EA23EAEA), - SPH_C64(0x7BDF337BEC29ECEC), SPH_C64(0x546FD954B6C7B6B6), - SPH_C64(0xA33AEEA3D461D4D4), SPH_C64(0xBD0CDEBDD26BD2D2), - SPH_C64(0x44B4A044143C1414), SPH_C64(0x66EEF0661E221E1E), - SPH_C64(0x42BA5B42E13EE1E1), SPH_C64(0xB4193DB4246C2424), - SPH_C64(0xD8E5DDD838483838), SPH_C64(0xF9B87EF9C657C6C6), - SPH_C64(0x904D9690DB70DBDB), SPH_C64(0x7A29627A4BDD4B4B), - SPH_C64(0x8F8DF78F7A8E7A7A), SPH_C64(0xD2F7CDD23A4E3A3A), - SPH_C64(0x8160BE81DE7FDEDE), SPH_C64(0x3B94CA3B5EE25E5E), - SPH_C64(0x8469B684DF7CDFDF), SPH_C64(0xFB49DCFB95A29595), - SPH_C64(0x2B4FB32BFC19FCFC), SPH_C64(0x38933938AAE3AAAA), - SPH_C64(0xAC21F6ACD764D7D7), SPH_C64(0xD1F03ED1CE4FCECE), - SPH_C64(0x1B3F381B07090707), SPH_C64(0x337778330F110F0F), - SPH_C64(0xC9C8F5C93D473D3D), SPH_C64(0x25A2FA2558E85858), - SPH_C64(0xC83EA4C89AB39A9A), SPH_C64(0xC22CB4C298B59898), - SPH_C64(0xD60894D69CB99C9C), SPH_C64(0x1D31C31DF20BF2F2), - SPH_C64(0x01F65101A7F4A7A7), SPH_C64(0x5599885511331111), - SPH_C64(0x9BA9D79B7E827E7E), SPH_C64(0x9DA72C9D8B808B8B), - SPH_C64(0x5261225243C54343), SPH_C64(0x0F1B180F03050303), - SPH_C64(0x4DA1434DE23BE2E2), SPH_C64(0x8B72AE8BDC79DCDC), - SPH_C64(0x569E7B56E532E5E5), SPH_C64(0x404BF940B2CBB2B2), - SPH_C64(0x6B044A6B4ED24E4E), SPH_C64(0xFCB176FCC754C7C7), - SPH_C64(0xC4224FC46DB76D6D), SPH_C64(0x6AF21B6AE926E9E9), - SPH_C64(0xBB0225BB27692727), SPH_C64(0x5D7A3A5D40C04040), - SPH_C64(0x9F568E9FD875D8D8), SPH_C64(0xEB92A5EB37593737), - SPH_C64(0xE076E4E092AB9292), SPH_C64(0x89830C898F8C8F8F), - SPH_C64(0x0509080501030101), SPH_C64(0x69F5E8691D271D1D), - SPH_C64(0x02F1A20253F55353), SPH_C64(0xC6D3EDC63E423E3E), - SPH_C64(0x20ABF22059EB5959), SPH_C64(0xE28746E2C15EC1C1), - SPH_C64(0x6E0D426E4FD14F4F), SPH_C64(0xFABF8DFA32563232), - SPH_C64(0x4EA6B04E163A1616), SPH_C64(0x35798335FA13FAFA), - SPH_C64(0xB9F387B9749C7474), SPH_C64(0x30708B30FB10FBFB), - SPH_C64(0xF25C3FF263A56363), SPH_C64(0xD9138CD99FBC9F9F), - SPH_C64(0xE489BDE4345C3434), SPH_C64(0x72CAD0721A2E1A1A), - SPH_C64(0x82674D822A7E2A2A), SPH_C64(0x2FB0EA2F5AEE5A5A), - SPH_C64(0x83911C838D8A8D8D), SPH_C64(0xCACF06CAC946C9C9), - SPH_C64(0xD4F936D4CF4CCFCF), SPH_C64(0x0915E309F607F6F6), - SPH_C64(0xEA64F4EA90AD9090), SPH_C64(0x88755D8828782828), - SPH_C64(0x92BC349288858888), SPH_C64(0xCD37ACCD9BB09B9B), - SPH_C64(0xF5A495F531533131), SPH_C64(0x367E70360E120E0E), - SPH_C64(0x733C8173BDDABDBD), SPH_C64(0x7F206A7F4ADE4A4A), - SPH_C64(0x6FFB136FE825E8E8), SPH_C64(0xF452C4F496A79696), - SPH_C64(0x04FF5904A6F7A6A6), SPH_C64(0x3C6C603C0C140C0C), - SPH_C64(0xCFC60ECFC845C8C8), SPH_C64(0x8096EF80798B7979), - SPH_C64(0x76358976BCD9BCBC), SPH_C64(0x7C27997CBEDFBEBE), - SPH_C64(0x74C42B74EF2CEFEF), SPH_C64(0xCB3957CB6EB26E6E), - SPH_C64(0x434C0A4346CA4646), SPH_C64(0xF15BCCF197A49797), - SPH_C64(0x2AB9E22A5BED5B5B), SPH_C64(0x7ED63B7EED2AEDED), - SPH_C64(0x7DD1C87D192B1919), SPH_C64(0x9A5F869AD976D9D9), - SPH_C64(0x26A50926ACE9ACAC), SPH_C64(0xC725BCC799B69999), - SPH_C64(0x32812932A8E5A8A8), SPH_C64(0x8D7C558D297B2929), - SPH_C64(0xE96307E964AC6464), SPH_C64(0x63E7F8631F211F1F), - SPH_C64(0x23AC0123ADEAADAD), SPH_C64(0x1CC7921C55FF5555), - SPH_C64(0x5F8B985F13351313), SPH_C64(0x6D0AB16DBBD0BBBB), - SPH_C64(0x0C1CEB0CF704F7F7), SPH_C64(0xCE305FCE6FB16F6F), - SPH_C64(0x6718A167B9D6B9B9), SPH_C64(0x4645024647C94747), - SPH_C64(0x934A65932F712F2F), SPH_C64(0x71CD2371EE2FEEEE), - SPH_C64(0x6211A962B8D5B8B8), SPH_C64(0x8A84FF8A7B8D7B7B), - SPH_C64(0x97B53C9789868989), SPH_C64(0xF0AD9DF030503030), - SPH_C64(0xB805D6B8D368D3D3), SPH_C64(0x9EA0DF9E7F817F7F), - SPH_C64(0xB3E197B3769A7676), SPH_C64(0xB0E664B0829B8282) -}; - -#if !SPH_SMALL_FOOTPRINT_WHIRLPOOL - -static const long long int old0_T1[256] = { - SPH_C64(0x0F67D568B86868D5), SPH_C64(0x1ECEB7D06DD0D0B7), - SPH_C64(0xE00B60EB20EBEB60), SPH_C64(0x6E45872B7D2B2B87), - SPH_C64(0x327A7548D8484875), SPH_C64(0x019CD39DBA9D9DD3), - SPH_C64(0x1D77DF6ABE6A6ADF), SPH_C64(0x977353E431E4E453), - SPH_C64(0xA84B48E338E3E348), SPH_C64(0xD27115A3F8A3A315), - SPH_C64(0xDC8A1356FA565613), SPH_C64(0xFD7CBF819E8181BF), - SPH_C64(0xB2CF947D877D7D94), SPH_C64(0x2ADB12F10EF1F112), - SPH_C64(0xD95CAB85928585AB), SPH_C64(0x1A84DC9EBF9E9EDC), - SPH_C64(0x517D9C2C742C2C9C), SPH_C64(0x8A048C8E8F8E8E8C), - SPH_C64(0x9FE7857888787885), SPH_C64(0xD41EC5CA43CACAC5), - SPH_C64(0xAFB84B173917174B), SPH_C64(0x882137A9E6A9A937), - SPH_C64(0x4E2FF861A36161F8), SPH_C64(0x33E6A6D562D5D5A6), - SPH_C64(0x8FD2345DE75D5D34), SPH_C64(0x5358270B1D0B0B27), - SPH_C64(0x9814868C898C8C86), SPH_C64(0xC1FDCC3C443C3CCC), - SPH_C64(0xE89FB677997777B6), SPH_C64(0xE3B20851F3515108), - SPH_C64(0x2F0DAA22662222AA), SPH_C64(0x682A5742C6424257), - SPH_C64(0xDAE5C33F413F3FC3), SPH_C64(0xCE9A1954FC545419), - SPH_C64(0x73325841C3414158), SPH_C64(0xF474BA809D8080BA), - SPH_C64(0xE22EDBCC49CCCCDB), SPH_C64(0xC244A486978686A4), - SPH_C64(0x42F145B3C8B3B345), SPH_C64(0xD8C0781828181878), - SPH_C64(0x436D962E722E2E96), SPH_C64(0xD5821657F9575716), - SPH_C64(0x36301E060A06061E), SPH_C64(0x5537F762A66262F7), - SPH_C64(0x07F303F401F4F403), SPH_C64(0x9BADEE365A3636EE), - SPH_C64(0x17C6B2D16ED1D1B2), SPH_C64(0x147FDA6BBD6B6BDA), - SPH_C64(0xC3D8771B2D1B1B77), SPH_C64(0x6A0FEC65AF6565EC), - SPH_C64(0xFA8FBC759F7575BC), SPH_C64(0x9080501030101050), - SPH_C64(0x449E95DA73DADA95), SPH_C64(0x3B727049DB494970), - SPH_C64(0x0B2DBE266A2626BE), SPH_C64(0x629B3AF916F9F93A), - SPH_C64(0xDD16C0CB40CBCBC0), SPH_C64(0x7117E366AA6666E3), - SPH_C64(0x8C6B5CE734E7E75C), SPH_C64(0x03B968BAD3BABA68), - SPH_C64(0xB7192CAEEFAEAE2C), SPH_C64(0xEABA0D50F050500D), - SPH_C64(0xF8AA0752F6525207), SPH_C64(0x9A313DABE0ABAB3D), - SPH_C64(0x2D2811050F050511), SPH_C64(0x23D317F00DF0F017), - SPH_C64(0x6568390D170D0D39), SPH_C64(0xCCBFA273957373A2), - SPH_C64(0xFEC5D73B4D3B3BD7), SPH_C64(0x242014040C040414), - SPH_C64(0x3D1DA020602020A0), SPH_C64(0x5DA321FE1FFEFE21), - SPH_C64(0x7BA68EDD7ADDDD8E), SPH_C64(0x0EFB06F502F5F506), - SPH_C64(0x7DC95EB4C1B4B45E), SPH_C64(0x9DC23E5FE15F5F3E), - SPH_C64(0x5A50220A1E0A0A22), SPH_C64(0x74C15BB5C2B5B55B), - SPH_C64(0x8E4EE7C05DC0C0E7), SPH_C64(0xC9691AA0FDA0A01A), - SPH_C64(0xDEAFA871937171A8), SPH_C64(0xE4410BA5F2A5A50B), - SPH_C64(0x5875992D772D2D99), SPH_C64(0x4727FD60A06060FD), - SPH_C64(0xC5B7A772967272A7), SPH_C64(0x7FECE593A89393E5), - SPH_C64(0xECD5DD394B3939DD), SPH_C64(0x4840280818080828), - SPH_C64(0xEF6CB583988383B5), SPH_C64(0x3415A521632121A5), - SPH_C64(0x86DA315CE45C5C31), SPH_C64(0xCB4CA187948787A1), - SPH_C64(0x50E14FB1CEB1B14F), SPH_C64(0xB35347E03DE0E047), - SPH_C64(0x0000000000000000), SPH_C64(0x9556E8C358C3C3E8), - SPH_C64(0x82905A123612125A), SPH_C64(0x6DFCEF91AE9191EF), - SPH_C64(0xAE24988A838A8A98), SPH_C64(0x12100A020602020A), - SPH_C64(0xFCE06C1C241C1C6C), SPH_C64(0x856359E637E6E659), - SPH_C64(0x57124C45CF45454C), SPH_C64(0x9C5EEDC25BC2C2ED), - SPH_C64(0xAA6EF3C451C4C4F3), SPH_C64(0x46BB2EFD1AFDFD2E), - SPH_C64(0x2E9179BFDCBFBF79), SPH_C64(0x5E1A4944CC444449), - SPH_C64(0xC0611FA1FEA1A11F), SPH_C64(0x165A614CD44C4C61), - SPH_C64(0xB685FF33553333FF), SPH_C64(0xA366F6C552C5C5F6), - SPH_C64(0xD054AE84918484AE), SPH_C64(0x2605AF23652323AF), - SPH_C64(0xBBC7917C847C7C91), SPH_C64(0x59E94AB0CDB0B04A), - SPH_C64(0x1035B1256F2525B1), SPH_C64(0xBDA841153F151541), - SPH_C64(0x80B5E1355F3535E1), SPH_C64(0x066FD069BB6969D0), - SPH_C64(0x54AB24FF1CFFFF24), SPH_C64(0x40D4FE94A19494FE), - SPH_C64(0x1F52644DD74D4D64), SPH_C64(0xD7A7AD70907070AD), - SPH_C64(0xDB7910A2FBA2A210), SPH_C64(0xBE1129AFECAFAF29), - SPH_C64(0xEB26DECD4ACDCDDE), SPH_C64(0x28FEA9D667D6D6A9), - SPH_C64(0x2B47C16CB46C6CC1), SPH_C64(0x66D151B7C4B7B751), - SPH_C64(0x6B933FF815F8F83F), SPH_C64(0x41482D091B09092D), - SPH_C64(0x38CB18F308F3F318), SPH_C64(0x781FE667A96767E6), - SPH_C64(0xED490EA4F1A4A40E), SPH_C64(0xE90365EA23EAEA65), - SPH_C64(0xDF337BEC29ECEC7B), SPH_C64(0x6FD954B6C7B6B654), - SPH_C64(0x3AEEA3D461D4D4A3), SPH_C64(0x0CDEBDD26BD2D2BD), - SPH_C64(0xB4A044143C141444), SPH_C64(0xEEF0661E221E1E66), - SPH_C64(0xBA5B42E13EE1E142), SPH_C64(0x193DB4246C2424B4), - SPH_C64(0xE5DDD838483838D8), SPH_C64(0xB87EF9C657C6C6F9), - SPH_C64(0x4D9690DB70DBDB90), SPH_C64(0x29627A4BDD4B4B7A), - SPH_C64(0x8DF78F7A8E7A7A8F), SPH_C64(0xF7CDD23A4E3A3AD2), - SPH_C64(0x60BE81DE7FDEDE81), SPH_C64(0x94CA3B5EE25E5E3B), - SPH_C64(0x69B684DF7CDFDF84), SPH_C64(0x49DCFB95A29595FB), - SPH_C64(0x4FB32BFC19FCFC2B), SPH_C64(0x933938AAE3AAAA38), - SPH_C64(0x21F6ACD764D7D7AC), SPH_C64(0xF03ED1CE4FCECED1), - SPH_C64(0x3F381B070907071B), SPH_C64(0x7778330F110F0F33), - SPH_C64(0xC8F5C93D473D3DC9), SPH_C64(0xA2FA2558E8585825), - SPH_C64(0x3EA4C89AB39A9AC8), SPH_C64(0x2CB4C298B59898C2), - SPH_C64(0x0894D69CB99C9CD6), SPH_C64(0x31C31DF20BF2F21D), - SPH_C64(0xF65101A7F4A7A701), SPH_C64(0x9988551133111155), - SPH_C64(0xA9D79B7E827E7E9B), SPH_C64(0xA72C9D8B808B8B9D), - SPH_C64(0x61225243C5434352), SPH_C64(0x1B180F030503030F), - SPH_C64(0xA1434DE23BE2E24D), SPH_C64(0x72AE8BDC79DCDC8B), - SPH_C64(0x9E7B56E532E5E556), SPH_C64(0x4BF940B2CBB2B240), - SPH_C64(0x044A6B4ED24E4E6B), SPH_C64(0xB176FCC754C7C7FC), - SPH_C64(0x224FC46DB76D6DC4), SPH_C64(0xF21B6AE926E9E96A), - SPH_C64(0x0225BB27692727BB), SPH_C64(0x7A3A5D40C040405D), - SPH_C64(0x568E9FD875D8D89F), SPH_C64(0x92A5EB37593737EB), - SPH_C64(0x76E4E092AB9292E0), SPH_C64(0x830C898F8C8F8F89), - SPH_C64(0x0908050103010105), SPH_C64(0xF5E8691D271D1D69), - SPH_C64(0xF1A20253F5535302), SPH_C64(0xD3EDC63E423E3EC6), - SPH_C64(0xABF22059EB595920), SPH_C64(0x8746E2C15EC1C1E2), - SPH_C64(0x0D426E4FD14F4F6E), SPH_C64(0xBF8DFA32563232FA), - SPH_C64(0xA6B04E163A16164E), SPH_C64(0x798335FA13FAFA35), - SPH_C64(0xF387B9749C7474B9), SPH_C64(0x708B30FB10FBFB30), - SPH_C64(0x5C3FF263A56363F2), SPH_C64(0x138CD99FBC9F9FD9), - SPH_C64(0x89BDE4345C3434E4), SPH_C64(0xCAD0721A2E1A1A72), - SPH_C64(0x674D822A7E2A2A82), SPH_C64(0xB0EA2F5AEE5A5A2F), - SPH_C64(0x911C838D8A8D8D83), SPH_C64(0xCF06CAC946C9C9CA), - SPH_C64(0xF936D4CF4CCFCFD4), SPH_C64(0x15E309F607F6F609), - SPH_C64(0x64F4EA90AD9090EA), SPH_C64(0x755D882878282888), - SPH_C64(0xBC34928885888892), SPH_C64(0x37ACCD9BB09B9BCD), - SPH_C64(0xA495F531533131F5), SPH_C64(0x7E70360E120E0E36), - SPH_C64(0x3C8173BDDABDBD73), SPH_C64(0x206A7F4ADE4A4A7F), - SPH_C64(0xFB136FE825E8E86F), SPH_C64(0x52C4F496A79696F4), - SPH_C64(0xFF5904A6F7A6A604), SPH_C64(0x6C603C0C140C0C3C), - SPH_C64(0xC60ECFC845C8C8CF), SPH_C64(0x96EF80798B797980), - SPH_C64(0x358976BCD9BCBC76), SPH_C64(0x27997CBEDFBEBE7C), - SPH_C64(0xC42B74EF2CEFEF74), SPH_C64(0x3957CB6EB26E6ECB), - SPH_C64(0x4C0A4346CA464643), SPH_C64(0x5BCCF197A49797F1), - SPH_C64(0xB9E22A5BED5B5B2A), SPH_C64(0xD63B7EED2AEDED7E), - SPH_C64(0xD1C87D192B19197D), SPH_C64(0x5F869AD976D9D99A), - SPH_C64(0xA50926ACE9ACAC26), SPH_C64(0x25BCC799B69999C7), - SPH_C64(0x812932A8E5A8A832), SPH_C64(0x7C558D297B29298D), - SPH_C64(0x6307E964AC6464E9), SPH_C64(0xE7F8631F211F1F63), - SPH_C64(0xAC0123ADEAADAD23), SPH_C64(0xC7921C55FF55551C), - SPH_C64(0x8B985F133513135F), SPH_C64(0x0AB16DBBD0BBBB6D), - SPH_C64(0x1CEB0CF704F7F70C), SPH_C64(0x305FCE6FB16F6FCE), - SPH_C64(0x18A167B9D6B9B967), SPH_C64(0x45024647C9474746), - SPH_C64(0x4A65932F712F2F93), SPH_C64(0xCD2371EE2FEEEE71), - SPH_C64(0x11A962B8D5B8B862), SPH_C64(0x84FF8A7B8D7B7B8A), - SPH_C64(0xB53C978986898997), SPH_C64(0xAD9DF030503030F0), - SPH_C64(0x05D6B8D368D3D3B8), SPH_C64(0xA0DF9E7F817F7F9E), - SPH_C64(0xE197B3769A7676B3), SPH_C64(0xE664B0829B8282B0) -}; - -static const long long int old0_T2[256] = { - SPH_C64(0x67D568B86868D50F), SPH_C64(0xCEB7D06DD0D0B71E), - SPH_C64(0x0B60EB20EBEB60E0), SPH_C64(0x45872B7D2B2B876E), - SPH_C64(0x7A7548D848487532), SPH_C64(0x9CD39DBA9D9DD301), - SPH_C64(0x77DF6ABE6A6ADF1D), SPH_C64(0x7353E431E4E45397), - SPH_C64(0x4B48E338E3E348A8), SPH_C64(0x7115A3F8A3A315D2), - SPH_C64(0x8A1356FA565613DC), SPH_C64(0x7CBF819E8181BFFD), - SPH_C64(0xCF947D877D7D94B2), SPH_C64(0xDB12F10EF1F1122A), - SPH_C64(0x5CAB85928585ABD9), SPH_C64(0x84DC9EBF9E9EDC1A), - SPH_C64(0x7D9C2C742C2C9C51), SPH_C64(0x048C8E8F8E8E8C8A), - SPH_C64(0xE78578887878859F), SPH_C64(0x1EC5CA43CACAC5D4), - SPH_C64(0xB84B173917174BAF), SPH_C64(0x2137A9E6A9A93788), - SPH_C64(0x2FF861A36161F84E), SPH_C64(0xE6A6D562D5D5A633), - SPH_C64(0xD2345DE75D5D348F), SPH_C64(0x58270B1D0B0B2753), - SPH_C64(0x14868C898C8C8698), SPH_C64(0xFDCC3C443C3CCCC1), - SPH_C64(0x9FB677997777B6E8), SPH_C64(0xB20851F3515108E3), - SPH_C64(0x0DAA22662222AA2F), SPH_C64(0x2A5742C642425768), - SPH_C64(0xE5C33F413F3FC3DA), SPH_C64(0x9A1954FC545419CE), - SPH_C64(0x325841C341415873), SPH_C64(0x74BA809D8080BAF4), - SPH_C64(0x2EDBCC49CCCCDBE2), SPH_C64(0x44A486978686A4C2), - SPH_C64(0xF145B3C8B3B34542), SPH_C64(0xC0781828181878D8), - SPH_C64(0x6D962E722E2E9643), SPH_C64(0x821657F9575716D5), - SPH_C64(0x301E060A06061E36), SPH_C64(0x37F762A66262F755), - SPH_C64(0xF303F401F4F40307), SPH_C64(0xADEE365A3636EE9B), - SPH_C64(0xC6B2D16ED1D1B217), SPH_C64(0x7FDA6BBD6B6BDA14), - SPH_C64(0xD8771B2D1B1B77C3), SPH_C64(0x0FEC65AF6565EC6A), - SPH_C64(0x8FBC759F7575BCFA), SPH_C64(0x8050103010105090), - SPH_C64(0x9E95DA73DADA9544), SPH_C64(0x727049DB4949703B), - SPH_C64(0x2DBE266A2626BE0B), SPH_C64(0x9B3AF916F9F93A62), - SPH_C64(0x16C0CB40CBCBC0DD), SPH_C64(0x17E366AA6666E371), - SPH_C64(0x6B5CE734E7E75C8C), SPH_C64(0xB968BAD3BABA6803), - SPH_C64(0x192CAEEFAEAE2CB7), SPH_C64(0xBA0D50F050500DEA), - SPH_C64(0xAA0752F6525207F8), SPH_C64(0x313DABE0ABAB3D9A), - SPH_C64(0x2811050F0505112D), SPH_C64(0xD317F00DF0F01723), - SPH_C64(0x68390D170D0D3965), SPH_C64(0xBFA273957373A2CC), - SPH_C64(0xC5D73B4D3B3BD7FE), SPH_C64(0x2014040C04041424), - SPH_C64(0x1DA020602020A03D), SPH_C64(0xA321FE1FFEFE215D), - SPH_C64(0xA68EDD7ADDDD8E7B), SPH_C64(0xFB06F502F5F5060E), - SPH_C64(0xC95EB4C1B4B45E7D), SPH_C64(0xC23E5FE15F5F3E9D), - SPH_C64(0x50220A1E0A0A225A), SPH_C64(0xC15BB5C2B5B55B74), - SPH_C64(0x4EE7C05DC0C0E78E), SPH_C64(0x691AA0FDA0A01AC9), - SPH_C64(0xAFA871937171A8DE), SPH_C64(0x410BA5F2A5A50BE4), - SPH_C64(0x75992D772D2D9958), SPH_C64(0x27FD60A06060FD47), - SPH_C64(0xB7A772967272A7C5), SPH_C64(0xECE593A89393E57F), - SPH_C64(0xD5DD394B3939DDEC), SPH_C64(0x4028081808082848), - SPH_C64(0x6CB583988383B5EF), SPH_C64(0x15A521632121A534), - SPH_C64(0xDA315CE45C5C3186), SPH_C64(0x4CA187948787A1CB), - SPH_C64(0xE14FB1CEB1B14F50), SPH_C64(0x5347E03DE0E047B3), - SPH_C64(0x0000000000000000), SPH_C64(0x56E8C358C3C3E895), - SPH_C64(0x905A123612125A82), SPH_C64(0xFCEF91AE9191EF6D), - SPH_C64(0x24988A838A8A98AE), SPH_C64(0x100A020602020A12), - SPH_C64(0xE06C1C241C1C6CFC), SPH_C64(0x6359E637E6E65985), - SPH_C64(0x124C45CF45454C57), SPH_C64(0x5EEDC25BC2C2ED9C), - SPH_C64(0x6EF3C451C4C4F3AA), SPH_C64(0xBB2EFD1AFDFD2E46), - SPH_C64(0x9179BFDCBFBF792E), SPH_C64(0x1A4944CC4444495E), - SPH_C64(0x611FA1FEA1A11FC0), SPH_C64(0x5A614CD44C4C6116), - SPH_C64(0x85FF33553333FFB6), SPH_C64(0x66F6C552C5C5F6A3), - SPH_C64(0x54AE84918484AED0), SPH_C64(0x05AF23652323AF26), - SPH_C64(0xC7917C847C7C91BB), SPH_C64(0xE94AB0CDB0B04A59), - SPH_C64(0x35B1256F2525B110), SPH_C64(0xA841153F151541BD), - SPH_C64(0xB5E1355F3535E180), SPH_C64(0x6FD069BB6969D006), - SPH_C64(0xAB24FF1CFFFF2454), SPH_C64(0xD4FE94A19494FE40), - SPH_C64(0x52644DD74D4D641F), SPH_C64(0xA7AD70907070ADD7), - SPH_C64(0x7910A2FBA2A210DB), SPH_C64(0x1129AFECAFAF29BE), - SPH_C64(0x26DECD4ACDCDDEEB), SPH_C64(0xFEA9D667D6D6A928), - SPH_C64(0x47C16CB46C6CC12B), SPH_C64(0xD151B7C4B7B75166), - SPH_C64(0x933FF815F8F83F6B), SPH_C64(0x482D091B09092D41), - SPH_C64(0xCB18F308F3F31838), SPH_C64(0x1FE667A96767E678), - SPH_C64(0x490EA4F1A4A40EED), SPH_C64(0x0365EA23EAEA65E9), - SPH_C64(0x337BEC29ECEC7BDF), SPH_C64(0xD954B6C7B6B6546F), - SPH_C64(0xEEA3D461D4D4A33A), SPH_C64(0xDEBDD26BD2D2BD0C), - SPH_C64(0xA044143C141444B4), SPH_C64(0xF0661E221E1E66EE), - SPH_C64(0x5B42E13EE1E142BA), SPH_C64(0x3DB4246C2424B419), - SPH_C64(0xDDD838483838D8E5), SPH_C64(0x7EF9C657C6C6F9B8), - SPH_C64(0x9690DB70DBDB904D), SPH_C64(0x627A4BDD4B4B7A29), - SPH_C64(0xF78F7A8E7A7A8F8D), SPH_C64(0xCDD23A4E3A3AD2F7), - SPH_C64(0xBE81DE7FDEDE8160), SPH_C64(0xCA3B5EE25E5E3B94), - SPH_C64(0xB684DF7CDFDF8469), SPH_C64(0xDCFB95A29595FB49), - SPH_C64(0xB32BFC19FCFC2B4F), SPH_C64(0x3938AAE3AAAA3893), - SPH_C64(0xF6ACD764D7D7AC21), SPH_C64(0x3ED1CE4FCECED1F0), - SPH_C64(0x381B070907071B3F), SPH_C64(0x78330F110F0F3377), - SPH_C64(0xF5C93D473D3DC9C8), SPH_C64(0xFA2558E8585825A2), - SPH_C64(0xA4C89AB39A9AC83E), SPH_C64(0xB4C298B59898C22C), - SPH_C64(0x94D69CB99C9CD608), SPH_C64(0xC31DF20BF2F21D31), - SPH_C64(0x5101A7F4A7A701F6), SPH_C64(0x8855113311115599), - SPH_C64(0xD79B7E827E7E9BA9), SPH_C64(0x2C9D8B808B8B9DA7), - SPH_C64(0x225243C543435261), SPH_C64(0x180F030503030F1B), - SPH_C64(0x434DE23BE2E24DA1), SPH_C64(0xAE8BDC79DCDC8B72), - SPH_C64(0x7B56E532E5E5569E), SPH_C64(0xF940B2CBB2B2404B), - SPH_C64(0x4A6B4ED24E4E6B04), SPH_C64(0x76FCC754C7C7FCB1), - SPH_C64(0x4FC46DB76D6DC422), SPH_C64(0x1B6AE926E9E96AF2), - SPH_C64(0x25BB27692727BB02), SPH_C64(0x3A5D40C040405D7A), - SPH_C64(0x8E9FD875D8D89F56), SPH_C64(0xA5EB37593737EB92), - SPH_C64(0xE4E092AB9292E076), SPH_C64(0x0C898F8C8F8F8983), - SPH_C64(0x0805010301010509), SPH_C64(0xE8691D271D1D69F5), - SPH_C64(0xA20253F5535302F1), SPH_C64(0xEDC63E423E3EC6D3), - SPH_C64(0xF22059EB595920AB), SPH_C64(0x46E2C15EC1C1E287), - SPH_C64(0x426E4FD14F4F6E0D), SPH_C64(0x8DFA32563232FABF), - SPH_C64(0xB04E163A16164EA6), SPH_C64(0x8335FA13FAFA3579), - SPH_C64(0x87B9749C7474B9F3), SPH_C64(0x8B30FB10FBFB3070), - SPH_C64(0x3FF263A56363F25C), SPH_C64(0x8CD99FBC9F9FD913), - SPH_C64(0xBDE4345C3434E489), SPH_C64(0xD0721A2E1A1A72CA), - SPH_C64(0x4D822A7E2A2A8267), SPH_C64(0xEA2F5AEE5A5A2FB0), - SPH_C64(0x1C838D8A8D8D8391), SPH_C64(0x06CAC946C9C9CACF), - SPH_C64(0x36D4CF4CCFCFD4F9), SPH_C64(0xE309F607F6F60915), - SPH_C64(0xF4EA90AD9090EA64), SPH_C64(0x5D88287828288875), - SPH_C64(0x34928885888892BC), SPH_C64(0xACCD9BB09B9BCD37), - SPH_C64(0x95F531533131F5A4), SPH_C64(0x70360E120E0E367E), - SPH_C64(0x8173BDDABDBD733C), SPH_C64(0x6A7F4ADE4A4A7F20), - SPH_C64(0x136FE825E8E86FFB), SPH_C64(0xC4F496A79696F452), - SPH_C64(0x5904A6F7A6A604FF), SPH_C64(0x603C0C140C0C3C6C), - SPH_C64(0x0ECFC845C8C8CFC6), SPH_C64(0xEF80798B79798096), - SPH_C64(0x8976BCD9BCBC7635), SPH_C64(0x997CBEDFBEBE7C27), - SPH_C64(0x2B74EF2CEFEF74C4), SPH_C64(0x57CB6EB26E6ECB39), - SPH_C64(0x0A4346CA4646434C), SPH_C64(0xCCF197A49797F15B), - SPH_C64(0xE22A5BED5B5B2AB9), SPH_C64(0x3B7EED2AEDED7ED6), - SPH_C64(0xC87D192B19197DD1), SPH_C64(0x869AD976D9D99A5F), - SPH_C64(0x0926ACE9ACAC26A5), SPH_C64(0xBCC799B69999C725), - SPH_C64(0x2932A8E5A8A83281), SPH_C64(0x558D297B29298D7C), - SPH_C64(0x07E964AC6464E963), SPH_C64(0xF8631F211F1F63E7), - SPH_C64(0x0123ADEAADAD23AC), SPH_C64(0x921C55FF55551CC7), - SPH_C64(0x985F133513135F8B), SPH_C64(0xB16DBBD0BBBB6D0A), - SPH_C64(0xEB0CF704F7F70C1C), SPH_C64(0x5FCE6FB16F6FCE30), - SPH_C64(0xA167B9D6B9B96718), SPH_C64(0x024647C947474645), - SPH_C64(0x65932F712F2F934A), SPH_C64(0x2371EE2FEEEE71CD), - SPH_C64(0xA962B8D5B8B86211), SPH_C64(0xFF8A7B8D7B7B8A84), - SPH_C64(0x3C978986898997B5), SPH_C64(0x9DF030503030F0AD), - SPH_C64(0xD6B8D368D3D3B805), SPH_C64(0xDF9E7F817F7F9EA0), - SPH_C64(0x97B3769A7676B3E1), SPH_C64(0x64B0829B8282B0E6) -}; - -static const long long int old0_T3[256] = { - SPH_C64(0xD568B86868D50F67), SPH_C64(0xB7D06DD0D0B71ECE), - SPH_C64(0x60EB20EBEB60E00B), SPH_C64(0x872B7D2B2B876E45), - SPH_C64(0x7548D8484875327A), SPH_C64(0xD39DBA9D9DD3019C), - SPH_C64(0xDF6ABE6A6ADF1D77), SPH_C64(0x53E431E4E4539773), - SPH_C64(0x48E338E3E348A84B), SPH_C64(0x15A3F8A3A315D271), - SPH_C64(0x1356FA565613DC8A), SPH_C64(0xBF819E8181BFFD7C), - SPH_C64(0x947D877D7D94B2CF), SPH_C64(0x12F10EF1F1122ADB), - SPH_C64(0xAB85928585ABD95C), SPH_C64(0xDC9EBF9E9EDC1A84), - SPH_C64(0x9C2C742C2C9C517D), SPH_C64(0x8C8E8F8E8E8C8A04), - SPH_C64(0x8578887878859FE7), SPH_C64(0xC5CA43CACAC5D41E), - SPH_C64(0x4B173917174BAFB8), SPH_C64(0x37A9E6A9A9378821), - SPH_C64(0xF861A36161F84E2F), SPH_C64(0xA6D562D5D5A633E6), - SPH_C64(0x345DE75D5D348FD2), SPH_C64(0x270B1D0B0B275358), - SPH_C64(0x868C898C8C869814), SPH_C64(0xCC3C443C3CCCC1FD), - SPH_C64(0xB677997777B6E89F), SPH_C64(0x0851F3515108E3B2), - SPH_C64(0xAA22662222AA2F0D), SPH_C64(0x5742C6424257682A), - SPH_C64(0xC33F413F3FC3DAE5), SPH_C64(0x1954FC545419CE9A), - SPH_C64(0x5841C34141587332), SPH_C64(0xBA809D8080BAF474), - SPH_C64(0xDBCC49CCCCDBE22E), SPH_C64(0xA486978686A4C244), - SPH_C64(0x45B3C8B3B34542F1), SPH_C64(0x781828181878D8C0), - SPH_C64(0x962E722E2E96436D), SPH_C64(0x1657F9575716D582), - SPH_C64(0x1E060A06061E3630), SPH_C64(0xF762A66262F75537), - SPH_C64(0x03F401F4F40307F3), SPH_C64(0xEE365A3636EE9BAD), - SPH_C64(0xB2D16ED1D1B217C6), SPH_C64(0xDA6BBD6B6BDA147F), - SPH_C64(0x771B2D1B1B77C3D8), SPH_C64(0xEC65AF6565EC6A0F), - SPH_C64(0xBC759F7575BCFA8F), SPH_C64(0x5010301010509080), - SPH_C64(0x95DA73DADA95449E), SPH_C64(0x7049DB4949703B72), - SPH_C64(0xBE266A2626BE0B2D), SPH_C64(0x3AF916F9F93A629B), - SPH_C64(0xC0CB40CBCBC0DD16), SPH_C64(0xE366AA6666E37117), - SPH_C64(0x5CE734E7E75C8C6B), SPH_C64(0x68BAD3BABA6803B9), - SPH_C64(0x2CAEEFAEAE2CB719), SPH_C64(0x0D50F050500DEABA), - SPH_C64(0x0752F6525207F8AA), SPH_C64(0x3DABE0ABAB3D9A31), - SPH_C64(0x11050F0505112D28), SPH_C64(0x17F00DF0F01723D3), - SPH_C64(0x390D170D0D396568), SPH_C64(0xA273957373A2CCBF), - SPH_C64(0xD73B4D3B3BD7FEC5), SPH_C64(0x14040C0404142420), - SPH_C64(0xA020602020A03D1D), SPH_C64(0x21FE1FFEFE215DA3), - SPH_C64(0x8EDD7ADDDD8E7BA6), SPH_C64(0x06F502F5F5060EFB), - SPH_C64(0x5EB4C1B4B45E7DC9), SPH_C64(0x3E5FE15F5F3E9DC2), - SPH_C64(0x220A1E0A0A225A50), SPH_C64(0x5BB5C2B5B55B74C1), - SPH_C64(0xE7C05DC0C0E78E4E), SPH_C64(0x1AA0FDA0A01AC969), - SPH_C64(0xA871937171A8DEAF), SPH_C64(0x0BA5F2A5A50BE441), - SPH_C64(0x992D772D2D995875), SPH_C64(0xFD60A06060FD4727), - SPH_C64(0xA772967272A7C5B7), SPH_C64(0xE593A89393E57FEC), - SPH_C64(0xDD394B3939DDECD5), SPH_C64(0x2808180808284840), - SPH_C64(0xB583988383B5EF6C), SPH_C64(0xA521632121A53415), - SPH_C64(0x315CE45C5C3186DA), SPH_C64(0xA187948787A1CB4C), - SPH_C64(0x4FB1CEB1B14F50E1), SPH_C64(0x47E03DE0E047B353), - SPH_C64(0x0000000000000000), SPH_C64(0xE8C358C3C3E89556), - SPH_C64(0x5A123612125A8290), SPH_C64(0xEF91AE9191EF6DFC), - SPH_C64(0x988A838A8A98AE24), SPH_C64(0x0A020602020A1210), - SPH_C64(0x6C1C241C1C6CFCE0), SPH_C64(0x59E637E6E6598563), - SPH_C64(0x4C45CF45454C5712), SPH_C64(0xEDC25BC2C2ED9C5E), - SPH_C64(0xF3C451C4C4F3AA6E), SPH_C64(0x2EFD1AFDFD2E46BB), - SPH_C64(0x79BFDCBFBF792E91), SPH_C64(0x4944CC4444495E1A), - SPH_C64(0x1FA1FEA1A11FC061), SPH_C64(0x614CD44C4C61165A), - SPH_C64(0xFF33553333FFB685), SPH_C64(0xF6C552C5C5F6A366), - SPH_C64(0xAE84918484AED054), SPH_C64(0xAF23652323AF2605), - SPH_C64(0x917C847C7C91BBC7), SPH_C64(0x4AB0CDB0B04A59E9), - SPH_C64(0xB1256F2525B11035), SPH_C64(0x41153F151541BDA8), - SPH_C64(0xE1355F3535E180B5), SPH_C64(0xD069BB6969D0066F), - SPH_C64(0x24FF1CFFFF2454AB), SPH_C64(0xFE94A19494FE40D4), - SPH_C64(0x644DD74D4D641F52), SPH_C64(0xAD70907070ADD7A7), - SPH_C64(0x10A2FBA2A210DB79), SPH_C64(0x29AFECAFAF29BE11), - SPH_C64(0xDECD4ACDCDDEEB26), SPH_C64(0xA9D667D6D6A928FE), - SPH_C64(0xC16CB46C6CC12B47), SPH_C64(0x51B7C4B7B75166D1), - SPH_C64(0x3FF815F8F83F6B93), SPH_C64(0x2D091B09092D4148), - SPH_C64(0x18F308F3F31838CB), SPH_C64(0xE667A96767E6781F), - SPH_C64(0x0EA4F1A4A40EED49), SPH_C64(0x65EA23EAEA65E903), - SPH_C64(0x7BEC29ECEC7BDF33), SPH_C64(0x54B6C7B6B6546FD9), - SPH_C64(0xA3D461D4D4A33AEE), SPH_C64(0xBDD26BD2D2BD0CDE), - SPH_C64(0x44143C141444B4A0), SPH_C64(0x661E221E1E66EEF0), - SPH_C64(0x42E13EE1E142BA5B), SPH_C64(0xB4246C2424B4193D), - SPH_C64(0xD838483838D8E5DD), SPH_C64(0xF9C657C6C6F9B87E), - SPH_C64(0x90DB70DBDB904D96), SPH_C64(0x7A4BDD4B4B7A2962), - SPH_C64(0x8F7A8E7A7A8F8DF7), SPH_C64(0xD23A4E3A3AD2F7CD), - SPH_C64(0x81DE7FDEDE8160BE), SPH_C64(0x3B5EE25E5E3B94CA), - SPH_C64(0x84DF7CDFDF8469B6), SPH_C64(0xFB95A29595FB49DC), - SPH_C64(0x2BFC19FCFC2B4FB3), SPH_C64(0x38AAE3AAAA389339), - SPH_C64(0xACD764D7D7AC21F6), SPH_C64(0xD1CE4FCECED1F03E), - SPH_C64(0x1B070907071B3F38), SPH_C64(0x330F110F0F337778), - SPH_C64(0xC93D473D3DC9C8F5), SPH_C64(0x2558E8585825A2FA), - SPH_C64(0xC89AB39A9AC83EA4), SPH_C64(0xC298B59898C22CB4), - SPH_C64(0xD69CB99C9CD60894), SPH_C64(0x1DF20BF2F21D31C3), - SPH_C64(0x01A7F4A7A701F651), SPH_C64(0x5511331111559988), - SPH_C64(0x9B7E827E7E9BA9D7), SPH_C64(0x9D8B808B8B9DA72C), - SPH_C64(0x5243C54343526122), SPH_C64(0x0F030503030F1B18), - SPH_C64(0x4DE23BE2E24DA143), SPH_C64(0x8BDC79DCDC8B72AE), - SPH_C64(0x56E532E5E5569E7B), SPH_C64(0x40B2CBB2B2404BF9), - SPH_C64(0x6B4ED24E4E6B044A), SPH_C64(0xFCC754C7C7FCB176), - SPH_C64(0xC46DB76D6DC4224F), SPH_C64(0x6AE926E9E96AF21B), - SPH_C64(0xBB27692727BB0225), SPH_C64(0x5D40C040405D7A3A), - SPH_C64(0x9FD875D8D89F568E), SPH_C64(0xEB37593737EB92A5), - SPH_C64(0xE092AB9292E076E4), SPH_C64(0x898F8C8F8F89830C), - SPH_C64(0x0501030101050908), SPH_C64(0x691D271D1D69F5E8), - SPH_C64(0x0253F5535302F1A2), SPH_C64(0xC63E423E3EC6D3ED), - SPH_C64(0x2059EB595920ABF2), SPH_C64(0xE2C15EC1C1E28746), - SPH_C64(0x6E4FD14F4F6E0D42), SPH_C64(0xFA32563232FABF8D), - SPH_C64(0x4E163A16164EA6B0), SPH_C64(0x35FA13FAFA357983), - SPH_C64(0xB9749C7474B9F387), SPH_C64(0x30FB10FBFB30708B), - SPH_C64(0xF263A56363F25C3F), SPH_C64(0xD99FBC9F9FD9138C), - SPH_C64(0xE4345C3434E489BD), SPH_C64(0x721A2E1A1A72CAD0), - SPH_C64(0x822A7E2A2A82674D), SPH_C64(0x2F5AEE5A5A2FB0EA), - SPH_C64(0x838D8A8D8D83911C), SPH_C64(0xCAC946C9C9CACF06), - SPH_C64(0xD4CF4CCFCFD4F936), SPH_C64(0x09F607F6F60915E3), - SPH_C64(0xEA90AD9090EA64F4), SPH_C64(0x882878282888755D), - SPH_C64(0x928885888892BC34), SPH_C64(0xCD9BB09B9BCD37AC), - SPH_C64(0xF531533131F5A495), SPH_C64(0x360E120E0E367E70), - SPH_C64(0x73BDDABDBD733C81), SPH_C64(0x7F4ADE4A4A7F206A), - SPH_C64(0x6FE825E8E86FFB13), SPH_C64(0xF496A79696F452C4), - SPH_C64(0x04A6F7A6A604FF59), SPH_C64(0x3C0C140C0C3C6C60), - SPH_C64(0xCFC845C8C8CFC60E), SPH_C64(0x80798B79798096EF), - SPH_C64(0x76BCD9BCBC763589), SPH_C64(0x7CBEDFBEBE7C2799), - SPH_C64(0x74EF2CEFEF74C42B), SPH_C64(0xCB6EB26E6ECB3957), - SPH_C64(0x4346CA4646434C0A), SPH_C64(0xF197A49797F15BCC), - SPH_C64(0x2A5BED5B5B2AB9E2), SPH_C64(0x7EED2AEDED7ED63B), - SPH_C64(0x7D192B19197DD1C8), SPH_C64(0x9AD976D9D99A5F86), - SPH_C64(0x26ACE9ACAC26A509), SPH_C64(0xC799B69999C725BC), - SPH_C64(0x32A8E5A8A8328129), SPH_C64(0x8D297B29298D7C55), - SPH_C64(0xE964AC6464E96307), SPH_C64(0x631F211F1F63E7F8), - SPH_C64(0x23ADEAADAD23AC01), SPH_C64(0x1C55FF55551CC792), - SPH_C64(0x5F133513135F8B98), SPH_C64(0x6DBBD0BBBB6D0AB1), - SPH_C64(0x0CF704F7F70C1CEB), SPH_C64(0xCE6FB16F6FCE305F), - SPH_C64(0x67B9D6B9B96718A1), SPH_C64(0x4647C94747464502), - SPH_C64(0x932F712F2F934A65), SPH_C64(0x71EE2FEEEE71CD23), - SPH_C64(0x62B8D5B8B86211A9), SPH_C64(0x8A7B8D7B7B8A84FF), - SPH_C64(0x978986898997B53C), SPH_C64(0xF030503030F0AD9D), - SPH_C64(0xB8D368D3D3B805D6), SPH_C64(0x9E7F817F7F9EA0DF), - SPH_C64(0xB3769A7676B3E197), SPH_C64(0xB0829B8282B0E664) -}; - -static const long long int old0_T4[256] = { - SPH_C64(0x68B86868D50F67D5), SPH_C64(0xD06DD0D0B71ECEB7), - SPH_C64(0xEB20EBEB60E00B60), SPH_C64(0x2B7D2B2B876E4587), - SPH_C64(0x48D8484875327A75), SPH_C64(0x9DBA9D9DD3019CD3), - SPH_C64(0x6ABE6A6ADF1D77DF), SPH_C64(0xE431E4E453977353), - SPH_C64(0xE338E3E348A84B48), SPH_C64(0xA3F8A3A315D27115), - SPH_C64(0x56FA565613DC8A13), SPH_C64(0x819E8181BFFD7CBF), - SPH_C64(0x7D877D7D94B2CF94), SPH_C64(0xF10EF1F1122ADB12), - SPH_C64(0x85928585ABD95CAB), SPH_C64(0x9EBF9E9EDC1A84DC), - SPH_C64(0x2C742C2C9C517D9C), SPH_C64(0x8E8F8E8E8C8A048C), - SPH_C64(0x78887878859FE785), SPH_C64(0xCA43CACAC5D41EC5), - SPH_C64(0x173917174BAFB84B), SPH_C64(0xA9E6A9A937882137), - SPH_C64(0x61A36161F84E2FF8), SPH_C64(0xD562D5D5A633E6A6), - SPH_C64(0x5DE75D5D348FD234), SPH_C64(0x0B1D0B0B27535827), - SPH_C64(0x8C898C8C86981486), SPH_C64(0x3C443C3CCCC1FDCC), - SPH_C64(0x77997777B6E89FB6), SPH_C64(0x51F3515108E3B208), - SPH_C64(0x22662222AA2F0DAA), SPH_C64(0x42C6424257682A57), - SPH_C64(0x3F413F3FC3DAE5C3), SPH_C64(0x54FC545419CE9A19), - SPH_C64(0x41C3414158733258), SPH_C64(0x809D8080BAF474BA), - SPH_C64(0xCC49CCCCDBE22EDB), SPH_C64(0x86978686A4C244A4), - SPH_C64(0xB3C8B3B34542F145), SPH_C64(0x1828181878D8C078), - SPH_C64(0x2E722E2E96436D96), SPH_C64(0x57F9575716D58216), - SPH_C64(0x060A06061E36301E), SPH_C64(0x62A66262F75537F7), - SPH_C64(0xF401F4F40307F303), SPH_C64(0x365A3636EE9BADEE), - SPH_C64(0xD16ED1D1B217C6B2), SPH_C64(0x6BBD6B6BDA147FDA), - SPH_C64(0x1B2D1B1B77C3D877), SPH_C64(0x65AF6565EC6A0FEC), - SPH_C64(0x759F7575BCFA8FBC), SPH_C64(0x1030101050908050), - SPH_C64(0xDA73DADA95449E95), SPH_C64(0x49DB4949703B7270), - SPH_C64(0x266A2626BE0B2DBE), SPH_C64(0xF916F9F93A629B3A), - SPH_C64(0xCB40CBCBC0DD16C0), SPH_C64(0x66AA6666E37117E3), - SPH_C64(0xE734E7E75C8C6B5C), SPH_C64(0xBAD3BABA6803B968), - SPH_C64(0xAEEFAEAE2CB7192C), SPH_C64(0x50F050500DEABA0D), - SPH_C64(0x52F6525207F8AA07), SPH_C64(0xABE0ABAB3D9A313D), - SPH_C64(0x050F0505112D2811), SPH_C64(0xF00DF0F01723D317), - SPH_C64(0x0D170D0D39656839), SPH_C64(0x73957373A2CCBFA2), - SPH_C64(0x3B4D3B3BD7FEC5D7), SPH_C64(0x040C040414242014), - SPH_C64(0x20602020A03D1DA0), SPH_C64(0xFE1FFEFE215DA321), - SPH_C64(0xDD7ADDDD8E7BA68E), SPH_C64(0xF502F5F5060EFB06), - SPH_C64(0xB4C1B4B45E7DC95E), SPH_C64(0x5FE15F5F3E9DC23E), - SPH_C64(0x0A1E0A0A225A5022), SPH_C64(0xB5C2B5B55B74C15B), - SPH_C64(0xC05DC0C0E78E4EE7), SPH_C64(0xA0FDA0A01AC9691A), - SPH_C64(0x71937171A8DEAFA8), SPH_C64(0xA5F2A5A50BE4410B), - SPH_C64(0x2D772D2D99587599), SPH_C64(0x60A06060FD4727FD), - SPH_C64(0x72967272A7C5B7A7), SPH_C64(0x93A89393E57FECE5), - SPH_C64(0x394B3939DDECD5DD), SPH_C64(0x0818080828484028), - SPH_C64(0x83988383B5EF6CB5), SPH_C64(0x21632121A53415A5), - SPH_C64(0x5CE45C5C3186DA31), SPH_C64(0x87948787A1CB4CA1), - SPH_C64(0xB1CEB1B14F50E14F), SPH_C64(0xE03DE0E047B35347), - SPH_C64(0x0000000000000000), SPH_C64(0xC358C3C3E89556E8), - SPH_C64(0x123612125A82905A), SPH_C64(0x91AE9191EF6DFCEF), - SPH_C64(0x8A838A8A98AE2498), SPH_C64(0x020602020A12100A), - SPH_C64(0x1C241C1C6CFCE06C), SPH_C64(0xE637E6E659856359), - SPH_C64(0x45CF45454C57124C), SPH_C64(0xC25BC2C2ED9C5EED), - SPH_C64(0xC451C4C4F3AA6EF3), SPH_C64(0xFD1AFDFD2E46BB2E), - SPH_C64(0xBFDCBFBF792E9179), SPH_C64(0x44CC4444495E1A49), - SPH_C64(0xA1FEA1A11FC0611F), SPH_C64(0x4CD44C4C61165A61), - SPH_C64(0x33553333FFB685FF), SPH_C64(0xC552C5C5F6A366F6), - SPH_C64(0x84918484AED054AE), SPH_C64(0x23652323AF2605AF), - SPH_C64(0x7C847C7C91BBC791), SPH_C64(0xB0CDB0B04A59E94A), - SPH_C64(0x256F2525B11035B1), SPH_C64(0x153F151541BDA841), - SPH_C64(0x355F3535E180B5E1), SPH_C64(0x69BB6969D0066FD0), - SPH_C64(0xFF1CFFFF2454AB24), SPH_C64(0x94A19494FE40D4FE), - SPH_C64(0x4DD74D4D641F5264), SPH_C64(0x70907070ADD7A7AD), - SPH_C64(0xA2FBA2A210DB7910), SPH_C64(0xAFECAFAF29BE1129), - SPH_C64(0xCD4ACDCDDEEB26DE), SPH_C64(0xD667D6D6A928FEA9), - SPH_C64(0x6CB46C6CC12B47C1), SPH_C64(0xB7C4B7B75166D151), - SPH_C64(0xF815F8F83F6B933F), SPH_C64(0x091B09092D41482D), - SPH_C64(0xF308F3F31838CB18), SPH_C64(0x67A96767E6781FE6), - SPH_C64(0xA4F1A4A40EED490E), SPH_C64(0xEA23EAEA65E90365), - SPH_C64(0xEC29ECEC7BDF337B), SPH_C64(0xB6C7B6B6546FD954), - SPH_C64(0xD461D4D4A33AEEA3), SPH_C64(0xD26BD2D2BD0CDEBD), - SPH_C64(0x143C141444B4A044), SPH_C64(0x1E221E1E66EEF066), - SPH_C64(0xE13EE1E142BA5B42), SPH_C64(0x246C2424B4193DB4), - SPH_C64(0x38483838D8E5DDD8), SPH_C64(0xC657C6C6F9B87EF9), - SPH_C64(0xDB70DBDB904D9690), SPH_C64(0x4BDD4B4B7A29627A), - SPH_C64(0x7A8E7A7A8F8DF78F), SPH_C64(0x3A4E3A3AD2F7CDD2), - SPH_C64(0xDE7FDEDE8160BE81), SPH_C64(0x5EE25E5E3B94CA3B), - SPH_C64(0xDF7CDFDF8469B684), SPH_C64(0x95A29595FB49DCFB), - SPH_C64(0xFC19FCFC2B4FB32B), SPH_C64(0xAAE3AAAA38933938), - SPH_C64(0xD764D7D7AC21F6AC), SPH_C64(0xCE4FCECED1F03ED1), - SPH_C64(0x070907071B3F381B), SPH_C64(0x0F110F0F33777833), - SPH_C64(0x3D473D3DC9C8F5C9), SPH_C64(0x58E8585825A2FA25), - SPH_C64(0x9AB39A9AC83EA4C8), SPH_C64(0x98B59898C22CB4C2), - SPH_C64(0x9CB99C9CD60894D6), SPH_C64(0xF20BF2F21D31C31D), - SPH_C64(0xA7F4A7A701F65101), SPH_C64(0x1133111155998855), - SPH_C64(0x7E827E7E9BA9D79B), SPH_C64(0x8B808B8B9DA72C9D), - SPH_C64(0x43C5434352612252), SPH_C64(0x030503030F1B180F), - SPH_C64(0xE23BE2E24DA1434D), SPH_C64(0xDC79DCDC8B72AE8B), - SPH_C64(0xE532E5E5569E7B56), SPH_C64(0xB2CBB2B2404BF940), - SPH_C64(0x4ED24E4E6B044A6B), SPH_C64(0xC754C7C7FCB176FC), - SPH_C64(0x6DB76D6DC4224FC4), SPH_C64(0xE926E9E96AF21B6A), - SPH_C64(0x27692727BB0225BB), SPH_C64(0x40C040405D7A3A5D), - SPH_C64(0xD875D8D89F568E9F), SPH_C64(0x37593737EB92A5EB), - SPH_C64(0x92AB9292E076E4E0), SPH_C64(0x8F8C8F8F89830C89), - SPH_C64(0x0103010105090805), SPH_C64(0x1D271D1D69F5E869), - SPH_C64(0x53F5535302F1A202), SPH_C64(0x3E423E3EC6D3EDC6), - SPH_C64(0x59EB595920ABF220), SPH_C64(0xC15EC1C1E28746E2), - SPH_C64(0x4FD14F4F6E0D426E), SPH_C64(0x32563232FABF8DFA), - SPH_C64(0x163A16164EA6B04E), SPH_C64(0xFA13FAFA35798335), - SPH_C64(0x749C7474B9F387B9), SPH_C64(0xFB10FBFB30708B30), - SPH_C64(0x63A56363F25C3FF2), SPH_C64(0x9FBC9F9FD9138CD9), - SPH_C64(0x345C3434E489BDE4), SPH_C64(0x1A2E1A1A72CAD072), - SPH_C64(0x2A7E2A2A82674D82), SPH_C64(0x5AEE5A5A2FB0EA2F), - SPH_C64(0x8D8A8D8D83911C83), SPH_C64(0xC946C9C9CACF06CA), - SPH_C64(0xCF4CCFCFD4F936D4), SPH_C64(0xF607F6F60915E309), - SPH_C64(0x90AD9090EA64F4EA), SPH_C64(0x2878282888755D88), - SPH_C64(0x8885888892BC3492), SPH_C64(0x9BB09B9BCD37ACCD), - SPH_C64(0x31533131F5A495F5), SPH_C64(0x0E120E0E367E7036), - SPH_C64(0xBDDABDBD733C8173), SPH_C64(0x4ADE4A4A7F206A7F), - SPH_C64(0xE825E8E86FFB136F), SPH_C64(0x96A79696F452C4F4), - SPH_C64(0xA6F7A6A604FF5904), SPH_C64(0x0C140C0C3C6C603C), - SPH_C64(0xC845C8C8CFC60ECF), SPH_C64(0x798B79798096EF80), - SPH_C64(0xBCD9BCBC76358976), SPH_C64(0xBEDFBEBE7C27997C), - SPH_C64(0xEF2CEFEF74C42B74), SPH_C64(0x6EB26E6ECB3957CB), - SPH_C64(0x46CA4646434C0A43), SPH_C64(0x97A49797F15BCCF1), - SPH_C64(0x5BED5B5B2AB9E22A), SPH_C64(0xED2AEDED7ED63B7E), - SPH_C64(0x192B19197DD1C87D), SPH_C64(0xD976D9D99A5F869A), - SPH_C64(0xACE9ACAC26A50926), SPH_C64(0x99B69999C725BCC7), - SPH_C64(0xA8E5A8A832812932), SPH_C64(0x297B29298D7C558D), - SPH_C64(0x64AC6464E96307E9), SPH_C64(0x1F211F1F63E7F863), - SPH_C64(0xADEAADAD23AC0123), SPH_C64(0x55FF55551CC7921C), - SPH_C64(0x133513135F8B985F), SPH_C64(0xBBD0BBBB6D0AB16D), - SPH_C64(0xF704F7F70C1CEB0C), SPH_C64(0x6FB16F6FCE305FCE), - SPH_C64(0xB9D6B9B96718A167), SPH_C64(0x47C9474746450246), - SPH_C64(0x2F712F2F934A6593), SPH_C64(0xEE2FEEEE71CD2371), - SPH_C64(0xB8D5B8B86211A962), SPH_C64(0x7B8D7B7B8A84FF8A), - SPH_C64(0x8986898997B53C97), SPH_C64(0x30503030F0AD9DF0), - SPH_C64(0xD368D3D3B805D6B8), SPH_C64(0x7F817F7F9EA0DF9E), - SPH_C64(0x769A7676B3E197B3), SPH_C64(0x829B8282B0E664B0) -}; - -static const long long int old0_T5[256] = { - SPH_C64(0xB86868D50F67D568), SPH_C64(0x6DD0D0B71ECEB7D0), - SPH_C64(0x20EBEB60E00B60EB), SPH_C64(0x7D2B2B876E45872B), - SPH_C64(0xD8484875327A7548), SPH_C64(0xBA9D9DD3019CD39D), - SPH_C64(0xBE6A6ADF1D77DF6A), SPH_C64(0x31E4E453977353E4), - SPH_C64(0x38E3E348A84B48E3), SPH_C64(0xF8A3A315D27115A3), - SPH_C64(0xFA565613DC8A1356), SPH_C64(0x9E8181BFFD7CBF81), - SPH_C64(0x877D7D94B2CF947D), SPH_C64(0x0EF1F1122ADB12F1), - SPH_C64(0x928585ABD95CAB85), SPH_C64(0xBF9E9EDC1A84DC9E), - SPH_C64(0x742C2C9C517D9C2C), SPH_C64(0x8F8E8E8C8A048C8E), - SPH_C64(0x887878859FE78578), SPH_C64(0x43CACAC5D41EC5CA), - SPH_C64(0x3917174BAFB84B17), SPH_C64(0xE6A9A937882137A9), - SPH_C64(0xA36161F84E2FF861), SPH_C64(0x62D5D5A633E6A6D5), - SPH_C64(0xE75D5D348FD2345D), SPH_C64(0x1D0B0B275358270B), - SPH_C64(0x898C8C869814868C), SPH_C64(0x443C3CCCC1FDCC3C), - SPH_C64(0x997777B6E89FB677), SPH_C64(0xF3515108E3B20851), - SPH_C64(0x662222AA2F0DAA22), SPH_C64(0xC6424257682A5742), - SPH_C64(0x413F3FC3DAE5C33F), SPH_C64(0xFC545419CE9A1954), - SPH_C64(0xC341415873325841), SPH_C64(0x9D8080BAF474BA80), - SPH_C64(0x49CCCCDBE22EDBCC), SPH_C64(0x978686A4C244A486), - SPH_C64(0xC8B3B34542F145B3), SPH_C64(0x28181878D8C07818), - SPH_C64(0x722E2E96436D962E), SPH_C64(0xF9575716D5821657), - SPH_C64(0x0A06061E36301E06), SPH_C64(0xA66262F75537F762), - SPH_C64(0x01F4F40307F303F4), SPH_C64(0x5A3636EE9BADEE36), - SPH_C64(0x6ED1D1B217C6B2D1), SPH_C64(0xBD6B6BDA147FDA6B), - SPH_C64(0x2D1B1B77C3D8771B), SPH_C64(0xAF6565EC6A0FEC65), - SPH_C64(0x9F7575BCFA8FBC75), SPH_C64(0x3010105090805010), - SPH_C64(0x73DADA95449E95DA), SPH_C64(0xDB4949703B727049), - SPH_C64(0x6A2626BE0B2DBE26), SPH_C64(0x16F9F93A629B3AF9), - SPH_C64(0x40CBCBC0DD16C0CB), SPH_C64(0xAA6666E37117E366), - SPH_C64(0x34E7E75C8C6B5CE7), SPH_C64(0xD3BABA6803B968BA), - SPH_C64(0xEFAEAE2CB7192CAE), SPH_C64(0xF050500DEABA0D50), - SPH_C64(0xF6525207F8AA0752), SPH_C64(0xE0ABAB3D9A313DAB), - SPH_C64(0x0F0505112D281105), SPH_C64(0x0DF0F01723D317F0), - SPH_C64(0x170D0D396568390D), SPH_C64(0x957373A2CCBFA273), - SPH_C64(0x4D3B3BD7FEC5D73B), SPH_C64(0x0C04041424201404), - SPH_C64(0x602020A03D1DA020), SPH_C64(0x1FFEFE215DA321FE), - SPH_C64(0x7ADDDD8E7BA68EDD), SPH_C64(0x02F5F5060EFB06F5), - SPH_C64(0xC1B4B45E7DC95EB4), SPH_C64(0xE15F5F3E9DC23E5F), - SPH_C64(0x1E0A0A225A50220A), SPH_C64(0xC2B5B55B74C15BB5), - SPH_C64(0x5DC0C0E78E4EE7C0), SPH_C64(0xFDA0A01AC9691AA0), - SPH_C64(0x937171A8DEAFA871), SPH_C64(0xF2A5A50BE4410BA5), - SPH_C64(0x772D2D995875992D), SPH_C64(0xA06060FD4727FD60), - SPH_C64(0x967272A7C5B7A772), SPH_C64(0xA89393E57FECE593), - SPH_C64(0x4B3939DDECD5DD39), SPH_C64(0x1808082848402808), - SPH_C64(0x988383B5EF6CB583), SPH_C64(0x632121A53415A521), - SPH_C64(0xE45C5C3186DA315C), SPH_C64(0x948787A1CB4CA187), - SPH_C64(0xCEB1B14F50E14FB1), SPH_C64(0x3DE0E047B35347E0), - SPH_C64(0x0000000000000000), SPH_C64(0x58C3C3E89556E8C3), - SPH_C64(0x3612125A82905A12), SPH_C64(0xAE9191EF6DFCEF91), - SPH_C64(0x838A8A98AE24988A), SPH_C64(0x0602020A12100A02), - SPH_C64(0x241C1C6CFCE06C1C), SPH_C64(0x37E6E659856359E6), - SPH_C64(0xCF45454C57124C45), SPH_C64(0x5BC2C2ED9C5EEDC2), - SPH_C64(0x51C4C4F3AA6EF3C4), SPH_C64(0x1AFDFD2E46BB2EFD), - SPH_C64(0xDCBFBF792E9179BF), SPH_C64(0xCC4444495E1A4944), - SPH_C64(0xFEA1A11FC0611FA1), SPH_C64(0xD44C4C61165A614C), - SPH_C64(0x553333FFB685FF33), SPH_C64(0x52C5C5F6A366F6C5), - SPH_C64(0x918484AED054AE84), SPH_C64(0x652323AF2605AF23), - SPH_C64(0x847C7C91BBC7917C), SPH_C64(0xCDB0B04A59E94AB0), - SPH_C64(0x6F2525B11035B125), SPH_C64(0x3F151541BDA84115), - SPH_C64(0x5F3535E180B5E135), SPH_C64(0xBB6969D0066FD069), - SPH_C64(0x1CFFFF2454AB24FF), SPH_C64(0xA19494FE40D4FE94), - SPH_C64(0xD74D4D641F52644D), SPH_C64(0x907070ADD7A7AD70), - SPH_C64(0xFBA2A210DB7910A2), SPH_C64(0xECAFAF29BE1129AF), - SPH_C64(0x4ACDCDDEEB26DECD), SPH_C64(0x67D6D6A928FEA9D6), - SPH_C64(0xB46C6CC12B47C16C), SPH_C64(0xC4B7B75166D151B7), - SPH_C64(0x15F8F83F6B933FF8), SPH_C64(0x1B09092D41482D09), - SPH_C64(0x08F3F31838CB18F3), SPH_C64(0xA96767E6781FE667), - SPH_C64(0xF1A4A40EED490EA4), SPH_C64(0x23EAEA65E90365EA), - SPH_C64(0x29ECEC7BDF337BEC), SPH_C64(0xC7B6B6546FD954B6), - SPH_C64(0x61D4D4A33AEEA3D4), SPH_C64(0x6BD2D2BD0CDEBDD2), - SPH_C64(0x3C141444B4A04414), SPH_C64(0x221E1E66EEF0661E), - SPH_C64(0x3EE1E142BA5B42E1), SPH_C64(0x6C2424B4193DB424), - SPH_C64(0x483838D8E5DDD838), SPH_C64(0x57C6C6F9B87EF9C6), - SPH_C64(0x70DBDB904D9690DB), SPH_C64(0xDD4B4B7A29627A4B), - SPH_C64(0x8E7A7A8F8DF78F7A), SPH_C64(0x4E3A3AD2F7CDD23A), - SPH_C64(0x7FDEDE8160BE81DE), SPH_C64(0xE25E5E3B94CA3B5E), - SPH_C64(0x7CDFDF8469B684DF), SPH_C64(0xA29595FB49DCFB95), - SPH_C64(0x19FCFC2B4FB32BFC), SPH_C64(0xE3AAAA38933938AA), - SPH_C64(0x64D7D7AC21F6ACD7), SPH_C64(0x4FCECED1F03ED1CE), - SPH_C64(0x0907071B3F381B07), SPH_C64(0x110F0F337778330F), - SPH_C64(0x473D3DC9C8F5C93D), SPH_C64(0xE8585825A2FA2558), - SPH_C64(0xB39A9AC83EA4C89A), SPH_C64(0xB59898C22CB4C298), - SPH_C64(0xB99C9CD60894D69C), SPH_C64(0x0BF2F21D31C31DF2), - SPH_C64(0xF4A7A701F65101A7), SPH_C64(0x3311115599885511), - SPH_C64(0x827E7E9BA9D79B7E), SPH_C64(0x808B8B9DA72C9D8B), - SPH_C64(0xC543435261225243), SPH_C64(0x0503030F1B180F03), - SPH_C64(0x3BE2E24DA1434DE2), SPH_C64(0x79DCDC8B72AE8BDC), - SPH_C64(0x32E5E5569E7B56E5), SPH_C64(0xCBB2B2404BF940B2), - SPH_C64(0xD24E4E6B044A6B4E), SPH_C64(0x54C7C7FCB176FCC7), - SPH_C64(0xB76D6DC4224FC46D), SPH_C64(0x26E9E96AF21B6AE9), - SPH_C64(0x692727BB0225BB27), SPH_C64(0xC040405D7A3A5D40), - SPH_C64(0x75D8D89F568E9FD8), SPH_C64(0x593737EB92A5EB37), - SPH_C64(0xAB9292E076E4E092), SPH_C64(0x8C8F8F89830C898F), - SPH_C64(0x0301010509080501), SPH_C64(0x271D1D69F5E8691D), - SPH_C64(0xF5535302F1A20253), SPH_C64(0x423E3EC6D3EDC63E), - SPH_C64(0xEB595920ABF22059), SPH_C64(0x5EC1C1E28746E2C1), - SPH_C64(0xD14F4F6E0D426E4F), SPH_C64(0x563232FABF8DFA32), - SPH_C64(0x3A16164EA6B04E16), SPH_C64(0x13FAFA35798335FA), - SPH_C64(0x9C7474B9F387B974), SPH_C64(0x10FBFB30708B30FB), - SPH_C64(0xA56363F25C3FF263), SPH_C64(0xBC9F9FD9138CD99F), - SPH_C64(0x5C3434E489BDE434), SPH_C64(0x2E1A1A72CAD0721A), - SPH_C64(0x7E2A2A82674D822A), SPH_C64(0xEE5A5A2FB0EA2F5A), - SPH_C64(0x8A8D8D83911C838D), SPH_C64(0x46C9C9CACF06CAC9), - SPH_C64(0x4CCFCFD4F936D4CF), SPH_C64(0x07F6F60915E309F6), - SPH_C64(0xAD9090EA64F4EA90), SPH_C64(0x78282888755D8828), - SPH_C64(0x85888892BC349288), SPH_C64(0xB09B9BCD37ACCD9B), - SPH_C64(0x533131F5A495F531), SPH_C64(0x120E0E367E70360E), - SPH_C64(0xDABDBD733C8173BD), SPH_C64(0xDE4A4A7F206A7F4A), - SPH_C64(0x25E8E86FFB136FE8), SPH_C64(0xA79696F452C4F496), - SPH_C64(0xF7A6A604FF5904A6), SPH_C64(0x140C0C3C6C603C0C), - SPH_C64(0x45C8C8CFC60ECFC8), SPH_C64(0x8B79798096EF8079), - SPH_C64(0xD9BCBC76358976BC), SPH_C64(0xDFBEBE7C27997CBE), - SPH_C64(0x2CEFEF74C42B74EF), SPH_C64(0xB26E6ECB3957CB6E), - SPH_C64(0xCA4646434C0A4346), SPH_C64(0xA49797F15BCCF197), - SPH_C64(0xED5B5B2AB9E22A5B), SPH_C64(0x2AEDED7ED63B7EED), - SPH_C64(0x2B19197DD1C87D19), SPH_C64(0x76D9D99A5F869AD9), - SPH_C64(0xE9ACAC26A50926AC), SPH_C64(0xB69999C725BCC799), - SPH_C64(0xE5A8A832812932A8), SPH_C64(0x7B29298D7C558D29), - SPH_C64(0xAC6464E96307E964), SPH_C64(0x211F1F63E7F8631F), - SPH_C64(0xEAADAD23AC0123AD), SPH_C64(0xFF55551CC7921C55), - SPH_C64(0x3513135F8B985F13), SPH_C64(0xD0BBBB6D0AB16DBB), - SPH_C64(0x04F7F70C1CEB0CF7), SPH_C64(0xB16F6FCE305FCE6F), - SPH_C64(0xD6B9B96718A167B9), SPH_C64(0xC947474645024647), - SPH_C64(0x712F2F934A65932F), SPH_C64(0x2FEEEE71CD2371EE), - SPH_C64(0xD5B8B86211A962B8), SPH_C64(0x8D7B7B8A84FF8A7B), - SPH_C64(0x86898997B53C9789), SPH_C64(0x503030F0AD9DF030), - SPH_C64(0x68D3D3B805D6B8D3), SPH_C64(0x817F7F9EA0DF9E7F), - SPH_C64(0x9A7676B3E197B376), SPH_C64(0x9B8282B0E664B082) -}; - -static const long long int old0_T6[256] = { - SPH_C64(0x6868D50F67D568B8), SPH_C64(0xD0D0B71ECEB7D06D), - SPH_C64(0xEBEB60E00B60EB20), SPH_C64(0x2B2B876E45872B7D), - SPH_C64(0x484875327A7548D8), SPH_C64(0x9D9DD3019CD39DBA), - SPH_C64(0x6A6ADF1D77DF6ABE), SPH_C64(0xE4E453977353E431), - SPH_C64(0xE3E348A84B48E338), SPH_C64(0xA3A315D27115A3F8), - SPH_C64(0x565613DC8A1356FA), SPH_C64(0x8181BFFD7CBF819E), - SPH_C64(0x7D7D94B2CF947D87), SPH_C64(0xF1F1122ADB12F10E), - SPH_C64(0x8585ABD95CAB8592), SPH_C64(0x9E9EDC1A84DC9EBF), - SPH_C64(0x2C2C9C517D9C2C74), SPH_C64(0x8E8E8C8A048C8E8F), - SPH_C64(0x7878859FE7857888), SPH_C64(0xCACAC5D41EC5CA43), - SPH_C64(0x17174BAFB84B1739), SPH_C64(0xA9A937882137A9E6), - SPH_C64(0x6161F84E2FF861A3), SPH_C64(0xD5D5A633E6A6D562), - SPH_C64(0x5D5D348FD2345DE7), SPH_C64(0x0B0B275358270B1D), - SPH_C64(0x8C8C869814868C89), SPH_C64(0x3C3CCCC1FDCC3C44), - SPH_C64(0x7777B6E89FB67799), SPH_C64(0x515108E3B20851F3), - SPH_C64(0x2222AA2F0DAA2266), SPH_C64(0x424257682A5742C6), - SPH_C64(0x3F3FC3DAE5C33F41), SPH_C64(0x545419CE9A1954FC), - SPH_C64(0x41415873325841C3), SPH_C64(0x8080BAF474BA809D), - SPH_C64(0xCCCCDBE22EDBCC49), SPH_C64(0x8686A4C244A48697), - SPH_C64(0xB3B34542F145B3C8), SPH_C64(0x181878D8C0781828), - SPH_C64(0x2E2E96436D962E72), SPH_C64(0x575716D5821657F9), - SPH_C64(0x06061E36301E060A), SPH_C64(0x6262F75537F762A6), - SPH_C64(0xF4F40307F303F401), SPH_C64(0x3636EE9BADEE365A), - SPH_C64(0xD1D1B217C6B2D16E), SPH_C64(0x6B6BDA147FDA6BBD), - SPH_C64(0x1B1B77C3D8771B2D), SPH_C64(0x6565EC6A0FEC65AF), - SPH_C64(0x7575BCFA8FBC759F), SPH_C64(0x1010509080501030), - SPH_C64(0xDADA95449E95DA73), SPH_C64(0x4949703B727049DB), - SPH_C64(0x2626BE0B2DBE266A), SPH_C64(0xF9F93A629B3AF916), - SPH_C64(0xCBCBC0DD16C0CB40), SPH_C64(0x6666E37117E366AA), - SPH_C64(0xE7E75C8C6B5CE734), SPH_C64(0xBABA6803B968BAD3), - SPH_C64(0xAEAE2CB7192CAEEF), SPH_C64(0x50500DEABA0D50F0), - SPH_C64(0x525207F8AA0752F6), SPH_C64(0xABAB3D9A313DABE0), - SPH_C64(0x0505112D2811050F), SPH_C64(0xF0F01723D317F00D), - SPH_C64(0x0D0D396568390D17), SPH_C64(0x7373A2CCBFA27395), - SPH_C64(0x3B3BD7FEC5D73B4D), SPH_C64(0x040414242014040C), - SPH_C64(0x2020A03D1DA02060), SPH_C64(0xFEFE215DA321FE1F), - SPH_C64(0xDDDD8E7BA68EDD7A), SPH_C64(0xF5F5060EFB06F502), - SPH_C64(0xB4B45E7DC95EB4C1), SPH_C64(0x5F5F3E9DC23E5FE1), - SPH_C64(0x0A0A225A50220A1E), SPH_C64(0xB5B55B74C15BB5C2), - SPH_C64(0xC0C0E78E4EE7C05D), SPH_C64(0xA0A01AC9691AA0FD), - SPH_C64(0x7171A8DEAFA87193), SPH_C64(0xA5A50BE4410BA5F2), - SPH_C64(0x2D2D995875992D77), SPH_C64(0x6060FD4727FD60A0), - SPH_C64(0x7272A7C5B7A77296), SPH_C64(0x9393E57FECE593A8), - SPH_C64(0x3939DDECD5DD394B), SPH_C64(0x0808284840280818), - SPH_C64(0x8383B5EF6CB58398), SPH_C64(0x2121A53415A52163), - SPH_C64(0x5C5C3186DA315CE4), SPH_C64(0x8787A1CB4CA18794), - SPH_C64(0xB1B14F50E14FB1CE), SPH_C64(0xE0E047B35347E03D), - SPH_C64(0x0000000000000000), SPH_C64(0xC3C3E89556E8C358), - SPH_C64(0x12125A82905A1236), SPH_C64(0x9191EF6DFCEF91AE), - SPH_C64(0x8A8A98AE24988A83), SPH_C64(0x02020A12100A0206), - SPH_C64(0x1C1C6CFCE06C1C24), SPH_C64(0xE6E659856359E637), - SPH_C64(0x45454C57124C45CF), SPH_C64(0xC2C2ED9C5EEDC25B), - SPH_C64(0xC4C4F3AA6EF3C451), SPH_C64(0xFDFD2E46BB2EFD1A), - SPH_C64(0xBFBF792E9179BFDC), SPH_C64(0x4444495E1A4944CC), - SPH_C64(0xA1A11FC0611FA1FE), SPH_C64(0x4C4C61165A614CD4), - SPH_C64(0x3333FFB685FF3355), SPH_C64(0xC5C5F6A366F6C552), - SPH_C64(0x8484AED054AE8491), SPH_C64(0x2323AF2605AF2365), - SPH_C64(0x7C7C91BBC7917C84), SPH_C64(0xB0B04A59E94AB0CD), - SPH_C64(0x2525B11035B1256F), SPH_C64(0x151541BDA841153F), - SPH_C64(0x3535E180B5E1355F), SPH_C64(0x6969D0066FD069BB), - SPH_C64(0xFFFF2454AB24FF1C), SPH_C64(0x9494FE40D4FE94A1), - SPH_C64(0x4D4D641F52644DD7), SPH_C64(0x7070ADD7A7AD7090), - SPH_C64(0xA2A210DB7910A2FB), SPH_C64(0xAFAF29BE1129AFEC), - SPH_C64(0xCDCDDEEB26DECD4A), SPH_C64(0xD6D6A928FEA9D667), - SPH_C64(0x6C6CC12B47C16CB4), SPH_C64(0xB7B75166D151B7C4), - SPH_C64(0xF8F83F6B933FF815), SPH_C64(0x09092D41482D091B), - SPH_C64(0xF3F31838CB18F308), SPH_C64(0x6767E6781FE667A9), - SPH_C64(0xA4A40EED490EA4F1), SPH_C64(0xEAEA65E90365EA23), - SPH_C64(0xECEC7BDF337BEC29), SPH_C64(0xB6B6546FD954B6C7), - SPH_C64(0xD4D4A33AEEA3D461), SPH_C64(0xD2D2BD0CDEBDD26B), - SPH_C64(0x141444B4A044143C), SPH_C64(0x1E1E66EEF0661E22), - SPH_C64(0xE1E142BA5B42E13E), SPH_C64(0x2424B4193DB4246C), - SPH_C64(0x3838D8E5DDD83848), SPH_C64(0xC6C6F9B87EF9C657), - SPH_C64(0xDBDB904D9690DB70), SPH_C64(0x4B4B7A29627A4BDD), - SPH_C64(0x7A7A8F8DF78F7A8E), SPH_C64(0x3A3AD2F7CDD23A4E), - SPH_C64(0xDEDE8160BE81DE7F), SPH_C64(0x5E5E3B94CA3B5EE2), - SPH_C64(0xDFDF8469B684DF7C), SPH_C64(0x9595FB49DCFB95A2), - SPH_C64(0xFCFC2B4FB32BFC19), SPH_C64(0xAAAA38933938AAE3), - SPH_C64(0xD7D7AC21F6ACD764), SPH_C64(0xCECED1F03ED1CE4F), - SPH_C64(0x07071B3F381B0709), SPH_C64(0x0F0F337778330F11), - SPH_C64(0x3D3DC9C8F5C93D47), SPH_C64(0x585825A2FA2558E8), - SPH_C64(0x9A9AC83EA4C89AB3), SPH_C64(0x9898C22CB4C298B5), - SPH_C64(0x9C9CD60894D69CB9), SPH_C64(0xF2F21D31C31DF20B), - SPH_C64(0xA7A701F65101A7F4), SPH_C64(0x1111559988551133), - SPH_C64(0x7E7E9BA9D79B7E82), SPH_C64(0x8B8B9DA72C9D8B80), - SPH_C64(0x43435261225243C5), SPH_C64(0x03030F1B180F0305), - SPH_C64(0xE2E24DA1434DE23B), SPH_C64(0xDCDC8B72AE8BDC79), - SPH_C64(0xE5E5569E7B56E532), SPH_C64(0xB2B2404BF940B2CB), - SPH_C64(0x4E4E6B044A6B4ED2), SPH_C64(0xC7C7FCB176FCC754), - SPH_C64(0x6D6DC4224FC46DB7), SPH_C64(0xE9E96AF21B6AE926), - SPH_C64(0x2727BB0225BB2769), SPH_C64(0x40405D7A3A5D40C0), - SPH_C64(0xD8D89F568E9FD875), SPH_C64(0x3737EB92A5EB3759), - SPH_C64(0x9292E076E4E092AB), SPH_C64(0x8F8F89830C898F8C), - SPH_C64(0x0101050908050103), SPH_C64(0x1D1D69F5E8691D27), - SPH_C64(0x535302F1A20253F5), SPH_C64(0x3E3EC6D3EDC63E42), - SPH_C64(0x595920ABF22059EB), SPH_C64(0xC1C1E28746E2C15E), - SPH_C64(0x4F4F6E0D426E4FD1), SPH_C64(0x3232FABF8DFA3256), - SPH_C64(0x16164EA6B04E163A), SPH_C64(0xFAFA35798335FA13), - SPH_C64(0x7474B9F387B9749C), SPH_C64(0xFBFB30708B30FB10), - SPH_C64(0x6363F25C3FF263A5), SPH_C64(0x9F9FD9138CD99FBC), - SPH_C64(0x3434E489BDE4345C), SPH_C64(0x1A1A72CAD0721A2E), - SPH_C64(0x2A2A82674D822A7E), SPH_C64(0x5A5A2FB0EA2F5AEE), - SPH_C64(0x8D8D83911C838D8A), SPH_C64(0xC9C9CACF06CAC946), - SPH_C64(0xCFCFD4F936D4CF4C), SPH_C64(0xF6F60915E309F607), - SPH_C64(0x9090EA64F4EA90AD), SPH_C64(0x282888755D882878), - SPH_C64(0x888892BC34928885), SPH_C64(0x9B9BCD37ACCD9BB0), - SPH_C64(0x3131F5A495F53153), SPH_C64(0x0E0E367E70360E12), - SPH_C64(0xBDBD733C8173BDDA), SPH_C64(0x4A4A7F206A7F4ADE), - SPH_C64(0xE8E86FFB136FE825), SPH_C64(0x9696F452C4F496A7), - SPH_C64(0xA6A604FF5904A6F7), SPH_C64(0x0C0C3C6C603C0C14), - SPH_C64(0xC8C8CFC60ECFC845), SPH_C64(0x79798096EF80798B), - SPH_C64(0xBCBC76358976BCD9), SPH_C64(0xBEBE7C27997CBEDF), - SPH_C64(0xEFEF74C42B74EF2C), SPH_C64(0x6E6ECB3957CB6EB2), - SPH_C64(0x4646434C0A4346CA), SPH_C64(0x9797F15BCCF197A4), - SPH_C64(0x5B5B2AB9E22A5BED), SPH_C64(0xEDED7ED63B7EED2A), - SPH_C64(0x19197DD1C87D192B), SPH_C64(0xD9D99A5F869AD976), - SPH_C64(0xACAC26A50926ACE9), SPH_C64(0x9999C725BCC799B6), - SPH_C64(0xA8A832812932A8E5), SPH_C64(0x29298D7C558D297B), - SPH_C64(0x6464E96307E964AC), SPH_C64(0x1F1F63E7F8631F21), - SPH_C64(0xADAD23AC0123ADEA), SPH_C64(0x55551CC7921C55FF), - SPH_C64(0x13135F8B985F1335), SPH_C64(0xBBBB6D0AB16DBBD0), - SPH_C64(0xF7F70C1CEB0CF704), SPH_C64(0x6F6FCE305FCE6FB1), - SPH_C64(0xB9B96718A167B9D6), SPH_C64(0x47474645024647C9), - SPH_C64(0x2F2F934A65932F71), SPH_C64(0xEEEE71CD2371EE2F), - SPH_C64(0xB8B86211A962B8D5), SPH_C64(0x7B7B8A84FF8A7B8D), - SPH_C64(0x898997B53C978986), SPH_C64(0x3030F0AD9DF03050), - SPH_C64(0xD3D3B805D6B8D368), SPH_C64(0x7F7F9EA0DF9E7F81), - SPH_C64(0x7676B3E197B3769A), SPH_C64(0x8282B0E664B0829B) -}; - -static const long long int old0_T7[256] = { - SPH_C64(0x68D50F67D568B868), SPH_C64(0xD0B71ECEB7D06DD0), - SPH_C64(0xEB60E00B60EB20EB), SPH_C64(0x2B876E45872B7D2B), - SPH_C64(0x4875327A7548D848), SPH_C64(0x9DD3019CD39DBA9D), - SPH_C64(0x6ADF1D77DF6ABE6A), SPH_C64(0xE453977353E431E4), - SPH_C64(0xE348A84B48E338E3), SPH_C64(0xA315D27115A3F8A3), - SPH_C64(0x5613DC8A1356FA56), SPH_C64(0x81BFFD7CBF819E81), - SPH_C64(0x7D94B2CF947D877D), SPH_C64(0xF1122ADB12F10EF1), - SPH_C64(0x85ABD95CAB859285), SPH_C64(0x9EDC1A84DC9EBF9E), - SPH_C64(0x2C9C517D9C2C742C), SPH_C64(0x8E8C8A048C8E8F8E), - SPH_C64(0x78859FE785788878), SPH_C64(0xCAC5D41EC5CA43CA), - SPH_C64(0x174BAFB84B173917), SPH_C64(0xA937882137A9E6A9), - SPH_C64(0x61F84E2FF861A361), SPH_C64(0xD5A633E6A6D562D5), - SPH_C64(0x5D348FD2345DE75D), SPH_C64(0x0B275358270B1D0B), - SPH_C64(0x8C869814868C898C), SPH_C64(0x3CCCC1FDCC3C443C), - SPH_C64(0x77B6E89FB6779977), SPH_C64(0x5108E3B20851F351), - SPH_C64(0x22AA2F0DAA226622), SPH_C64(0x4257682A5742C642), - SPH_C64(0x3FC3DAE5C33F413F), SPH_C64(0x5419CE9A1954FC54), - SPH_C64(0x415873325841C341), SPH_C64(0x80BAF474BA809D80), - SPH_C64(0xCCDBE22EDBCC49CC), SPH_C64(0x86A4C244A4869786), - SPH_C64(0xB34542F145B3C8B3), SPH_C64(0x1878D8C078182818), - SPH_C64(0x2E96436D962E722E), SPH_C64(0x5716D5821657F957), - SPH_C64(0x061E36301E060A06), SPH_C64(0x62F75537F762A662), - SPH_C64(0xF40307F303F401F4), SPH_C64(0x36EE9BADEE365A36), - SPH_C64(0xD1B217C6B2D16ED1), SPH_C64(0x6BDA147FDA6BBD6B), - SPH_C64(0x1B77C3D8771B2D1B), SPH_C64(0x65EC6A0FEC65AF65), - SPH_C64(0x75BCFA8FBC759F75), SPH_C64(0x1050908050103010), - SPH_C64(0xDA95449E95DA73DA), SPH_C64(0x49703B727049DB49), - SPH_C64(0x26BE0B2DBE266A26), SPH_C64(0xF93A629B3AF916F9), - SPH_C64(0xCBC0DD16C0CB40CB), SPH_C64(0x66E37117E366AA66), - SPH_C64(0xE75C8C6B5CE734E7), SPH_C64(0xBA6803B968BAD3BA), - SPH_C64(0xAE2CB7192CAEEFAE), SPH_C64(0x500DEABA0D50F050), - SPH_C64(0x5207F8AA0752F652), SPH_C64(0xAB3D9A313DABE0AB), - SPH_C64(0x05112D2811050F05), SPH_C64(0xF01723D317F00DF0), - SPH_C64(0x0D396568390D170D), SPH_C64(0x73A2CCBFA2739573), - SPH_C64(0x3BD7FEC5D73B4D3B), SPH_C64(0x0414242014040C04), - SPH_C64(0x20A03D1DA0206020), SPH_C64(0xFE215DA321FE1FFE), - SPH_C64(0xDD8E7BA68EDD7ADD), SPH_C64(0xF5060EFB06F502F5), - SPH_C64(0xB45E7DC95EB4C1B4), SPH_C64(0x5F3E9DC23E5FE15F), - SPH_C64(0x0A225A50220A1E0A), SPH_C64(0xB55B74C15BB5C2B5), - SPH_C64(0xC0E78E4EE7C05DC0), SPH_C64(0xA01AC9691AA0FDA0), - SPH_C64(0x71A8DEAFA8719371), SPH_C64(0xA50BE4410BA5F2A5), - SPH_C64(0x2D995875992D772D), SPH_C64(0x60FD4727FD60A060), - SPH_C64(0x72A7C5B7A7729672), SPH_C64(0x93E57FECE593A893), - SPH_C64(0x39DDECD5DD394B39), SPH_C64(0x0828484028081808), - SPH_C64(0x83B5EF6CB5839883), SPH_C64(0x21A53415A5216321), - SPH_C64(0x5C3186DA315CE45C), SPH_C64(0x87A1CB4CA1879487), - SPH_C64(0xB14F50E14FB1CEB1), SPH_C64(0xE047B35347E03DE0), - SPH_C64(0x0000000000000000), SPH_C64(0xC3E89556E8C358C3), - SPH_C64(0x125A82905A123612), SPH_C64(0x91EF6DFCEF91AE91), - SPH_C64(0x8A98AE24988A838A), SPH_C64(0x020A12100A020602), - SPH_C64(0x1C6CFCE06C1C241C), SPH_C64(0xE659856359E637E6), - SPH_C64(0x454C57124C45CF45), SPH_C64(0xC2ED9C5EEDC25BC2), - SPH_C64(0xC4F3AA6EF3C451C4), SPH_C64(0xFD2E46BB2EFD1AFD), - SPH_C64(0xBF792E9179BFDCBF), SPH_C64(0x44495E1A4944CC44), - SPH_C64(0xA11FC0611FA1FEA1), SPH_C64(0x4C61165A614CD44C), - SPH_C64(0x33FFB685FF335533), SPH_C64(0xC5F6A366F6C552C5), - SPH_C64(0x84AED054AE849184), SPH_C64(0x23AF2605AF236523), - SPH_C64(0x7C91BBC7917C847C), SPH_C64(0xB04A59E94AB0CDB0), - SPH_C64(0x25B11035B1256F25), SPH_C64(0x1541BDA841153F15), - SPH_C64(0x35E180B5E1355F35), SPH_C64(0x69D0066FD069BB69), - SPH_C64(0xFF2454AB24FF1CFF), SPH_C64(0x94FE40D4FE94A194), - SPH_C64(0x4D641F52644DD74D), SPH_C64(0x70ADD7A7AD709070), - SPH_C64(0xA210DB7910A2FBA2), SPH_C64(0xAF29BE1129AFECAF), - SPH_C64(0xCDDEEB26DECD4ACD), SPH_C64(0xD6A928FEA9D667D6), - SPH_C64(0x6CC12B47C16CB46C), SPH_C64(0xB75166D151B7C4B7), - SPH_C64(0xF83F6B933FF815F8), SPH_C64(0x092D41482D091B09), - SPH_C64(0xF31838CB18F308F3), SPH_C64(0x67E6781FE667A967), - SPH_C64(0xA40EED490EA4F1A4), SPH_C64(0xEA65E90365EA23EA), - SPH_C64(0xEC7BDF337BEC29EC), SPH_C64(0xB6546FD954B6C7B6), - SPH_C64(0xD4A33AEEA3D461D4), SPH_C64(0xD2BD0CDEBDD26BD2), - SPH_C64(0x1444B4A044143C14), SPH_C64(0x1E66EEF0661E221E), - SPH_C64(0xE142BA5B42E13EE1), SPH_C64(0x24B4193DB4246C24), - SPH_C64(0x38D8E5DDD8384838), SPH_C64(0xC6F9B87EF9C657C6), - SPH_C64(0xDB904D9690DB70DB), SPH_C64(0x4B7A29627A4BDD4B), - SPH_C64(0x7A8F8DF78F7A8E7A), SPH_C64(0x3AD2F7CDD23A4E3A), - SPH_C64(0xDE8160BE81DE7FDE), SPH_C64(0x5E3B94CA3B5EE25E), - SPH_C64(0xDF8469B684DF7CDF), SPH_C64(0x95FB49DCFB95A295), - SPH_C64(0xFC2B4FB32BFC19FC), SPH_C64(0xAA38933938AAE3AA), - SPH_C64(0xD7AC21F6ACD764D7), SPH_C64(0xCED1F03ED1CE4FCE), - SPH_C64(0x071B3F381B070907), SPH_C64(0x0F337778330F110F), - SPH_C64(0x3DC9C8F5C93D473D), SPH_C64(0x5825A2FA2558E858), - SPH_C64(0x9AC83EA4C89AB39A), SPH_C64(0x98C22CB4C298B598), - SPH_C64(0x9CD60894D69CB99C), SPH_C64(0xF21D31C31DF20BF2), - SPH_C64(0xA701F65101A7F4A7), SPH_C64(0x1155998855113311), - SPH_C64(0x7E9BA9D79B7E827E), SPH_C64(0x8B9DA72C9D8B808B), - SPH_C64(0x435261225243C543), SPH_C64(0x030F1B180F030503), - SPH_C64(0xE24DA1434DE23BE2), SPH_C64(0xDC8B72AE8BDC79DC), - SPH_C64(0xE5569E7B56E532E5), SPH_C64(0xB2404BF940B2CBB2), - SPH_C64(0x4E6B044A6B4ED24E), SPH_C64(0xC7FCB176FCC754C7), - SPH_C64(0x6DC4224FC46DB76D), SPH_C64(0xE96AF21B6AE926E9), - SPH_C64(0x27BB0225BB276927), SPH_C64(0x405D7A3A5D40C040), - SPH_C64(0xD89F568E9FD875D8), SPH_C64(0x37EB92A5EB375937), - SPH_C64(0x92E076E4E092AB92), SPH_C64(0x8F89830C898F8C8F), - SPH_C64(0x0105090805010301), SPH_C64(0x1D69F5E8691D271D), - SPH_C64(0x5302F1A20253F553), SPH_C64(0x3EC6D3EDC63E423E), - SPH_C64(0x5920ABF22059EB59), SPH_C64(0xC1E28746E2C15EC1), - SPH_C64(0x4F6E0D426E4FD14F), SPH_C64(0x32FABF8DFA325632), - SPH_C64(0x164EA6B04E163A16), SPH_C64(0xFA35798335FA13FA), - SPH_C64(0x74B9F387B9749C74), SPH_C64(0xFB30708B30FB10FB), - SPH_C64(0x63F25C3FF263A563), SPH_C64(0x9FD9138CD99FBC9F), - SPH_C64(0x34E489BDE4345C34), SPH_C64(0x1A72CAD0721A2E1A), - SPH_C64(0x2A82674D822A7E2A), SPH_C64(0x5A2FB0EA2F5AEE5A), - SPH_C64(0x8D83911C838D8A8D), SPH_C64(0xC9CACF06CAC946C9), - SPH_C64(0xCFD4F936D4CF4CCF), SPH_C64(0xF60915E309F607F6), - SPH_C64(0x90EA64F4EA90AD90), SPH_C64(0x2888755D88287828), - SPH_C64(0x8892BC3492888588), SPH_C64(0x9BCD37ACCD9BB09B), - SPH_C64(0x31F5A495F5315331), SPH_C64(0x0E367E70360E120E), - SPH_C64(0xBD733C8173BDDABD), SPH_C64(0x4A7F206A7F4ADE4A), - SPH_C64(0xE86FFB136FE825E8), SPH_C64(0x96F452C4F496A796), - SPH_C64(0xA604FF5904A6F7A6), SPH_C64(0x0C3C6C603C0C140C), - SPH_C64(0xC8CFC60ECFC845C8), SPH_C64(0x798096EF80798B79), - SPH_C64(0xBC76358976BCD9BC), SPH_C64(0xBE7C27997CBEDFBE), - SPH_C64(0xEF74C42B74EF2CEF), SPH_C64(0x6ECB3957CB6EB26E), - SPH_C64(0x46434C0A4346CA46), SPH_C64(0x97F15BCCF197A497), - SPH_C64(0x5B2AB9E22A5BED5B), SPH_C64(0xED7ED63B7EED2AED), - SPH_C64(0x197DD1C87D192B19), SPH_C64(0xD99A5F869AD976D9), - SPH_C64(0xAC26A50926ACE9AC), SPH_C64(0x99C725BCC799B699), - SPH_C64(0xA832812932A8E5A8), SPH_C64(0x298D7C558D297B29), - SPH_C64(0x64E96307E964AC64), SPH_C64(0x1F63E7F8631F211F), - SPH_C64(0xAD23AC0123ADEAAD), SPH_C64(0x551CC7921C55FF55), - SPH_C64(0x135F8B985F133513), SPH_C64(0xBB6D0AB16DBBD0BB), - SPH_C64(0xF70C1CEB0CF704F7), SPH_C64(0x6FCE305FCE6FB16F), - SPH_C64(0xB96718A167B9D6B9), SPH_C64(0x474645024647C947), - SPH_C64(0x2F934A65932F712F), SPH_C64(0xEE71CD2371EE2FEE), - SPH_C64(0xB86211A962B8D5B8), SPH_C64(0x7B8A84FF8A7B8D7B), - SPH_C64(0x8997B53C97898689), SPH_C64(0x30F0AD9DF0305030), - SPH_C64(0xD3B805D6B8D368D3), SPH_C64(0x7F9EA0DF9E7F817F), - SPH_C64(0x76B3E197B3769A76), SPH_C64(0x82B0E664B0829B82) -}; - -#endif - -static const long long int old0_RC[10] = { - SPH_C64(0xE46A9D482BEBD068), - SPH_C64(0x9E85F17D8156A3E3), - SPH_C64(0xD561A917CA788E2C), - SPH_C64(0x422251773C8C0B5D), - SPH_C64(0x18B386CC8041543F), - SPH_C64(0x6BD136F46206572E), - SPH_C64(0xF92649DA1075651B), - SPH_C64(0xAB5250AEBAE766CB), - SPH_C64(0xFE20043B730DF005), - SPH_C64(0xA0C0B50A5FB4F5DD) -}; - -/* ====================================================================== */ -/* - * Constants for plain WHIRLPOOL-1 (second version). - */ - -static const long long int old1_T0[256] = { - SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323), - SPH_C64(0xF9B87EF9C657C6C6), SPH_C64(0x6FFB136FE825E8E8), - SPH_C64(0xA1CB4CA187948787), SPH_C64(0x6211A962B8D5B8B8), - SPH_C64(0x0509080501030101), SPH_C64(0x6E0D426E4FD14F4F), - SPH_C64(0xEE9BADEE365A3636), SPH_C64(0x04FF5904A6F7A6A6), - SPH_C64(0xBD0CDEBDD26BD2D2), SPH_C64(0x060EFB06F502F5F5), - SPH_C64(0x8096EF80798B7979), SPH_C64(0xCE305FCE6FB16F6F), - SPH_C64(0xEF6DFCEF91AE9191), SPH_C64(0x07F8AA0752F65252), - SPH_C64(0xFD4727FD60A06060), SPH_C64(0x76358976BCD9BCBC), - SPH_C64(0xCD37ACCD9BB09B9B), SPH_C64(0x8C8A048C8E8F8E8E), - SPH_C64(0x15D27115A3F8A3A3), SPH_C64(0x3C6C603C0C140C0C), - SPH_C64(0x8A84FF8A7B8D7B7B), SPH_C64(0xE180B5E1355F3535), - SPH_C64(0x69F5E8691D271D1D), SPH_C64(0x47B35347E03DE0E0), - SPH_C64(0xAC21F6ACD764D7D7), SPH_C64(0xED9C5EEDC25BC2C2), - SPH_C64(0x96436D962E722E2E), SPH_C64(0x7A29627A4BDD4B4B), - SPH_C64(0x215DA321FE1FFEFE), SPH_C64(0x16D5821657F95757), - SPH_C64(0x41BDA841153F1515), SPH_C64(0xB6E89FB677997777), - SPH_C64(0xEB92A5EB37593737), SPH_C64(0x569E7B56E532E5E5), - SPH_C64(0xD9138CD99FBC9F9F), SPH_C64(0x1723D317F00DF0F0), - SPH_C64(0x7F206A7F4ADE4A4A), SPH_C64(0x95449E95DA73DADA), - SPH_C64(0x25A2FA2558E85858), SPH_C64(0xCACF06CAC946C9C9), - SPH_C64(0x8D7C558D297B2929), SPH_C64(0x225A50220A1E0A0A), - SPH_C64(0x4F50E14FB1CEB1B1), SPH_C64(0x1AC9691AA0FDA0A0), - SPH_C64(0xDA147FDA6BBD6B6B), SPH_C64(0xABD95CAB85928585), - SPH_C64(0x733C8173BDDABDBD), SPH_C64(0x348FD2345DE75D5D), - SPH_C64(0x5090805010301010), SPH_C64(0x0307F303F401F4F4), - SPH_C64(0xC0DD16C0CB40CBCB), SPH_C64(0xC6D3EDC63E423E3E), - SPH_C64(0x112D2811050F0505), SPH_C64(0xE6781FE667A96767), - SPH_C64(0x53977353E431E4E4), SPH_C64(0xBB0225BB27692727), - SPH_C64(0x5873325841C34141), SPH_C64(0x9DA72C9D8B808B8B), - SPH_C64(0x01F65101A7F4A7A7), SPH_C64(0x94B2CF947D877D7D), - SPH_C64(0xFB49DCFB95A29595), SPH_C64(0x9F568E9FD875D8D8), - SPH_C64(0x30708B30FB10FBFB), SPH_C64(0x71CD2371EE2FEEEE), - SPH_C64(0x91BBC7917C847C7C), SPH_C64(0xE37117E366AA6666), - SPH_C64(0x8E7BA68EDD7ADDDD), SPH_C64(0x4BAFB84B17391717), - SPH_C64(0x4645024647C94747), SPH_C64(0xDC1A84DC9EBF9E9E), - SPH_C64(0xC5D41EC5CA43CACA), SPH_C64(0x995875992D772D2D), - SPH_C64(0x792E9179BFDCBFBF), SPH_C64(0x1B3F381B07090707), - SPH_C64(0x23AC0123ADEAADAD), SPH_C64(0x2FB0EA2F5AEE5A5A), - SPH_C64(0xB5EF6CB583988383), SPH_C64(0xFFB685FF33553333), - SPH_C64(0xF25C3FF263A56363), SPH_C64(0x0A12100A02060202), - SPH_C64(0x38933938AAE3AAAA), SPH_C64(0xA8DEAFA871937171), - SPH_C64(0xCFC60ECFC845C8C8), SPH_C64(0x7DD1C87D192B1919), - SPH_C64(0x703B727049DB4949), SPH_C64(0x9A5F869AD976D9D9), - SPH_C64(0x1D31C31DF20BF2F2), SPH_C64(0x48A84B48E338E3E3), - SPH_C64(0x2AB9E22A5BED5B5B), SPH_C64(0x92BC349288858888), - SPH_C64(0xC83EA4C89AB39A9A), SPH_C64(0xBE0B2DBE266A2626), - SPH_C64(0xFABF8DFA32563232), SPH_C64(0x4A59E94AB0CDB0B0), - SPH_C64(0x6AF21B6AE926E9E9), SPH_C64(0x337778330F110F0F), - SPH_C64(0xA633E6A6D562D5D5), SPH_C64(0xBAF474BA809D8080), - SPH_C64(0x7C27997CBEDFBEBE), SPH_C64(0xDEEB26DECD4ACDCD), - SPH_C64(0xE489BDE4345C3434), SPH_C64(0x75327A7548D84848), - SPH_C64(0x2454AB24FF1CFFFF), SPH_C64(0x8F8DF78F7A8E7A7A), - SPH_C64(0xEA64F4EA90AD9090), SPH_C64(0x3E9DC23E5FE15F5F), - SPH_C64(0xA03D1DA020602020), SPH_C64(0xD50F67D568B86868), - SPH_C64(0x72CAD0721A2E1A1A), SPH_C64(0x2CB7192CAEEFAEAE), - SPH_C64(0x5E7DC95EB4C1B4B4), SPH_C64(0x19CE9A1954FC5454), - SPH_C64(0xE57FECE593A89393), SPH_C64(0xAA2F0DAA22662222), - SPH_C64(0xE96307E964AC6464), SPH_C64(0x122ADB12F10EF1F1), - SPH_C64(0xA2CCBFA273957373), SPH_C64(0x5A82905A12361212), - SPH_C64(0x5D7A3A5D40C04040), SPH_C64(0x2848402808180808), - SPH_C64(0xE89556E8C358C3C3), SPH_C64(0x7BDF337BEC29ECEC), - SPH_C64(0x904D9690DB70DBDB), SPH_C64(0x1FC0611FA1FEA1A1), - SPH_C64(0x83911C838D8A8D8D), SPH_C64(0xC9C8F5C93D473D3D), - SPH_C64(0xF15BCCF197A49797), SPH_C64(0x0000000000000000), - SPH_C64(0xD4F936D4CF4CCFCF), SPH_C64(0x876E45872B7D2B2B), - SPH_C64(0xB3E197B3769A7676), SPH_C64(0xB0E664B0829B8282), - SPH_C64(0xA928FEA9D667D6D6), SPH_C64(0x77C3D8771B2D1B1B), - SPH_C64(0x5B74C15BB5C2B5B5), SPH_C64(0x29BE1129AFECAFAF), - SPH_C64(0xDF1D77DF6ABE6A6A), SPH_C64(0x0DEABA0D50F05050), - SPH_C64(0x4C57124C45CF4545), SPH_C64(0x1838CB18F308F3F3), - SPH_C64(0xF0AD9DF030503030), SPH_C64(0x74C42B74EF2CEFEF), - SPH_C64(0xC3DAE5C33F413F3F), SPH_C64(0x1CC7921C55FF5555), - SPH_C64(0x10DB7910A2FBA2A2), SPH_C64(0x65E90365EA23EAEA), - SPH_C64(0xEC6A0FEC65AF6565), SPH_C64(0x6803B968BAD3BABA), - SPH_C64(0x934A65932F712F2F), SPH_C64(0xE78E4EE7C05DC0C0), - SPH_C64(0x8160BE81DE7FDEDE), SPH_C64(0x6CFCE06C1C241C1C), - SPH_C64(0x2E46BB2EFD1AFDFD), SPH_C64(0x641F52644DD74D4D), - SPH_C64(0xE076E4E092AB9292), SPH_C64(0xBCFA8FBC759F7575), - SPH_C64(0x1E36301E060A0606), SPH_C64(0x98AE24988A838A8A), - SPH_C64(0x404BF940B2CBB2B2), SPH_C64(0x59856359E637E6E6), - SPH_C64(0x367E70360E120E0E), SPH_C64(0x63E7F8631F211F1F), - SPH_C64(0xF75537F762A66262), SPH_C64(0xA33AEEA3D461D4D4), - SPH_C64(0x32812932A8E5A8A8), SPH_C64(0xF452C4F496A79696), - SPH_C64(0x3A629B3AF916F9F9), SPH_C64(0xF6A366F6C552C5C5), - SPH_C64(0xB11035B1256F2525), SPH_C64(0x20ABF22059EB5959), - SPH_C64(0xAED054AE84918484), SPH_C64(0xA7C5B7A772967272), - SPH_C64(0xDDECD5DD394B3939), SPH_C64(0x61165A614CD44C4C), - SPH_C64(0x3B94CA3B5EE25E5E), SPH_C64(0x859FE78578887878), - SPH_C64(0xD8E5DDD838483838), SPH_C64(0x869814868C898C8C), - SPH_C64(0xB217C6B2D16ED1D1), SPH_C64(0x0BE4410BA5F2A5A5), - SPH_C64(0x4DA1434DE23BE2E2), SPH_C64(0xF84E2FF861A36161), - SPH_C64(0x4542F145B3C8B3B3), SPH_C64(0xA53415A521632121), - SPH_C64(0xD60894D69CB99C9C), SPH_C64(0x66EEF0661E221E1E), - SPH_C64(0x5261225243C54343), SPH_C64(0xFCB176FCC754C7C7), - SPH_C64(0x2B4FB32BFC19FCFC), SPH_C64(0x14242014040C0404), - SPH_C64(0x08E3B20851F35151), SPH_C64(0xC725BCC799B69999), - SPH_C64(0xC4224FC46DB76D6D), SPH_C64(0x396568390D170D0D), - SPH_C64(0x35798335FA13FAFA), SPH_C64(0x8469B684DF7CDFDF), - SPH_C64(0x9BA9D79B7E827E7E), SPH_C64(0xB4193DB4246C2424), - SPH_C64(0xD7FEC5D73B4D3B3B), SPH_C64(0x3D9A313DABE0ABAB), - SPH_C64(0xD1F03ED1CE4FCECE), SPH_C64(0x5599885511331111), - SPH_C64(0x89830C898F8C8F8F), SPH_C64(0x6B044A6B4ED24E4E), - SPH_C64(0x5166D151B7C4B7B7), SPH_C64(0x60E00B60EB20EBEB), - SPH_C64(0xCCC1FDCC3C443C3C), SPH_C64(0xBFFD7CBF819E8181), - SPH_C64(0xFE40D4FE94A19494), SPH_C64(0x0C1CEB0CF704F7F7), - SPH_C64(0x6718A167B9D6B9B9), SPH_C64(0x5F8B985F13351313), - SPH_C64(0x9C517D9C2C742C2C), SPH_C64(0xB805D6B8D368D3D3), - SPH_C64(0x5C8C6B5CE734E7E7), SPH_C64(0xCB3957CB6EB26E6E), - SPH_C64(0xF3AA6EF3C451C4C4), SPH_C64(0x0F1B180F03050303), - SPH_C64(0x13DC8A1356FA5656), SPH_C64(0x495E1A4944CC4444), - SPH_C64(0x9EA0DF9E7F817F7F), SPH_C64(0x37882137A9E6A9A9), - SPH_C64(0x82674D822A7E2A2A), SPH_C64(0x6D0AB16DBBD0BBBB), - SPH_C64(0xE28746E2C15EC1C1), SPH_C64(0x02F1A20253F55353), - SPH_C64(0x8B72AE8BDC79DCDC), SPH_C64(0x275358270B1D0B0B), - SPH_C64(0xD3019CD39DBA9D9D), SPH_C64(0xC12B47C16CB46C6C), - SPH_C64(0xF5A495F531533131), SPH_C64(0xB9F387B9749C7474), - SPH_C64(0x0915E309F607F6F6), SPH_C64(0x434C0A4346CA4646), - SPH_C64(0x26A50926ACE9ACAC), SPH_C64(0x97B53C9789868989), - SPH_C64(0x44B4A044143C1414), SPH_C64(0x42BA5B42E13EE1E1), - SPH_C64(0x4EA6B04E163A1616), SPH_C64(0xD2F7CDD23A4E3A3A), - SPH_C64(0xD0066FD069BB6969), SPH_C64(0x2D41482D091B0909), - SPH_C64(0xADD7A7AD70907070), SPH_C64(0x546FD954B6C7B6B6), - SPH_C64(0xB71ECEB7D06DD0D0), SPH_C64(0x7ED63B7EED2AEDED), - SPH_C64(0xDBE22EDBCC49CCCC), SPH_C64(0x57682A5742C64242), - SPH_C64(0xC22CB4C298B59898), SPH_C64(0x0EED490EA4F1A4A4), - SPH_C64(0x88755D8828782828), SPH_C64(0x3186DA315CE45C5C), - SPH_C64(0x3F6B933FF815F8F8), SPH_C64(0xA4C244A486978686) -}; - -#if !SPH_SMALL_FOOTPRINT_WHIRLPOOL - -static const long long int old1_T1[256] = { - SPH_C64(0xD8C0781828181878), SPH_C64(0x2605AF23652323AF), - SPH_C64(0xB87EF9C657C6C6F9), SPH_C64(0xFB136FE825E8E86F), - SPH_C64(0xCB4CA187948787A1), SPH_C64(0x11A962B8D5B8B862), - SPH_C64(0x0908050103010105), SPH_C64(0x0D426E4FD14F4F6E), - SPH_C64(0x9BADEE365A3636EE), SPH_C64(0xFF5904A6F7A6A604), - SPH_C64(0x0CDEBDD26BD2D2BD), SPH_C64(0x0EFB06F502F5F506), - SPH_C64(0x96EF80798B797980), SPH_C64(0x305FCE6FB16F6FCE), - SPH_C64(0x6DFCEF91AE9191EF), SPH_C64(0xF8AA0752F6525207), - SPH_C64(0x4727FD60A06060FD), SPH_C64(0x358976BCD9BCBC76), - SPH_C64(0x37ACCD9BB09B9BCD), SPH_C64(0x8A048C8E8F8E8E8C), - SPH_C64(0xD27115A3F8A3A315), SPH_C64(0x6C603C0C140C0C3C), - SPH_C64(0x84FF8A7B8D7B7B8A), SPH_C64(0x80B5E1355F3535E1), - SPH_C64(0xF5E8691D271D1D69), SPH_C64(0xB35347E03DE0E047), - SPH_C64(0x21F6ACD764D7D7AC), SPH_C64(0x9C5EEDC25BC2C2ED), - SPH_C64(0x436D962E722E2E96), SPH_C64(0x29627A4BDD4B4B7A), - SPH_C64(0x5DA321FE1FFEFE21), SPH_C64(0xD5821657F9575716), - SPH_C64(0xBDA841153F151541), SPH_C64(0xE89FB677997777B6), - SPH_C64(0x92A5EB37593737EB), SPH_C64(0x9E7B56E532E5E556), - SPH_C64(0x138CD99FBC9F9FD9), SPH_C64(0x23D317F00DF0F017), - SPH_C64(0x206A7F4ADE4A4A7F), SPH_C64(0x449E95DA73DADA95), - SPH_C64(0xA2FA2558E8585825), SPH_C64(0xCF06CAC946C9C9CA), - SPH_C64(0x7C558D297B29298D), SPH_C64(0x5A50220A1E0A0A22), - SPH_C64(0x50E14FB1CEB1B14F), SPH_C64(0xC9691AA0FDA0A01A), - SPH_C64(0x147FDA6BBD6B6BDA), SPH_C64(0xD95CAB85928585AB), - SPH_C64(0x3C8173BDDABDBD73), SPH_C64(0x8FD2345DE75D5D34), - SPH_C64(0x9080501030101050), SPH_C64(0x07F303F401F4F403), - SPH_C64(0xDD16C0CB40CBCBC0), SPH_C64(0xD3EDC63E423E3EC6), - SPH_C64(0x2D2811050F050511), SPH_C64(0x781FE667A96767E6), - SPH_C64(0x977353E431E4E453), SPH_C64(0x0225BB27692727BB), - SPH_C64(0x73325841C3414158), SPH_C64(0xA72C9D8B808B8B9D), - SPH_C64(0xF65101A7F4A7A701), SPH_C64(0xB2CF947D877D7D94), - SPH_C64(0x49DCFB95A29595FB), SPH_C64(0x568E9FD875D8D89F), - SPH_C64(0x708B30FB10FBFB30), SPH_C64(0xCD2371EE2FEEEE71), - SPH_C64(0xBBC7917C847C7C91), SPH_C64(0x7117E366AA6666E3), - SPH_C64(0x7BA68EDD7ADDDD8E), SPH_C64(0xAFB84B173917174B), - SPH_C64(0x45024647C9474746), SPH_C64(0x1A84DC9EBF9E9EDC), - SPH_C64(0xD41EC5CA43CACAC5), SPH_C64(0x5875992D772D2D99), - SPH_C64(0x2E9179BFDCBFBF79), SPH_C64(0x3F381B070907071B), - SPH_C64(0xAC0123ADEAADAD23), SPH_C64(0xB0EA2F5AEE5A5A2F), - SPH_C64(0xEF6CB583988383B5), SPH_C64(0xB685FF33553333FF), - SPH_C64(0x5C3FF263A56363F2), SPH_C64(0x12100A020602020A), - SPH_C64(0x933938AAE3AAAA38), SPH_C64(0xDEAFA871937171A8), - SPH_C64(0xC60ECFC845C8C8CF), SPH_C64(0xD1C87D192B19197D), - SPH_C64(0x3B727049DB494970), SPH_C64(0x5F869AD976D9D99A), - SPH_C64(0x31C31DF20BF2F21D), SPH_C64(0xA84B48E338E3E348), - SPH_C64(0xB9E22A5BED5B5B2A), SPH_C64(0xBC34928885888892), - SPH_C64(0x3EA4C89AB39A9AC8), SPH_C64(0x0B2DBE266A2626BE), - SPH_C64(0xBF8DFA32563232FA), SPH_C64(0x59E94AB0CDB0B04A), - SPH_C64(0xF21B6AE926E9E96A), SPH_C64(0x7778330F110F0F33), - SPH_C64(0x33E6A6D562D5D5A6), SPH_C64(0xF474BA809D8080BA), - SPH_C64(0x27997CBEDFBEBE7C), SPH_C64(0xEB26DECD4ACDCDDE), - SPH_C64(0x89BDE4345C3434E4), SPH_C64(0x327A7548D8484875), - SPH_C64(0x54AB24FF1CFFFF24), SPH_C64(0x8DF78F7A8E7A7A8F), - SPH_C64(0x64F4EA90AD9090EA), SPH_C64(0x9DC23E5FE15F5F3E), - SPH_C64(0x3D1DA020602020A0), SPH_C64(0x0F67D568B86868D5), - SPH_C64(0xCAD0721A2E1A1A72), SPH_C64(0xB7192CAEEFAEAE2C), - SPH_C64(0x7DC95EB4C1B4B45E), SPH_C64(0xCE9A1954FC545419), - SPH_C64(0x7FECE593A89393E5), SPH_C64(0x2F0DAA22662222AA), - SPH_C64(0x6307E964AC6464E9), SPH_C64(0x2ADB12F10EF1F112), - SPH_C64(0xCCBFA273957373A2), SPH_C64(0x82905A123612125A), - SPH_C64(0x7A3A5D40C040405D), SPH_C64(0x4840280818080828), - SPH_C64(0x9556E8C358C3C3E8), SPH_C64(0xDF337BEC29ECEC7B), - SPH_C64(0x4D9690DB70DBDB90), SPH_C64(0xC0611FA1FEA1A11F), - SPH_C64(0x911C838D8A8D8D83), SPH_C64(0xC8F5C93D473D3DC9), - SPH_C64(0x5BCCF197A49797F1), SPH_C64(0x0000000000000000), - SPH_C64(0xF936D4CF4CCFCFD4), SPH_C64(0x6E45872B7D2B2B87), - SPH_C64(0xE197B3769A7676B3), SPH_C64(0xE664B0829B8282B0), - SPH_C64(0x28FEA9D667D6D6A9), SPH_C64(0xC3D8771B2D1B1B77), - SPH_C64(0x74C15BB5C2B5B55B), SPH_C64(0xBE1129AFECAFAF29), - SPH_C64(0x1D77DF6ABE6A6ADF), SPH_C64(0xEABA0D50F050500D), - SPH_C64(0x57124C45CF45454C), SPH_C64(0x38CB18F308F3F318), - SPH_C64(0xAD9DF030503030F0), SPH_C64(0xC42B74EF2CEFEF74), - SPH_C64(0xDAE5C33F413F3FC3), SPH_C64(0xC7921C55FF55551C), - SPH_C64(0xDB7910A2FBA2A210), SPH_C64(0xE90365EA23EAEA65), - SPH_C64(0x6A0FEC65AF6565EC), SPH_C64(0x03B968BAD3BABA68), - SPH_C64(0x4A65932F712F2F93), SPH_C64(0x8E4EE7C05DC0C0E7), - SPH_C64(0x60BE81DE7FDEDE81), SPH_C64(0xFCE06C1C241C1C6C), - SPH_C64(0x46BB2EFD1AFDFD2E), SPH_C64(0x1F52644DD74D4D64), - SPH_C64(0x76E4E092AB9292E0), SPH_C64(0xFA8FBC759F7575BC), - SPH_C64(0x36301E060A06061E), SPH_C64(0xAE24988A838A8A98), - SPH_C64(0x4BF940B2CBB2B240), SPH_C64(0x856359E637E6E659), - SPH_C64(0x7E70360E120E0E36), SPH_C64(0xE7F8631F211F1F63), - SPH_C64(0x5537F762A66262F7), SPH_C64(0x3AEEA3D461D4D4A3), - SPH_C64(0x812932A8E5A8A832), SPH_C64(0x52C4F496A79696F4), - SPH_C64(0x629B3AF916F9F93A), SPH_C64(0xA366F6C552C5C5F6), - SPH_C64(0x1035B1256F2525B1), SPH_C64(0xABF22059EB595920), - SPH_C64(0xD054AE84918484AE), SPH_C64(0xC5B7A772967272A7), - SPH_C64(0xECD5DD394B3939DD), SPH_C64(0x165A614CD44C4C61), - SPH_C64(0x94CA3B5EE25E5E3B), SPH_C64(0x9FE7857888787885), - SPH_C64(0xE5DDD838483838D8), SPH_C64(0x9814868C898C8C86), - SPH_C64(0x17C6B2D16ED1D1B2), SPH_C64(0xE4410BA5F2A5A50B), - SPH_C64(0xA1434DE23BE2E24D), SPH_C64(0x4E2FF861A36161F8), - SPH_C64(0x42F145B3C8B3B345), SPH_C64(0x3415A521632121A5), - SPH_C64(0x0894D69CB99C9CD6), SPH_C64(0xEEF0661E221E1E66), - SPH_C64(0x61225243C5434352), SPH_C64(0xB176FCC754C7C7FC), - SPH_C64(0x4FB32BFC19FCFC2B), SPH_C64(0x242014040C040414), - SPH_C64(0xE3B20851F3515108), SPH_C64(0x25BCC799B69999C7), - SPH_C64(0x224FC46DB76D6DC4), SPH_C64(0x6568390D170D0D39), - SPH_C64(0x798335FA13FAFA35), SPH_C64(0x69B684DF7CDFDF84), - SPH_C64(0xA9D79B7E827E7E9B), SPH_C64(0x193DB4246C2424B4), - SPH_C64(0xFEC5D73B4D3B3BD7), SPH_C64(0x9A313DABE0ABAB3D), - SPH_C64(0xF03ED1CE4FCECED1), SPH_C64(0x9988551133111155), - SPH_C64(0x830C898F8C8F8F89), SPH_C64(0x044A6B4ED24E4E6B), - SPH_C64(0x66D151B7C4B7B751), SPH_C64(0xE00B60EB20EBEB60), - SPH_C64(0xC1FDCC3C443C3CCC), SPH_C64(0xFD7CBF819E8181BF), - SPH_C64(0x40D4FE94A19494FE), SPH_C64(0x1CEB0CF704F7F70C), - SPH_C64(0x18A167B9D6B9B967), SPH_C64(0x8B985F133513135F), - SPH_C64(0x517D9C2C742C2C9C), SPH_C64(0x05D6B8D368D3D3B8), - SPH_C64(0x8C6B5CE734E7E75C), SPH_C64(0x3957CB6EB26E6ECB), - SPH_C64(0xAA6EF3C451C4C4F3), SPH_C64(0x1B180F030503030F), - SPH_C64(0xDC8A1356FA565613), SPH_C64(0x5E1A4944CC444449), - SPH_C64(0xA0DF9E7F817F7F9E), SPH_C64(0x882137A9E6A9A937), - SPH_C64(0x674D822A7E2A2A82), SPH_C64(0x0AB16DBBD0BBBB6D), - SPH_C64(0x8746E2C15EC1C1E2), SPH_C64(0xF1A20253F5535302), - SPH_C64(0x72AE8BDC79DCDC8B), SPH_C64(0x5358270B1D0B0B27), - SPH_C64(0x019CD39DBA9D9DD3), SPH_C64(0x2B47C16CB46C6CC1), - SPH_C64(0xA495F531533131F5), SPH_C64(0xF387B9749C7474B9), - SPH_C64(0x15E309F607F6F609), SPH_C64(0x4C0A4346CA464643), - SPH_C64(0xA50926ACE9ACAC26), SPH_C64(0xB53C978986898997), - SPH_C64(0xB4A044143C141444), SPH_C64(0xBA5B42E13EE1E142), - SPH_C64(0xA6B04E163A16164E), SPH_C64(0xF7CDD23A4E3A3AD2), - SPH_C64(0x066FD069BB6969D0), SPH_C64(0x41482D091B09092D), - SPH_C64(0xD7A7AD70907070AD), SPH_C64(0x6FD954B6C7B6B654), - SPH_C64(0x1ECEB7D06DD0D0B7), SPH_C64(0xD63B7EED2AEDED7E), - SPH_C64(0xE22EDBCC49CCCCDB), SPH_C64(0x682A5742C6424257), - SPH_C64(0x2CB4C298B59898C2), SPH_C64(0xED490EA4F1A4A40E), - SPH_C64(0x755D882878282888), SPH_C64(0x86DA315CE45C5C31), - SPH_C64(0x6B933FF815F8F83F), SPH_C64(0xC244A486978686A4) -}; - -static const long long int old1_T2[256] = { - SPH_C64(0xC0781828181878D8), SPH_C64(0x05AF23652323AF26), - SPH_C64(0x7EF9C657C6C6F9B8), SPH_C64(0x136FE825E8E86FFB), - SPH_C64(0x4CA187948787A1CB), SPH_C64(0xA962B8D5B8B86211), - SPH_C64(0x0805010301010509), SPH_C64(0x426E4FD14F4F6E0D), - SPH_C64(0xADEE365A3636EE9B), SPH_C64(0x5904A6F7A6A604FF), - SPH_C64(0xDEBDD26BD2D2BD0C), SPH_C64(0xFB06F502F5F5060E), - SPH_C64(0xEF80798B79798096), SPH_C64(0x5FCE6FB16F6FCE30), - SPH_C64(0xFCEF91AE9191EF6D), SPH_C64(0xAA0752F6525207F8), - SPH_C64(0x27FD60A06060FD47), SPH_C64(0x8976BCD9BCBC7635), - SPH_C64(0xACCD9BB09B9BCD37), SPH_C64(0x048C8E8F8E8E8C8A), - SPH_C64(0x7115A3F8A3A315D2), SPH_C64(0x603C0C140C0C3C6C), - SPH_C64(0xFF8A7B8D7B7B8A84), SPH_C64(0xB5E1355F3535E180), - SPH_C64(0xE8691D271D1D69F5), SPH_C64(0x5347E03DE0E047B3), - SPH_C64(0xF6ACD764D7D7AC21), SPH_C64(0x5EEDC25BC2C2ED9C), - SPH_C64(0x6D962E722E2E9643), SPH_C64(0x627A4BDD4B4B7A29), - SPH_C64(0xA321FE1FFEFE215D), SPH_C64(0x821657F9575716D5), - SPH_C64(0xA841153F151541BD), SPH_C64(0x9FB677997777B6E8), - SPH_C64(0xA5EB37593737EB92), SPH_C64(0x7B56E532E5E5569E), - SPH_C64(0x8CD99FBC9F9FD913), SPH_C64(0xD317F00DF0F01723), - SPH_C64(0x6A7F4ADE4A4A7F20), SPH_C64(0x9E95DA73DADA9544), - SPH_C64(0xFA2558E8585825A2), SPH_C64(0x06CAC946C9C9CACF), - SPH_C64(0x558D297B29298D7C), SPH_C64(0x50220A1E0A0A225A), - SPH_C64(0xE14FB1CEB1B14F50), SPH_C64(0x691AA0FDA0A01AC9), - SPH_C64(0x7FDA6BBD6B6BDA14), SPH_C64(0x5CAB85928585ABD9), - SPH_C64(0x8173BDDABDBD733C), SPH_C64(0xD2345DE75D5D348F), - SPH_C64(0x8050103010105090), SPH_C64(0xF303F401F4F40307), - SPH_C64(0x16C0CB40CBCBC0DD), SPH_C64(0xEDC63E423E3EC6D3), - SPH_C64(0x2811050F0505112D), SPH_C64(0x1FE667A96767E678), - SPH_C64(0x7353E431E4E45397), SPH_C64(0x25BB27692727BB02), - SPH_C64(0x325841C341415873), SPH_C64(0x2C9D8B808B8B9DA7), - SPH_C64(0x5101A7F4A7A701F6), SPH_C64(0xCF947D877D7D94B2), - SPH_C64(0xDCFB95A29595FB49), SPH_C64(0x8E9FD875D8D89F56), - SPH_C64(0x8B30FB10FBFB3070), SPH_C64(0x2371EE2FEEEE71CD), - SPH_C64(0xC7917C847C7C91BB), SPH_C64(0x17E366AA6666E371), - SPH_C64(0xA68EDD7ADDDD8E7B), SPH_C64(0xB84B173917174BAF), - SPH_C64(0x024647C947474645), SPH_C64(0x84DC9EBF9E9EDC1A), - SPH_C64(0x1EC5CA43CACAC5D4), SPH_C64(0x75992D772D2D9958), - SPH_C64(0x9179BFDCBFBF792E), SPH_C64(0x381B070907071B3F), - SPH_C64(0x0123ADEAADAD23AC), SPH_C64(0xEA2F5AEE5A5A2FB0), - SPH_C64(0x6CB583988383B5EF), SPH_C64(0x85FF33553333FFB6), - SPH_C64(0x3FF263A56363F25C), SPH_C64(0x100A020602020A12), - SPH_C64(0x3938AAE3AAAA3893), SPH_C64(0xAFA871937171A8DE), - SPH_C64(0x0ECFC845C8C8CFC6), SPH_C64(0xC87D192B19197DD1), - SPH_C64(0x727049DB4949703B), SPH_C64(0x869AD976D9D99A5F), - SPH_C64(0xC31DF20BF2F21D31), SPH_C64(0x4B48E338E3E348A8), - SPH_C64(0xE22A5BED5B5B2AB9), SPH_C64(0x34928885888892BC), - SPH_C64(0xA4C89AB39A9AC83E), SPH_C64(0x2DBE266A2626BE0B), - SPH_C64(0x8DFA32563232FABF), SPH_C64(0xE94AB0CDB0B04A59), - SPH_C64(0x1B6AE926E9E96AF2), SPH_C64(0x78330F110F0F3377), - SPH_C64(0xE6A6D562D5D5A633), SPH_C64(0x74BA809D8080BAF4), - SPH_C64(0x997CBEDFBEBE7C27), SPH_C64(0x26DECD4ACDCDDEEB), - SPH_C64(0xBDE4345C3434E489), SPH_C64(0x7A7548D848487532), - SPH_C64(0xAB24FF1CFFFF2454), SPH_C64(0xF78F7A8E7A7A8F8D), - SPH_C64(0xF4EA90AD9090EA64), SPH_C64(0xC23E5FE15F5F3E9D), - SPH_C64(0x1DA020602020A03D), SPH_C64(0x67D568B86868D50F), - SPH_C64(0xD0721A2E1A1A72CA), SPH_C64(0x192CAEEFAEAE2CB7), - SPH_C64(0xC95EB4C1B4B45E7D), SPH_C64(0x9A1954FC545419CE), - SPH_C64(0xECE593A89393E57F), SPH_C64(0x0DAA22662222AA2F), - SPH_C64(0x07E964AC6464E963), SPH_C64(0xDB12F10EF1F1122A), - SPH_C64(0xBFA273957373A2CC), SPH_C64(0x905A123612125A82), - SPH_C64(0x3A5D40C040405D7A), SPH_C64(0x4028081808082848), - SPH_C64(0x56E8C358C3C3E895), SPH_C64(0x337BEC29ECEC7BDF), - SPH_C64(0x9690DB70DBDB904D), SPH_C64(0x611FA1FEA1A11FC0), - SPH_C64(0x1C838D8A8D8D8391), SPH_C64(0xF5C93D473D3DC9C8), - SPH_C64(0xCCF197A49797F15B), SPH_C64(0x0000000000000000), - SPH_C64(0x36D4CF4CCFCFD4F9), SPH_C64(0x45872B7D2B2B876E), - SPH_C64(0x97B3769A7676B3E1), SPH_C64(0x64B0829B8282B0E6), - SPH_C64(0xFEA9D667D6D6A928), SPH_C64(0xD8771B2D1B1B77C3), - SPH_C64(0xC15BB5C2B5B55B74), SPH_C64(0x1129AFECAFAF29BE), - SPH_C64(0x77DF6ABE6A6ADF1D), SPH_C64(0xBA0D50F050500DEA), - SPH_C64(0x124C45CF45454C57), SPH_C64(0xCB18F308F3F31838), - SPH_C64(0x9DF030503030F0AD), SPH_C64(0x2B74EF2CEFEF74C4), - SPH_C64(0xE5C33F413F3FC3DA), SPH_C64(0x921C55FF55551CC7), - SPH_C64(0x7910A2FBA2A210DB), SPH_C64(0x0365EA23EAEA65E9), - SPH_C64(0x0FEC65AF6565EC6A), SPH_C64(0xB968BAD3BABA6803), - SPH_C64(0x65932F712F2F934A), SPH_C64(0x4EE7C05DC0C0E78E), - SPH_C64(0xBE81DE7FDEDE8160), SPH_C64(0xE06C1C241C1C6CFC), - SPH_C64(0xBB2EFD1AFDFD2E46), SPH_C64(0x52644DD74D4D641F), - SPH_C64(0xE4E092AB9292E076), SPH_C64(0x8FBC759F7575BCFA), - SPH_C64(0x301E060A06061E36), SPH_C64(0x24988A838A8A98AE), - SPH_C64(0xF940B2CBB2B2404B), SPH_C64(0x6359E637E6E65985), - SPH_C64(0x70360E120E0E367E), SPH_C64(0xF8631F211F1F63E7), - SPH_C64(0x37F762A66262F755), SPH_C64(0xEEA3D461D4D4A33A), - SPH_C64(0x2932A8E5A8A83281), SPH_C64(0xC4F496A79696F452), - SPH_C64(0x9B3AF916F9F93A62), SPH_C64(0x66F6C552C5C5F6A3), - SPH_C64(0x35B1256F2525B110), SPH_C64(0xF22059EB595920AB), - SPH_C64(0x54AE84918484AED0), SPH_C64(0xB7A772967272A7C5), - SPH_C64(0xD5DD394B3939DDEC), SPH_C64(0x5A614CD44C4C6116), - SPH_C64(0xCA3B5EE25E5E3B94), SPH_C64(0xE78578887878859F), - SPH_C64(0xDDD838483838D8E5), SPH_C64(0x14868C898C8C8698), - SPH_C64(0xC6B2D16ED1D1B217), SPH_C64(0x410BA5F2A5A50BE4), - SPH_C64(0x434DE23BE2E24DA1), SPH_C64(0x2FF861A36161F84E), - SPH_C64(0xF145B3C8B3B34542), SPH_C64(0x15A521632121A534), - SPH_C64(0x94D69CB99C9CD608), SPH_C64(0xF0661E221E1E66EE), - SPH_C64(0x225243C543435261), SPH_C64(0x76FCC754C7C7FCB1), - SPH_C64(0xB32BFC19FCFC2B4F), SPH_C64(0x2014040C04041424), - SPH_C64(0xB20851F3515108E3), SPH_C64(0xBCC799B69999C725), - SPH_C64(0x4FC46DB76D6DC422), SPH_C64(0x68390D170D0D3965), - SPH_C64(0x8335FA13FAFA3579), SPH_C64(0xB684DF7CDFDF8469), - SPH_C64(0xD79B7E827E7E9BA9), SPH_C64(0x3DB4246C2424B419), - SPH_C64(0xC5D73B4D3B3BD7FE), SPH_C64(0x313DABE0ABAB3D9A), - SPH_C64(0x3ED1CE4FCECED1F0), SPH_C64(0x8855113311115599), - SPH_C64(0x0C898F8C8F8F8983), SPH_C64(0x4A6B4ED24E4E6B04), - SPH_C64(0xD151B7C4B7B75166), SPH_C64(0x0B60EB20EBEB60E0), - SPH_C64(0xFDCC3C443C3CCCC1), SPH_C64(0x7CBF819E8181BFFD), - SPH_C64(0xD4FE94A19494FE40), SPH_C64(0xEB0CF704F7F70C1C), - SPH_C64(0xA167B9D6B9B96718), SPH_C64(0x985F133513135F8B), - SPH_C64(0x7D9C2C742C2C9C51), SPH_C64(0xD6B8D368D3D3B805), - SPH_C64(0x6B5CE734E7E75C8C), SPH_C64(0x57CB6EB26E6ECB39), - SPH_C64(0x6EF3C451C4C4F3AA), SPH_C64(0x180F030503030F1B), - SPH_C64(0x8A1356FA565613DC), SPH_C64(0x1A4944CC4444495E), - SPH_C64(0xDF9E7F817F7F9EA0), SPH_C64(0x2137A9E6A9A93788), - SPH_C64(0x4D822A7E2A2A8267), SPH_C64(0xB16DBBD0BBBB6D0A), - SPH_C64(0x46E2C15EC1C1E287), SPH_C64(0xA20253F5535302F1), - SPH_C64(0xAE8BDC79DCDC8B72), SPH_C64(0x58270B1D0B0B2753), - SPH_C64(0x9CD39DBA9D9DD301), SPH_C64(0x47C16CB46C6CC12B), - SPH_C64(0x95F531533131F5A4), SPH_C64(0x87B9749C7474B9F3), - SPH_C64(0xE309F607F6F60915), SPH_C64(0x0A4346CA4646434C), - SPH_C64(0x0926ACE9ACAC26A5), SPH_C64(0x3C978986898997B5), - SPH_C64(0xA044143C141444B4), SPH_C64(0x5B42E13EE1E142BA), - SPH_C64(0xB04E163A16164EA6), SPH_C64(0xCDD23A4E3A3AD2F7), - SPH_C64(0x6FD069BB6969D006), SPH_C64(0x482D091B09092D41), - SPH_C64(0xA7AD70907070ADD7), SPH_C64(0xD954B6C7B6B6546F), - SPH_C64(0xCEB7D06DD0D0B71E), SPH_C64(0x3B7EED2AEDED7ED6), - SPH_C64(0x2EDBCC49CCCCDBE2), SPH_C64(0x2A5742C642425768), - SPH_C64(0xB4C298B59898C22C), SPH_C64(0x490EA4F1A4A40EED), - SPH_C64(0x5D88287828288875), SPH_C64(0xDA315CE45C5C3186), - SPH_C64(0x933FF815F8F83F6B), SPH_C64(0x44A486978686A4C2) -}; - -static const long long int old1_T3[256] = { - SPH_C64(0x781828181878D8C0), SPH_C64(0xAF23652323AF2605), - SPH_C64(0xF9C657C6C6F9B87E), SPH_C64(0x6FE825E8E86FFB13), - SPH_C64(0xA187948787A1CB4C), SPH_C64(0x62B8D5B8B86211A9), - SPH_C64(0x0501030101050908), SPH_C64(0x6E4FD14F4F6E0D42), - SPH_C64(0xEE365A3636EE9BAD), SPH_C64(0x04A6F7A6A604FF59), - SPH_C64(0xBDD26BD2D2BD0CDE), SPH_C64(0x06F502F5F5060EFB), - SPH_C64(0x80798B79798096EF), SPH_C64(0xCE6FB16F6FCE305F), - SPH_C64(0xEF91AE9191EF6DFC), SPH_C64(0x0752F6525207F8AA), - SPH_C64(0xFD60A06060FD4727), SPH_C64(0x76BCD9BCBC763589), - SPH_C64(0xCD9BB09B9BCD37AC), SPH_C64(0x8C8E8F8E8E8C8A04), - SPH_C64(0x15A3F8A3A315D271), SPH_C64(0x3C0C140C0C3C6C60), - SPH_C64(0x8A7B8D7B7B8A84FF), SPH_C64(0xE1355F3535E180B5), - SPH_C64(0x691D271D1D69F5E8), SPH_C64(0x47E03DE0E047B353), - SPH_C64(0xACD764D7D7AC21F6), SPH_C64(0xEDC25BC2C2ED9C5E), - SPH_C64(0x962E722E2E96436D), SPH_C64(0x7A4BDD4B4B7A2962), - SPH_C64(0x21FE1FFEFE215DA3), SPH_C64(0x1657F9575716D582), - SPH_C64(0x41153F151541BDA8), SPH_C64(0xB677997777B6E89F), - SPH_C64(0xEB37593737EB92A5), SPH_C64(0x56E532E5E5569E7B), - SPH_C64(0xD99FBC9F9FD9138C), SPH_C64(0x17F00DF0F01723D3), - SPH_C64(0x7F4ADE4A4A7F206A), SPH_C64(0x95DA73DADA95449E), - SPH_C64(0x2558E8585825A2FA), SPH_C64(0xCAC946C9C9CACF06), - SPH_C64(0x8D297B29298D7C55), SPH_C64(0x220A1E0A0A225A50), - SPH_C64(0x4FB1CEB1B14F50E1), SPH_C64(0x1AA0FDA0A01AC969), - SPH_C64(0xDA6BBD6B6BDA147F), SPH_C64(0xAB85928585ABD95C), - SPH_C64(0x73BDDABDBD733C81), SPH_C64(0x345DE75D5D348FD2), - SPH_C64(0x5010301010509080), SPH_C64(0x03F401F4F40307F3), - SPH_C64(0xC0CB40CBCBC0DD16), SPH_C64(0xC63E423E3EC6D3ED), - SPH_C64(0x11050F0505112D28), SPH_C64(0xE667A96767E6781F), - SPH_C64(0x53E431E4E4539773), SPH_C64(0xBB27692727BB0225), - SPH_C64(0x5841C34141587332), SPH_C64(0x9D8B808B8B9DA72C), - SPH_C64(0x01A7F4A7A701F651), SPH_C64(0x947D877D7D94B2CF), - SPH_C64(0xFB95A29595FB49DC), SPH_C64(0x9FD875D8D89F568E), - SPH_C64(0x30FB10FBFB30708B), SPH_C64(0x71EE2FEEEE71CD23), - SPH_C64(0x917C847C7C91BBC7), SPH_C64(0xE366AA6666E37117), - SPH_C64(0x8EDD7ADDDD8E7BA6), SPH_C64(0x4B173917174BAFB8), - SPH_C64(0x4647C94747464502), SPH_C64(0xDC9EBF9E9EDC1A84), - SPH_C64(0xC5CA43CACAC5D41E), SPH_C64(0x992D772D2D995875), - SPH_C64(0x79BFDCBFBF792E91), SPH_C64(0x1B070907071B3F38), - SPH_C64(0x23ADEAADAD23AC01), SPH_C64(0x2F5AEE5A5A2FB0EA), - SPH_C64(0xB583988383B5EF6C), SPH_C64(0xFF33553333FFB685), - SPH_C64(0xF263A56363F25C3F), SPH_C64(0x0A020602020A1210), - SPH_C64(0x38AAE3AAAA389339), SPH_C64(0xA871937171A8DEAF), - SPH_C64(0xCFC845C8C8CFC60E), SPH_C64(0x7D192B19197DD1C8), - SPH_C64(0x7049DB4949703B72), SPH_C64(0x9AD976D9D99A5F86), - SPH_C64(0x1DF20BF2F21D31C3), SPH_C64(0x48E338E3E348A84B), - SPH_C64(0x2A5BED5B5B2AB9E2), SPH_C64(0x928885888892BC34), - SPH_C64(0xC89AB39A9AC83EA4), SPH_C64(0xBE266A2626BE0B2D), - SPH_C64(0xFA32563232FABF8D), SPH_C64(0x4AB0CDB0B04A59E9), - SPH_C64(0x6AE926E9E96AF21B), SPH_C64(0x330F110F0F337778), - SPH_C64(0xA6D562D5D5A633E6), SPH_C64(0xBA809D8080BAF474), - SPH_C64(0x7CBEDFBEBE7C2799), SPH_C64(0xDECD4ACDCDDEEB26), - SPH_C64(0xE4345C3434E489BD), SPH_C64(0x7548D8484875327A), - SPH_C64(0x24FF1CFFFF2454AB), SPH_C64(0x8F7A8E7A7A8F8DF7), - SPH_C64(0xEA90AD9090EA64F4), SPH_C64(0x3E5FE15F5F3E9DC2), - SPH_C64(0xA020602020A03D1D), SPH_C64(0xD568B86868D50F67), - SPH_C64(0x721A2E1A1A72CAD0), SPH_C64(0x2CAEEFAEAE2CB719), - SPH_C64(0x5EB4C1B4B45E7DC9), SPH_C64(0x1954FC545419CE9A), - SPH_C64(0xE593A89393E57FEC), SPH_C64(0xAA22662222AA2F0D), - SPH_C64(0xE964AC6464E96307), SPH_C64(0x12F10EF1F1122ADB), - SPH_C64(0xA273957373A2CCBF), SPH_C64(0x5A123612125A8290), - SPH_C64(0x5D40C040405D7A3A), SPH_C64(0x2808180808284840), - SPH_C64(0xE8C358C3C3E89556), SPH_C64(0x7BEC29ECEC7BDF33), - SPH_C64(0x90DB70DBDB904D96), SPH_C64(0x1FA1FEA1A11FC061), - SPH_C64(0x838D8A8D8D83911C), SPH_C64(0xC93D473D3DC9C8F5), - SPH_C64(0xF197A49797F15BCC), SPH_C64(0x0000000000000000), - SPH_C64(0xD4CF4CCFCFD4F936), SPH_C64(0x872B7D2B2B876E45), - SPH_C64(0xB3769A7676B3E197), SPH_C64(0xB0829B8282B0E664), - SPH_C64(0xA9D667D6D6A928FE), SPH_C64(0x771B2D1B1B77C3D8), - SPH_C64(0x5BB5C2B5B55B74C1), SPH_C64(0x29AFECAFAF29BE11), - SPH_C64(0xDF6ABE6A6ADF1D77), SPH_C64(0x0D50F050500DEABA), - SPH_C64(0x4C45CF45454C5712), SPH_C64(0x18F308F3F31838CB), - SPH_C64(0xF030503030F0AD9D), SPH_C64(0x74EF2CEFEF74C42B), - SPH_C64(0xC33F413F3FC3DAE5), SPH_C64(0x1C55FF55551CC792), - SPH_C64(0x10A2FBA2A210DB79), SPH_C64(0x65EA23EAEA65E903), - SPH_C64(0xEC65AF6565EC6A0F), SPH_C64(0x68BAD3BABA6803B9), - SPH_C64(0x932F712F2F934A65), SPH_C64(0xE7C05DC0C0E78E4E), - SPH_C64(0x81DE7FDEDE8160BE), SPH_C64(0x6C1C241C1C6CFCE0), - SPH_C64(0x2EFD1AFDFD2E46BB), SPH_C64(0x644DD74D4D641F52), - SPH_C64(0xE092AB9292E076E4), SPH_C64(0xBC759F7575BCFA8F), - SPH_C64(0x1E060A06061E3630), SPH_C64(0x988A838A8A98AE24), - SPH_C64(0x40B2CBB2B2404BF9), SPH_C64(0x59E637E6E6598563), - SPH_C64(0x360E120E0E367E70), SPH_C64(0x631F211F1F63E7F8), - SPH_C64(0xF762A66262F75537), SPH_C64(0xA3D461D4D4A33AEE), - SPH_C64(0x32A8E5A8A8328129), SPH_C64(0xF496A79696F452C4), - SPH_C64(0x3AF916F9F93A629B), SPH_C64(0xF6C552C5C5F6A366), - SPH_C64(0xB1256F2525B11035), SPH_C64(0x2059EB595920ABF2), - SPH_C64(0xAE84918484AED054), SPH_C64(0xA772967272A7C5B7), - SPH_C64(0xDD394B3939DDECD5), SPH_C64(0x614CD44C4C61165A), - SPH_C64(0x3B5EE25E5E3B94CA), SPH_C64(0x8578887878859FE7), - SPH_C64(0xD838483838D8E5DD), SPH_C64(0x868C898C8C869814), - SPH_C64(0xB2D16ED1D1B217C6), SPH_C64(0x0BA5F2A5A50BE441), - SPH_C64(0x4DE23BE2E24DA143), SPH_C64(0xF861A36161F84E2F), - SPH_C64(0x45B3C8B3B34542F1), SPH_C64(0xA521632121A53415), - SPH_C64(0xD69CB99C9CD60894), SPH_C64(0x661E221E1E66EEF0), - SPH_C64(0x5243C54343526122), SPH_C64(0xFCC754C7C7FCB176), - SPH_C64(0x2BFC19FCFC2B4FB3), SPH_C64(0x14040C0404142420), - SPH_C64(0x0851F3515108E3B2), SPH_C64(0xC799B69999C725BC), - SPH_C64(0xC46DB76D6DC4224F), SPH_C64(0x390D170D0D396568), - SPH_C64(0x35FA13FAFA357983), SPH_C64(0x84DF7CDFDF8469B6), - SPH_C64(0x9B7E827E7E9BA9D7), SPH_C64(0xB4246C2424B4193D), - SPH_C64(0xD73B4D3B3BD7FEC5), SPH_C64(0x3DABE0ABAB3D9A31), - SPH_C64(0xD1CE4FCECED1F03E), SPH_C64(0x5511331111559988), - SPH_C64(0x898F8C8F8F89830C), SPH_C64(0x6B4ED24E4E6B044A), - SPH_C64(0x51B7C4B7B75166D1), SPH_C64(0x60EB20EBEB60E00B), - SPH_C64(0xCC3C443C3CCCC1FD), SPH_C64(0xBF819E8181BFFD7C), - SPH_C64(0xFE94A19494FE40D4), SPH_C64(0x0CF704F7F70C1CEB), - SPH_C64(0x67B9D6B9B96718A1), SPH_C64(0x5F133513135F8B98), - SPH_C64(0x9C2C742C2C9C517D), SPH_C64(0xB8D368D3D3B805D6), - SPH_C64(0x5CE734E7E75C8C6B), SPH_C64(0xCB6EB26E6ECB3957), - SPH_C64(0xF3C451C4C4F3AA6E), SPH_C64(0x0F030503030F1B18), - SPH_C64(0x1356FA565613DC8A), SPH_C64(0x4944CC4444495E1A), - SPH_C64(0x9E7F817F7F9EA0DF), SPH_C64(0x37A9E6A9A9378821), - SPH_C64(0x822A7E2A2A82674D), SPH_C64(0x6DBBD0BBBB6D0AB1), - SPH_C64(0xE2C15EC1C1E28746), SPH_C64(0x0253F5535302F1A2), - SPH_C64(0x8BDC79DCDC8B72AE), SPH_C64(0x270B1D0B0B275358), - SPH_C64(0xD39DBA9D9DD3019C), SPH_C64(0xC16CB46C6CC12B47), - SPH_C64(0xF531533131F5A495), SPH_C64(0xB9749C7474B9F387), - SPH_C64(0x09F607F6F60915E3), SPH_C64(0x4346CA4646434C0A), - SPH_C64(0x26ACE9ACAC26A509), SPH_C64(0x978986898997B53C), - SPH_C64(0x44143C141444B4A0), SPH_C64(0x42E13EE1E142BA5B), - SPH_C64(0x4E163A16164EA6B0), SPH_C64(0xD23A4E3A3AD2F7CD), - SPH_C64(0xD069BB6969D0066F), SPH_C64(0x2D091B09092D4148), - SPH_C64(0xAD70907070ADD7A7), SPH_C64(0x54B6C7B6B6546FD9), - SPH_C64(0xB7D06DD0D0B71ECE), SPH_C64(0x7EED2AEDED7ED63B), - SPH_C64(0xDBCC49CCCCDBE22E), SPH_C64(0x5742C6424257682A), - SPH_C64(0xC298B59898C22CB4), SPH_C64(0x0EA4F1A4A40EED49), - SPH_C64(0x882878282888755D), SPH_C64(0x315CE45C5C3186DA), - SPH_C64(0x3FF815F8F83F6B93), SPH_C64(0xA486978686A4C244) -}; - -static const long long int old1_T4[256] = { - SPH_C64(0x1828181878D8C078), SPH_C64(0x23652323AF2605AF), - SPH_C64(0xC657C6C6F9B87EF9), SPH_C64(0xE825E8E86FFB136F), - SPH_C64(0x87948787A1CB4CA1), SPH_C64(0xB8D5B8B86211A962), - SPH_C64(0x0103010105090805), SPH_C64(0x4FD14F4F6E0D426E), - SPH_C64(0x365A3636EE9BADEE), SPH_C64(0xA6F7A6A604FF5904), - SPH_C64(0xD26BD2D2BD0CDEBD), SPH_C64(0xF502F5F5060EFB06), - SPH_C64(0x798B79798096EF80), SPH_C64(0x6FB16F6FCE305FCE), - SPH_C64(0x91AE9191EF6DFCEF), SPH_C64(0x52F6525207F8AA07), - SPH_C64(0x60A06060FD4727FD), SPH_C64(0xBCD9BCBC76358976), - SPH_C64(0x9BB09B9BCD37ACCD), SPH_C64(0x8E8F8E8E8C8A048C), - SPH_C64(0xA3F8A3A315D27115), SPH_C64(0x0C140C0C3C6C603C), - SPH_C64(0x7B8D7B7B8A84FF8A), SPH_C64(0x355F3535E180B5E1), - SPH_C64(0x1D271D1D69F5E869), SPH_C64(0xE03DE0E047B35347), - SPH_C64(0xD764D7D7AC21F6AC), SPH_C64(0xC25BC2C2ED9C5EED), - SPH_C64(0x2E722E2E96436D96), SPH_C64(0x4BDD4B4B7A29627A), - SPH_C64(0xFE1FFEFE215DA321), SPH_C64(0x57F9575716D58216), - SPH_C64(0x153F151541BDA841), SPH_C64(0x77997777B6E89FB6), - SPH_C64(0x37593737EB92A5EB), SPH_C64(0xE532E5E5569E7B56), - SPH_C64(0x9FBC9F9FD9138CD9), SPH_C64(0xF00DF0F01723D317), - SPH_C64(0x4ADE4A4A7F206A7F), SPH_C64(0xDA73DADA95449E95), - SPH_C64(0x58E8585825A2FA25), SPH_C64(0xC946C9C9CACF06CA), - SPH_C64(0x297B29298D7C558D), SPH_C64(0x0A1E0A0A225A5022), - SPH_C64(0xB1CEB1B14F50E14F), SPH_C64(0xA0FDA0A01AC9691A), - SPH_C64(0x6BBD6B6BDA147FDA), SPH_C64(0x85928585ABD95CAB), - SPH_C64(0xBDDABDBD733C8173), SPH_C64(0x5DE75D5D348FD234), - SPH_C64(0x1030101050908050), SPH_C64(0xF401F4F40307F303), - SPH_C64(0xCB40CBCBC0DD16C0), SPH_C64(0x3E423E3EC6D3EDC6), - SPH_C64(0x050F0505112D2811), SPH_C64(0x67A96767E6781FE6), - SPH_C64(0xE431E4E453977353), SPH_C64(0x27692727BB0225BB), - SPH_C64(0x41C3414158733258), SPH_C64(0x8B808B8B9DA72C9D), - SPH_C64(0xA7F4A7A701F65101), SPH_C64(0x7D877D7D94B2CF94), - SPH_C64(0x95A29595FB49DCFB), SPH_C64(0xD875D8D89F568E9F), - SPH_C64(0xFB10FBFB30708B30), SPH_C64(0xEE2FEEEE71CD2371), - SPH_C64(0x7C847C7C91BBC791), SPH_C64(0x66AA6666E37117E3), - SPH_C64(0xDD7ADDDD8E7BA68E), SPH_C64(0x173917174BAFB84B), - SPH_C64(0x47C9474746450246), SPH_C64(0x9EBF9E9EDC1A84DC), - SPH_C64(0xCA43CACAC5D41EC5), SPH_C64(0x2D772D2D99587599), - SPH_C64(0xBFDCBFBF792E9179), SPH_C64(0x070907071B3F381B), - SPH_C64(0xADEAADAD23AC0123), SPH_C64(0x5AEE5A5A2FB0EA2F), - SPH_C64(0x83988383B5EF6CB5), SPH_C64(0x33553333FFB685FF), - SPH_C64(0x63A56363F25C3FF2), SPH_C64(0x020602020A12100A), - SPH_C64(0xAAE3AAAA38933938), SPH_C64(0x71937171A8DEAFA8), - SPH_C64(0xC845C8C8CFC60ECF), SPH_C64(0x192B19197DD1C87D), - SPH_C64(0x49DB4949703B7270), SPH_C64(0xD976D9D99A5F869A), - SPH_C64(0xF20BF2F21D31C31D), SPH_C64(0xE338E3E348A84B48), - SPH_C64(0x5BED5B5B2AB9E22A), SPH_C64(0x8885888892BC3492), - SPH_C64(0x9AB39A9AC83EA4C8), SPH_C64(0x266A2626BE0B2DBE), - SPH_C64(0x32563232FABF8DFA), SPH_C64(0xB0CDB0B04A59E94A), - SPH_C64(0xE926E9E96AF21B6A), SPH_C64(0x0F110F0F33777833), - SPH_C64(0xD562D5D5A633E6A6), SPH_C64(0x809D8080BAF474BA), - SPH_C64(0xBEDFBEBE7C27997C), SPH_C64(0xCD4ACDCDDEEB26DE), - SPH_C64(0x345C3434E489BDE4), SPH_C64(0x48D8484875327A75), - SPH_C64(0xFF1CFFFF2454AB24), SPH_C64(0x7A8E7A7A8F8DF78F), - SPH_C64(0x90AD9090EA64F4EA), SPH_C64(0x5FE15F5F3E9DC23E), - SPH_C64(0x20602020A03D1DA0), SPH_C64(0x68B86868D50F67D5), - SPH_C64(0x1A2E1A1A72CAD072), SPH_C64(0xAEEFAEAE2CB7192C), - SPH_C64(0xB4C1B4B45E7DC95E), SPH_C64(0x54FC545419CE9A19), - SPH_C64(0x93A89393E57FECE5), SPH_C64(0x22662222AA2F0DAA), - SPH_C64(0x64AC6464E96307E9), SPH_C64(0xF10EF1F1122ADB12), - SPH_C64(0x73957373A2CCBFA2), SPH_C64(0x123612125A82905A), - SPH_C64(0x40C040405D7A3A5D), SPH_C64(0x0818080828484028), - SPH_C64(0xC358C3C3E89556E8), SPH_C64(0xEC29ECEC7BDF337B), - SPH_C64(0xDB70DBDB904D9690), SPH_C64(0xA1FEA1A11FC0611F), - SPH_C64(0x8D8A8D8D83911C83), SPH_C64(0x3D473D3DC9C8F5C9), - SPH_C64(0x97A49797F15BCCF1), SPH_C64(0x0000000000000000), - SPH_C64(0xCF4CCFCFD4F936D4), SPH_C64(0x2B7D2B2B876E4587), - SPH_C64(0x769A7676B3E197B3), SPH_C64(0x829B8282B0E664B0), - SPH_C64(0xD667D6D6A928FEA9), SPH_C64(0x1B2D1B1B77C3D877), - SPH_C64(0xB5C2B5B55B74C15B), SPH_C64(0xAFECAFAF29BE1129), - SPH_C64(0x6ABE6A6ADF1D77DF), SPH_C64(0x50F050500DEABA0D), - SPH_C64(0x45CF45454C57124C), SPH_C64(0xF308F3F31838CB18), - SPH_C64(0x30503030F0AD9DF0), SPH_C64(0xEF2CEFEF74C42B74), - SPH_C64(0x3F413F3FC3DAE5C3), SPH_C64(0x55FF55551CC7921C), - SPH_C64(0xA2FBA2A210DB7910), SPH_C64(0xEA23EAEA65E90365), - SPH_C64(0x65AF6565EC6A0FEC), SPH_C64(0xBAD3BABA6803B968), - SPH_C64(0x2F712F2F934A6593), SPH_C64(0xC05DC0C0E78E4EE7), - SPH_C64(0xDE7FDEDE8160BE81), SPH_C64(0x1C241C1C6CFCE06C), - SPH_C64(0xFD1AFDFD2E46BB2E), SPH_C64(0x4DD74D4D641F5264), - SPH_C64(0x92AB9292E076E4E0), SPH_C64(0x759F7575BCFA8FBC), - SPH_C64(0x060A06061E36301E), SPH_C64(0x8A838A8A98AE2498), - SPH_C64(0xB2CBB2B2404BF940), SPH_C64(0xE637E6E659856359), - SPH_C64(0x0E120E0E367E7036), SPH_C64(0x1F211F1F63E7F863), - SPH_C64(0x62A66262F75537F7), SPH_C64(0xD461D4D4A33AEEA3), - SPH_C64(0xA8E5A8A832812932), SPH_C64(0x96A79696F452C4F4), - SPH_C64(0xF916F9F93A629B3A), SPH_C64(0xC552C5C5F6A366F6), - SPH_C64(0x256F2525B11035B1), SPH_C64(0x59EB595920ABF220), - SPH_C64(0x84918484AED054AE), SPH_C64(0x72967272A7C5B7A7), - SPH_C64(0x394B3939DDECD5DD), SPH_C64(0x4CD44C4C61165A61), - SPH_C64(0x5EE25E5E3B94CA3B), SPH_C64(0x78887878859FE785), - SPH_C64(0x38483838D8E5DDD8), SPH_C64(0x8C898C8C86981486), - SPH_C64(0xD16ED1D1B217C6B2), SPH_C64(0xA5F2A5A50BE4410B), - SPH_C64(0xE23BE2E24DA1434D), SPH_C64(0x61A36161F84E2FF8), - SPH_C64(0xB3C8B3B34542F145), SPH_C64(0x21632121A53415A5), - SPH_C64(0x9CB99C9CD60894D6), SPH_C64(0x1E221E1E66EEF066), - SPH_C64(0x43C5434352612252), SPH_C64(0xC754C7C7FCB176FC), - SPH_C64(0xFC19FCFC2B4FB32B), SPH_C64(0x040C040414242014), - SPH_C64(0x51F3515108E3B208), SPH_C64(0x99B69999C725BCC7), - SPH_C64(0x6DB76D6DC4224FC4), SPH_C64(0x0D170D0D39656839), - SPH_C64(0xFA13FAFA35798335), SPH_C64(0xDF7CDFDF8469B684), - SPH_C64(0x7E827E7E9BA9D79B), SPH_C64(0x246C2424B4193DB4), - SPH_C64(0x3B4D3B3BD7FEC5D7), SPH_C64(0xABE0ABAB3D9A313D), - SPH_C64(0xCE4FCECED1F03ED1), SPH_C64(0x1133111155998855), - SPH_C64(0x8F8C8F8F89830C89), SPH_C64(0x4ED24E4E6B044A6B), - SPH_C64(0xB7C4B7B75166D151), SPH_C64(0xEB20EBEB60E00B60), - SPH_C64(0x3C443C3CCCC1FDCC), SPH_C64(0x819E8181BFFD7CBF), - SPH_C64(0x94A19494FE40D4FE), SPH_C64(0xF704F7F70C1CEB0C), - SPH_C64(0xB9D6B9B96718A167), SPH_C64(0x133513135F8B985F), - SPH_C64(0x2C742C2C9C517D9C), SPH_C64(0xD368D3D3B805D6B8), - SPH_C64(0xE734E7E75C8C6B5C), SPH_C64(0x6EB26E6ECB3957CB), - SPH_C64(0xC451C4C4F3AA6EF3), SPH_C64(0x030503030F1B180F), - SPH_C64(0x56FA565613DC8A13), SPH_C64(0x44CC4444495E1A49), - SPH_C64(0x7F817F7F9EA0DF9E), SPH_C64(0xA9E6A9A937882137), - SPH_C64(0x2A7E2A2A82674D82), SPH_C64(0xBBD0BBBB6D0AB16D), - SPH_C64(0xC15EC1C1E28746E2), SPH_C64(0x53F5535302F1A202), - SPH_C64(0xDC79DCDC8B72AE8B), SPH_C64(0x0B1D0B0B27535827), - SPH_C64(0x9DBA9D9DD3019CD3), SPH_C64(0x6CB46C6CC12B47C1), - SPH_C64(0x31533131F5A495F5), SPH_C64(0x749C7474B9F387B9), - SPH_C64(0xF607F6F60915E309), SPH_C64(0x46CA4646434C0A43), - SPH_C64(0xACE9ACAC26A50926), SPH_C64(0x8986898997B53C97), - SPH_C64(0x143C141444B4A044), SPH_C64(0xE13EE1E142BA5B42), - SPH_C64(0x163A16164EA6B04E), SPH_C64(0x3A4E3A3AD2F7CDD2), - SPH_C64(0x69BB6969D0066FD0), SPH_C64(0x091B09092D41482D), - SPH_C64(0x70907070ADD7A7AD), SPH_C64(0xB6C7B6B6546FD954), - SPH_C64(0xD06DD0D0B71ECEB7), SPH_C64(0xED2AEDED7ED63B7E), - SPH_C64(0xCC49CCCCDBE22EDB), SPH_C64(0x42C6424257682A57), - SPH_C64(0x98B59898C22CB4C2), SPH_C64(0xA4F1A4A40EED490E), - SPH_C64(0x2878282888755D88), SPH_C64(0x5CE45C5C3186DA31), - SPH_C64(0xF815F8F83F6B933F), SPH_C64(0x86978686A4C244A4) -}; - -static const long long int old1_T5[256] = { - SPH_C64(0x28181878D8C07818), SPH_C64(0x652323AF2605AF23), - SPH_C64(0x57C6C6F9B87EF9C6), SPH_C64(0x25E8E86FFB136FE8), - SPH_C64(0x948787A1CB4CA187), SPH_C64(0xD5B8B86211A962B8), - SPH_C64(0x0301010509080501), SPH_C64(0xD14F4F6E0D426E4F), - SPH_C64(0x5A3636EE9BADEE36), SPH_C64(0xF7A6A604FF5904A6), - SPH_C64(0x6BD2D2BD0CDEBDD2), SPH_C64(0x02F5F5060EFB06F5), - SPH_C64(0x8B79798096EF8079), SPH_C64(0xB16F6FCE305FCE6F), - SPH_C64(0xAE9191EF6DFCEF91), SPH_C64(0xF6525207F8AA0752), - SPH_C64(0xA06060FD4727FD60), SPH_C64(0xD9BCBC76358976BC), - SPH_C64(0xB09B9BCD37ACCD9B), SPH_C64(0x8F8E8E8C8A048C8E), - SPH_C64(0xF8A3A315D27115A3), SPH_C64(0x140C0C3C6C603C0C), - SPH_C64(0x8D7B7B8A84FF8A7B), SPH_C64(0x5F3535E180B5E135), - SPH_C64(0x271D1D69F5E8691D), SPH_C64(0x3DE0E047B35347E0), - SPH_C64(0x64D7D7AC21F6ACD7), SPH_C64(0x5BC2C2ED9C5EEDC2), - SPH_C64(0x722E2E96436D962E), SPH_C64(0xDD4B4B7A29627A4B), - SPH_C64(0x1FFEFE215DA321FE), SPH_C64(0xF9575716D5821657), - SPH_C64(0x3F151541BDA84115), SPH_C64(0x997777B6E89FB677), - SPH_C64(0x593737EB92A5EB37), SPH_C64(0x32E5E5569E7B56E5), - SPH_C64(0xBC9F9FD9138CD99F), SPH_C64(0x0DF0F01723D317F0), - SPH_C64(0xDE4A4A7F206A7F4A), SPH_C64(0x73DADA95449E95DA), - SPH_C64(0xE8585825A2FA2558), SPH_C64(0x46C9C9CACF06CAC9), - SPH_C64(0x7B29298D7C558D29), SPH_C64(0x1E0A0A225A50220A), - SPH_C64(0xCEB1B14F50E14FB1), SPH_C64(0xFDA0A01AC9691AA0), - SPH_C64(0xBD6B6BDA147FDA6B), SPH_C64(0x928585ABD95CAB85), - SPH_C64(0xDABDBD733C8173BD), SPH_C64(0xE75D5D348FD2345D), - SPH_C64(0x3010105090805010), SPH_C64(0x01F4F40307F303F4), - SPH_C64(0x40CBCBC0DD16C0CB), SPH_C64(0x423E3EC6D3EDC63E), - SPH_C64(0x0F0505112D281105), SPH_C64(0xA96767E6781FE667), - SPH_C64(0x31E4E453977353E4), SPH_C64(0x692727BB0225BB27), - SPH_C64(0xC341415873325841), SPH_C64(0x808B8B9DA72C9D8B), - SPH_C64(0xF4A7A701F65101A7), SPH_C64(0x877D7D94B2CF947D), - SPH_C64(0xA29595FB49DCFB95), SPH_C64(0x75D8D89F568E9FD8), - SPH_C64(0x10FBFB30708B30FB), SPH_C64(0x2FEEEE71CD2371EE), - SPH_C64(0x847C7C91BBC7917C), SPH_C64(0xAA6666E37117E366), - SPH_C64(0x7ADDDD8E7BA68EDD), SPH_C64(0x3917174BAFB84B17), - SPH_C64(0xC947474645024647), SPH_C64(0xBF9E9EDC1A84DC9E), - SPH_C64(0x43CACAC5D41EC5CA), SPH_C64(0x772D2D995875992D), - SPH_C64(0xDCBFBF792E9179BF), SPH_C64(0x0907071B3F381B07), - SPH_C64(0xEAADAD23AC0123AD), SPH_C64(0xEE5A5A2FB0EA2F5A), - SPH_C64(0x988383B5EF6CB583), SPH_C64(0x553333FFB685FF33), - SPH_C64(0xA56363F25C3FF263), SPH_C64(0x0602020A12100A02), - SPH_C64(0xE3AAAA38933938AA), SPH_C64(0x937171A8DEAFA871), - SPH_C64(0x45C8C8CFC60ECFC8), SPH_C64(0x2B19197DD1C87D19), - SPH_C64(0xDB4949703B727049), SPH_C64(0x76D9D99A5F869AD9), - SPH_C64(0x0BF2F21D31C31DF2), SPH_C64(0x38E3E348A84B48E3), - SPH_C64(0xED5B5B2AB9E22A5B), SPH_C64(0x85888892BC349288), - SPH_C64(0xB39A9AC83EA4C89A), SPH_C64(0x6A2626BE0B2DBE26), - SPH_C64(0x563232FABF8DFA32), SPH_C64(0xCDB0B04A59E94AB0), - SPH_C64(0x26E9E96AF21B6AE9), SPH_C64(0x110F0F337778330F), - SPH_C64(0x62D5D5A633E6A6D5), SPH_C64(0x9D8080BAF474BA80), - SPH_C64(0xDFBEBE7C27997CBE), SPH_C64(0x4ACDCDDEEB26DECD), - SPH_C64(0x5C3434E489BDE434), SPH_C64(0xD8484875327A7548), - SPH_C64(0x1CFFFF2454AB24FF), SPH_C64(0x8E7A7A8F8DF78F7A), - SPH_C64(0xAD9090EA64F4EA90), SPH_C64(0xE15F5F3E9DC23E5F), - SPH_C64(0x602020A03D1DA020), SPH_C64(0xB86868D50F67D568), - SPH_C64(0x2E1A1A72CAD0721A), SPH_C64(0xEFAEAE2CB7192CAE), - SPH_C64(0xC1B4B45E7DC95EB4), SPH_C64(0xFC545419CE9A1954), - SPH_C64(0xA89393E57FECE593), SPH_C64(0x662222AA2F0DAA22), - SPH_C64(0xAC6464E96307E964), SPH_C64(0x0EF1F1122ADB12F1), - SPH_C64(0x957373A2CCBFA273), SPH_C64(0x3612125A82905A12), - SPH_C64(0xC040405D7A3A5D40), SPH_C64(0x1808082848402808), - SPH_C64(0x58C3C3E89556E8C3), SPH_C64(0x29ECEC7BDF337BEC), - SPH_C64(0x70DBDB904D9690DB), SPH_C64(0xFEA1A11FC0611FA1), - SPH_C64(0x8A8D8D83911C838D), SPH_C64(0x473D3DC9C8F5C93D), - SPH_C64(0xA49797F15BCCF197), SPH_C64(0x0000000000000000), - SPH_C64(0x4CCFCFD4F936D4CF), SPH_C64(0x7D2B2B876E45872B), - SPH_C64(0x9A7676B3E197B376), SPH_C64(0x9B8282B0E664B082), - SPH_C64(0x67D6D6A928FEA9D6), SPH_C64(0x2D1B1B77C3D8771B), - SPH_C64(0xC2B5B55B74C15BB5), SPH_C64(0xECAFAF29BE1129AF), - SPH_C64(0xBE6A6ADF1D77DF6A), SPH_C64(0xF050500DEABA0D50), - SPH_C64(0xCF45454C57124C45), SPH_C64(0x08F3F31838CB18F3), - SPH_C64(0x503030F0AD9DF030), SPH_C64(0x2CEFEF74C42B74EF), - SPH_C64(0x413F3FC3DAE5C33F), SPH_C64(0xFF55551CC7921C55), - SPH_C64(0xFBA2A210DB7910A2), SPH_C64(0x23EAEA65E90365EA), - SPH_C64(0xAF6565EC6A0FEC65), SPH_C64(0xD3BABA6803B968BA), - SPH_C64(0x712F2F934A65932F), SPH_C64(0x5DC0C0E78E4EE7C0), - SPH_C64(0x7FDEDE8160BE81DE), SPH_C64(0x241C1C6CFCE06C1C), - SPH_C64(0x1AFDFD2E46BB2EFD), SPH_C64(0xD74D4D641F52644D), - SPH_C64(0xAB9292E076E4E092), SPH_C64(0x9F7575BCFA8FBC75), - SPH_C64(0x0A06061E36301E06), SPH_C64(0x838A8A98AE24988A), - SPH_C64(0xCBB2B2404BF940B2), SPH_C64(0x37E6E659856359E6), - SPH_C64(0x120E0E367E70360E), SPH_C64(0x211F1F63E7F8631F), - SPH_C64(0xA66262F75537F762), SPH_C64(0x61D4D4A33AEEA3D4), - SPH_C64(0xE5A8A832812932A8), SPH_C64(0xA79696F452C4F496), - SPH_C64(0x16F9F93A629B3AF9), SPH_C64(0x52C5C5F6A366F6C5), - SPH_C64(0x6F2525B11035B125), SPH_C64(0xEB595920ABF22059), - SPH_C64(0x918484AED054AE84), SPH_C64(0x967272A7C5B7A772), - SPH_C64(0x4B3939DDECD5DD39), SPH_C64(0xD44C4C61165A614C), - SPH_C64(0xE25E5E3B94CA3B5E), SPH_C64(0x887878859FE78578), - SPH_C64(0x483838D8E5DDD838), SPH_C64(0x898C8C869814868C), - SPH_C64(0x6ED1D1B217C6B2D1), SPH_C64(0xF2A5A50BE4410BA5), - SPH_C64(0x3BE2E24DA1434DE2), SPH_C64(0xA36161F84E2FF861), - SPH_C64(0xC8B3B34542F145B3), SPH_C64(0x632121A53415A521), - SPH_C64(0xB99C9CD60894D69C), SPH_C64(0x221E1E66EEF0661E), - SPH_C64(0xC543435261225243), SPH_C64(0x54C7C7FCB176FCC7), - SPH_C64(0x19FCFC2B4FB32BFC), SPH_C64(0x0C04041424201404), - SPH_C64(0xF3515108E3B20851), SPH_C64(0xB69999C725BCC799), - SPH_C64(0xB76D6DC4224FC46D), SPH_C64(0x170D0D396568390D), - SPH_C64(0x13FAFA35798335FA), SPH_C64(0x7CDFDF8469B684DF), - SPH_C64(0x827E7E9BA9D79B7E), SPH_C64(0x6C2424B4193DB424), - SPH_C64(0x4D3B3BD7FEC5D73B), SPH_C64(0xE0ABAB3D9A313DAB), - SPH_C64(0x4FCECED1F03ED1CE), SPH_C64(0x3311115599885511), - SPH_C64(0x8C8F8F89830C898F), SPH_C64(0xD24E4E6B044A6B4E), - SPH_C64(0xC4B7B75166D151B7), SPH_C64(0x20EBEB60E00B60EB), - SPH_C64(0x443C3CCCC1FDCC3C), SPH_C64(0x9E8181BFFD7CBF81), - SPH_C64(0xA19494FE40D4FE94), SPH_C64(0x04F7F70C1CEB0CF7), - SPH_C64(0xD6B9B96718A167B9), SPH_C64(0x3513135F8B985F13), - SPH_C64(0x742C2C9C517D9C2C), SPH_C64(0x68D3D3B805D6B8D3), - SPH_C64(0x34E7E75C8C6B5CE7), SPH_C64(0xB26E6ECB3957CB6E), - SPH_C64(0x51C4C4F3AA6EF3C4), SPH_C64(0x0503030F1B180F03), - SPH_C64(0xFA565613DC8A1356), SPH_C64(0xCC4444495E1A4944), - SPH_C64(0x817F7F9EA0DF9E7F), SPH_C64(0xE6A9A937882137A9), - SPH_C64(0x7E2A2A82674D822A), SPH_C64(0xD0BBBB6D0AB16DBB), - SPH_C64(0x5EC1C1E28746E2C1), SPH_C64(0xF5535302F1A20253), - SPH_C64(0x79DCDC8B72AE8BDC), SPH_C64(0x1D0B0B275358270B), - SPH_C64(0xBA9D9DD3019CD39D), SPH_C64(0xB46C6CC12B47C16C), - SPH_C64(0x533131F5A495F531), SPH_C64(0x9C7474B9F387B974), - SPH_C64(0x07F6F60915E309F6), SPH_C64(0xCA4646434C0A4346), - SPH_C64(0xE9ACAC26A50926AC), SPH_C64(0x86898997B53C9789), - SPH_C64(0x3C141444B4A04414), SPH_C64(0x3EE1E142BA5B42E1), - SPH_C64(0x3A16164EA6B04E16), SPH_C64(0x4E3A3AD2F7CDD23A), - SPH_C64(0xBB6969D0066FD069), SPH_C64(0x1B09092D41482D09), - SPH_C64(0x907070ADD7A7AD70), SPH_C64(0xC7B6B6546FD954B6), - SPH_C64(0x6DD0D0B71ECEB7D0), SPH_C64(0x2AEDED7ED63B7EED), - SPH_C64(0x49CCCCDBE22EDBCC), SPH_C64(0xC6424257682A5742), - SPH_C64(0xB59898C22CB4C298), SPH_C64(0xF1A4A40EED490EA4), - SPH_C64(0x78282888755D8828), SPH_C64(0xE45C5C3186DA315C), - SPH_C64(0x15F8F83F6B933FF8), SPH_C64(0x978686A4C244A486) -}; - -static const long long int old1_T6[256] = { - SPH_C64(0x181878D8C0781828), SPH_C64(0x2323AF2605AF2365), - SPH_C64(0xC6C6F9B87EF9C657), SPH_C64(0xE8E86FFB136FE825), - SPH_C64(0x8787A1CB4CA18794), SPH_C64(0xB8B86211A962B8D5), - SPH_C64(0x0101050908050103), SPH_C64(0x4F4F6E0D426E4FD1), - SPH_C64(0x3636EE9BADEE365A), SPH_C64(0xA6A604FF5904A6F7), - SPH_C64(0xD2D2BD0CDEBDD26B), SPH_C64(0xF5F5060EFB06F502), - SPH_C64(0x79798096EF80798B), SPH_C64(0x6F6FCE305FCE6FB1), - SPH_C64(0x9191EF6DFCEF91AE), SPH_C64(0x525207F8AA0752F6), - SPH_C64(0x6060FD4727FD60A0), SPH_C64(0xBCBC76358976BCD9), - SPH_C64(0x9B9BCD37ACCD9BB0), SPH_C64(0x8E8E8C8A048C8E8F), - SPH_C64(0xA3A315D27115A3F8), SPH_C64(0x0C0C3C6C603C0C14), - SPH_C64(0x7B7B8A84FF8A7B8D), SPH_C64(0x3535E180B5E1355F), - SPH_C64(0x1D1D69F5E8691D27), SPH_C64(0xE0E047B35347E03D), - SPH_C64(0xD7D7AC21F6ACD764), SPH_C64(0xC2C2ED9C5EEDC25B), - SPH_C64(0x2E2E96436D962E72), SPH_C64(0x4B4B7A29627A4BDD), - SPH_C64(0xFEFE215DA321FE1F), SPH_C64(0x575716D5821657F9), - SPH_C64(0x151541BDA841153F), SPH_C64(0x7777B6E89FB67799), - SPH_C64(0x3737EB92A5EB3759), SPH_C64(0xE5E5569E7B56E532), - SPH_C64(0x9F9FD9138CD99FBC), SPH_C64(0xF0F01723D317F00D), - SPH_C64(0x4A4A7F206A7F4ADE), SPH_C64(0xDADA95449E95DA73), - SPH_C64(0x585825A2FA2558E8), SPH_C64(0xC9C9CACF06CAC946), - SPH_C64(0x29298D7C558D297B), SPH_C64(0x0A0A225A50220A1E), - SPH_C64(0xB1B14F50E14FB1CE), SPH_C64(0xA0A01AC9691AA0FD), - SPH_C64(0x6B6BDA147FDA6BBD), SPH_C64(0x8585ABD95CAB8592), - SPH_C64(0xBDBD733C8173BDDA), SPH_C64(0x5D5D348FD2345DE7), - SPH_C64(0x1010509080501030), SPH_C64(0xF4F40307F303F401), - SPH_C64(0xCBCBC0DD16C0CB40), SPH_C64(0x3E3EC6D3EDC63E42), - SPH_C64(0x0505112D2811050F), SPH_C64(0x6767E6781FE667A9), - SPH_C64(0xE4E453977353E431), SPH_C64(0x2727BB0225BB2769), - SPH_C64(0x41415873325841C3), SPH_C64(0x8B8B9DA72C9D8B80), - SPH_C64(0xA7A701F65101A7F4), SPH_C64(0x7D7D94B2CF947D87), - SPH_C64(0x9595FB49DCFB95A2), SPH_C64(0xD8D89F568E9FD875), - SPH_C64(0xFBFB30708B30FB10), SPH_C64(0xEEEE71CD2371EE2F), - SPH_C64(0x7C7C91BBC7917C84), SPH_C64(0x6666E37117E366AA), - SPH_C64(0xDDDD8E7BA68EDD7A), SPH_C64(0x17174BAFB84B1739), - SPH_C64(0x47474645024647C9), SPH_C64(0x9E9EDC1A84DC9EBF), - SPH_C64(0xCACAC5D41EC5CA43), SPH_C64(0x2D2D995875992D77), - SPH_C64(0xBFBF792E9179BFDC), SPH_C64(0x07071B3F381B0709), - SPH_C64(0xADAD23AC0123ADEA), SPH_C64(0x5A5A2FB0EA2F5AEE), - SPH_C64(0x8383B5EF6CB58398), SPH_C64(0x3333FFB685FF3355), - SPH_C64(0x6363F25C3FF263A5), SPH_C64(0x02020A12100A0206), - SPH_C64(0xAAAA38933938AAE3), SPH_C64(0x7171A8DEAFA87193), - SPH_C64(0xC8C8CFC60ECFC845), SPH_C64(0x19197DD1C87D192B), - SPH_C64(0x4949703B727049DB), SPH_C64(0xD9D99A5F869AD976), - SPH_C64(0xF2F21D31C31DF20B), SPH_C64(0xE3E348A84B48E338), - SPH_C64(0x5B5B2AB9E22A5BED), SPH_C64(0x888892BC34928885), - SPH_C64(0x9A9AC83EA4C89AB3), SPH_C64(0x2626BE0B2DBE266A), - SPH_C64(0x3232FABF8DFA3256), SPH_C64(0xB0B04A59E94AB0CD), - SPH_C64(0xE9E96AF21B6AE926), SPH_C64(0x0F0F337778330F11), - SPH_C64(0xD5D5A633E6A6D562), SPH_C64(0x8080BAF474BA809D), - SPH_C64(0xBEBE7C27997CBEDF), SPH_C64(0xCDCDDEEB26DECD4A), - SPH_C64(0x3434E489BDE4345C), SPH_C64(0x484875327A7548D8), - SPH_C64(0xFFFF2454AB24FF1C), SPH_C64(0x7A7A8F8DF78F7A8E), - SPH_C64(0x9090EA64F4EA90AD), SPH_C64(0x5F5F3E9DC23E5FE1), - SPH_C64(0x2020A03D1DA02060), SPH_C64(0x6868D50F67D568B8), - SPH_C64(0x1A1A72CAD0721A2E), SPH_C64(0xAEAE2CB7192CAEEF), - SPH_C64(0xB4B45E7DC95EB4C1), SPH_C64(0x545419CE9A1954FC), - SPH_C64(0x9393E57FECE593A8), SPH_C64(0x2222AA2F0DAA2266), - SPH_C64(0x6464E96307E964AC), SPH_C64(0xF1F1122ADB12F10E), - SPH_C64(0x7373A2CCBFA27395), SPH_C64(0x12125A82905A1236), - SPH_C64(0x40405D7A3A5D40C0), SPH_C64(0x0808284840280818), - SPH_C64(0xC3C3E89556E8C358), SPH_C64(0xECEC7BDF337BEC29), - SPH_C64(0xDBDB904D9690DB70), SPH_C64(0xA1A11FC0611FA1FE), - SPH_C64(0x8D8D83911C838D8A), SPH_C64(0x3D3DC9C8F5C93D47), - SPH_C64(0x9797F15BCCF197A4), SPH_C64(0x0000000000000000), - SPH_C64(0xCFCFD4F936D4CF4C), SPH_C64(0x2B2B876E45872B7D), - SPH_C64(0x7676B3E197B3769A), SPH_C64(0x8282B0E664B0829B), - SPH_C64(0xD6D6A928FEA9D667), SPH_C64(0x1B1B77C3D8771B2D), - SPH_C64(0xB5B55B74C15BB5C2), SPH_C64(0xAFAF29BE1129AFEC), - SPH_C64(0x6A6ADF1D77DF6ABE), SPH_C64(0x50500DEABA0D50F0), - SPH_C64(0x45454C57124C45CF), SPH_C64(0xF3F31838CB18F308), - SPH_C64(0x3030F0AD9DF03050), SPH_C64(0xEFEF74C42B74EF2C), - SPH_C64(0x3F3FC3DAE5C33F41), SPH_C64(0x55551CC7921C55FF), - SPH_C64(0xA2A210DB7910A2FB), SPH_C64(0xEAEA65E90365EA23), - SPH_C64(0x6565EC6A0FEC65AF), SPH_C64(0xBABA6803B968BAD3), - SPH_C64(0x2F2F934A65932F71), SPH_C64(0xC0C0E78E4EE7C05D), - SPH_C64(0xDEDE8160BE81DE7F), SPH_C64(0x1C1C6CFCE06C1C24), - SPH_C64(0xFDFD2E46BB2EFD1A), SPH_C64(0x4D4D641F52644DD7), - SPH_C64(0x9292E076E4E092AB), SPH_C64(0x7575BCFA8FBC759F), - SPH_C64(0x06061E36301E060A), SPH_C64(0x8A8A98AE24988A83), - SPH_C64(0xB2B2404BF940B2CB), SPH_C64(0xE6E659856359E637), - SPH_C64(0x0E0E367E70360E12), SPH_C64(0x1F1F63E7F8631F21), - SPH_C64(0x6262F75537F762A6), SPH_C64(0xD4D4A33AEEA3D461), - SPH_C64(0xA8A832812932A8E5), SPH_C64(0x9696F452C4F496A7), - SPH_C64(0xF9F93A629B3AF916), SPH_C64(0xC5C5F6A366F6C552), - SPH_C64(0x2525B11035B1256F), SPH_C64(0x595920ABF22059EB), - SPH_C64(0x8484AED054AE8491), SPH_C64(0x7272A7C5B7A77296), - SPH_C64(0x3939DDECD5DD394B), SPH_C64(0x4C4C61165A614CD4), - SPH_C64(0x5E5E3B94CA3B5EE2), SPH_C64(0x7878859FE7857888), - SPH_C64(0x3838D8E5DDD83848), SPH_C64(0x8C8C869814868C89), - SPH_C64(0xD1D1B217C6B2D16E), SPH_C64(0xA5A50BE4410BA5F2), - SPH_C64(0xE2E24DA1434DE23B), SPH_C64(0x6161F84E2FF861A3), - SPH_C64(0xB3B34542F145B3C8), SPH_C64(0x2121A53415A52163), - SPH_C64(0x9C9CD60894D69CB9), SPH_C64(0x1E1E66EEF0661E22), - SPH_C64(0x43435261225243C5), SPH_C64(0xC7C7FCB176FCC754), - SPH_C64(0xFCFC2B4FB32BFC19), SPH_C64(0x040414242014040C), - SPH_C64(0x515108E3B20851F3), SPH_C64(0x9999C725BCC799B6), - SPH_C64(0x6D6DC4224FC46DB7), SPH_C64(0x0D0D396568390D17), - SPH_C64(0xFAFA35798335FA13), SPH_C64(0xDFDF8469B684DF7C), - SPH_C64(0x7E7E9BA9D79B7E82), SPH_C64(0x2424B4193DB4246C), - SPH_C64(0x3B3BD7FEC5D73B4D), SPH_C64(0xABAB3D9A313DABE0), - SPH_C64(0xCECED1F03ED1CE4F), SPH_C64(0x1111559988551133), - SPH_C64(0x8F8F89830C898F8C), SPH_C64(0x4E4E6B044A6B4ED2), - SPH_C64(0xB7B75166D151B7C4), SPH_C64(0xEBEB60E00B60EB20), - SPH_C64(0x3C3CCCC1FDCC3C44), SPH_C64(0x8181BFFD7CBF819E), - SPH_C64(0x9494FE40D4FE94A1), SPH_C64(0xF7F70C1CEB0CF704), - SPH_C64(0xB9B96718A167B9D6), SPH_C64(0x13135F8B985F1335), - SPH_C64(0x2C2C9C517D9C2C74), SPH_C64(0xD3D3B805D6B8D368), - SPH_C64(0xE7E75C8C6B5CE734), SPH_C64(0x6E6ECB3957CB6EB2), - SPH_C64(0xC4C4F3AA6EF3C451), SPH_C64(0x03030F1B180F0305), - SPH_C64(0x565613DC8A1356FA), SPH_C64(0x4444495E1A4944CC), - SPH_C64(0x7F7F9EA0DF9E7F81), SPH_C64(0xA9A937882137A9E6), - SPH_C64(0x2A2A82674D822A7E), SPH_C64(0xBBBB6D0AB16DBBD0), - SPH_C64(0xC1C1E28746E2C15E), SPH_C64(0x535302F1A20253F5), - SPH_C64(0xDCDC8B72AE8BDC79), SPH_C64(0x0B0B275358270B1D), - SPH_C64(0x9D9DD3019CD39DBA), SPH_C64(0x6C6CC12B47C16CB4), - SPH_C64(0x3131F5A495F53153), SPH_C64(0x7474B9F387B9749C), - SPH_C64(0xF6F60915E309F607), SPH_C64(0x4646434C0A4346CA), - SPH_C64(0xACAC26A50926ACE9), SPH_C64(0x898997B53C978986), - SPH_C64(0x141444B4A044143C), SPH_C64(0xE1E142BA5B42E13E), - SPH_C64(0x16164EA6B04E163A), SPH_C64(0x3A3AD2F7CDD23A4E), - SPH_C64(0x6969D0066FD069BB), SPH_C64(0x09092D41482D091B), - SPH_C64(0x7070ADD7A7AD7090), SPH_C64(0xB6B6546FD954B6C7), - SPH_C64(0xD0D0B71ECEB7D06D), SPH_C64(0xEDED7ED63B7EED2A), - SPH_C64(0xCCCCDBE22EDBCC49), SPH_C64(0x424257682A5742C6), - SPH_C64(0x9898C22CB4C298B5), SPH_C64(0xA4A40EED490EA4F1), - SPH_C64(0x282888755D882878), SPH_C64(0x5C5C3186DA315CE4), - SPH_C64(0xF8F83F6B933FF815), SPH_C64(0x8686A4C244A48697) -}; - -static const long long int old1_T7[256] = { - SPH_C64(0x1878D8C078182818), SPH_C64(0x23AF2605AF236523), - SPH_C64(0xC6F9B87EF9C657C6), SPH_C64(0xE86FFB136FE825E8), - SPH_C64(0x87A1CB4CA1879487), SPH_C64(0xB86211A962B8D5B8), - SPH_C64(0x0105090805010301), SPH_C64(0x4F6E0D426E4FD14F), - SPH_C64(0x36EE9BADEE365A36), SPH_C64(0xA604FF5904A6F7A6), - SPH_C64(0xD2BD0CDEBDD26BD2), SPH_C64(0xF5060EFB06F502F5), - SPH_C64(0x798096EF80798B79), SPH_C64(0x6FCE305FCE6FB16F), - SPH_C64(0x91EF6DFCEF91AE91), SPH_C64(0x5207F8AA0752F652), - SPH_C64(0x60FD4727FD60A060), SPH_C64(0xBC76358976BCD9BC), - SPH_C64(0x9BCD37ACCD9BB09B), SPH_C64(0x8E8C8A048C8E8F8E), - SPH_C64(0xA315D27115A3F8A3), SPH_C64(0x0C3C6C603C0C140C), - SPH_C64(0x7B8A84FF8A7B8D7B), SPH_C64(0x35E180B5E1355F35), - SPH_C64(0x1D69F5E8691D271D), SPH_C64(0xE047B35347E03DE0), - SPH_C64(0xD7AC21F6ACD764D7), SPH_C64(0xC2ED9C5EEDC25BC2), - SPH_C64(0x2E96436D962E722E), SPH_C64(0x4B7A29627A4BDD4B), - SPH_C64(0xFE215DA321FE1FFE), SPH_C64(0x5716D5821657F957), - SPH_C64(0x1541BDA841153F15), SPH_C64(0x77B6E89FB6779977), - SPH_C64(0x37EB92A5EB375937), SPH_C64(0xE5569E7B56E532E5), - SPH_C64(0x9FD9138CD99FBC9F), SPH_C64(0xF01723D317F00DF0), - SPH_C64(0x4A7F206A7F4ADE4A), SPH_C64(0xDA95449E95DA73DA), - SPH_C64(0x5825A2FA2558E858), SPH_C64(0xC9CACF06CAC946C9), - SPH_C64(0x298D7C558D297B29), SPH_C64(0x0A225A50220A1E0A), - SPH_C64(0xB14F50E14FB1CEB1), SPH_C64(0xA01AC9691AA0FDA0), - SPH_C64(0x6BDA147FDA6BBD6B), SPH_C64(0x85ABD95CAB859285), - SPH_C64(0xBD733C8173BDDABD), SPH_C64(0x5D348FD2345DE75D), - SPH_C64(0x1050908050103010), SPH_C64(0xF40307F303F401F4), - SPH_C64(0xCBC0DD16C0CB40CB), SPH_C64(0x3EC6D3EDC63E423E), - SPH_C64(0x05112D2811050F05), SPH_C64(0x67E6781FE667A967), - SPH_C64(0xE453977353E431E4), SPH_C64(0x27BB0225BB276927), - SPH_C64(0x415873325841C341), SPH_C64(0x8B9DA72C9D8B808B), - SPH_C64(0xA701F65101A7F4A7), SPH_C64(0x7D94B2CF947D877D), - SPH_C64(0x95FB49DCFB95A295), SPH_C64(0xD89F568E9FD875D8), - SPH_C64(0xFB30708B30FB10FB), SPH_C64(0xEE71CD2371EE2FEE), - SPH_C64(0x7C91BBC7917C847C), SPH_C64(0x66E37117E366AA66), - SPH_C64(0xDD8E7BA68EDD7ADD), SPH_C64(0x174BAFB84B173917), - SPH_C64(0x474645024647C947), SPH_C64(0x9EDC1A84DC9EBF9E), - SPH_C64(0xCAC5D41EC5CA43CA), SPH_C64(0x2D995875992D772D), - SPH_C64(0xBF792E9179BFDCBF), SPH_C64(0x071B3F381B070907), - SPH_C64(0xAD23AC0123ADEAAD), SPH_C64(0x5A2FB0EA2F5AEE5A), - SPH_C64(0x83B5EF6CB5839883), SPH_C64(0x33FFB685FF335533), - SPH_C64(0x63F25C3FF263A563), SPH_C64(0x020A12100A020602), - SPH_C64(0xAA38933938AAE3AA), SPH_C64(0x71A8DEAFA8719371), - SPH_C64(0xC8CFC60ECFC845C8), SPH_C64(0x197DD1C87D192B19), - SPH_C64(0x49703B727049DB49), SPH_C64(0xD99A5F869AD976D9), - SPH_C64(0xF21D31C31DF20BF2), SPH_C64(0xE348A84B48E338E3), - SPH_C64(0x5B2AB9E22A5BED5B), SPH_C64(0x8892BC3492888588), - SPH_C64(0x9AC83EA4C89AB39A), SPH_C64(0x26BE0B2DBE266A26), - SPH_C64(0x32FABF8DFA325632), SPH_C64(0xB04A59E94AB0CDB0), - SPH_C64(0xE96AF21B6AE926E9), SPH_C64(0x0F337778330F110F), - SPH_C64(0xD5A633E6A6D562D5), SPH_C64(0x80BAF474BA809D80), - SPH_C64(0xBE7C27997CBEDFBE), SPH_C64(0xCDDEEB26DECD4ACD), - SPH_C64(0x34E489BDE4345C34), SPH_C64(0x4875327A7548D848), - SPH_C64(0xFF2454AB24FF1CFF), SPH_C64(0x7A8F8DF78F7A8E7A), - SPH_C64(0x90EA64F4EA90AD90), SPH_C64(0x5F3E9DC23E5FE15F), - SPH_C64(0x20A03D1DA0206020), SPH_C64(0x68D50F67D568B868), - SPH_C64(0x1A72CAD0721A2E1A), SPH_C64(0xAE2CB7192CAEEFAE), - SPH_C64(0xB45E7DC95EB4C1B4), SPH_C64(0x5419CE9A1954FC54), - SPH_C64(0x93E57FECE593A893), SPH_C64(0x22AA2F0DAA226622), - SPH_C64(0x64E96307E964AC64), SPH_C64(0xF1122ADB12F10EF1), - SPH_C64(0x73A2CCBFA2739573), SPH_C64(0x125A82905A123612), - SPH_C64(0x405D7A3A5D40C040), SPH_C64(0x0828484028081808), - SPH_C64(0xC3E89556E8C358C3), SPH_C64(0xEC7BDF337BEC29EC), - SPH_C64(0xDB904D9690DB70DB), SPH_C64(0xA11FC0611FA1FEA1), - SPH_C64(0x8D83911C838D8A8D), SPH_C64(0x3DC9C8F5C93D473D), - SPH_C64(0x97F15BCCF197A497), SPH_C64(0x0000000000000000), - SPH_C64(0xCFD4F936D4CF4CCF), SPH_C64(0x2B876E45872B7D2B), - SPH_C64(0x76B3E197B3769A76), SPH_C64(0x82B0E664B0829B82), - SPH_C64(0xD6A928FEA9D667D6), SPH_C64(0x1B77C3D8771B2D1B), - SPH_C64(0xB55B74C15BB5C2B5), SPH_C64(0xAF29BE1129AFECAF), - SPH_C64(0x6ADF1D77DF6ABE6A), SPH_C64(0x500DEABA0D50F050), - SPH_C64(0x454C57124C45CF45), SPH_C64(0xF31838CB18F308F3), - SPH_C64(0x30F0AD9DF0305030), SPH_C64(0xEF74C42B74EF2CEF), - SPH_C64(0x3FC3DAE5C33F413F), SPH_C64(0x551CC7921C55FF55), - SPH_C64(0xA210DB7910A2FBA2), SPH_C64(0xEA65E90365EA23EA), - SPH_C64(0x65EC6A0FEC65AF65), SPH_C64(0xBA6803B968BAD3BA), - SPH_C64(0x2F934A65932F712F), SPH_C64(0xC0E78E4EE7C05DC0), - SPH_C64(0xDE8160BE81DE7FDE), SPH_C64(0x1C6CFCE06C1C241C), - SPH_C64(0xFD2E46BB2EFD1AFD), SPH_C64(0x4D641F52644DD74D), - SPH_C64(0x92E076E4E092AB92), SPH_C64(0x75BCFA8FBC759F75), - SPH_C64(0x061E36301E060A06), SPH_C64(0x8A98AE24988A838A), - SPH_C64(0xB2404BF940B2CBB2), SPH_C64(0xE659856359E637E6), - SPH_C64(0x0E367E70360E120E), SPH_C64(0x1F63E7F8631F211F), - SPH_C64(0x62F75537F762A662), SPH_C64(0xD4A33AEEA3D461D4), - SPH_C64(0xA832812932A8E5A8), SPH_C64(0x96F452C4F496A796), - SPH_C64(0xF93A629B3AF916F9), SPH_C64(0xC5F6A366F6C552C5), - SPH_C64(0x25B11035B1256F25), SPH_C64(0x5920ABF22059EB59), - SPH_C64(0x84AED054AE849184), SPH_C64(0x72A7C5B7A7729672), - SPH_C64(0x39DDECD5DD394B39), SPH_C64(0x4C61165A614CD44C), - SPH_C64(0x5E3B94CA3B5EE25E), SPH_C64(0x78859FE785788878), - SPH_C64(0x38D8E5DDD8384838), SPH_C64(0x8C869814868C898C), - SPH_C64(0xD1B217C6B2D16ED1), SPH_C64(0xA50BE4410BA5F2A5), - SPH_C64(0xE24DA1434DE23BE2), SPH_C64(0x61F84E2FF861A361), - SPH_C64(0xB34542F145B3C8B3), SPH_C64(0x21A53415A5216321), - SPH_C64(0x9CD60894D69CB99C), SPH_C64(0x1E66EEF0661E221E), - SPH_C64(0x435261225243C543), SPH_C64(0xC7FCB176FCC754C7), - SPH_C64(0xFC2B4FB32BFC19FC), SPH_C64(0x0414242014040C04), - SPH_C64(0x5108E3B20851F351), SPH_C64(0x99C725BCC799B699), - SPH_C64(0x6DC4224FC46DB76D), SPH_C64(0x0D396568390D170D), - SPH_C64(0xFA35798335FA13FA), SPH_C64(0xDF8469B684DF7CDF), - SPH_C64(0x7E9BA9D79B7E827E), SPH_C64(0x24B4193DB4246C24), - SPH_C64(0x3BD7FEC5D73B4D3B), SPH_C64(0xAB3D9A313DABE0AB), - SPH_C64(0xCED1F03ED1CE4FCE), SPH_C64(0x1155998855113311), - SPH_C64(0x8F89830C898F8C8F), SPH_C64(0x4E6B044A6B4ED24E), - SPH_C64(0xB75166D151B7C4B7), SPH_C64(0xEB60E00B60EB20EB), - SPH_C64(0x3CCCC1FDCC3C443C), SPH_C64(0x81BFFD7CBF819E81), - SPH_C64(0x94FE40D4FE94A194), SPH_C64(0xF70C1CEB0CF704F7), - SPH_C64(0xB96718A167B9D6B9), SPH_C64(0x135F8B985F133513), - SPH_C64(0x2C9C517D9C2C742C), SPH_C64(0xD3B805D6B8D368D3), - SPH_C64(0xE75C8C6B5CE734E7), SPH_C64(0x6ECB3957CB6EB26E), - SPH_C64(0xC4F3AA6EF3C451C4), SPH_C64(0x030F1B180F030503), - SPH_C64(0x5613DC8A1356FA56), SPH_C64(0x44495E1A4944CC44), - SPH_C64(0x7F9EA0DF9E7F817F), SPH_C64(0xA937882137A9E6A9), - SPH_C64(0x2A82674D822A7E2A), SPH_C64(0xBB6D0AB16DBBD0BB), - SPH_C64(0xC1E28746E2C15EC1), SPH_C64(0x5302F1A20253F553), - SPH_C64(0xDC8B72AE8BDC79DC), SPH_C64(0x0B275358270B1D0B), - SPH_C64(0x9DD3019CD39DBA9D), SPH_C64(0x6CC12B47C16CB46C), - SPH_C64(0x31F5A495F5315331), SPH_C64(0x74B9F387B9749C74), - SPH_C64(0xF60915E309F607F6), SPH_C64(0x46434C0A4346CA46), - SPH_C64(0xAC26A50926ACE9AC), SPH_C64(0x8997B53C97898689), - SPH_C64(0x1444B4A044143C14), SPH_C64(0xE142BA5B42E13EE1), - SPH_C64(0x164EA6B04E163A16), SPH_C64(0x3AD2F7CDD23A4E3A), - SPH_C64(0x69D0066FD069BB69), SPH_C64(0x092D41482D091B09), - SPH_C64(0x70ADD7A7AD709070), SPH_C64(0xB6546FD954B6C7B6), - SPH_C64(0xD0B71ECEB7D06DD0), SPH_C64(0xED7ED63B7EED2AED), - SPH_C64(0xCCDBE22EDBCC49CC), SPH_C64(0x4257682A5742C642), - SPH_C64(0x98C22CB4C298B598), SPH_C64(0xA40EED490EA4F1A4), - SPH_C64(0x2888755D88287828), SPH_C64(0x5C3186DA315CE45C), - SPH_C64(0xF83F6B933FF815F8), SPH_C64(0x86A4C244A4869786) -}; - -#endif - -//static const sph_u64 old1_RC[10] = { -static const long long int old1_RC[10] = { - SPH_C64(0x4F01B887E8C62318), - SPH_C64(0x52916F79F5D2A636), - SPH_C64(0x357B0CA38E9BBC60), - SPH_C64(0x57FE4B2EC2D7E01D), - SPH_C64(0xDA4AF09FE5377715), - SPH_C64(0x856BA0B10A29C958), - SPH_C64(0x67053ECBF4105DBD), - SPH_C64(0xD8957DA78B4127E4), - SPH_C64(0x9E4717DD667CEEFB), - SPH_C64(0x33835AAD07BF2DCA) -}; - -/* ====================================================================== */ - -#define DECL8(z) \ - __m256i z ## 0, z ## 1, z ## 2, z ## 3, \ - z ## 4, z ## 5, z ## 6, z ## 7 - -#if SPH_LITTLE_FAST - -#define READ_DATA_W(x) \ -do { \ - n ## x = ((__m256i*)src)[x]; \ -} while (0) - -#define UPDATE_STATE_W(x) \ -do { \ - state[x] = _mm256_xor_si256( state[x], \ - _mm256_xor_si256( n ## x, ((__m256i*)src)[x] ) ); \ -} while (0) - -#define LVARS DECL8(n); DECL8(h); - -#else - -#define READ_DATA_W(x) \ -do { \ - sn ## x = n ## x = (__m256i*)src + (x); \ -} while (0) - -#define UPDATE_STATE_W(x) \ -do { \ - state[x] = _mm256_xor_si256( state[x], \ - _mm256_xor_si256( n ## x, sn ## x ) ); \ -} while (0) - - -#define LVARS DECL8(n); DECL8(sn); DECL8(h); -#endif - -#define READ_STATE_W(x) \ -do { \ - h ## x = state[x]; \ -} while (0) - -#define MUL8(FUN) \ -do { \ - FUN(0); \ - FUN(1); \ - FUN(2); \ - FUN(3); \ - FUN(4); \ - FUN(5); \ - FUN(6); \ - FUN(7); \ -} while (0) - -/* - * First operation: XOR the input data with the first round key. - */ -#define ROUND0_W(x) \ -do { \ - n ## x = _mm256_xor_si256( n ## x, h ## x ); \ -} while (0) - -#define READ_DATA MUL8(READ_DATA_W) -#define READ_STATE MUL8(READ_STATE_W) -#define ROUND0 MUL8(ROUND0_W) -#define UPDATE_STATE MUL8(UPDATE_STATE_W) -/* -#define BYTE(x, n) \ - _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) ) -*/ -#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF) - - -// A very complex, but structured, expression with a mix of scalar -// and vector operations to retrieve specific 64 bit constants from -// a scalar array referenced by "table". The constants are retrieved -// based on indexes provided by vector "in" and placed in a vector -// to be returned. The entire operation is broken down into three macro. - -// Extract 64 bit vector elements from "in" representing offsets. Unmask the -// low byte of each and scale for use as vector indexes. -// Pack the data in a vector and return it. - -/* -#define t_row( inv, row ) \ - _mm256_and_si256( \ - _mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) ) -*/ - -// Build a vector from elements of non-contiguous 64 bit data extracted from -// scalar "table". -// reference scalar version 1480 kH/s -/* -// version 1, extract with gather -// 955 kH/s -#define t_lane( inv, row, lane ) \ - BYTE( _mm256_extract_epi64( inv, lane ), row ) \ - - -#define t_vec( table, inv, row ) \ - _mm256_i32gather_epi64( table, _mm_set_epi32( t_lane( inv, row, 3 ), \ - t_lane( inv, row, 2 ), t_lane( inv, row, 1 ), \ - t_lane( inv, row, 0) ), 1 ) -*/ -/* -// version 2, extract with set -// 1100 kH/s -#define t_lane( table, inv, row, lane ) \ - table[ BYTE( _mm256_extract_epi64( inv, lane ), row ) ] \ - -#define t_vec( table, inv, row ) \ - _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \ - t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \ - t_lane( table, inv, row, 0 ) ) -*/ - -// version 3, vector indexing with set -// 1105 kH/s -#define t_lane( table, inv, row, lane ) \ - table[ BYTE( inv[ lane ], row ) ] \ - -#define t_vec( table, inv, row ) \ - _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \ - t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \ - t_lane( table, inv, row, 0 ) ) - - - -#if SPH_SMALL_FOOTPRINT_WHIRLPOOL - -static inline __m256i -table_skew( __m256i val, int num ) -{ - return mm256_rol_64( val, 8*num ); -} - -#define ROUND_ELT( table, in, i0, i1, i2, i3, i4, i5, i6, i7 ) \ - (_mm256_xor_si256( t_vec( table ## 0, in ## i0, 0 ), \ - _mm256_xor_si256( table_skew( t_vec( table ## 0, in ## i1, 1 ), 1 ), \ - _mm256_xor_si256( table_skew( t_vec( table ## 0, in ## i2, 2 ), 2 ), \ - _mm256_xor_si256( table_skew( t_vec( table ## 0, in ## i3, 3 ), 3 ), \ - _mm256_xor_si256( table_skew( t_vec( table ## 0, in ## i4, 4 ), 4 ), \ - _mm256_xor_si256( table_skew( t_vec( table ## 0, in ## i5, 5 ), 5 ), \ - _mm256_xor_si256( table_skew( t_vec( table ## 0, in ## i6, 6 ), 6 ), \ - table_skew( t_vec( table ## 0, in ## i7, 7 ), 7 ) \ - )))))))) - -#else - -#define ROUND_ELT(table, in, i0, i1, i2, i3, i4, i5, i6, i7) \ - (_mm256_xor_si256( t_vec( table ## 0, in ## i0, 0 ), \ - _mm256_xor_si256( t_vec( table ## 1, in ## i1, 1 ), \ - _mm256_xor_si256( t_vec( table ## 2, in ## i2, 2 ), \ - _mm256_xor_si256( t_vec( table ## 3, in ## i3, 3 ), \ - _mm256_xor_si256( t_vec( table ## 4, in ## i4, 4 ), \ - _mm256_xor_si256( t_vec( table ## 5, in ## i5, 5 ), \ - _mm256_xor_si256( t_vec( table ## 6, in ## i6, 6 ), \ - t_vec( table ## 7, in ## i7, 7 ) )))))))) - -#endif - - -#define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7) \ -do { \ - out ## 0 = _mm256_xor_si256( \ - ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c0 ); \ - out ## 1 = _mm256_xor_si256( \ - ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2), c1 ); \ - out ## 2 = _mm256_xor_si256( \ - ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3), c2 ); \ - out ## 3 = _mm256_xor_si256( \ - ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4), c3 ); \ - out ## 4 = _mm256_xor_si256( \ - ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5), c4 ); \ - out ## 5 = _mm256_xor_si256( \ - ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6), c5 ); \ - out ## 6 = _mm256_xor_si256( \ - ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7), c6 ); \ - out ## 7 = _mm256_xor_si256( \ - ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0), c7 ); \ -} while (0) - -#define ROUND_KSCHED(table, in, out, c) \ - ROUND(table, in, out, c, _mm256_setzero_si256(), _mm256_setzero_si256(), \ - _mm256_setzero_si256(), _mm256_setzero_si256(), \ - _mm256_setzero_si256(), _mm256_setzero_si256(), \ - _mm256_setzero_si256() ) - - -#define ROUND_WENC(table, in, key, out) \ - ROUND(table, in, out, key ## 0, key ## 1, key ## 2, key ## 3, \ - key ## 4, key ## 5, key ## 6, key ## 7 ) - -#define TRANSFER(dst, src) \ -do { \ - dst ## 0 = src ## 0; \ - dst ## 1 = src ## 1; \ - dst ## 2 = src ## 2; \ - dst ## 3 = src ## 3; \ - dst ## 4 = src ## 4; \ - dst ## 5 = src ## 5; \ - dst ## 6 = src ## 6; \ - dst ## 7 = src ## 7; \ -} while (0) - -void -whirlpool_4way_init(void *cc) -{ - whirlpool_4way_context *sc = cc;; - memset_zero_256( sc->state, 8 ); - sc->count = 0; -} - - -#define ROUND_FUN(name, type) \ -static void \ -name ## _round( const void *src, __m256i *state ) \ -{ \ - LVARS \ - int r; \ - READ_DATA; \ - READ_STATE; \ - ROUND0; \ - for (r = 0; r < 10; r ++) { \ - DECL8(tmp); \ - ROUND_KSCHED( type ## _T, h, tmp, _mm256_set1_epi64x( type ## _RC[r] ) ); \ - TRANSFER( h, tmp ); \ - ROUND_WENC( type ## _T, n, h, tmp ); \ - TRANSFER( n, tmp ); \ - } \ - UPDATE_STATE; \ -} - -ROUND_FUN(whirlpool_4way, plain) -ROUND_FUN(whirlpool0_4way, old0) -ROUND_FUN(whirlpool1_4way, old1) - -/* - * We want big-endian encoding of the message length, over 256 bits. BE64 - * triggers that. However, our block length is 512 bits, not 1024 bits. - * Internally, our encoding/decoding is little-endian, which is not a - * problem here since we also deactivate output in md_helper.c. - */ -#define BE64 1 -#define SVAL sc->state -#define BLEN 64U -#define PLW4 1 - -#define RFUN whirlpool_4way_round -#define HASH whirlpool_4way -#include "md-helper-4way.c" -#undef RFUN -#undef HASH - -#define RFUN whirlpool0_4way_round -#define HASH whirlpool0_4way -#include "md-helper-4way.c" -#undef RFUN -#undef HASH - -#define RFUN whirlpool1_4way_round -#define HASH whirlpool1_4way -#include "md-helper-4way.c" -#undef RFUN -#undef HASH - -#define MAKE_CLOSE(name) \ -void \ -name ## _close(void *cc, void *dst) \ -{ \ - name ## _context *sc; \ - int i; \ - name ## _mdclose(cc, dst, 0); \ - sc = cc; \ - for (i = 0; i < 8; i ++) \ - ((__m256i*)dst)[i] = sc->state[i]; \ -} - -MAKE_CLOSE(whirlpool_4way) -MAKE_CLOSE(whirlpool0_4way) -MAKE_CLOSE(whirlpool1_4way) - -#ifdef __cplusplus -} -#endif -#endif diff --git a/algo/whirlpool/whirlpool-hash-4way.h b/algo/whirlpool/whirlpool-hash-4way.h deleted file mode 100644 index 2c2fb3b..0000000 --- a/algo/whirlpool/whirlpool-hash-4way.h +++ /dev/null @@ -1,108 +0,0 @@ -/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * WHIRLPOOL interface. - * - * WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original - * version, published in 2000, studied by NESSIE), "WHIRLPOOL-1" - * (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current - * version, 2003, with a new diffusion matrix, also described as "plain - * WHIRLPOOL"). All three variants are implemented here. - * - * The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L. - * M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open - * NESSIE Workshop, Leuven, Belgium, November 13--14, 2000. - * - * The current WHIRLPOOL specification and a reference implementation - * can be found on the WHIRLPOOL web page: - * http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_whirlpool.h - * @author Thomas Pornin - */ - -#ifndef WHIRLPOOL_HASH_4WAY_H__ -#define WHIRLPOOL_HASH_4WAY_H__ - -#ifdef __AVX2__ - -#include -#include "algo/sha/sph_types.h" -#include "simd-utils.h" - -/** - * Output size (in bits) for WHIRLPOOL. - */ -#define SPH_SIZE_whirlpool 512 - -/** - * Output size (in bits) for WHIRLPOOL-0. - */ -#define SPH_SIZE_whirlpool0 512 - -/** - * Output size (in bits) for WHIRLPOOL-1. - */ -#define SPH_SIZE_whirlpool1 512 - -typedef struct { - __m256i buf[8] __attribute__ ((aligned (64))); - __m256i state[8]; - sph_u64 count; -} whirlpool_4way_context; - -void whirlpool_4way_init( void *cc ); - -void whirlpool_4way( void *cc, const void *data, size_t len ); - -void whirlpool_4way_close( void *cc, void *dst ); - -/** - * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL. - */ -typedef whirlpool_4way_context whirlpool0_4way_context; - -#define whirlpool0_4way_init whirlpool_4way_init - -void whirlpool0_4way( void *cc, const void *data, size_t len ); - -void whirlpool0_4way_close( void *cc, void *dst ); - -/** - * WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL. - */ -typedef whirlpool_4way_context whirlpool1_4way_context; - -#define whirlpool1_4way_init whirlpool_4way_init - -void whirlpool1_4way(void *cc, const void *data, size_t len); - -void whirlpool1_4way_close(void *cc, void *dst); - -#endif - -#endif diff --git a/algo/whirlpool/whirlpool.c b/algo/whirlpool/whirlpool.c deleted file mode 100644 index 18f38c4..0000000 --- a/algo/whirlpool/whirlpool.c +++ /dev/null @@ -1,98 +0,0 @@ -#include "whirlpool-gate.h" -#include -#include -#include -#include -#include "sph_whirlpool.h" - -typedef struct { - sph_whirlpool_context whirl1; - sph_whirlpool_context whirl2; - sph_whirlpool_context whirl3; - sph_whirlpool_context whirl4; -} whirlpool_ctx_holder; - -static whirlpool_ctx_holder whirl_ctx; -static __thread sph_whirlpool_context whirl1_mid_ctx; - -void init_whirlpool_ctx() -{ - sph_whirlpool1_init( &whirl_ctx.whirl1 ); - sph_whirlpool1_init( &whirl_ctx.whirl2 ); - sph_whirlpool1_init( &whirl_ctx.whirl3 ); - sph_whirlpool1_init( &whirl_ctx.whirl4 ); -} - -void whirlpool_hash(void *state, const void *input) -{ - whirlpool_ctx_holder ctx; - memcpy( &ctx, &whirl_ctx, sizeof(whirl_ctx) ); - - const int midlen = 64; - const int tail = 80 - midlen; - unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; - #define hashB hash+64 - - // copy cached midstate - memcpy( &ctx.whirl1, &whirl1_mid_ctx, sizeof whirl1_mid_ctx ); - sph_whirlpool1( &ctx.whirl1, input + midlen, tail ); - sph_whirlpool1_close(&ctx.whirl1, hash); - - sph_whirlpool1(&ctx.whirl2, hash, 64); - sph_whirlpool1_close(&ctx.whirl2, hashB); - - sph_whirlpool1(&ctx.whirl3, hashB, 64); - sph_whirlpool1_close(&ctx.whirl3, hash); - - sph_whirlpool1(&ctx.whirl4, hash, 64); - sph_whirlpool1_close(&ctx.whirl4, hash); - - memcpy(state, hash, 32); -} - -void whirlpool_midstate( const void* input ) -{ - memcpy( &whirl1_mid_ctx, &whirl_ctx.whirl1, sizeof whirl1_mid_ctx ); - sph_whirlpool1( &whirl1_mid_ctx, input, 64 ); -} - - -int scanhash_whirlpool( struct work* work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) endiandata[20]; - uint32_t* pdata = work->data; - uint32_t* ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce - 1; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - for (int i=0; i < 19; i++) - be32enc(&endiandata[i], pdata[i]); - - whirlpool_midstate( endiandata ); - - do { - const uint32_t Htarg = ptarget[7]; - uint32_t vhash[8]; - pdata[19] = ++n; - be32enc(&endiandata[19], n ); - whirlpool_hash(vhash, endiandata); - - if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) - { - work_set_target_ratio(work, vhash); - *hashes_done = n - first_nonce + 1; - return true; - } - - } while ( n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/whirlpool/whirlpoolx.c b/algo/whirlpool/whirlpoolx.c deleted file mode 100644 index 5c82acd..0000000 --- a/algo/whirlpool/whirlpoolx.c +++ /dev/null @@ -1,71 +0,0 @@ -#include "algo-gate-api.h" - -#include -#include -#include -#include -#include "sph_whirlpool.h" - -void whirlpoolx_hash(void *state, const void *input) -{ - sph_whirlpool_context ctx_whirlpool; - - unsigned char hash[64]; -// unsigned char hash_xored[32]; - - sph_whirlpool1_init(&ctx_whirlpool); - sph_whirlpool1(&ctx_whirlpool, input, 80); - sph_whirlpool1_close(&ctx_whirlpool, hash); - - // compress the 48 first bytes of the hash to 32 -// for (int i = 0; i < 32; i++) -// hash_xored[i] = hash[i] ^ hash[i + 16]; - - memcpy(state, hash, 32); -} - -int scanhash_whirlpoolx( struct work* work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) endiandata[20]; - uint32_t* pdata = work->data; - uint32_t* ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce - 1; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - for (int i=0; i < 19; i++) - be32enc(&endiandata[i], pdata[i]); - - do { - const uint32_t Htarg = ptarget[7]; - uint32_t vhash[8]; - pdata[19] = ++n; - be32enc(&endiandata[19], n ); - whirlpoolx_hash(vhash, endiandata); - - if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) - { - work_set_target_ratio(work, vhash); - *hashes_done = n - first_nonce + 1; - return true; - } - - } while ( n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -bool register_whirlpoolx_algo( algo_gate_t* gate ) -{ - algo_not_tested(); - gate->scanhash = (void*)&scanhash_whirlpoolx; - gate->hash = (void*)&whirlpoolx_hash; - return true; -}; - diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c deleted file mode 100644 index e2922bc..0000000 --- a/algo/x11/c11-4way.c +++ /dev/null @@ -1,210 +0,0 @@ -#include "cpuminer-config.h" -#include "c11-gate.h" - -#if defined (C11_4WAY) - -#include -#include - -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; -} c11_4way_ctx_holder; - -c11_4way_ctx_holder c11_4way_ctx; - -void init_c11_4way_ctx() -{ - blake512_4way_init( &c11_4way_ctx.blake ); - bmw512_4way_init( &c11_4way_ctx.bmw ); - init_groestl( &c11_4way_ctx.groestl, 64 ); - skein512_4way_init( &c11_4way_ctx.skein ); - jh512_4way_init( &c11_4way_ctx.jh ); - keccak512_4way_init( &c11_4way_ctx.keccak ); - luffa_2way_init( &c11_4way_ctx.luffa, 512 ); - cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &c11_4way_ctx.shavite ); - simd_2way_init( &c11_4way_ctx.simd, 512 ); - init_echo( &c11_4way_ctx.echo, 512 ); -} - -void c11_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhashB[8*2] __attribute__ ((aligned (64))); - c11_4way_ctx_holder ctx; - memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) ); - - // 1 Blake 4way - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 3 Groestl - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // 4way - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - // 4 JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // 5 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - // 6 Skein - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 7 Luffa - intrlv_2x128( vhash, hash0, hash1, 512 ); - intrlv_2x128( vhashB, hash2, hash3, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhashB, 512 ); - - // 8 Cubehash - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &c11_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &c11_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &c11_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - // 10 Simd - intrlv_2x128( vhash, hash0, hash1, 512 ); - intrlv_2x128( vhashB, hash2, hash3, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhashB, 512 ); - - // 11 Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -int scanhash_c11_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - __m256i *noncev = (__m256i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - for (int m=0; m < 6; m++) - if (Htarg <= htmax[m]) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - c11_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x11/c11-gate.c b/algo/x11/c11-gate.c deleted file mode 100644 index 30c719b..0000000 --- a/algo/x11/c11-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "c11-gate.h" - -bool register_c11_algo( algo_gate_t* gate ) -{ -#if defined (C11_4WAY) - init_c11_4way_ctx(); - gate->scanhash = (void*)&scanhash_c11_4way; - gate->hash = (void*)&c11_4way_hash; -#else - init_c11_ctx(); - gate->scanhash = (void*)&scanhash_c11; - gate->hash = (void*)&c11_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x11/c11-gate.h b/algo/x11/c11-gate.h deleted file mode 100644 index e4f88a5..0000000 --- a/algo/x11/c11-gate.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef C11_GATE_H__ -#define C11_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define C11_4WAY -#endif - -bool register_c11_algo( algo_gate_t* gate ); - -#if defined(C11_4WAY) - -void c11_4way_hash( void *state, const void *input ); - -int scanhash_c11_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_c11_4way_ctx(); - -#endif - -void c11_hash( void *state, const void *input ); - -int scanhash_c11( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_c11_ctx(); - -#endif - diff --git a/algo/x11/c11.c b/algo/x11/c11.c deleted file mode 100644 index c51f567..0000000 --- a/algo/x11/c11.c +++ /dev/null @@ -1,175 +0,0 @@ -#include "c11-gate.h" - -#include -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - - -typedef struct { - sph_shavite512_context shavite; - sph_skein512_context skein; -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else - hashState_echo echo; - hashState_groestl groestl; -#endif - hashState_luffa luffa; - cubehashParam cube; - hashState_sd simd; -} c11_ctx_holder; - -c11_ctx_holder c11_ctx __attribute__ ((aligned (64))); - -void init_c11_ctx() -{ - init_luffa( &c11_ctx.luffa, 512 ); - cubehashInit( &c11_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &c11_ctx.shavite ); - init_sd( &c11_ctx.simd, 512 ); -#ifdef NO_AES_NI - sph_groestl512_init( &c11_ctx.groestl ); - sph_echo512_init( &c11_ctx.echo ); -#else - init_echo( &c11_ctx.echo, 512 ); - init_groestl( &c11_ctx.groestl, 64 ); -#endif -} - -void c11_hash( void *output, const void *input ) -{ - unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16]; -// uint32_t _ALIGN(64) hash[16]; - - c11_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &c11_ctx, sizeof(c11_ctx) ); - - size_t hashptr; - unsigned char hashbuf[128]; - sph_u64 hashctA; - sph_u64 hashctB; - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - DECL_BMW; - BMW_I; - BMW_U; - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - BMW_C; - #undef M - #undef H - #undef dH - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#endif - - DECL_JH; - JH_H; - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash+64, - (const BitSequence*)hash, 64 ); - - cubehashUpdateDigest( &ctx.cube, (byte*)hash, - (const byte*)hash+64, 64 ); - - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hash+64); - - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash+64, 512 ); - -#ifdef NO_AES_NI - sph_echo512 (&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash+64); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hash+64, - (const BitSequence *)hash, 512 ); -#endif - - memcpy(output, hash+64, 32); -} - -int scanhash_c11( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0cff; - - swab32_array( endiandata, pdata, 20 ); - - do - { - be32enc( &endiandata[19], nonce ); - c11_hash( hash, endiandata ); - if ( hash[7] <= Htarg && fulltest(hash, ptarget) ) - { - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - work_set_target_ratio( work, hash ); - return 1; - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - diff --git a/algo/x11/fresh.c b/algo/x11/fresh.c deleted file mode 100644 index 79491c6..0000000 --- a/algo/x11/fresh.c +++ /dev/null @@ -1,138 +0,0 @@ -#include "algo-gate-api.h" - -#include -#include -#include -#include - -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" - -//#define DEBUG_ALGO - -extern void freshhash(void* output, const void* input, uint32_t len) -{ - unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; - #define hashA hash - #define hashB hash+64 - - sph_shavite512_context ctx_shavite; - sph_simd512_context ctx_simd; - sph_echo512_context ctx_echo; - - sph_shavite512_init(&ctx_shavite); - sph_shavite512(&ctx_shavite, input, len); - sph_shavite512_close(&ctx_shavite, hashA); - - sph_simd512_init(&ctx_simd); - sph_simd512(&ctx_simd, hashA, 64); - sph_simd512_close(&ctx_simd, hashB); - - sph_shavite512_init(&ctx_shavite); - sph_shavite512(&ctx_shavite, hashB, 64); - sph_shavite512_close(&ctx_shavite, hashA); - - sph_simd512_init(&ctx_simd); - sph_simd512(&ctx_simd, hashA, 64); - sph_simd512_close(&ctx_simd, hashB); - - sph_echo512_init(&ctx_echo); - sph_echo512(&ctx_echo, hashB, 64); - sph_echo512_close(&ctx_echo, hashA); - - memcpy(output, hash, 32); -} - -int scanhash_fresh( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t len = 80; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; -#ifdef _MSC_VER - uint32_t __declspec(align(32)) hash64[8]; -#else - uint32_t hash64[8] __attribute__((aligned(32))); -#endif - uint32_t endiandata[32]; - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - for (int k = 0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - -#ifdef DEBUG_ALGO - if (Htarg != 0) - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - freshhash(hash64, endiandata, len); -#ifndef DEBUG_ALGO - if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -void fresh_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - - -bool register_fresh_algo( algo_gate_t* gate ) -{ - algo_not_tested(); - gate->scanhash = (void*)&scanhash_fresh; - gate->hash = (void*)&freshhash; - gate->set_target = (void*)&fresh_set_target; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c deleted file mode 100644 index ba3199c..0000000 --- a/algo/x11/timetravel-4way.c +++ /dev/null @@ -1,233 +0,0 @@ -#include "timetravel-gate.h" - -#if defined(TIMETRAVEL_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread int permutation[TT8_FUNC_COUNT] = { 0 }; - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; -} tt8_4way_ctx_holder; - -tt8_4way_ctx_holder tt8_4way_ctx __attribute__ ((aligned (64))); - -void init_tt8_4way_ctx() -{ - blake512_4way_init( &tt8_4way_ctx.blake ); - bmw512_4way_init( &tt8_4way_ctx.bmw ); - init_groestl( &tt8_4way_ctx.groestl, 64 ); - skein512_4way_init( &tt8_4way_ctx.skein ); - jh512_4way_init( &tt8_4way_ctx.jh ); - keccak512_4way_init( &tt8_4way_ctx.keccak ); - luffa_2way_init( &tt8_4way_ctx.luffa, 512 ); - cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 ); -}; - -void timetravel_4way_hash(void *output, const void *input) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhashX[8*4] __attribute__ ((aligned (64))); - uint64_t vhashY[8*4] __attribute__ ((aligned (64))); - uint64_t *vhashA, *vhashB; - tt8_4way_ctx_holder ctx __attribute__ ((aligned (64))); - uint32_t dataLen = 64; - int i; - - memcpy( &ctx, &tt8_4way_ctx, sizeof(tt8_4way_ctx) ); - - for ( i = 0; i < TT8_FUNC_COUNT; i++ ) - { - if (i == 0) - { - dataLen = 80; - vhashA = (uint64_t*)input; - vhashB = vhashX; - } - else - { - dataLen = 64; - if ( i % 2 == 0 ) - { - vhashA = vhashY; - vhashB = vhashX; - } - else - { - vhashA = vhashX; - vhashB = vhashY; - } - } - - switch ( permutation[i] ) - { - case 0: - blake512_4way( &ctx.blake, vhashA, dataLen ); - blake512_4way_close( &ctx.blake, vhashB ); - if ( i == 7 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 1: - bmw512_4way( &ctx.bmw, vhashA, dataLen ); - bmw512_4way_close( &ctx.bmw, vhashB ); - if ( i == 7 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 2: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, dataLen<<3 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, dataLen<<3 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, dataLen<<3 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, dataLen<<3 ); - if ( i != 7 ) - intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); - break; - case 3: - skein512_4way( &ctx.skein, vhashA, dataLen ); - skein512_4way_close( &ctx.skein, vhashB ); - if ( i == 7 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 4: - jh512_4way( &ctx.jh, vhashA, dataLen ); - jh512_4way_close( &ctx.jh, vhashB ); - if ( i == 7 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 5: - keccak512_4way( &ctx.keccak, vhashA, dataLen ); - keccak512_4way_close( &ctx.keccak, vhashB ); - if ( i == 7 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 6: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - intrlv_2x128( vhashA, hash0, hash1, dataLen<<3 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 ); - intrlv_2x128( vhashA, hash2, hash3, dataLen<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - dintrlv_2x128( hash2, hash3, vhashA, dataLen<<3 ); - if ( i != 7 ) - intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); - break; - case 7: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*)hash0, dataLen ); - memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, - (const byte*)hash1, dataLen ); - memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, - (const byte*)hash2, dataLen ); - memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, - (const byte*)hash3, dataLen ); - if ( i != 7 ) - intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); - break; - default: - applog(LOG_ERR,"SWERR: timetravel invalid permutation"); - break; - } - } - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); -} - -int scanhash_timetravel_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - uint32_t *noncep = vdata + 73; // 9*8 + 1 - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - volatile uint8_t *restart = &(work_restart[thr_id].restart); - int i; - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - for ( int k = 0; k < 19; k++ ) - be32enc( &endiandata[k], pdata[k] ); - - const uint32_t timestamp = endiandata[17]; - if ( timestamp != s_ntime ) - { - const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP ) - % TT8_FUNC_COUNT_PERMUTATIONS; - for ( i = 0; i < TT8_FUNC_COUNT; i++ ) - permutation[i] = i; - for ( i = 0; i < steps; i++ ) - tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT ); - s_ntime = timestamp; - } - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); - - do - { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); - - timetravel_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) - && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !(*restart) ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x11/timetravel-gate.c b/algo/x11/timetravel-gate.c deleted file mode 100644 index f2371ca..0000000 --- a/algo/x11/timetravel-gate.c +++ /dev/null @@ -1,78 +0,0 @@ -#include "timetravel-gate.h" - -void tt8_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool register_timetravel_algo( algo_gate_t* gate ) -{ -#ifdef TIMETRAVEL_4WAY - init_tt8_4way_ctx(); - gate->scanhash = (void*)&scanhash_timetravel_4way; - gate->hash = (void*)&timetravel_4way_hash; -#else - init_tt8_ctx(); - gate->scanhash = (void*)&scanhash_timetravel; - gate->hash = (void*)&timetravel_hash; -#endif - gate->set_target = (void*)&tt8_set_target; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0xffffLL; - return true; -}; - -inline void tt_swap( int *a, int *b ) -{ - int c = *a; - *a = *b; - *b = c; -} - -inline void reverse( int *pbegin, int *pend ) -{ - while ( (pbegin != pend) && (pbegin != --pend) ) - { - tt_swap( pbegin, pend ); - pbegin++; - } -} - -void tt8_next_permutation( int *pbegin, int *pend ) -{ - if ( pbegin == pend ) - return; - - int *i = pbegin; - ++i; - if ( i == pend ) - return; - - i = pend; - --i; - - while (1) - { - int *j = i; - --i; - - if ( *i < *j ) - { - int *k = pend; - - while ( !(*i < *--k) ) /* do nothing */ ; - - tt_swap( i, k ); - reverse(j, pend); - return; // true - } - - if ( i == pbegin ) - { - reverse(pbegin, pend); - return; // false - } - // else? - } -} - diff --git a/algo/x11/timetravel-gate.h b/algo/x11/timetravel-gate.h deleted file mode 100644 index e9c1ae0..0000000 --- a/algo/x11/timetravel-gate.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef TIMETRAVEL_GATE_H__ -#define TIMETRAVEL_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define TIMETRAVEL_4WAY -#endif - -// Machinecoin Genesis Timestamp -#define TT8_FUNC_BASE_TIMESTAMP 1389040865 - -#define TT8_FUNC_COUNT 8 -#define TT8_FUNC_COUNT_PERMUTATIONS 40320 - -void tt8_next_permutation( int *pbegin, int *pend ); - -bool register_timetravel_algo( algo_gate_t* gate ); - -#if defined(TIMETRAVEL_4WAY) - -void timetravel_4way_hash( void *state, const void *input ); - -int scanhash_timetravel_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_tt8_4way_ctx(); - -#endif - -void timetravel_hash( void *state, const void *input ); - -int scanhash_timetravel( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_tt8_ctx(); - -#endif - diff --git a/algo/x11/timetravel.c b/algo/x11/timetravel.c deleted file mode 100644 index 02ad5ec..0000000 --- a/algo/x11/timetravel.c +++ /dev/null @@ -1,311 +0,0 @@ -#include "timetravel-gate.h" - -#include -#include -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#ifdef NO_AES_NI - #include "algo/groestl/sph_groestl.h" -#else - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread int permutation[TT8_FUNC_COUNT] = { 0 }; - -typedef struct { - sph_blake512_context blake; - sph_bmw512_context bmw; - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - hashState_luffa luffa; - cubehashParam cube; -#ifdef NO_AES_NI - sph_groestl512_context groestl; -#else - hashState_groestl groestl; -#endif -} tt_ctx_holder; - -tt_ctx_holder tt_ctx __attribute__ ((aligned (64))); -__thread tt_ctx_holder tt_mid __attribute__ ((aligned (64))); - -void init_tt8_ctx() -{ - sph_blake512_init( &tt_ctx.blake ); - sph_bmw512_init( &tt_ctx.bmw ); - sph_skein512_init( &tt_ctx.skein ); - sph_jh512_init( &tt_ctx.jh ); - sph_keccak512_init( &tt_ctx.keccak ); - init_luffa( &tt_ctx.luffa, 512 ); - cubehashInit( &tt_ctx.cube, 512, 16, 32 ); -#ifdef NO_AES_NI - sph_groestl512_init( &tt_ctx.groestl ); -#else - init_groestl( &tt_ctx.groestl, 64 ); -#endif -}; - -void timetravel_hash(void *output, const void *input) -{ - uint32_t hash[ 16 * TT8_FUNC_COUNT ] __attribute__ ((aligned (64))); - uint32_t *hashA, *hashB; - tt_ctx_holder ctx __attribute__ ((aligned (64))); - uint32_t dataLen = 64; - uint32_t *work_data = (uint32_t *)input; - int i; - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) ); - - for ( i = 0; i < TT8_FUNC_COUNT; i++ ) - { - if (i == 0) - { - dataLen = 80; - hashA = work_data; - } - else - { - dataLen = 64; - hashA = &hash[16 * (i - 1)]; - } - hashB = &hash[16 * i]; - - switch ( permutation[i] ) - { - case 0: - if ( i == 0 ) - { - memcpy( &ctx.blake, &tt_mid.blake, sizeof tt_mid.blake ); - sph_blake512( &ctx.blake, input + midlen, tail ); - sph_blake512_close( &ctx.blake, hashB ); - } - else - { - sph_blake512( &ctx.blake, hashA, dataLen ); - sph_blake512_close( &ctx.blake, hashB ); - } - break; - case 1: - if ( i == 0 ) - { - memcpy( &ctx.bmw, &tt_mid.bmw, sizeof tt_mid.bmw ); - sph_bmw512( &ctx.bmw, input + midlen, tail ); - sph_bmw512_close( &ctx.bmw, hashB ); - } - else - { - sph_bmw512( &ctx.bmw, hashA, dataLen ); - sph_bmw512_close( &ctx.bmw, hashB ); - } - break; - case 2: -#ifdef NO_AES_NI - if ( i == 0 ) - { - memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl ); - sph_groestl512( &ctx.groestl, input + midlen, tail ); - sph_groestl512_close( &ctx.groestl, hashB ); - } - else - { - sph_groestl512( &ctx.groestl, hashA, dataLen ); - sph_groestl512_close( &ctx.groestl, hashB ); - } -#else -// groestl midstate is slower -// if ( i == 0 ) -// { -// memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl ); -// update_and_final_groestl( &ctx.groestl, (char*)hashB, -// (char*)input + midlen, tail*8 ); -// } -// else -// { - update_and_final_groestl( &ctx.groestl, (char*)hashB, - (char*)hashA, dataLen*8 ); -// } -#endif - break; - case 3: - if ( i == 0 ) - { - memcpy( &ctx.skein, &tt_mid.skein, sizeof tt_mid.skein ); - sph_skein512( &ctx.skein, input + midlen, tail ); - sph_skein512_close( &ctx.skein, hashB ); - } - else - { - sph_skein512( &ctx.skein, hashA, dataLen ); - sph_skein512_close( &ctx.skein, hashB ); - } - break; - case 4: - if ( i == 0 ) - { - memcpy( &ctx.jh, &tt_mid.jh, sizeof tt_mid.jh ); - sph_jh512( &ctx.jh, input + midlen, tail ); - sph_jh512_close( &ctx.jh, hashB ); - } - else - { - sph_jh512( &ctx.jh, hashA, dataLen ); - sph_jh512_close( &ctx.jh, hashB); - } - break; - case 5: - if ( i == 0 ) - { - memcpy( &ctx.keccak, &tt_mid.keccak, sizeof tt_mid.keccak ); - sph_keccak512( &ctx.keccak, input + midlen, tail ); - sph_keccak512_close( &ctx.keccak, hashB ); - } - else - { - sph_keccak512( &ctx.keccak, hashA, dataLen ); - sph_keccak512_close( &ctx.keccak, hashB ); - } - break; - case 6: - if ( i == 0 ) - { - memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence *)input + 64, 16 ); - } - else - { - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence *)hashA, dataLen ); - } - break; - case 7: - if ( i == 0 ) - { - memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube ); - cubehashUpdateDigest( &ctx.cube, (byte*)hashB, - (const byte*)input + midlen, tail ); - } - else - { - cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)hashA, - dataLen ); - } - break; - default: - break; - } - } - - memcpy(output, &hash[16 * (TT8_FUNC_COUNT - 1)], 32); -} - -int scanhash_timetravel( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) hash[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - int i; - - if (opt_benchmark) - ptarget[7] = 0x0cff; - - for (int k=0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - const uint32_t timestamp = endiandata[17]; - if ( timestamp != s_ntime ) - { - const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP ) - % TT8_FUNC_COUNT_PERMUTATIONS; - for ( i = 0; i < TT8_FUNC_COUNT; i++ ) - permutation[i] = i; - for ( i = 0; i < steps; i++ ) - tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT ); - s_ntime = timestamp; - - // do midstate precalc for first function - switch ( permutation[0] ) - { - case 0: - memcpy( &tt_mid.blake, &tt_ctx.blake, sizeof(tt_mid.blake) ); - sph_blake512( &tt_mid.blake, endiandata, 64 ); - break; - case 1: - memcpy( &tt_mid.bmw, &tt_ctx.bmw, sizeof(tt_mid.bmw) ); - sph_bmw512( &tt_mid.bmw, endiandata, 64 ); - break; - case 2: -#ifdef NO_AES_NI - memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) ); - sph_groestl512( &tt_mid.groestl, endiandata, 64 ); -#else -// groestl midstate is slower -// memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) ); -// update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 ); -#endif - break; - case 3: - memcpy( &tt_mid.skein, &tt_ctx.skein, sizeof(tt_mid.skein ) ); - sph_skein512( &tt_mid.skein, endiandata, 64 ); - break; - case 4: - memcpy( &tt_mid.jh, &tt_ctx.jh, sizeof(tt_mid.jh ) ); - sph_jh512( &tt_mid.jh, endiandata, 64 ); - break; - case 5: - memcpy( &tt_mid.keccak, &tt_ctx.keccak, sizeof(tt_mid.keccak ) ); - sph_keccak512( &tt_mid.keccak, endiandata, 64 ); - break; - case 6: - memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) ); - update_luffa( &tt_mid.luffa, (const BitSequence*)endiandata, 64 ); - break; - case 7: - memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) ); - cubehashUpdate( &tt_mid.cube, (const byte*)endiandata, 64 ); - break; - default: - break; - } - } - - do { - be32enc( &endiandata[19], nonce ); - timetravel_hash( hash, endiandata ); - - if ( hash[7] <= Htarg && fulltest( hash, ptarget) ) - { - work_set_target_ratio( work, hash ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - work_set_target_ratio( work, hash ); - return 1; - } - nonce++; - - } while (nonce < max_nonce && !(*restart)); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - - diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c deleted file mode 100644 index 5dab3c8..0000000 --- a/algo/x11/timetravel10-4way.c +++ /dev/null @@ -1,267 +0,0 @@ -#include "timetravel10-gate.h" - -#if defined(TIMETRAVEL10_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread int permutation[TT10_FUNC_COUNT] = { 0 }; - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; -} tt10_4way_ctx_holder; - -tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64))); - -void init_tt10_4way_ctx() -{ - blake512_4way_init( &tt10_4way_ctx.blake ); - bmw512_4way_init( &tt10_4way_ctx.bmw ); - init_groestl( &tt10_4way_ctx.groestl, 64 ); - skein512_4way_init( &tt10_4way_ctx.skein ); - jh512_4way_init( &tt10_4way_ctx.jh ); - keccak512_4way_init( &tt10_4way_ctx.keccak ); - luffa_2way_init( &tt10_4way_ctx.luffa, 512 ); - cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &tt10_4way_ctx.shavite ); - simd_2way_init( &tt10_4way_ctx.simd, 512 ); -}; - -void timetravel10_4way_hash(void *output, const void *input) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhashX[8*4] __attribute__ ((aligned (64))); - uint64_t vhashY[8*4] __attribute__ ((aligned (64))); - uint64_t *vhashA, *vhashB; - tt10_4way_ctx_holder ctx __attribute__ ((aligned (64))); - uint32_t dataLen = 64; - int i; - - memcpy( &ctx, &tt10_4way_ctx, sizeof(tt10_4way_ctx) ); - - for ( i = 0; i < TT10_FUNC_COUNT; i++ ) - { - if (i == 0) - { - dataLen = 80; - vhashA = (uint64_t*)input; - vhashB = vhashX; - } - else - { - dataLen = 64; - if ( i % 2 == 0 ) - { - vhashA = vhashY; - vhashB = vhashX; - } - else - { - vhashA = vhashX; - vhashB = vhashY; - } - } - - switch ( permutation[i] ) - { - case 0: - blake512_4way( &ctx.blake, vhashA, dataLen ); - blake512_4way_close( &ctx.blake, vhashB ); - if ( i == 9 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 1: - bmw512_4way( &ctx.bmw, vhashA, dataLen ); - bmw512_4way_close( &ctx.bmw, vhashB ); - if ( i == 9 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 2: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, dataLen<<3 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, dataLen<<3 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, dataLen<<3 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, dataLen<<3 ); - if ( i != 9 ) - intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); - break; - case 3: - skein512_4way( &ctx.skein, vhashA, dataLen ); - skein512_4way_close( &ctx.skein, vhashB ); - if ( i == 9 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 4: - jh512_4way( &ctx.jh, vhashA, dataLen ); - jh512_4way_close( &ctx.jh, vhashB ); - if ( i == 9 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 5: - keccak512_4way( &ctx.keccak, vhashA, dataLen ); - keccak512_4way_close( &ctx.keccak, vhashB ); - if ( i == 9 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); - break; - case 6: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - intrlv_2x128( vhashA, hash0, hash1, dataLen<<3 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 ); - intrlv_2x128( vhashA, hash2, hash3, dataLen<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - dintrlv_2x128( hash2, hash3, vhashA, dataLen<<3 ); - if ( i != 9 ) - intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); - break; - case 7: - dintrlv_4x64( hash0, hash1, hash2, hash3, - vhashA, dataLen<<3 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*)hash0, dataLen ); - memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, - (const byte*)hash1, dataLen ); - memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, - (const byte*)hash2, dataLen ); - memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, - (const byte*)hash3, dataLen ); - if ( i != 9 ) - intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); - break; - case 8: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - sph_shavite512( &ctx.shavite, hash0, dataLen ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, dataLen ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, dataLen ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, dataLen ); - sph_shavite512_close( &ctx.shavite, hash3 ); - if ( i != 9 ) - intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); - break; - case 9: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - intrlv_2x128( vhashA, hash0, hash1, dataLen<<3 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); - dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 ); - intrlv_2x128( vhashA, hash2, hash3, dataLen<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); - dintrlv_2x128( hash2, hash3, vhashA, dataLen<<3 ); - if ( i != 9 ) - intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); - break; - default: - applog(LOG_ERR,"SWERR: timetravel invalid permutation"); - break; - } - } - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); -} - -int scanhash_timetravel10_4way( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - uint32_t *noncep = vdata + 73; // 9*8 + 1 - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - int i; - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - for ( int k = 0; k < 19; k++ ) - be32enc( &endiandata[k], pdata[k] ); - - const uint32_t timestamp = endiandata[17]; - if ( timestamp != s_ntime ) - { - const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP ) - % TT10_FUNC_COUNT_PERMUTATIONS; - for ( i = 0; i < TT10_FUNC_COUNT; i++ ) - permutation[i] = i; - for ( i = 0; i < steps; i++ ) - tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT ); - s_ntime = timestamp; - } - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); - - do - { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); - - timetravel10_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) - && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !(*restart) ); - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x11/timetravel10-gate.c b/algo/x11/timetravel10-gate.c deleted file mode 100644 index 0d8fcc4..0000000 --- a/algo/x11/timetravel10-gate.c +++ /dev/null @@ -1,78 +0,0 @@ -#include "timetravel10-gate.h" - -void tt10_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool register_timetravel10_algo( algo_gate_t* gate ) -{ -#ifdef TIMETRAVEL10_4WAY - init_tt10_4way_ctx(); - gate->scanhash = (void*)&scanhash_timetravel10_4way; - gate->hash = (void*)&timetravel10_4way_hash; -#else - init_tt10_ctx(); - gate->scanhash = (void*)&scanhash_timetravel10; - gate->hash = (void*)&timetravel10_hash; -#endif - gate->set_target = (void*)&tt10_set_target; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0xffffLL; - return true; -}; - -inline void tt10_swap( int *a, int *b ) -{ - int c = *a; - *a = *b; - *b = c; -} - -inline void reverse( int *pbegin, int *pend ) -{ - while ( (pbegin != pend) && (pbegin != --pend) ) - { - tt10_swap( pbegin, pend ); - pbegin++; - } -} - -void tt10_next_permutation( int *pbegin, int *pend ) -{ - if ( pbegin == pend ) - return; - - int *i = pbegin; - ++i; - if ( i == pend ) - return; - - i = pend; - --i; - - while (1) - { - int *j = i; - --i; - - if ( *i < *j ) - { - int *k = pend; - - while ( !(*i < *--k) ) /* do nothing */ ; - - tt10_swap( i, k ); - reverse(j, pend); - return; // true - } - - if ( i == pbegin ) - { - reverse(pbegin, pend); - return; // false - } - // else? - } -} - diff --git a/algo/x11/timetravel10-gate.h b/algo/x11/timetravel10-gate.h deleted file mode 100644 index 35328f5..0000000 --- a/algo/x11/timetravel10-gate.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef TIMETRAVEL10_GATE_H__ -#define TIMETRAVEL10_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define TIMETRAVEL10_4WAY -#endif - -// BitCore Genesis Timestamp -#define TT10_FUNC_BASE_TIMESTAMP 1492973331U -#define TT10_FUNC_COUNT 10 -#define TT10_FUNC_COUNT_PERMUTATIONS 40320 - -void tt10_next_permutation( int *pbegin, int *pend ); - -bool register_timetravel10_algo( algo_gate_t* gate ); - -#if defined(TIMETRAVEL10_4WAY) - -void timetravel10_4way_hash( void *state, const void *input ); - -int scanhash_timetravel10_4way( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); - -void init_tt10_4way_ctx(); - -#endif - -void timetravel10_hash( void *state, const void *input ); - -int scanhash_timetravel10( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_tt10_ctx(); - -#endif - diff --git a/algo/x11/timetravel10.c b/algo/x11/timetravel10.c deleted file mode 100644 index 0fefba5..0000000 --- a/algo/x11/timetravel10.c +++ /dev/null @@ -1,350 +0,0 @@ -#include "timetravel10-gate.h" -#include -#include -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/nist.h" - -#ifdef NO_AES_NI - #include "algo/groestl/sph_groestl.h" -#else - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread int permutation[TT10_FUNC_COUNT] = { 0 }; - -typedef struct { - sph_blake512_context blake; - sph_bmw512_context bmw; - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; -#ifdef NO_AES_NI - sph_groestl512_context groestl; -#else - hashState_groestl groestl; -#endif -} tt10_ctx_holder; - -tt10_ctx_holder tt10_ctx __attribute__ ((aligned (64))); -__thread tt10_ctx_holder tt10_mid __attribute__ ((aligned (64))); - -void init_tt10_ctx() -{ - sph_blake512_init( &tt10_ctx.blake ); - sph_bmw512_init( &tt10_ctx.bmw ); - sph_skein512_init( &tt10_ctx.skein ); - sph_jh512_init( &tt10_ctx.jh ); - sph_keccak512_init( &tt10_ctx.keccak ); - init_luffa( &tt10_ctx.luffa, 512 ); - cubehashInit( &tt10_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &tt10_ctx.shavite ); - init_sd( &tt10_ctx.simd, 512 ); -#ifdef NO_AES_NI - sph_groestl512_init( &tt10_ctx.groestl ); -#else - init_groestl( &tt10_ctx.groestl, 64 ); -#endif -}; - -void timetravel10_hash(void *output, const void *input) -{ - uint32_t hash[ 16 * TT10_FUNC_COUNT ] __attribute__ ((aligned (64))); - uint32_t *hashA, *hashB; - tt10_ctx_holder ctx __attribute__ ((aligned (64))); - uint32_t dataLen = 64; - uint32_t *work_data = (uint32_t *)input; - int i; - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) ); - - for ( i = 0; i < TT10_FUNC_COUNT; i++ ) - { - if (i == 0) - { - dataLen = 80; - hashA = work_data; - } - else - { - dataLen = 64; - hashA = &hash[16 * (i - 1)]; - } - hashB = &hash[16 * i]; - - switch ( permutation[i] ) - { - case 0: - if ( i == 0 ) - { - memcpy( &ctx.blake, &tt10_mid.blake, sizeof tt10_mid.blake ); - sph_blake512( &ctx.blake, input + midlen, tail ); - sph_blake512_close( &ctx.blake, hashB ); - } - else - { - sph_blake512( &ctx.blake, hashA, dataLen ); - sph_blake512_close( &ctx.blake, hashB ); - } - break; - case 1: - if ( i == 0 ) - { - memcpy( &ctx.bmw, &tt10_mid.bmw, sizeof tt10_mid.bmw ); - sph_bmw512( &ctx.bmw, input + midlen, tail ); - sph_bmw512_close( &ctx.bmw, hashB ); - } - else - { - sph_bmw512( &ctx.bmw, hashA, dataLen ); - sph_bmw512_close( &ctx.bmw, hashB ); - } - break; - case 2: -#ifdef NO_AES_NI - if ( i == 0 ) - { - memcpy( &ctx.groestl, &tt10_mid.groestl, sizeof tt10_mid.groestl ); - sph_groestl512( &ctx.groestl, input + midlen, tail ); - sph_groestl512_close( &ctx.groestl, hashB ); - } - else - { - sph_groestl512( &ctx.groestl, hashA, dataLen ); - sph_groestl512_close( &ctx.groestl, hashB ); - } -#else -// groestl midstate is slower -// if ( i == 0 ) -// { -// memcpy( &ctx.groestl, &tt10_mid.groestl, sizeof tt10_mid.groestl ); -// update_and_final_groestl( &ctx.groestl, (char*)hashB, -// (char*)input + midlen, tail*8 ); -// } -// else -// { - update_and_final_groestl( &ctx.groestl, (char*)hashB, - (char*)hashA, dataLen*8 ); -// } -#endif - break; - case 3: - if ( i == 0 ) - { - memcpy( &ctx.skein, &tt10_mid.skein, sizeof tt10_mid.skein ); - sph_skein512( &ctx.skein, input + midlen, tail ); - sph_skein512_close( &ctx.skein, hashB ); - } - else - { - sph_skein512( &ctx.skein, hashA, dataLen ); - sph_skein512_close( &ctx.skein, hashB ); - } - break; - case 4: - if ( i == 0 ) - { - memcpy( &ctx.jh, &tt10_mid.jh, sizeof tt10_mid.jh ); - sph_jh512( &ctx.jh, input + midlen, tail ); - sph_jh512_close( &ctx.jh, hashB ); - } - else - { - sph_jh512( &ctx.jh, hashA, dataLen ); - sph_jh512_close( &ctx.jh, hashB); - } - break; - case 5: - if ( i == 0 ) - { - memcpy( &ctx.keccak, &tt10_mid.keccak, sizeof tt10_mid.keccak ); - sph_keccak512( &ctx.keccak, input + midlen, tail ); - sph_keccak512_close( &ctx.keccak, hashB ); - } - else - { - sph_keccak512( &ctx.keccak, hashA, dataLen ); - sph_keccak512_close( &ctx.keccak, hashB ); - } - break; - case 6: - if ( i == 0 ) - { - memcpy( &ctx.luffa, &tt10_mid.luffa, sizeof tt10_mid.luffa ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence *)input + 64, 16 ); - } - else - { - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence *)hashA, dataLen ); - } - break; - case 7: - if ( i == 0 ) - { - memcpy( &ctx.cube, &tt10_mid.cube, sizeof tt10_mid.cube ); - cubehashUpdateDigest( &ctx.cube, (byte*)hashB, - (const byte*)input + midlen, tail ); - } - else - { - cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)hashA, - dataLen ); - } - break; - case 8: - if ( i == 0 ) - { - memcpy( &ctx.shavite, &tt10_mid.shavite, sizeof tt10_mid.shavite ); - sph_shavite512( &ctx.shavite, input + midlen, tail*8 ); - sph_shavite512_close( &ctx.shavite, hashB ); - } - else - { - sph_shavite512( &ctx.shavite, hashA, dataLen ); - sph_shavite512_close( &ctx.shavite, hashB ); - } - break; - case 9: - if ( i == 0 ) - { - memcpy( &ctx.simd, &tt10_mid.simd, sizeof tt10_mid.simd ); - update_final_sd( &ctx.simd, (BitSequence *)hashB, - (const BitSequence *)input + midlen, tail*8 ); - } - else - { - update_final_sd( &ctx.simd, (BitSequence *)hashB, - (const BitSequence *)hashA, dataLen*8 ); - } - break; - default: - break; - } - } - - memcpy(output, &hash[16 * (TT10_FUNC_COUNT - 1)], 32); -} - -int scanhash_timetravel10( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) hash[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - int i; - - if (opt_benchmark) - ptarget[7] = 0x0cff; - - for (int k=0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - const uint32_t timestamp = endiandata[17]; - if ( timestamp != s_ntime ) - { - const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP ) - % TT10_FUNC_COUNT_PERMUTATIONS; - for ( i = 0; i < TT10_FUNC_COUNT; i++ ) - permutation[i] = i; - for ( i = 0; i < steps; i++ ) - tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT ); - s_ntime = timestamp; - - // do midstate precalc for first function - switch ( permutation[0] ) - { - case 0: - memcpy( &tt10_mid.blake, &tt10_ctx.blake, sizeof(tt10_mid.blake) ); - sph_blake512( &tt10_mid.blake, endiandata, 64 ); - break; - case 1: - memcpy( &tt10_mid.bmw, &tt10_ctx.bmw, sizeof(tt10_mid.bmw) ); - sph_bmw512( &tt10_mid.bmw, endiandata, 64 ); - break; - case 2: -#ifdef NO_AES_NI - memcpy( &tt10_mid.groestl, &tt10_ctx.groestl, sizeof(tt10_mid.groestl ) ); - sph_groestl512( &tt10_mid.groestl, endiandata, 64 ); -#else -// groestl midstate is slower -// memcpy( &tt10_mid.groestl, &tt10_ctx.groestl, sizeof(tt10_mid.groestl ) ); -// update_groestl( &tt10_mid.groestl, (char*)endiandata, 64*8 ); -#endif - break; - case 3: - memcpy( &tt10_mid.skein, &tt10_ctx.skein, sizeof(tt10_mid.skein ) ); - sph_skein512( &tt10_mid.skein, endiandata, 64 ); - break; - case 4: - memcpy( &tt10_mid.jh, &tt10_ctx.jh, sizeof(tt10_mid.jh ) ); - sph_jh512( &tt10_mid.jh, endiandata, 64 ); - break; - case 5: - memcpy( &tt10_mid.keccak, &tt10_ctx.keccak, sizeof(tt10_mid.keccak ) ); - sph_keccak512( &tt10_mid.keccak, endiandata, 64 ); - break; - case 6: - memcpy( &tt10_mid.luffa, &tt10_ctx.luffa, sizeof(tt10_mid.luffa ) ); - update_luffa( &tt10_mid.luffa, (const BitSequence*)endiandata, 64 ); - break; - case 7: - memcpy( &tt10_mid.cube, &tt10_ctx.cube, sizeof(tt10_mid.cube ) ); - cubehashUpdate( &tt10_mid.cube, (const byte*)endiandata, 64 ); - break; - case 8: - memcpy( &tt10_mid.shavite, &tt10_ctx.shavite, sizeof(tt10_mid.shavite ) ); - sph_shavite512( &tt10_mid.shavite, endiandata, 64 ); - break; - case 9: - memcpy( &tt10_mid.simd, &tt10_ctx.simd, sizeof(tt10_mid.simd ) ); - update_sd( &tt10_mid.simd, (const BitSequence *)endiandata, 512 ); - break; - default: - break; - } - } - - do { - be32enc( &endiandata[19], nonce ); - timetravel10_hash( hash, endiandata ); - - if ( hash[7] <= Htarg && fulltest( hash, ptarget) ) - { - work_set_target_ratio( work, hash ); - pdata[19] = nonce; - work_set_target_ratio( work, hash ); - *hashes_done = pdata[19] - first_nonce; - return 1; - } - nonce++; - - } while (nonce < max_nonce && !(*restart)); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} diff --git a/algo/x11/tribus-4way.c b/algo/x11/tribus-4way.c deleted file mode 100644 index d7f6194..0000000 --- a/algo/x11/tribus-4way.c +++ /dev/null @@ -1,127 +0,0 @@ -#include "tribus-gate.h" -#include -#include -#include -#include - -#if defined(TRIBUS_4WAY) - -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/echo/aes_ni/hash_api.h" - -//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64))); -static __thread jh512_4way_context ctx_mid; -/* -void init_tribus_4way_ctx() -{ - init_echo( &tribus_4way_ctx, 512 ); -} -*/ -void tribus_hash_4way(void *state, const void *input) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - jh512_4way_context ctx_jh; - keccak512_4way_context ctx_keccak; - hashState_echo ctx_echo; - - memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) ); - jh512_4way( &ctx_jh, input + (64<<2), 16 ); - jh512_4way_close( &ctx_jh, vhash ); - - keccak512_4way_init( &ctx_keccak ); - keccak512_4way( &ctx_keccak, vhash, 64 ); - keccak512_4way_close( &ctx_keccak, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // hash echo serially - init_echo( &ctx_echo, 512 ); - update_final_echo( &ctx_echo, (BitSequence *) hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx_echo, 512 ); - update_final_echo( &ctx_echo, (BitSequence *) hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx_echo, 512 ); - update_final_echo( &ctx_echo, (BitSequence *) hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx_echo, 512 ); - update_final_echo( &ctx_echo, (BitSequence *) hash3, - (const BitSequence *) hash3, 512 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -int scanhash_tribus_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t n = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - - uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - // precalc midstate - // doing it one way then then interleaving would be faster but too - // complicated tto interleave context. - jh512_4way_init( &ctx_mid ); - jh512_4way( &ctx_mid, vdata, 64 ); - - for ( int m = 0; m < 6; m++ ) - { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - tribus_hash_4way( hash, vdata ); - - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( !( (hash+(i<<3))[7] & mask ) ) - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart); - break; - } - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x11/tribus-gate.c b/algo/x11/tribus-gate.c deleted file mode 100644 index f30d65e..0000000 --- a/algo/x11/tribus-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "tribus-gate.h" - -bool register_tribus_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x1ffff; -#if defined (TRIBUS_4WAY) -// init_tribus_4way_ctx(); - gate->scanhash = (void*)&scanhash_tribus_4way; - gate->hash = (void*)&tribus_hash_4way; -#else - gate->miner_thread_init = (void*)&tribus_thread_init; - gate->scanhash = (void*)&scanhash_tribus; - gate->hash = (void*)&tribus_hash; -#endif - return true; -}; - diff --git a/algo/x11/tribus-gate.h b/algo/x11/tribus-gate.h deleted file mode 100644 index dca51b4..0000000 --- a/algo/x11/tribus-gate.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef TRIBUS_GATE_H__ -#define TRIBUS_GATE_H__ - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define TRIBUS_4WAY -#endif - -#if defined(TRIBUS_4WAY) - -//void init_tribus_4way_ctx(); - -void tribus_hash_4way( void *state, const void *input ); - -int scanhash_tribus_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#else - -void tribus_hash( void *state, const void *input ); - -int scanhash_tribus( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -bool tribus_thread_init(); - -#endif - -#endif diff --git a/algo/x11/tribus.c b/algo/x11/tribus.c deleted file mode 100644 index 2d346fd..0000000 --- a/algo/x11/tribus.c +++ /dev/null @@ -1,139 +0,0 @@ -#include "tribus-gate.h" -#include -#include -#include -#include - -#include "algo/jh//sph_jh.h" -#include "algo/keccak/sph_keccak.h" - -#ifdef NO_AES_NI - #include "algo/echo/sph_echo.h" -#else - #include "algo/echo/aes_ni/hash_api.h" -#endif - -typedef struct { - sph_jh512_context jh; - sph_keccak512_context keccak; -#ifdef NO_AES_NI - sph_echo512_context echo; -#else - hashState_echo echo; -#endif -} tribus_ctx_holder; - -static __thread tribus_ctx_holder tribus_ctx; - -bool tribus_thread_init() -{ - sph_jh512_init( &tribus_ctx.jh ); - sph_keccak512_init( &tribus_ctx.keccak ); -#ifdef NO_AES_NI - sph_echo512_init( &tribus_ctx.echo ); -#else - init_echo( &tribus_ctx.echo, 512 ); -#endif - return true; -} - -void tribus_hash(void *state, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (32))); - tribus_ctx_holder ctx; - memcpy( &ctx, &tribus_ctx, sizeof(tribus_ctx) ); - - sph_jh512( &ctx.jh, input+64, 16 ); - sph_jh512_close( &ctx.jh, (void*) hash ); - - sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); - sph_keccak512_close( &ctx.keccak, (void*) hash ); - -#ifdef NO_AES_NI - sph_echo512( &ctx.echo, hash, 64 ); - sph_echo512_close (&ctx.echo, hash ); -#else - update_final_echo( &ctx.echo, (BitSequence *) hash, - (const BitSequence *) hash, 512 ); -#endif - - memcpy(state, hash, 32); -} - -int scanhash_tribus( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t n = pdata[19] - 1; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - for (int i=0; i < 19; i++) { - be32enc(&endiandata[i], pdata[i]); - } - - // precalc midstate - sph_jh512_init( &tribus_ctx.jh ); - sph_jh512( &tribus_ctx.jh, endiandata, 64 ); - -#ifdef DEBUG_ALGO - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - tribus_hash(hash32, endiandata); -#ifndef DEBUG_ALGO - if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) { - work_set_target_ratio(work, hash32); - *hashes_done = n - first_nonce + 1; - return 1; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash32[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash32, ptarget)) { - work_set_target_ratio(work, hash32); - *hashes_done = n - first_nonce + 1; - return 1; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - - diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c deleted file mode 100644 index 52a3c1b..0000000 --- a/algo/x11/x11-4way.c +++ /dev/null @@ -1,209 +0,0 @@ -#include "cpuminer-config.h" -#include "x11-gate.h" - -#if defined (X11_4WAY) - -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; -} x11_4way_ctx_holder; - -x11_4way_ctx_holder x11_4way_ctx; - -void init_x11_4way_ctx() -{ - blake512_4way_init( &x11_4way_ctx.blake ); - bmw512_4way_init( &x11_4way_ctx.bmw ); - init_groestl( &x11_4way_ctx.groestl, 64 ); - skein512_4way_init( &x11_4way_ctx.skein ); - jh512_4way_init( &x11_4way_ctx.jh ); - keccak512_4way_init( &x11_4way_ctx.keccak ); - luffa_2way_init( &x11_4way_ctx.luffa, 512 ); - cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x11_4way_ctx.shavite ); - simd_2way_init( &x11_4way_ctx.simd, 512 ); - init_echo( &x11_4way_ctx.echo, 512 ); -} - -void x11_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhashB[8*2] __attribute__ ((aligned (64))); - - x11_4way_ctx_holder ctx; - memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) ); - - // 1 Blake 4way - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 3 Groestl - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // 4way - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 7 Luffa parallel 2 way 128 bit - intrlv_2x128( vhash, hash0, hash1, 512 ); - intrlv_2x128( vhashB, hash2, hash3, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhashB, 512 ); - - // 8 Cubehash - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &x11_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &x11_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &x11_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - // 10 Simd - intrlv_2x128( vhash, hash0, hash1, 512 ); - intrlv_2x128( vhashB, hash2, hash3, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhashB, 512 ); - - // 11 Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -int scanhash_x11_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - __m256i *noncev = (__m256i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - for (int m=0; m < 6; m++) - if (Htarg <= htmax[m]) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x11_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x11/x11-gate.c b/algo/x11/x11-gate.c deleted file mode 100644 index adad370..0000000 --- a/algo/x11/x11-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "x11-gate.h" - -bool register_x11_algo( algo_gate_t* gate ) -{ -#if defined (X11_4WAY) - init_x11_4way_ctx(); - gate->scanhash = (void*)&scanhash_x11_4way; - gate->hash = (void*)&x11_4way_hash; -#else - init_x11_ctx(); - gate->scanhash = (void*)&scanhash_x11; - gate->hash = (void*)&x11_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x11/x11-gate.h b/algo/x11/x11-gate.h deleted file mode 100644 index 702dcaa..0000000 --- a/algo/x11/x11-gate.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef X11_GATE_H__ -#define X11_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X11_4WAY -#endif - -bool register_x11_algo( algo_gate_t* gate ); - -#if defined(X11_4WAY) - -void x11_4way_hash( void *state, const void *input ); - -int scanhash_x11_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x11_4way_ctx(); - -#endif - -void x11_hash( void *state, const void *input ); - -int scanhash_x11( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x11_ctx(); - -#endif - diff --git a/algo/x11/x11.c b/algo/x11/x11.c deleted file mode 100644 index fb641a3..0000000 --- a/algo/x11/x11.c +++ /dev/null @@ -1,191 +0,0 @@ -#include "cpuminer-config.h" -#include "x11-gate.h" - -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/echo/sph_echo.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -typedef struct { - hashState_luffa luffa; - cubehashParam cube; - hashState_sd simd; - sph_shavite512_context shavite; -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else - hashState_echo echo; - hashState_groestl groestl; -#endif -} x11_ctx_holder; - -x11_ctx_holder x11_ctx; - -void init_x11_ctx() -{ - init_luffa( &x11_ctx.luffa, 512 ); - cubehashInit( &x11_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x11_ctx.shavite ); - init_sd( &x11_ctx.simd, 512 ); -#ifdef NO_AES_NI - sph_groestl512_init( &x11_ctx.groestl ); - sph_echo512_init( &x11_ctx.echo ); -#else - init_echo( &x11_ctx.echo, 512 ); - init_groestl( &x11_ctx.groestl, 64 ); -#endif -} - -void x11_hash( void *state, const void *input ) -{ - unsigned char hash[128] __attribute__ ((aligned (32))); - unsigned char hashbuf[128] __attribute__ ((aligned (16))); - sph_u64 hashctA; - sph_u64 hashctB; - x11_ctx_holder ctx; - memcpy( &ctx, &x11_ctx, sizeof(x11_ctx) ); - size_t hashptr; - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - DECL_BMW; - BMW_I; - BMW_U; - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - BMW_C; - #undef M - #undef H - #undef dH - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 ); -// update_groestl( &ctx.groestl, (char*)hash, 512 ); -// final_groestl( &ctx.groestl, (char*)hash ); -#endif - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - DECL_JH; - JH_H; - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - -// asm volatile ("emms"); - - update_luffa( &ctx.luffa, (const BitSequence*)hash, 64 ); - final_luffa( &ctx.luffa, (BitSequence*)hash+64 ); - - cubehashUpdate( &ctx.cube, (const byte*) hash+64, 64 ); - cubehashDigest( &ctx.cube, (byte*)hash ); - - sph_shavite512( &ctx.shavite, hash, 64 ); - sph_shavite512_close( &ctx.shavite, hash+64 ); - - update_sd( &ctx.simd, (const BitSequence *)hash+64, 512 ); - final_sd( &ctx.simd, (BitSequence *)hash ); - -#ifdef NO_AES_NI - sph_echo512 (&ctx.echo, hash, 64 ); - sph_echo512_close(&ctx.echo, hash+64 ); -#else - update_echo ( &ctx.echo, (const BitSequence *) hash, 512 ); - final_echo( &ctx.echo, (BitSequence *) hash+64 ); -#endif - -// asm volatile ("emms"); - memcpy( state, hash+64, 32 ); -} - -int scanhash_x11( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // big endian encode 0..18 uint32_t, 64 bits at a time - swab32_array( endiandata, pdata, 20 ); - - for (int m=0; m < 6; m++) - if (Htarg <= htmax[m]) - { - uint32_t mask = masks[m]; - do - { - pdata[19] = ++n; - be32enc( &endiandata[19], n ); - x11_hash( hash64, &endiandata ); - if ( ( hash64[7] & mask ) == 0 ) - { - if ( fulltest( hash64, ptarget ) ) - { - *hashes_done = n - first_nonce + 1; - work_set_target_ratio( work, hash64 ); - return true; - } - } - } while ( n < max_nonce && !work_restart[thr_id].restart ); - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/x11/x11evo-4way.c b/algo/x11/x11evo-4way.c deleted file mode 100644 index 8fe1512..0000000 --- a/algo/x11/x11evo-4way.c +++ /dev/null @@ -1,281 +0,0 @@ -#include "cpuminer-config.h" -#include "x11evo-gate.h" - -#if defined(X11EVO_4WAY) - -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/simd-hash-2way.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; -} x11evo_4way_ctx_holder; - -static x11evo_4way_ctx_holder x11evo_4way_ctx __attribute__ ((aligned (64))); - -void init_x11evo_4way_ctx() -{ - blake512_4way_init( &x11evo_4way_ctx.blake ); - bmw512_4way_init( &x11evo_4way_ctx.bmw ); - init_groestl( &x11evo_4way_ctx.groestl, 64 ); - skein512_4way_init( &x11evo_4way_ctx.skein ); - jh512_4way_init( &x11evo_4way_ctx.jh ); - keccak512_4way_init( &x11evo_4way_ctx.keccak ); - luffa_2way_init( &x11evo_4way_ctx.luffa, 512 ); - cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x11evo_4way_ctx.shavite ); - simd_2way_init( &x11evo_4way_ctx.simd, 512 ); - init_echo( &x11evo_4way_ctx.echo, 512 ); -} - -static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 }; -static __thread uint32_t s_ntime = UINT32_MAX; - -void x11evo_4way_hash( void *state, const void *input ) -{ - uint32_t hash0[16] __attribute__ ((aligned (64))); - uint32_t hash1[16] __attribute__ ((aligned (64))); - uint32_t hash2[16] __attribute__ ((aligned (64))); - uint32_t hash3[16] __attribute__ ((aligned (64))); - uint32_t vhash[16*4] __attribute__ ((aligned (64))); - x11evo_4way_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &x11evo_4way_ctx, sizeof(x11evo_4way_ctx) ); - - if ( s_seq == -1 ) - { - uint32_t *data = (uint32_t*) input; - const uint32_t ntime = data[17]; - evo_twisted_code( ntime, hashOrder ); - } - - int i; - int len = strlen( hashOrder ); - for ( i = 0; i < len; i++ ) - { - char elem = hashOrder[i]; - uint8_t idx; - if ( elem >= 'A' ) - idx = elem - 'A' + 10; - else - idx = elem - '0'; - -// int size = 64; - - switch ( idx ) - { - case 0: - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - break; - case 1: - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - if ( i >= len-1 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - break; - case 2: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); - if ( i < len-1 ) - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 ); - break; - case 3: - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - if ( i >= len-1 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - break; - case 4: - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - if ( i >= len-1 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - break; - case 5: - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - if ( i >= len-1 ) - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - break; - case 6: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - intrlv_2x128( vhash, hash0, hash1, 64<<3 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 64<<3 ); - intrlv_2x128( vhash, hash2, hash3, 64<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash2, hash3, vhash, 64<<3 ); - if ( i < len-1 ) - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 ); - break; - case 7: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, - (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, - (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, - (const byte*) hash3, 64 ); - if ( i < len-1 ) - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 ); - break; - case 8: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - if ( i < len-1 ) - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 ); - break; - case 9: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - intrlv_2x128( vhash, hash0, hash1, 64<<3 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 ); - dintrlv_2x128( hash0, hash1, vhash, 64<<3 ); - intrlv_2x128( vhash, hash2, hash3, 64<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 ); - dintrlv_2x128( hash2, hash3, vhash, 64<<3 ); - if ( i < len-1 ) - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 ); - break; - case 10: - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - if ( i < len-1 ) - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 ); - break; - } - } - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -//static const uint32_t diff1targ = 0x0000ffff; - -int scanhash_x11evo_4way( struct work* work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint32_t *noncep = vdata + 73; // 9*8 + 1 - const uint32_t Htarg = ptarget[7]; - - swab32_array( endiandata, pdata, 20 ); - - int ntime = endiandata[17]; - if ( ntime != s_ntime || s_seq == -1 ) - { - evo_twisted_code( ntime, hashOrder ); - s_ntime = ntime; - } - - uint32_t hmask = 0xFFFFFFFF; - if ( Htarg > 0 ) - { - if ( Htarg <= 0xF ) - hmask = 0xFFFFFFF0; - else if ( Htarg <= 0xFF ) - hmask = 0xFFFFFF00; - else if ( Htarg <= 0xFFF ) - hmask = 0xFFFF000; - else if ( Htarg <= 0xFFFF ) - hmask = 0xFFFF000; - } - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); - - do - { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); - - x11evo_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & hmask ) == 0 ) - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x11/x11evo-gate.c b/algo/x11/x11evo-gate.c deleted file mode 100644 index ccb0d71..0000000 --- a/algo/x11/x11evo-gate.c +++ /dev/null @@ -1,97 +0,0 @@ -#include "x11evo-gate.h" - -int s_seq = -1; - -static inline int getCurrentAlgoSeq( uint32_t current_time ) -{ - // change once per day - return (int) (current_time - X11EVO_INITIAL_DATE) / (60 * 60 * 24); -} - -// swap_vars doesn't work here -void evo_swap( uint8_t *a, uint8_t *b ) -{ - uint8_t __tmp = *a; - *a = *b; - *b = __tmp; -} - -void initPerm( uint8_t n[], uint8_t count ) -{ - int i; - for ( i = 0; i0 && n[i - 1] >= n[i]; i-- ); - tail = i; - - if ( tail > 0 ) - { - for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- ); - evo_swap( &n[tail - 1], &n[j] ); - } - - for ( i = tail, j = count - 1; i= 10 ) - sprintf( sptr, "%c", 'A' + (algoList[j] - 10) ); - else - sprintf( sptr, "%u", algoList[j] ); - sptr++; - } - *sptr = 0; - - //applog(LOG_DEBUG, "nextPerm %s", str); -} - -void evo_twisted_code( uint32_t ntime, char *permstr ) -{ - int seq = getCurrentAlgoSeq( ntime ); - if ( s_seq != seq ) - { - getAlgoString( permstr, seq ); - s_seq = seq; - } -} - -bool register_x11evo_algo( algo_gate_t* gate ) -{ -#if defined (X11EVO_4WAY) - init_x11evo_4way_ctx(); - gate->scanhash = (void*)&scanhash_x11evo_4way; - gate->hash = (void*)&x11evo_4way_hash; -#else - init_x11evo_ctx(); - gate->scanhash = (void*)&scanhash_x11evo; - gate->hash = (void*)&x11evo_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - return true; -}; - diff --git a/algo/x11/x11evo-gate.h b/algo/x11/x11evo-gate.h deleted file mode 100644 index 515f1b3..0000000 --- a/algo/x11/x11evo-gate.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef X11EVO_GATE_H__ -#define X11EVO_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X11EVO_4WAY -#endif - -#define X11EVO_INITIAL_DATE 1462060800 -#define X11EVO_FUNC_COUNT 11 - -extern int s_seq; - -bool register_x11evo_algo( algo_gate_t* gate ); - -#if defined(X11EVO_4WAY) - -void x11evo_4way_hash( void *state, const void *input ); - -int scanhash_x11evo_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x11evo_4way_ctx(); - -#endif - -void x11evo_hash( void *state, const void *input ); - -int scanhash_x11evo( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x11evo_ctx(); - -void evo_twisted_code( uint32_t ntime, char *permstr ); - -#endif - diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c deleted file mode 100644 index d58f124..0000000 --- a/algo/x11/x11evo.c +++ /dev/null @@ -1,213 +0,0 @@ -#include "cpuminer-config.h" -#include "x11evo-gate.h" - -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" - -typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else - hashState_echo echo; - hashState_groestl groestl; -#endif - hashState_luffa luffa; - cubehashParam cube; - hashState_sd simd; - sph_blake512_context blake; - sph_bmw512_context bmw; - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - sph_shavite512_context shavite; -} x11evo_ctx_holder; - -static x11evo_ctx_holder x11evo_ctx __attribute__ ((aligned (64))); - -void init_x11evo_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init( &x11evo_ctx.groestl ); - sph_echo512_init( &x11evo_ctx.echo ); -#else - init_echo( &x11evo_ctx.echo, 512 ); - init_groestl( &x11evo_ctx.groestl, 64 ); -#endif - init_luffa( &x11evo_ctx.luffa, 512 ); - cubehashInit( &x11evo_ctx.cube, 512, 16, 32 ); - init_sd( &x11evo_ctx.simd, 512 ); - sph_blake512_init( &x11evo_ctx.blake ); - sph_bmw512_init( &x11evo_ctx.bmw ); - sph_skein512_init( &x11evo_ctx.skein ); - sph_jh512_init( &x11evo_ctx.jh ); - sph_keccak512_init( &x11evo_ctx.keccak ); - sph_shavite512_init( &x11evo_ctx.shavite ); -} - -static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 }; -static __thread uint32_t s_ntime = UINT32_MAX; - -void x11evo_hash( void *state, const void *input ) -{ - uint32_t hash[16] __attribute__ ((aligned (64))); - x11evo_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &x11evo_ctx, sizeof(x11evo_ctx) ); - - if ( s_seq == -1 ) - { - uint32_t *data = (uint32_t*) input; - const uint32_t ntime = data[17]; - evo_twisted_code(ntime, hashOrder); - } - - int i; - for ( i = 0; i < strlen(hashOrder); i++ ) - { - char elem = hashOrder[i]; - uint8_t idx; - if (elem >= 'A') - idx = elem - 'A' + 10; - else - idx = elem - '0'; - - int size = 64; - - switch (idx) - { - case 0: - sph_blake512( &ctx.blake, (char*)input, 80 ); - sph_blake512_close( &ctx.blake, (char*)hash ); - break; - case 1: - sph_bmw512( &ctx.bmw, (char*)hash, size ); - sph_bmw512_close( &ctx.bmw, (char*)hash ); - break; - case 2: -#ifdef NO_AES_NI - sph_groestl512( &ctx.groestl, (char*)hash, size ); - sph_groestl512_close( &ctx.groestl, (char*)hash ); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#endif - break; - case 3: - sph_skein512( &ctx.skein, (char*)hash, size ); - sph_skein512_close( &ctx.skein, (char*)hash ); - break; - case 4: - sph_jh512( &ctx.jh, (char*)hash, size ); - sph_jh512_close( &ctx.jh, (char*)hash ); - break; - case 5: - sph_keccak512( &ctx.keccak, (char*)hash, size ); - sph_keccak512_close( &ctx.keccak, (char*)hash ); - break; - case 6: - update_and_final_luffa( &ctx.luffa, (char*)hash, - (const char*)hash, 64 ); - break; - case 7: - cubehashUpdateDigest( &ctx.cube, (char*)hash, - (const char*)hash, 64 ); - break; - case 8: - sph_shavite512( &ctx.shavite, (char*)hash, size ); - sph_shavite512_close( &ctx.shavite, (char*)hash ); - break; - case 9: - update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 ); - break; - case 10: -#ifdef NO_AES_NI - sph_echo512( &ctx.echo, (char*)hash, size ); - sph_echo512_close( &ctx.echo, (char*)hash ); -#else - update_final_echo( &ctx.echo, (char*)hash, - (const char*)hash, 512 ); -#endif - break; - } - } - memcpy( state, hash, 32 ); -} - -//static const uint32_t diff1targ = 0x0000ffff; - -int scanhash_x11evo( struct work* work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - - swab32_array( endiandata, pdata, 20 ); - - int ntime = endiandata[17]; - if ( ntime != s_ntime || s_seq == -1 ) - { - evo_twisted_code( ntime, hashOrder ); - s_ntime = ntime; - } - - uint32_t hmask = 0xFFFFFFFF; - if ( Htarg > 0 ) - { - if ( Htarg <= 0xF ) - hmask = 0xFFFFFFF0; - else if ( Htarg <= 0xFF ) - hmask = 0xFFFFFF00; - else if ( Htarg <= 0xFFF ) - hmask = 0xFFFF000; - else if ( Htarg <= 0xFFFF ) - hmask = 0xFFFF000; - } - - do - { - pdata[19] = ++n; - be32enc( &endiandata[19], n ); - x11evo_hash( hash64, endiandata ); - if ( ( hash64[7] & hmask ) == 0 ) - { - if ( fulltest( hash64, ptarget ) ) - { - *hashes_done = n - first_nonce + 1; - work_set_target_ratio( work, hash64 ); - return true; - } - } - } while ( n < max_nonce && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/x11/x11gost-4way.c b/algo/x11/x11gost-4way.c deleted file mode 100644 index 90b1ebd..0000000 --- a/algo/x11/x11gost-4way.c +++ /dev/null @@ -1,216 +0,0 @@ -#include "cpuminer-config.h" -#include "x11gost-gate.h" - -#if defined (X11GOST_4WAY) - -#include -#include - -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/gost/sph_gost.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - sph_gost512_context gost; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; -} x11gost_4way_ctx_holder; - -x11gost_4way_ctx_holder x11gost_4way_ctx; - -void init_x11gost_4way_ctx() -{ - blake512_4way_init( &x11gost_4way_ctx.blake ); - bmw512_4way_init( &x11gost_4way_ctx.bmw ); - init_groestl( &x11gost_4way_ctx.groestl, 64 ); - skein512_4way_init( &x11gost_4way_ctx.skein ); - jh512_4way_init( &x11gost_4way_ctx.jh ); - keccak512_4way_init( &x11gost_4way_ctx.keccak ); - sph_gost512_init( &x11gost_4way_ctx.gost ); - luffa_2way_init( &x11gost_4way_ctx.luffa, 512 ); - cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x11gost_4way_ctx.shavite ); - simd_2way_init( &x11gost_4way_ctx.simd, 512 ); - init_echo( &x11gost_4way_ctx.echo, 512 ); -} - -void x11gost_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - - x11gost_4way_ctx_holder ctx; - memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) ); - - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, - sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, - sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, - sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // 4way - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - sph_gost512( &ctx.gost, hash0, 64 ); - sph_gost512_close( &ctx.gost, hash0 ); - memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) ); - sph_gost512( &ctx.gost, hash1, 64 ); - sph_gost512_close( &ctx.gost, hash1 ); - memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) ); - sph_gost512( &ctx.gost, hash2, 64 ); - sph_gost512_close( &ctx.gost, hash2 ); - memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) ); - sph_gost512( &ctx.gost, hash3, 64 ); - sph_gost512_close( &ctx.gost, hash3 ); - - intrlv_2x128( vhash, hash0, hash1, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - intrlv_2x128( vhash, hash0, hash1, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - __m256i *noncev = (__m256i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - for (int m=0; m < 6; m++) - if (Htarg <= htmax[m]) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x11gost_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x11/x11gost-gate.c b/algo/x11/x11gost-gate.c deleted file mode 100644 index 0d53551..0000000 --- a/algo/x11/x11gost-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "x11gost-gate.h" - -bool register_x11gost_algo( algo_gate_t* gate ) -{ -#if defined (X11GOST_4WAY) - init_x11gost_4way_ctx(); - gate->scanhash = (void*)&scanhash_x11gost_4way; - gate->hash = (void*)&x11gost_4way_hash; -#else - init_x11gost_ctx(); - gate->scanhash = (void*)&scanhash_x11gost; - gate->hash = (void*)&x11gost_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x11/x11gost-gate.h b/algo/x11/x11gost-gate.h deleted file mode 100644 index e090104..0000000 --- a/algo/x11/x11gost-gate.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef X11GOST_GATE_H__ -#define X11GOST_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X11GOST_4WAY -#endif - -bool register_x11gost_algo( algo_gate_t* gate ); - -#if defined(X11GOST_4WAY) - -void x11gost_4way_hash( void *state, const void *input ); - -int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x11gost_4way_ctx(); - -#endif - -void x11gost_hash( void *state, const void *input ); - -int scanhash_x11gost( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x11gost_ctx(); - -#endif - diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c deleted file mode 100644 index dd6964d..0000000 --- a/algo/x11/x11gost.c +++ /dev/null @@ -1,176 +0,0 @@ -#include "x11gost-gate.h" - -#include -#include -#include -#include - -#include "algo/groestl/sph_groestl.h" -#include "algo/gost/sph_gost.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/echo/sph_echo.h" - -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - -typedef struct { - sph_gost512_context gost; - sph_shavite512_context shavite; - hashState_luffa luffa; - cubehashParam cube; - hashState_sd simd; -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else - hashState_echo echo; - hashState_groestl groestl; -#endif -} x11gost_ctx_holder; - -x11gost_ctx_holder x11gost_ctx; - -void init_x11gost_ctx() -{ - sph_gost512_init( &x11gost_ctx.gost ); - sph_shavite512_init( &x11gost_ctx.shavite ); - init_luffa( &x11gost_ctx.luffa, 512 ); - cubehashInit( &x11gost_ctx.cube, 512, 16, 32 ); - init_sd( &x11gost_ctx.simd, 512 ); -#ifdef NO_AES_NI - sph_groestl512_init( &x11gost_ctx.groestl ); - sph_echo512_init( &x11gost_ctx.echo ); -#else - init_echo( &x11gost_ctx.echo, 512 ); - init_groestl( &x11gost_ctx.groestl, 64 ); -#endif - -} - -void x11gost_hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (64))); - #define hashA hash - #define hashB hash+64 - - size_t hashptr; - unsigned char hashbuf[128]; - sph_u64 hashctA; - sph_u64 hashctB; - - x11gost_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) ); - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - DECL_BMW; - BMW_I; - BMW_U; - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - BMW_C; - #undef M - #undef H - #undef dH - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#endif - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - DECL_JH; - JH_H; - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - sph_gost512(&ctx.gost, hashA, 64); - sph_gost512_close(&ctx.gost, hashB); - - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA, - (const BitSequence*)hashB, 64 ); - - cubehashUpdateDigest( &ctx.cube, (byte*) hashB, - (const byte*)hashA, 64 ); - - sph_shavite512(&ctx.shavite, hashB, 64); - sph_shavite512_close(&ctx.shavite, hashA); - - update_final_sd( &ctx.simd, (BitSequence *)hashB, - (const BitSequence *)hashA, 512 ); - -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hashB, 64); - sph_echo512_close(&ctx.echo, hashA); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hashA, - (const BitSequence *)hashB, 512 ); -#endif - - memcpy(output, hashA, 32); -} - -int scanhash_x11gost( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - const uint32_t first_nonce = pdata[19]; - uint32_t _ALIGN(64) endiandata[20]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0cff; - - for (int k = 0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - const uint32_t Htarg = ptarget[7]; - do { - uint32_t hash[8]; - be32enc(&endiandata[19], nonce); - x11gost_hash(hash, endiandata); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) { - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - work_set_target_ratio( work, hash ); - return 1; - } - nonce++; - - } while (nonce < max_nonce && !(*restart)); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - diff --git a/algo/x12/x12-4way.c b/algo/x12/x12-4way.c deleted file mode 100644 index 80fae6d..0000000 --- a/algo/x12/x12-4way.c +++ /dev/null @@ -1,222 +0,0 @@ -#include "x12-gate.h" - -#if defined(X12_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -//#include "algo/fugue/sph_fugue.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; - hamsi512_4way_context hamsi; -} x12_4way_ctx_holder; - -x12_4way_ctx_holder x12_4way_ctx __attribute__ ((aligned (64))); - -void init_x12_4way_ctx() -{ - blake512_4way_init( &x12_4way_ctx.blake ); - bmw512_4way_init( &x12_4way_ctx.bmw ); - init_groestl( &x12_4way_ctx.groestl, 64 ); - skein512_4way_init( &x12_4way_ctx.skein ); - jh512_4way_init( &x12_4way_ctx.jh ); - keccak512_4way_init( &x12_4way_ctx.keccak ); - luffa_2way_init( &x12_4way_ctx.luffa, 512 ); - cubehashInit( &x12_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x12_4way_ctx.shavite ); - simd_2way_init( &x12_4way_ctx.simd, 512 ); - init_echo( &x12_4way_ctx.echo, 512 ); - hamsi512_4way_init( &x12_4way_ctx.hamsi ); -}; - -void x12_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - x12_4way_ctx_holder ctx; - memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) ); - - // 1 Blake - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 3 Groestl - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // Parallel 4way 64 bit - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 7 Luffa - intrlv_2x128( vhash, hash0, hash1, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - intrlv_2x128( hash2, hash3, vhash, 512 ); - - // 8 Cubehash - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &x12_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &x12_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &x12_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - // 10 Simd - intrlv_2x128( vhash, hash0, hash1, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // 11 Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - // 12 Hamsi parallel 4way 32 bit - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 ); -} - -int scanhash_x12_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - uint32_t *noncep = vdata + 73; // 9*8 + 1 - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - // big endian encode 0..18 uint32_t, 64 bits at a time - swab32_array( endiandata, pdata, 20 ); - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); - - for ( int m=0; m < 6; m++ ) - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); - - x12_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( (hash+(i<<3))[7] & mask ) == 0 ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x12/x12-gate.c b/algo/x12/x12-gate.c deleted file mode 100644 index 5220c10..0000000 --- a/algo/x12/x12-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "x12-gate.h" - -bool register_x12_algo( algo_gate_t* gate ) -{ -#if defined (X12_4WAY) - init_x12_4way_ctx(); - gate->scanhash = (void*)&scanhash_x12_4way; - gate->hash = (void*)&x12_4way_hash; -#else - init_x12_ctx(); - gate->scanhash = (void*)&scanhash_x12; - gate->hash = (void*)&x12hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x12/x12-gate.h b/algo/x12/x12-gate.h deleted file mode 100644 index e26956e..0000000 --- a/algo/x12/x12-gate.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef X12_GATE_H__ -#define X12_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X12_4WAY -#endif - -bool register_x12_algo( algo_gate_t* gate ); - -#if defined(X12_4WAY) - -void x12_4way_hash( void *state, const void *input ); - -int scanhash_x12_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x12_4way_ctx(); - -#endif - -void x12hash( void *state, const void *input ); - -int scanhash_x12( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x12_ctx(); - -#endif - diff --git a/algo/x12/x12.c b/algo/x12/x12.c deleted file mode 100644 index 87a4fa6..0000000 --- a/algo/x12/x12.c +++ /dev/null @@ -1,243 +0,0 @@ -#include "x12-gate.h" - -#include -#include -#include -#include - -#include "algo/groestl/sph_groestl.h" -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" -#if defined(__AES__) - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - -typedef struct { -#if defined(__AES__) - hashState_groestl groestl; - hashState_echo echo; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif - hashState_luffa luffa; - cubehashParam cubehash; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; -} x12_ctx_holder; - -x12_ctx_holder x12_ctx; - -void init_x12_ctx() -{ -#if defined(__AES__) - init_echo( &x12_ctx.echo, 512 ); - init_groestl (&x12_ctx.groestl, 64 ); -#else - sph_groestl512_init(&x12_ctx.groestl); - sph_echo512_init(&x12_ctx.echo); -#endif - init_luffa( &x12_ctx.luffa, 512 ); - cubehashInit( &x12_ctx.cubehash, 512, 16, 32 ); - sph_shavite512_init( &x12_ctx.shavite ); - init_sd( &x12_ctx.simd, 512 ); - sph_hamsi512_init( &x12_ctx.hamsi ); -}; - -void x12hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (32))); - #define hashB hash+64 - - x12_ctx_holder ctx; - memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) ); - - // X11 algos - - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; - - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groetl---- - -#if defined(__AES__) - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - //---skein4--- - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - //---jh5------ - - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence*)hash, 64 ); - - // 8 Cube - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hashB, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hashB); - - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hashB, 512 ); - - //11---echo--- - -#if defined(__AES__) - update_final_echo ( &ctx.echo, (BitSequence *)hashB, - (const BitSequence *)hash, 512 ); -#else - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hashB); -#endif - - // 12 Hamsi - sph_hamsi512(&ctx.hamsi, hashB, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - asm volatile ("emms"); - memcpy(output, hashB, 32); -} - -int scanhash_x12( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - -#ifdef DEBUG_ALGO - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - x12hash(hash64, endiandata); -#ifndef DEBUG_ALGO - if (!(hash64[7] & mask)) - { - if ( fulltest(hash64, ptarget) ) - { - *hashes_done = n - first_nonce + 1; - return true; - } -// else -// { -// applog(LOG_INFO, "Result does not validate on CPU!"); -// } - } - -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - work_set_target_ratio( work, hash ); - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/x13/drop.c b/algo/x13/drop.c deleted file mode 100644 index 3990c3f..0000000 --- a/algo/x13/drop.c +++ /dev/null @@ -1,262 +0,0 @@ -/** - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2015 kernels10, tpruvot - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file drop.c - * @author kernels10 - * @author tpruvot - */ - -#define POK_BOOL_MASK 0x00008000 -#define POK_DATA_MASK 0xFFFF0000 - -#include "algo-gate-api.h" - -#include - -#include "algo/blake/sph_blake.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/echo/sph_echo.h" -#include "algo/fugue//sph_fugue.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/simd/sph_simd.h" -#include "algo/shavite/sph_shavite.h" - -static void shiftr_lp(const uint32_t *input, uint32_t *output, unsigned int shift) -{ - if(!shift) { - memcpy(output, input, 64); - return; - } - - memset(output, 0, 64); - for(int i = 0; i < 15; ++i) { - output[i + 1] |= (input[i] >> (32 - shift)); - output[i] |= (input[i] << shift); - } - - output[15] |= (input[15] << shift); - return; -} - -static void switchHash(const void *input, void *output, int id) -{ -/* - sph_keccak512_context ctx_keccak; - sph_blake512_context ctx_blake; - sph_groestl512_context ctx_groestl; - sph_skein512_context ctx_skein; - sph_luffa512_context ctx_luffa; - sph_echo512_context ctx_echo; - sph_simd512_context ctx_simd; - sph_cubehash512_context ctx_cubehash; - sph_fugue512_context ctx_fugue; - sph_shavite512_context ctx_shavite; - - switch(id) { - case 0: - sph_keccak512_init(&ctx_keccak); sph_keccak512(&ctx_keccak, input, 64); sph_keccak512_close(&ctx_keccak, output); - break; - case 1: - sph_blake512_init(&ctx_blake); sph_blake512(&ctx_blake, input, 64); sph_blake512_close(&ctx_blake, output); - break; - case 2: - sph_groestl512_init(&ctx_groestl); sph_groestl512(&ctx_groestl, input, 64); sph_groestl512_close(&ctx_groestl, output); - break; - case 3: - sph_skein512_init(&ctx_skein); sph_skein512(&ctx_skein, input, 64); sph_skein512_close(&ctx_skein, output); - break; - case 4: - sph_luffa512_init(&ctx_luffa); sph_luffa512(&ctx_luffa, input, 64); sph_luffa512_close(&ctx_luffa, output); - break; - case 5: - sph_echo512_init(&ctx_echo); sph_echo512(&ctx_echo, input, 64); sph_echo512_close(&ctx_echo, output); - break; - case 6: - sph_shavite512_init(&ctx_shavite); sph_shavite512(&ctx_shavite, input, 64); sph_shavite512_close(&ctx_shavite, output); - break; - case 7: - sph_fugue512_init(&ctx_fugue); sph_fugue512(&ctx_fugue, input, 64); sph_fugue512_close(&ctx_fugue, output); - break; - case 8: - sph_simd512_init(&ctx_simd); sph_simd512(&ctx_simd, input, 64); sph_simd512_close(&ctx_simd, output); - break; - case 9: - sph_cubehash512_init(&ctx_cubehash); sph_cubehash512(&ctx_cubehash, input, 64); sph_cubehash512_close(&ctx_cubehash, output); - break; - default: - break; - } -*/ -} - -void droplp_hash(void *state, const void *input) -{ - uint32_t _ALIGN(64) hash[2][16]; - sph_jh512_context ctx_jh; - uint32_t *hashA = hash[0]; - uint32_t *hashB = hash[1]; - - sph_jh512_init(&ctx_jh); - sph_jh512(&ctx_jh, input, 80); - sph_jh512_close(&ctx_jh, (void*)(hashA)); - - unsigned int startPosition = hashA[0] % 31; - unsigned int i = 0; - int j = 0; - int start = 0; - - for (i = startPosition; i < 31; i+=9) { - start = i % 10; - for (j = start; j < 10; j++) { - shiftr_lp(hashA, hashB, (i & 3)); - switchHash((const void*)hashB, (void*)hashA, j); - } - for (j = 0; j < start; j++) { - shiftr_lp(hashA, hashB, (i & 3)); - switchHash((const void*)hashB, (void*)hashA, j); - } - } - for (i = 0; i < startPosition; i += 9) { - start = i % 10; - for (j = start; j < 10; j++) { - shiftr_lp(hashA, hashB, (i & 3)); - switchHash((const void*)hashB, (void*)hashA, j); - } - for (j = 0; j < start; j++) { - shiftr_lp(hashA, hashB, (i & 3)); - switchHash((const void*)hashB, (void*)hashA, j); - } - } - - memcpy(state, hashA, 32); -} - -static void droplp_hash_pok(void *output, uint32_t *pdata, const uint32_t version) -{ - uint32_t _ALIGN(64) hash[8]; - uint32_t pok; - - pdata[0] = version; - droplp_hash(hash, pdata); - - // fill PoK - pok = version | (hash[0] & POK_DATA_MASK); - if (pdata[0] != pok) { - pdata[0] = pok; - droplp_hash(hash, pdata); - } - memcpy(output, hash, 32); -} - -int scanhash_drop( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) hash[16]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t version = pdata[0] & (~POK_DATA_MASK); - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - #define tmpdata pdata - - if (opt_benchmark) - ptarget[7] = 0x07ff; - - const uint32_t htarg = ptarget[7]; - - do { - tmpdata[19] = nonce; - droplp_hash_pok(hash, tmpdata, version); - - if (hash[7] <= htarg && fulltest(hash, ptarget)) { - pdata[0] = tmpdata[0]; - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - if (opt_debug) - applog(LOG_INFO, "found nonce %x", nonce); - return 1; - } - nonce++; - - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -void drop_get_new_work( struct work* work, struct work* g_work, int thr_id, - uint32_t* end_nonce_ptr, bool clean_job ) -{ - // ignore POK in first word -// const int nonce_i = 19; - const int wkcmp_sz = 72; // (19-1) * sizeof(uint32_t) - uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); - if ( memcmp( &work->data[1], &g_work->data[1], wkcmp_sz ) - && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) ) - { - work_free( work ); - work_copy( work, g_work ); - *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id; - if ( opt_randomize ) - *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads; - *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; - } - else - ++(*nonceptr); -} - -void drop_display_pok( struct work* work ) -{ - if ( work->data[0] & 0x00008000 ) - applog(LOG_BLUE, "POK received: %08xx", work->data[0] ); -} - -int drop_get_work_data_size() { return 80; } - -// Need to fix POK offset problems like zr5 -bool register_drop_algo( algo_gate_t* gate ) -{ - algo_not_tested(); - gate->scanhash = (void*)&scanhash_drop; - gate->hash = (void*)&droplp_hash_pok; - gate->get_new_work = (void*)&drop_get_new_work; - gate->set_target = (void*)&scrypt_set_target; - gate->build_stratum_request = (void*)&std_be_build_stratum_request; - gate->work_decode = (void*)&std_be_work_decode; - gate->submit_getwork_result = (void*)&std_be_submit_getwork_result; - gate->set_work_data_endian = (void*)&set_work_data_big_endian; - gate->decode_extra_data = (void*)&drop_display_pok; - gate->get_work_data_size = (void*)&drop_get_work_data_size; - gate->work_cmp_size = 72; - return true; -}; - diff --git a/algo/x13/phi1612-4way.c b/algo/x13/phi1612-4way.c deleted file mode 100644 index a62b6f3..0000000 --- a/algo/x13/phi1612-4way.c +++ /dev/null @@ -1,149 +0,0 @@ -#include "phi1612-gate.h" - -#if defined(PHI1612_4WAY) - -#include -#include -#include -#include -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/gost/sph_gost.h" -#include "algo/echo/aes_ni/hash_api.h" - -typedef struct { - skein512_4way_context skein; - jh512_4way_context jh; - cubehashParam cube; - sph_fugue512_context fugue; - sph_gost512_context gost; - hashState_echo echo; -} phi1612_4way_ctx_holder; - -phi1612_4way_ctx_holder phi1612_4way_ctx __attribute__ ((aligned (64))); - -void init_phi1612_4way_ctx() -{ - skein512_4way_init( &phi1612_4way_ctx.skein ); - jh512_4way_init( &phi1612_4way_ctx.jh ); - cubehashInit( &phi1612_4way_ctx.cube, 512, 16, 32 ); - sph_fugue512_init( &phi1612_4way_ctx.fugue ); - sph_gost512_init( &phi1612_4way_ctx.gost ); - init_echo( &phi1612_4way_ctx.echo, 512 ); -}; - -void phi1612_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - phi1612_4way_ctx_holder ctx; - memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) ); - - // Skein parallel 4way - skein512_4way( &ctx.skein, input, 80 ); - skein512_4way_close( &ctx.skein, vhash ); - - // JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // Serial to the end - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // Cubehash - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - // Fugue - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - // Gost - sph_gost512( &ctx.gost, hash0, 64 ); - sph_gost512_close( &ctx.gost, hash0 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash1, 64 ); - sph_gost512_close( &ctx.gost, hash1 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash2, 64 ); - sph_gost512_close( &ctx.gost, hash2 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash3, 64 ); - sph_gost512_close( &ctx.gost, hash3 ); - - // Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - - if ( opt_benchmark ) - ( (uint32_t*)ptarget )[7] = 0x0cff; - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - phi1612_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x13/phi1612-gate.c b/algo/x13/phi1612-gate.c deleted file mode 100644 index 9a9d871..0000000 --- a/algo/x13/phi1612-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "phi1612-gate.h" - -bool register_phi1612_algo( algo_gate_t* gate ) -{ -#if defined(PHI1612_4WAY) - init_phi1612_4way_ctx(); - gate->scanhash = (void*)&scanhash_phi1612_4way; - gate->hash = (void*)&phi1612_4way_hash; -#else - init_phi1612_ctx(); - gate->scanhash = (void*)&scanhash_phi1612; - gate->hash = (void*)&phi1612_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x13/phi1612-gate.h b/algo/x13/phi1612-gate.h deleted file mode 100644 index a1f6518..0000000 --- a/algo/x13/phi1612-gate.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef PHI1612_GATE_H__ -#define PHI1612_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define PHI1612_4WAY -#endif - -bool register_phi1612_algo( algo_gate_t* gate ); - -#if defined(PHI1612_4WAY) - -void phi1612_4way_hash( void *state, const void *input ); - -int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_phi1612_4way_ctx(); - -#endif - -void phi1612_hash( void *state, const void *input ); - -int scanhash_phi1612( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_phi1612_ctx(); - -#endif - diff --git a/algo/x13/phi1612.c b/algo/x13/phi1612.c deleted file mode 100644 index 1ea2032..0000000 --- a/algo/x13/phi1612.c +++ /dev/null @@ -1,132 +0,0 @@ -#include "phi1612-gate.h" - -#include -#include -#include -#include - -#include "algo/gost/sph_gost.h" -#include "algo/echo/sph_echo.h" -#include "algo/fugue//sph_fugue.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sph_jh.h" - -#ifndef NO_AES_NI - #include "algo/echo/aes_ni/hash_api.h" -#endif - -typedef struct { - sph_skein512_context skein; - sph_jh512_context jh; - cubehashParam cube; - sph_fugue512_context fugue; - sph_gost512_context gost; -#ifdef NO_AES_NI - sph_echo512_context echo; -#else - hashState_echo echo; -#endif -} phi_ctx_holder; - -phi_ctx_holder phi_ctx; -static __thread sph_skein512_context phi_skein_mid - __attribute__ ((aligned (64))); - -void init_phi1612_ctx() -{ - sph_skein512_init( &phi_ctx.skein ); - sph_jh512_init( &phi_ctx.jh ); - cubehashInit( &phi_ctx.cube, 512, 16, 32 ); - sph_fugue512_init( &phi_ctx.fugue ); - sph_gost512_init( &phi_ctx.gost ); -#ifdef NO_AES_NI - sph_echo512_init( &phi_ctx.echo ); -#else - init_echo( &phi_ctx.echo, 512 ); -#endif -} - -void phi_skein_midstate( const void* input ) -{ - memcpy( &phi_skein_mid, &phi_ctx.skein, sizeof phi_skein_mid ); - sph_skein512( &phi_skein_mid, input, 64 ); -} - -void phi1612_hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (64))); - phi_ctx_holder ctx __attribute__ ((aligned (64))); - - memcpy( &ctx, &phi_ctx, sizeof(phi_ctx) ); - - memcpy( &ctx.skein, &phi_skein_mid, sizeof phi_skein_mid ); - sph_skein512( &ctx.skein, input + 64, 16 ); - sph_skein512_close( &ctx.skein, hash ); - -// sph_skein512( &ctx.skein, input, 80 ); -// sph_skein512_close( &ctx.skein, (void*)hash ); - - sph_jh512( &ctx.jh, (const void*)hash, 64 ); - sph_jh512_close( &ctx.jh, (void*)hash ); - - cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 ); - - sph_fugue512( &ctx.fugue, (const void*)hash, 64 ); - sph_fugue512_close( &ctx.fugue, (void*)hash ); - - sph_gost512( &ctx.gost, hash, 64 ); - sph_gost512_close( &ctx.gost, hash ); - -#ifdef NO_AES_NI - sph_echo512( &ctx.echo, hash, 64 ); - sph_echo512_close( &ctx.echo, hash ); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif - - memcpy(output, hash, 32); -} - -int scanhash_phi1612( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - const uint32_t first_nonce = pdata[19]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0cff; - - for (int k = 0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - phi_skein_midstate( endiandata ); - - const uint32_t Htarg = ptarget[7]; - do { - uint32_t hash[8]; - be32enc(&endiandata[19], nonce); - phi1612_hash(hash, endiandata); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) { - pdata[19] = nonce; - work_set_target_ratio( work, hash ); - *hashes_done = pdata[19] - first_nonce; - return 1; - } - nonce++; - - } while (nonce < max_nonce && !(*restart)); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - diff --git a/algo/x13/skunk-4way.c b/algo/x13/skunk-4way.c deleted file mode 100644 index 165047e..0000000 --- a/algo/x13/skunk-4way.c +++ /dev/null @@ -1,125 +0,0 @@ -#include "skunk-gate.h" - -#if defined(SKUNK_4WAY) - -#include -#include -#include -#include -#include "algo/skein/skein-hash-4way.h" -#include "algo/gost/sph_gost.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/cubehash/cubehash_sse2.h" - -typedef struct { - skein512_4way_context skein; - cubehashParam cube; - sph_fugue512_context fugue; - sph_gost512_context gost; -} skunk_4way_ctx_holder; - -static __thread skunk_4way_ctx_holder skunk_4way_ctx; - -void skunk_4way_hash( void *output, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - - skunk_4way_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) ); - - skein512_4way( &ctx.skein, input, 80 ); - skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 ); - memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - sph_gost512( &ctx.gost, hash0, 64 ); - sph_gost512_close( &ctx.gost, hash0 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash1, 64 ); - sph_gost512_close( &ctx.gost, hash1 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash2, 64 ); - sph_gost512_close( &ctx.gost, hash2 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash3, 64 ); - sph_gost512_close( &ctx.gost, hash3 ); - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); -} - -int scanhash_skunk_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if ( opt_benchmark ) - ((uint32_t*)ptarget)[7] = 0x0cff; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - skunk_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n +=4; - } while ( ( n < max_nonce ) && !(*restart) ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -bool skunk_4way_thread_init() -{ - skein512_4way_init( &skunk_4way_ctx.skein ); - cubehashInit( &skunk_4way_ctx.cube, 512, 16, 32 ); - sph_fugue512_init( &skunk_4way_ctx.fugue ); - sph_gost512_init( &skunk_4way_ctx.gost ); - return true; -} - -#endif diff --git a/algo/x13/skunk-gate.c b/algo/x13/skunk-gate.c deleted file mode 100644 index d4b3ae8..0000000 --- a/algo/x13/skunk-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "skunk-gate.h" - -bool register_skunk_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AVX2_OPT; -#if defined (SKUNK_4WAY) - gate->miner_thread_init = (void*)&skunk_4way_thread_init; - gate->scanhash = (void*)&scanhash_skunk_4way; - gate->hash = (void*)&skunk_4way_hash; -// init_skunk_4way_ctx(); -#else - gate->miner_thread_init = (void*)&skunk_thread_init; - gate->scanhash = (void*)&scanhash_skunk; - gate->hash = (void*)&skunkhash; -#endif - return true; -} - diff --git a/algo/x13/skunk-gate.h b/algo/x13/skunk-gate.h deleted file mode 100644 index a389f2e..0000000 --- a/algo/x13/skunk-gate.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef SKUNK_GATE_H__ -#define SKUNK_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) - #define SKUNK_4WAY -#endif - -bool register_skunk_algo( algo_gate_t* gate ); - -#if defined(SKUNK_4WAY) - -void skunk_4way_hash( void *state, const void *input ); - -int scanhash_skunk_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -bool skunk_4way_thread_init(); -//void init_skunk_4way_ctx(); - -#endif - -void skunkhash( void *state, const void *input ); - -int scanhash_skunk( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -bool skunk_thread_init(); - -#endif - diff --git a/algo/x13/skunk.c b/algo/x13/skunk.c deleted file mode 100644 index 80358e9..0000000 --- a/algo/x13/skunk.c +++ /dev/null @@ -1,92 +0,0 @@ -#include "skunk-gate.h" -#include -#include -#include -#include -#include "algo/gost/sph_gost.h" -#include "algo/skein/sph_skein.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/cubehash/cubehash_sse2.h" - -typedef struct { - sph_skein512_context skein; - cubehashParam cube; - sph_fugue512_context fugue; - sph_gost512_context gost; -} skunk_ctx_holder; - -static __thread skunk_ctx_holder skunk_ctx; - -void skunkhash( void *output, const void *input ) -{ - unsigned char hash[128] __attribute__ ((aligned (64))); - - skunk_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &skunk_ctx, sizeof(skunk_ctx) ); - - sph_skein512( &ctx.skein, input+64, 16 ); - sph_skein512_close( &ctx.skein, (void*) hash ); - - cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 ); - - sph_fugue512( &ctx.fugue, hash, 64 ); - sph_fugue512_close( &ctx.fugue, hash ); - - sph_gost512( &ctx.gost, hash, 64 ); - sph_gost512_close( &ctx.gost, hash ); - - memcpy(output, hash, 32); -} - -int scanhash_skunk( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - const uint32_t first_nonce = pdata[19]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if ( opt_benchmark ) - ((uint32_t*)ptarget)[7] = 0x0cff; - - for ( int k = 0; k < 19; k++ ) - be32enc( &endiandata[k], pdata[k] ); - - // precalc midstate - sph_skein512_init( &skunk_ctx.skein ); - sph_skein512( &skunk_ctx.skein, endiandata, 64 ); - - const uint32_t Htarg = ptarget[7]; - do - { - uint32_t hash[8]; - be32enc( &endiandata[19], nonce ); - skunkhash( hash, endiandata ); - - if ( hash[7] <= Htarg && fulltest( hash, ptarget ) ) - { - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - work_set_target_ratio( work, hash ); - return 1; - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -bool skunk_thread_init() -{ - sph_skein512_init( &skunk_ctx.skein ); - cubehashInit( &skunk_ctx.cube, 512, 16, 32 ); - sph_fugue512_init( &skunk_ctx.fugue ); - sph_gost512_init( &skunk_ctx.gost ); - return true; -} diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c deleted file mode 100644 index 970f8e3..0000000 --- a/algo/x13/x13-4way.c +++ /dev/null @@ -1,234 +0,0 @@ -#include "x13-gate.h" - -#if defined(X13_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; -} x13_4way_ctx_holder; - -x13_4way_ctx_holder x13_4way_ctx __attribute__ ((aligned (64))); - -void init_x13_4way_ctx() -{ - blake512_4way_init( &x13_4way_ctx.blake ); - bmw512_4way_init( &x13_4way_ctx.bmw ); - init_groestl( &x13_4way_ctx.groestl, 64 ); - skein512_4way_init( &x13_4way_ctx.skein ); - jh512_4way_init( &x13_4way_ctx.jh ); - keccak512_4way_init( &x13_4way_ctx.keccak ); - luffa_2way_init( &x13_4way_ctx.luffa, 512 ); - cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x13_4way_ctx.shavite ); - simd_2way_init( &x13_4way_ctx.simd, 512 ); - init_echo( &x13_4way_ctx.echo, 512 ); - hamsi512_4way_init( &x13_4way_ctx.hamsi ); - sph_fugue512_init( &x13_4way_ctx.fugue ); -}; - -void x13_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - x13_4way_ctx_holder ctx; - memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) ); - - // 1 Blake - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 3 Groestl - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // Parallel 4way 64 bit - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 7 Luffa - intrlv_2x128( vhash, hash0, hash1, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // 8 Cubehash - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &x13_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &x13_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &x13_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - // 10 Simd - intrlv_2x128( vhash, hash0, hash1, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // 11 Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - // 12 Hamsi parallel 4way 32 bit - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 13 Fugue serial - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -int scanhash_x13_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - for ( int m=0; m < 6; m++ ) - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x13_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x13/x13-gate.c b/algo/x13/x13-gate.c deleted file mode 100644 index ce3e640..0000000 --- a/algo/x13/x13-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "x13-gate.h" - -bool register_x13_algo( algo_gate_t* gate ) -{ -#if defined (X13_4WAY) - init_x13_4way_ctx(); - gate->scanhash = (void*)&scanhash_x13_4way; - gate->hash = (void*)&x13_4way_hash; -#else - init_x13_ctx(); - gate->scanhash = (void*)&scanhash_x13; - gate->hash = (void*)&x13hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x13/x13-gate.h b/algo/x13/x13-gate.h deleted file mode 100644 index c61d7d6..0000000 --- a/algo/x13/x13-gate.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef X13_GATE_H__ -#define X13_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X13_4WAY -#endif - -bool register_x13_algo( algo_gate_t* gate ); - -#if defined(X13_4WAY) - -void x13_4way_hash( void *state, const void *input ); - -int scanhash_x13_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x13_4way_ctx(); - -#endif - -void x13hash( void *state, const void *input ); - -int scanhash_x13( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_x13_ctx(); - -#endif - diff --git a/algo/x13/x13.c b/algo/x13/x13.c deleted file mode 100644 index a55cb9a..0000000 --- a/algo/x13/x13.c +++ /dev/null @@ -1,253 +0,0 @@ -#include "x13-gate.h" - -#include -#include -#include -#include - -#include "algo/groestl/sph_groestl.h" -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" - -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - -typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else - hashState_groestl groestl; - hashState_echo echo; -#endif - hashState_luffa luffa; - cubehashParam cubehash; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; -} x13_ctx_holder; - -x13_ctx_holder x13_ctx; - -void init_x13_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init(&x13_ctx.groestl); - sph_echo512_init(&x13_ctx.echo); -#else - init_echo( &x13_ctx.echo, 512 ); - init_groestl (&x13_ctx.groestl, 64 ); -#endif - init_luffa( &x13_ctx.luffa, 512 ); - cubehashInit( &x13_ctx.cubehash, 512, 16, 32 ); - sph_shavite512_init( &x13_ctx.shavite ); - init_sd( &x13_ctx.simd, 512 ); - sph_hamsi512_init( &x13_ctx.hamsi ); - sph_fugue512_init( &x13_ctx.fugue ); -}; - -void x13hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (32))); - #define hashB hash+64 - - x13_ctx_holder ctx; - memcpy( &ctx, &x13_ctx, sizeof(x13_ctx) ); - - // X11 algos - - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; - - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groetl---- - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#endif - - //---skein4--- - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - //---jh5------ - - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence*)hash, 64 ); - - // 8 Cube - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hashB, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hashB); - - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hashB, 512 ); - - //11---echo--- - -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hashB); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hashB, - (const BitSequence *)hash, 512 ); -#endif - - // X13 algos - // 12 Hamsi - sph_hamsi512(&ctx.hamsi, hashB, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - // 13 Fugue - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hashB); - - asm volatile ("emms"); - memcpy(output, hashB, 32); -} - -int scanhash_x13( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - -#ifdef DEBUG_ALGO - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - x13hash(hash64, endiandata); -#ifndef DEBUG_ALGO - if (!(hash64[7] & mask)) - { - if ( fulltest(hash64, ptarget) ) - { - *hashes_done = n - first_nonce + 1; - return true; - } -// else -// { -// applog(LOG_INFO, "Result does not validate on CPU!"); -// } - } - -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - work_set_target_ratio( work, hash ); - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/x13/x13bcd-4way.c b/algo/x13/x13bcd-4way.c deleted file mode 100644 index e43e922..0000000 --- a/algo/x13/x13bcd-4way.c +++ /dev/null @@ -1,283 +0,0 @@ -#include "x13sm3-gate.h" - -#if defined(X13SM3_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -//#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/sm3/sm3-hash-4way.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; -// luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; - sm3_4way_ctx_t sm3; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; -} x13bcd_4way_ctx_holder; - -x13bcd_4way_ctx_holder x13bcd_4way_ctx __attribute__ ((aligned (64))); -static __thread blake512_4way_context x13bcd_ctx_mid; - -void init_x13bcd_4way_ctx() -{ - blake512_4way_init( &x13bcd_4way_ctx.blake ); - bmw512_4way_init( &x13bcd_4way_ctx.bmw ); - init_groestl( &x13bcd_4way_ctx.groestl, 64 ); - skein512_4way_init( &x13bcd_4way_ctx.skein ); - jh512_4way_init( &x13bcd_4way_ctx.jh ); - keccak512_4way_init( &x13bcd_4way_ctx.keccak ); -// luffa_2way_init( &x13bcd_4way_ctx.luffa, 512 ); - cubehashInit( &x13bcd_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x13bcd_4way_ctx.shavite ); - simd_2way_init( &x13bcd_4way_ctx.simd, 512 ); - init_echo( &x13bcd_4way_ctx.echo, 512 ); - sm3_4way_init( &x13bcd_4way_ctx.sm3 ); - hamsi512_4way_init( &x13bcd_4way_ctx.hamsi ); - sph_fugue512_init( &x13bcd_4way_ctx.fugue ); -}; - -void x13bcd_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - x13bcd_4way_ctx_holder ctx; - memcpy( &ctx, &x13bcd_4way_ctx, sizeof(x13bcd_4way_ctx) ); - - // Blake - memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) ); - blake512_4way( &ctx.blake, input + (64<<2), 16 ); - -// blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - // Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // Groestl - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // Parallel 4way - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - // Skein - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - - // SM3 parallel 32 bit - uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64))); - memset( sm3_vhash, 0, sizeof sm3_vhash ); - uint32_t sm3_hash0[32] __attribute__ ((aligned (32))); - memset( sm3_hash0, 0, sizeof sm3_hash0 ); - uint32_t sm3_hash1[32] __attribute__ ((aligned (32))); - memset( sm3_hash1, 0, sizeof sm3_hash1 ); - uint32_t sm3_hash2[32] __attribute__ ((aligned (32))); - memset( sm3_hash2, 0, sizeof sm3_hash2 ); - uint32_t sm3_hash3[32] __attribute__ ((aligned (32))); - memset( sm3_hash3, 0, sizeof sm3_hash3 ); - - sm3_4way( &ctx.sm3, vhash, 64 ); - sm3_4way_close( &ctx.sm3, sm3_vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 ); - -/* - // Luffa - intrlv_2x128( vhash, hash0, hash1, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); -*/ - - // Cubehash - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - // Shavite - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - // Simd - intrlv_2x128( vhash, hash0, hash1, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - -/* - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - - // SM3 parallel 32 bit - uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64))); - memset( sm3_vhash, 0, sizeof sm3_vhash ); - uint32_t sm3_hash0[32] __attribute__ ((aligned (32))); - memset( sm3_hash0, 0, sizeof sm3_hash0 ); - uint32_t sm3_hash1[32] __attribute__ ((aligned (32))); - memset( sm3_hash1, 0, sizeof sm3_hash1 ); - uint32_t sm3_hash2[32] __attribute__ ((aligned (32))); - memset( sm3_hash2, 0, sizeof sm3_hash2 ); - uint32_t sm3_hash3[32] __attribute__ ((aligned (32))); - memset( sm3_hash3, 0, sizeof sm3_hash3 ); - - sm3_4way( &ctx.sm3, vhash, 64 ); - sm3_4way_close( &ctx.sm3, sm3_vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 ); -*/ - - // Hamsi parallel 4x32x2 - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // Fugue serial - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - blake512_4way_init( &x13bcd_ctx_mid ); - blake512_4way( &x13bcd_ctx_mid, vdata, 64 ); - - for ( int m=0; m < 6; m++ ) - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x13bcd_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x13/x13bcd.c b/algo/x13/x13bcd.c deleted file mode 100644 index bf4a8cd..0000000 --- a/algo/x13/x13bcd.c +++ /dev/null @@ -1,258 +0,0 @@ -#include "x13sm3-gate.h" - -#include -#include -#include -#include - -#include "algo/groestl/sph_groestl.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/sm3/sph_sm3.h" - -//#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - -typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else - hashState_echo echo; - hashState_groestl groestl; -#endif -// hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sm3_ctx_t sm3; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; -} x13bcd_ctx_holder; - -x13bcd_ctx_holder x13bcd_ctx; - -void init_x13bcd_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init(&x13bcd_ctx.groestl); - sph_echo512_init(&x13bcd_ctx.echo); -#else - init_echo(&x13bcd_ctx.echo, 512); - init_groestl(&x13bcd_ctx.groestl, 64 ); -#endif -// init_luffa(&x13bcd_ctx.luffa,512); - cubehashInit(&x13bcd_ctx.cube,512,16,32); - sph_shavite512_init(&x13bcd_ctx.shavite); - init_sd(&x13bcd_ctx.simd,512); - sm3_init( &x13bcd_ctx.sm3 ); - sph_hamsi512_init(&x13bcd_ctx.hamsi); - sph_fugue512_init(&x13bcd_ctx.fugue); -}; - -void x13bcd_hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (32))); - - x13bcd_ctx_holder ctx; - memcpy(&ctx, &x13bcd_ctx, sizeof(x13bcd_ctx)); - - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; - - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#endif - - //---skein4--- - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - //---jh5------ - - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - uint32_t sm3_hash[32] __attribute__ ((aligned (32))); - memset(sm3_hash, 0, sizeof sm3_hash); - - sph_sm3(&ctx.sm3, hash, 64); - sph_sm3_close(&ctx.sm3, sm3_hash); - - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)sm3_hash, 64 ); - -/* - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - // 8 Cube - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)hash, 64 ); -*/ - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hash); - - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - - //11---echo--- -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif - - /* - uint32_t sm3_hash[32] __attribute__ ((aligned (32))); - memset(sm3_hash, 0, sizeof sm3_hash); - - sph_sm3(&ctx.sm3, hash, 64); - sph_sm3_close(&ctx.sm3, sm3_hash); - - sph_hamsi512(&ctx.hamsi, sm3_hash, 64); -*/ - - sph_hamsi512(&ctx.hamsi, hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hash); - - asm volatile ("emms"); - memcpy(output, hash, 32); -} - -int scanhash_x13bcd( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - -#ifdef DEBUG_ALGO - if (Htarg != 0) - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - x13bcd_hash(hash64, endiandata); -#ifndef DEBUG_ALGO - if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - work_set_target_ratio( work, hash64 ); - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c deleted file mode 100644 index a107627..0000000 --- a/algo/x13/x13sm3-4way.c +++ /dev/null @@ -1,262 +0,0 @@ -#include "x13sm3-gate.h" - -#if defined(X13SM3_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/sm3/sm3-hash-4way.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; - sm3_4way_ctx_t sm3; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; -} x13sm3_4way_ctx_holder; - -x13sm3_4way_ctx_holder x13sm3_4way_ctx __attribute__ ((aligned (64))); -static __thread blake512_4way_context x13sm3_ctx_mid; - -void init_x13sm3_4way_ctx() -{ - blake512_4way_init( &x13sm3_4way_ctx.blake ); - bmw512_4way_init( &x13sm3_4way_ctx.bmw ); - init_groestl( &x13sm3_4way_ctx.groestl, 64 ); - skein512_4way_init( &x13sm3_4way_ctx.skein ); - jh512_4way_init( &x13sm3_4way_ctx.jh ); - keccak512_4way_init( &x13sm3_4way_ctx.keccak ); - luffa_2way_init( &x13sm3_4way_ctx.luffa, 512 ); - cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x13sm3_4way_ctx.shavite ); - simd_2way_init( &x13sm3_4way_ctx.simd, 512 ); - init_echo( &x13sm3_4way_ctx.echo, 512 ); - sm3_4way_init( &x13sm3_4way_ctx.sm3 ); - hamsi512_4way_init( &x13sm3_4way_ctx.hamsi ); - sph_fugue512_init( &x13sm3_4way_ctx.fugue ); -}; - -void x13sm3_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - x13sm3_4way_ctx_holder ctx; - memcpy( &ctx, &x13sm3_4way_ctx, sizeof(x13sm3_4way_ctx) ); - - // Blake - memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) ); - blake512_4way( &ctx.blake, input + (64<<2), 16 ); - -// blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - // Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // Groestl - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // Parallel 4way - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - // Skein - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - // Serial to the end - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // Luffa - intrlv_2x128( vhash, hash0, hash1, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // Cubehash - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - // Shavite - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - // Simd - intrlv_2x128( vhash, hash0, hash1, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - - // SM3 parallel 32 bit - uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64))); - memset( sm3_vhash, 0, sizeof sm3_vhash ); - uint32_t sm3_hash0[32] __attribute__ ((aligned (32))); - memset( sm3_hash0, 0, sizeof sm3_hash0 ); - uint32_t sm3_hash1[32] __attribute__ ((aligned (32))); - memset( sm3_hash1, 0, sizeof sm3_hash1 ); - uint32_t sm3_hash2[32] __attribute__ ((aligned (32))); - memset( sm3_hash2, 0, sizeof sm3_hash2 ); - uint32_t sm3_hash3[32] __attribute__ ((aligned (32))); - memset( sm3_hash3, 0, sizeof sm3_hash3 ); - - sm3_4way( &ctx.sm3, vhash, 64 ); - sm3_4way_close( &ctx.sm3, sm3_vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 ); - - // Hamsi parallel 4x32x2 - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // Fugue serial - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - blake512_4way_init( &x13sm3_ctx_mid ); - blake512_4way( &x13sm3_ctx_mid, vdata, 64 ); - - for ( int m=0; m < 6; m++ ) - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x13sm3_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x13/x13sm3-gate.c b/algo/x13/x13sm3-gate.c deleted file mode 100644 index c4c348b..0000000 --- a/algo/x13/x13sm3-gate.c +++ /dev/null @@ -1,34 +0,0 @@ -#include "x13sm3-gate.h" - -bool register_x13sm3_algo( algo_gate_t* gate ) -{ -#if defined (X13SM3_4WAY) - init_x13sm3_4way_ctx(); - gate->scanhash = (void*)&scanhash_x13sm3_4way; - gate->hash = (void*)&x13sm3_4way_hash; -#else - init_x13sm3_ctx(); - gate->scanhash = (void*)&scanhash_x13sm3; - gate->hash = (void*)&x13sm3_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - -bool register_x13bcd_algo( algo_gate_t* gate ) -{ -#if defined (X13SM3_4WAY) - init_x13bcd_4way_ctx(); - gate->scanhash = (void*)&scanhash_x13bcd_4way; - gate->hash = (void*)&x13bcd_4way_hash; -#else - init_x13bcd_ctx(); - gate->scanhash = (void*)&scanhash_x13bcd; - gate->hash = (void*)&x13bcd_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x13/x13sm3-gate.h b/algo/x13/x13sm3-gate.h deleted file mode 100644 index f0047bf..0000000 --- a/algo/x13/x13sm3-gate.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef X13SM3_GATE_H__ -#define X13SM3_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X13SM3_4WAY -#endif - -bool register_x13sm3_algo( algo_gate_t* gate ); - -bool register_x13bcd_algo( algo_gate_t* gate ); - -#if defined(X13SM3_4WAY) - -void x13sm3_4way_hash( void *state, const void *input ); -int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_x13sm3_4way_ctx(); - -void x13bcd_4way_hash( void *state, const void *input ); -int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_x13bcd_4way_ctx(); - -#endif - -void x13sm3_hash( void *state, const void *input ); -int scanhash_x13sm3( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_x13sm3_ctx(); - -void x13bcd_hash( void *state, const void *input ); -int scanhash_x13bcd( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_x13bcd_ctx(); - -#endif - diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c deleted file mode 100644 index 8c495d0..0000000 --- a/algo/x13/x13sm3.c +++ /dev/null @@ -1,243 +0,0 @@ -#include "x13sm3-gate.h" - -#include -#include -#include -#include - -#include "algo/groestl/sph_groestl.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/sm3/sph_sm3.h" - -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - -typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else - hashState_echo echo; - hashState_groestl groestl; -#endif - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sm3_ctx_t sm3; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; -} hsr_ctx_holder; - -hsr_ctx_holder hsr_ctx; - -void init_x13sm3_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init(&hsr_ctx.groestl); - sph_echo512_init(&hsr_ctx.echo); -#else - init_echo(&hsr_ctx.echo, 512); - init_groestl(&hsr_ctx.groestl, 64 ); -#endif - init_luffa(&hsr_ctx.luffa,512); - cubehashInit(&hsr_ctx.cube,512,16,32); - sph_shavite512_init(&hsr_ctx.shavite); - init_sd(&hsr_ctx.simd,512); - sm3_init( &hsr_ctx.sm3 ); - sph_hamsi512_init(&hsr_ctx.hamsi); - sph_fugue512_init(&hsr_ctx.fugue); -}; - -void x13sm3_hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (32))); - - hsr_ctx_holder ctx; - memcpy(&ctx, &hsr_ctx, sizeof(hsr_ctx)); - - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; - - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#endif - - //---skein4--- - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - //---jh5------ - - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - // 8 Cube - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)hash, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hash); - - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - - //11---echo--- -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif - - uint32_t sm3_hash[32] __attribute__ ((aligned (32))); - memset(sm3_hash, 0, sizeof sm3_hash); - - sph_sm3(&ctx.sm3, hash, 64); - sph_sm3_close(&ctx.sm3, sm3_hash); - - sph_hamsi512(&ctx.hamsi, sm3_hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hash); - - asm volatile ("emms"); - memcpy(output, hash, 32); -} - -int scanhash_x13sm3( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - -#ifdef DEBUG_ALGO - if (Htarg != 0) - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - x13sm3_hash(hash64, endiandata); -#ifndef DEBUG_ALGO - if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - work_set_target_ratio( work, hash64 ); - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - diff --git a/algo/x14/axiom.c b/algo/x14/axiom.c deleted file mode 100644 index 7c0b70a..0000000 --- a/algo/x14/axiom.c +++ /dev/null @@ -1,88 +0,0 @@ -#include "algo-gate-api.h" - -#include -#include - -#include "algo/shabal/sph_shabal.h" - -static __thread uint32_t _ALIGN(64) M[65536][8]; - -void axiomhash(void *output, const void *input) -{ - sph_shabal256_context ctx __attribute__ ((aligned (64))); - const int N = 65536; - - sph_shabal256_init(&ctx); - sph_shabal256(&ctx, input, 80); - sph_shabal256_close(&ctx, M[0]); - - for(int i = 1; i < N; i++) { - sph_shabal256_init(&ctx); - sph_shabal256(&ctx, M[i-1], 32); - sph_shabal256_close(&ctx, M[i]); - } - - for(int b = 0; b < N; b++) - { - const int p = b > 0 ? b - 1 : 0xFFFF; - const int q = M[p][0] % 0xFFFF; - const int j = (b + q) % N; - - sph_shabal256_init(&ctx); -#if 0 - sph_shabal256(&ctx, M[p], 32); - sph_shabal256(&ctx, M[j], 32); -#else - uint8_t _ALIGN(64) hash[64]; - memcpy(hash, M[p], 32); - memcpy(&hash[32], M[j], 32); - sph_shabal256(&ctx, hash, 64); -#endif - sph_shabal256_close(&ctx, M[b]); - } - memcpy(output, M[N-1], 32); -} - -int scanhash_axiom( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t _ALIGN(64) hash64[8]; - uint32_t _ALIGN(64) endiandata[20]; - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - - uint32_t n = first_nonce; - - for (int k = 0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - do { - be32enc(&endiandata[19], n); - axiomhash(hash64, endiandata); - if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - work_set_target_ratio( work, hash64 ); - return true; - } - n++; - - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} - -bool register_axiom_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_axiom; - gate->hash = (void*)&axiomhash; - gate->get_max64 = (void*)&get_max64_0x40LL; - return true; -} diff --git a/algo/x14/polytimos-4way.c b/algo/x14/polytimos-4way.c deleted file mode 100644 index 3e1cc69..0000000 --- a/algo/x14/polytimos-4way.c +++ /dev/null @@ -1,142 +0,0 @@ -#include "polytimos-gate.h" - -#if defined(POLYTIMOS_4WAY) - -#include -#include -#include -#include -#include "algo/skein/skein-hash-4way.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/fugue//sph_fugue.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/gost/sph_gost.h" -#include "algo/echo/aes_ni/hash_api.h" - -union _poly_4way_context_overlay -{ - skein512_4way_context skein; - shabal512_4way_context shabal; - hashState_echo echo; - luffa_2way_context luffa; - sph_fugue512_context fugue; - sph_gost512_context gost; -}; -typedef union _poly_4way_context_overlay poly_4way_context_overlay; - -void polytimos_4way_hash( void *output, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - poly_4way_context_overlay ctx; - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, input, 80 ); - skein512_4way_close( &ctx.skein, vhash ); - - // Need to convert from 64 bit interleaved to 32 bit interleaved. - uint32_t vhash32[16*4]; - rintrlv_4x64_4x32( vhash32, vhash, 512 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash32, 64 ); - shabal512_4way_close( &ctx.shabal, vhash32 ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 ); - - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - intrlv_2x128( vhash, hash0, hash1, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash0, 64 ); - sph_gost512_close( &ctx.gost, hash0 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash1, 64 ); - sph_gost512_close( &ctx.gost, hash1 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash2, 64 ); - sph_gost512_close( &ctx.gost, hash2 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash3, 64 ); - sph_gost512_close( &ctx.gost, hash3 ); - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); -} - -int scanhash_polytimos_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - polytimos_4way_hash(hash, vdata); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) - if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - - } while ( ( n < max_nonce-4 ) && !(*restart)); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x14/polytimos-gate.c b/algo/x14/polytimos-gate.c deleted file mode 100644 index aa54060..0000000 --- a/algo/x14/polytimos-gate.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "polytimos-gate.h" - -bool register_polytimos_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; -#ifdef POLYTIMOS_4WAY - gate->scanhash = (void*)&scanhash_polytimos_4way; - gate->hash = (void*)&polytimos_4way_hash; -#else - init_polytimos_ctx(); - gate->scanhash = (void*)&scanhash_polytimos; - gate->hash = (void*)&polytimos_hash; -#endif - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x14/polytimos-gate.h b/algo/x14/polytimos-gate.h deleted file mode 100644 index 080d9dc..0000000 --- a/algo/x14/polytimos-gate.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef POLYTIMOS_GATE_H__ -#define POLYTIMOS_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define POLYTIMOS_4WAY -#endif - -bool register_polytimos_algo( algo_gate_t* gate ); - -#if defined(POLYTIMOS_4WAY) - -void polytimos_4way_hash( void *state, const void *input ); -int scanhash_polytimos_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - -void polytimos_hash( void *state, const void *input ); -int scanhash_polytimos( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_polytimos_ctx(); - -#endif - diff --git a/algo/x14/polytimos.c b/algo/x14/polytimos.c deleted file mode 100644 index b5a3de7..0000000 --- a/algo/x14/polytimos.c +++ /dev/null @@ -1,116 +0,0 @@ -#include "polytimos-gate.h" - -#include -#include -#include -#include - -#include "algo/skein/sph_skein.h" -#include "algo/echo/sph_echo.h" -#include "algo/fugue//sph_fugue.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/gost/sph_gost.h" -#ifndef NO_AES_NI - #include "algo/echo/aes_ni/hash_api.h" -#endif - -typedef struct { - sph_skein512_context skein; - sph_shabal512_context shabal; -#ifdef NO_AES_NI - sph_echo512_context echo; -#else - hashState_echo echo; -#endif - hashState_luffa luffa; - sph_fugue512_context fugue; - sph_gost512_context gost; -} poly_ctx_holder; - -poly_ctx_holder poly_ctx; - -void init_polytimos_ctx() -{ - sph_skein512_init(&poly_ctx.skein); - sph_shabal512_init(&poly_ctx.shabal); -#ifdef NO_AES_NI - sph_echo512_init(&poly_ctx.echo); -#else - init_echo( &poly_ctx.echo, 512 ); -#endif - init_luffa( &poly_ctx.luffa, 512 ); - sph_fugue512_init(&poly_ctx.fugue); - sph_gost512_init(&poly_ctx.gost); -} - -void polytimos_hash(void *output, const void *input) -{ - uint32_t hashA[16] __attribute__ ((aligned (64))); - poly_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &poly_ctx, sizeof(poly_ctx) ); - - sph_skein512(&ctx.skein, input, 80); - sph_skein512_close(&ctx.skein, hashA); - - sph_shabal512(&ctx.shabal, hashA, 64); - sph_shabal512_close(&ctx.shabal, hashA); - -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hashA, 64); - sph_echo512_close(&ctx.echo, hashA); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hashA, - (const BitSequence *)hashA, 512 ); -#endif - - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA, - (const BitSequence*)hashA, 64 ); - - sph_fugue512(&ctx.fugue, hashA, 64); - sph_fugue512_close(&ctx.fugue, hashA); - - sph_gost512(&ctx.gost, hashA, 64); - sph_gost512_close(&ctx.gost, hashA); - - memcpy(output, hashA, 32); -} - -int scanhash_polytimos( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if (opt_benchmark) - ptarget[7] = 0x0cff; - - // we need bigendian data... - for (int i=0; i < 19; i++) { - be32enc(&endiandata[i], pdata[i]); - } - do { - be32enc(&endiandata[19], nonce); - polytimos_hash(hash, endiandata); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) { - work_set_target_ratio(work, hash); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 1; - } - nonce++; - - } while (nonce < max_nonce && !(*restart)); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} diff --git a/algo/x14/veltor-4way.c b/algo/x14/veltor-4way.c deleted file mode 100644 index 4f35161..0000000 --- a/algo/x14/veltor-4way.c +++ /dev/null @@ -1,119 +0,0 @@ -#include "veltor-gate.h" -#include -#include -#include -#include - -#if defined(__AVX2__) && defined(__AES__) - -#include "algo/skein/skein-hash-4way.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/gost/sph_gost.h" - -typedef struct { - skein512_4way_context skein; - sph_shavite512_context shavite; - shabal512_4way_context shabal; - sph_gost512_context gost; -} veltor_4way_ctx_holder; - -veltor_4way_ctx_holder veltor_4way_ctx __attribute__ ((aligned (64))); - -void init_veltor_4way_ctx() -{ - skein512_4way_init( &veltor_4way_ctx.skein ); - sph_shavite512_init( &veltor_4way_ctx.shavite ); - shabal512_4way_init( &veltor_4way_ctx.shabal ); - sph_gost512_init( &veltor_4way_ctx.gost ); -} - -void veltor_4way_hash( void *output, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - veltor_4way_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) ); - - skein512_4way( &ctx.skein, input, 80 ); - skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - shabal512_4way( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - - sph_gost512( &ctx.gost, hash0, 64 ); - sph_gost512_close( &ctx.gost, hash0 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash1, 64 ); - sph_gost512_close( &ctx.gost, hash1 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash2, 64 ); - sph_gost512_close( &ctx.gost, hash2 ); - sph_gost512_init( &ctx.gost ); - sph_gost512( &ctx.gost, hash3, 64 ); - sph_gost512_close( &ctx.gost, hash3 ); - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); -} - -int scanhash_veltor_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - veltor_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !(*restart) ); - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x14/veltor-gate.c b/algo/x14/veltor-gate.c deleted file mode 100644 index 5e7e74b..0000000 --- a/algo/x14/veltor-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "veltor-gate.h" - -bool register_veltor_algo( algo_gate_t* gate ) -{ -#if defined (VELTOR_4WAY) - init_veltor_4way_ctx(); - gate->scanhash = (void*)&scanhash_veltor_4way; - gate->hash = (void*)&veltor_4way_hash; -#else - init_veltor_ctx(); - gate->scanhash = (void*)&scanhash_veltor; - gate->hash = (void*)&veltor_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x14/veltor-gate.h b/algo/x14/veltor-gate.h deleted file mode 100644 index 7f97663..0000000 --- a/algo/x14/veltor-gate.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef VELTOR_GATE_H__ -#define VELTOR_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define VELTOR_4WAY -#endif - -bool register_veltor_algo( algo_gate_t* gate ); - -#if defined(VELTOR_4WAY) - -void veltor_4way_hash( void *state, const void *input ); - -int scanhash_veltor_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_veltor_4way_ctx(); - -#endif - -void veltor_hash( void *state, const void *input ); - -int scanhash_veltor( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_veltor_ctx(); - -#endif - diff --git a/algo/x14/veltor.c b/algo/x14/veltor.c deleted file mode 100644 index 8e95af4..0000000 --- a/algo/x14/veltor.c +++ /dev/null @@ -1,105 +0,0 @@ -#include "veltor-gate.h" -#include -#include -#include -#include - -#include "algo/skein/sph_skein.h" -#include "algo/gost/sph_gost.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/shabal/sph_shabal.h" - -typedef struct { - sph_gost512_context gost; - sph_shavite512_context shavite; - sph_skein512_context skein; - sph_shabal512_context shabal; -} veltor_ctx_holder; - -veltor_ctx_holder veltor_ctx __attribute__ ((aligned (64))); -static __thread sph_skein512_context veltor_skein_mid - __attribute__ ((aligned (64))); - -void init_veltor_ctx() -{ - sph_gost512_init( &veltor_ctx.gost ); - sph_shavite512_init( &veltor_ctx.shavite ); - sph_skein512_init( &veltor_ctx.skein); - sph_shabal512_init( &veltor_ctx.shabal); -} - -void veltor_skein512_midstate( const void* input ) -{ - memcpy( &veltor_skein_mid, &veltor_ctx.skein, sizeof veltor_skein_mid ); - sph_skein512( &veltor_skein_mid, input, 64 ); -} - -void veltor_hash(void *output, const void *input) -{ - uint32_t _ALIGN(64) hashA[16], hashB[16]; - - veltor_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &veltor_ctx, sizeof(veltor_ctx) ); - - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - memcpy( &ctx.skein, &veltor_skein_mid, sizeof veltor_skein_mid ); - sph_skein512( &ctx.skein, input + midlen, tail ); - - sph_skein512_close(&ctx.skein, hashA); - - sph_shavite512(&ctx.shavite, hashA, 64); - sph_shavite512_close(&ctx.shavite, hashB); - - sph_shabal512(&ctx.shabal, hashB, 64); - sph_shabal512_close(&ctx.shabal, hashA); - - sph_gost512(&ctx.gost, hashA, 64); - sph_gost512_close(&ctx.gost, hashB); - - memcpy(output, hashB, 32); -} - -int scanhash_veltor( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if (opt_benchmark) - ptarget[7] = 0x0cff; - - // we need bigendian data... - for (int i=0; i < 19; i++) { - be32enc(&endiandata[i], pdata[i]); - } - - veltor_skein512_midstate( endiandata ); - - do { - be32enc(&endiandata[19], nonce); - veltor_hash(hash, endiandata); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) { - work_set_target_ratio(work, hash); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 1; - } - nonce++; - - } while (nonce < max_nonce && !(*restart)); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c deleted file mode 100644 index 5267d78..0000000 --- a/algo/x14/x14-4way.c +++ /dev/null @@ -1,245 +0,0 @@ -#include "x14-gate.h" - -#if defined(X14_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/shabal-hash-4way.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; -} x14_4way_ctx_holder; - -x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64))); - -void init_x14_4way_ctx() -{ - blake512_4way_init( &x14_4way_ctx.blake ); - bmw512_4way_init( &x14_4way_ctx.bmw ); - init_groestl( &x14_4way_ctx.groestl, 64 ); - skein512_4way_init( &x14_4way_ctx.skein ); - jh512_4way_init( &x14_4way_ctx.jh ); - keccak512_4way_init( &x14_4way_ctx.keccak ); - luffa_2way_init( &x14_4way_ctx.luffa, 512 ); - cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x14_4way_ctx.shavite ); - simd_2way_init( &x14_4way_ctx.simd, 512 ); - init_echo( &x14_4way_ctx.echo, 512 ); - hamsi512_4way_init( &x14_4way_ctx.hamsi ); - sph_fugue512_init( &x14_4way_ctx.fugue ); - shabal512_4way_init( &x14_4way_ctx.shabal ); -}; - -void x14_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - x14_4way_ctx_holder ctx; - memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) ); - - // 1 Blake - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 3 Groestl - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // Parallel 4way - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 7 Luffa - intrlv_2x128( vhash, hash0, hash1, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // 8 Cubehash - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &x14_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &x14_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &x14_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - // 10 Simd - intrlv_2x128( vhash, hash0, hash1, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // 11 Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - // 12 Hamsi parallel 4way 32 bit - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 13 Fugue serial - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - // 14 Shabal, parallel 32 bit - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - shabal512_4way( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, state ); -} - -int scanhash_x14_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - for ( int m=0; m < 6; m++ ) - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x14_4way_hash( hash, vdata ); - pdata[19] = n; - - uint32_t *hash7 = &(hash[7<<2]); - - for ( int lane = 0; lane < 4; lane++ ) - if ( ( hash7[ lane ] & mask ) == 0 ) - { - // deinterleave hash for lane - uint32_t lane_hash[8]; - extr_lane_4x32( lane_hash, hash, lane, 256 ); - - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x14/x14-gate.c b/algo/x14/x14-gate.c deleted file mode 100644 index d02c305..0000000 --- a/algo/x14/x14-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "x14-gate.h" - -bool register_x14_algo( algo_gate_t* gate ) -{ -#if defined (X14_4WAY) - init_x14_4way_ctx(); - gate->scanhash = (void*)&scanhash_x14_4way; - gate->hash = (void*)&x14_4way_hash; -#else - init_x14_ctx(); - gate->scanhash = (void*)&scanhash_x14; - gate->hash = (void*)&x14hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; - diff --git a/algo/x14/x14-gate.h b/algo/x14/x14-gate.h deleted file mode 100644 index 9df974f..0000000 --- a/algo/x14/x14-gate.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef X14_GATE_H__ -#define X14_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X14_4WAY -#endif - -bool register_x14_algo( algo_gate_t* gate ); - -#if defined(X14_4WAY) - -void x14_4way_hash( void *state, const void *input ); -int scanhash_x14_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_x14_4way_ctx(); - -#endif - -void x14hash( void *state, const void *input ); -int scanhash_x14( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_x14_ctx(); - -#endif - diff --git a/algo/x14/x14.c b/algo/x14/x14.c deleted file mode 100644 index 771805c..0000000 --- a/algo/x14/x14.c +++ /dev/null @@ -1,251 +0,0 @@ -#include "x14-gate.h" - -#include -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" - -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - -typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else - hashState_echo echo; - hashState_groestl groestl; -#endif - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; -} x14_ctx_holder; - -x14_ctx_holder x14_ctx; - -void init_x14_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init(&x14_ctx.groestl); - sph_echo512_init(&x14_ctx.echo); -#else - init_echo(&x14_ctx.echo, 512); - init_groestl(&x14_ctx.groestl, 64 ); -#endif - init_luffa(&x14_ctx.luffa,512); - cubehashInit(&x14_ctx.cube,512,16,32); - sph_shavite512_init(&x14_ctx.shavite); - init_sd(&x14_ctx.simd,512); - sph_hamsi512_init(&x14_ctx.hamsi); - sph_fugue512_init(&x14_ctx.fugue); - sph_shabal512_init(&x14_ctx.shabal); -}; - -void x14hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (32))); - #define hashB hash+64 - - x14_ctx_holder ctx; - memcpy(&ctx, &x14_ctx, sizeof(x14_ctx)); - - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; - - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#endif - - //---skein4--- - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - //---jh5------ - - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence*)hash, 64 ); - - // 8 Cube - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)hashB, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hashB); - - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hashB, 512 ); - - //11---echo--- -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hashB); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hashB, - (const BitSequence *)hash, 512 ); -#endif - - // X13 algos - - // 12 Hamsi - sph_hamsi512(&ctx.hamsi, hashB, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - // 13 Fugue - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hashB); - - // X14 Shabal - sph_shabal512(&ctx.shabal, hashB, 64); - sph_shabal512_close(&ctx.shabal, hash); - - - asm volatile ("emms"); - memcpy(output, hash, 32); -} - -int scanhash_x14( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - -#ifdef DEBUG_ALGO - if (Htarg != 0) - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - x14hash(hash64, endiandata); -#ifndef DEBUG_ALGO - if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - work_set_target_ratio( work, hash64 ); - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c deleted file mode 100644 index 87fe361..0000000 --- a/algo/x15/x15-4way.c +++ /dev/null @@ -1,264 +0,0 @@ -#include "x15-gate.h" - -#if defined(X15_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/whirlpool/sph_whirlpool.h" - -typedef struct { - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hashState_echo echo; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; -} x15_4way_ctx_holder; - -x15_4way_ctx_holder x15_4way_ctx __attribute__ ((aligned (64))); - -void init_x15_4way_ctx() -{ - blake512_4way_init( &x15_4way_ctx.blake ); - bmw512_4way_init( &x15_4way_ctx.bmw ); - init_groestl( &x15_4way_ctx.groestl, 64 ); - skein512_4way_init( &x15_4way_ctx.skein ); - jh512_4way_init( &x15_4way_ctx.jh ); - keccak512_4way_init( &x15_4way_ctx.keccak ); - luffa_2way_init( &x15_4way_ctx.luffa, 512 ); - cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x15_4way_ctx.shavite ); - simd_2way_init( &x15_4way_ctx.simd, 512 ); - init_echo( &x15_4way_ctx.echo, 512 ); - hamsi512_4way_init( &x15_4way_ctx.hamsi ); - sph_fugue512_init( &x15_4way_ctx.fugue ); - shabal512_4way_init( &x15_4way_ctx.shabal ); - sph_whirlpool_init( &x15_4way_ctx.whirlpool ); -}; - -void x15_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - x15_4way_ctx_holder ctx; - memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) ); - - // 1 Blake - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 3 Groestl - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // Parallel 4way - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - // Serial to the end - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 7 Luffa - intrlv_2x128( vhash, hash0, hash1, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // 8 Cubehash - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); - memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); - memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 ); - memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash0, 64 ); - sph_shavite512_close( &ctx.shavite, hash0 ); - memcpy( &ctx.shavite, &x15_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash1, 64 ); - sph_shavite512_close( &ctx.shavite, hash1 ); - memcpy( &ctx.shavite, &x15_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash2, 64 ); - sph_shavite512_close( &ctx.shavite, hash2 ); - memcpy( &ctx.shavite, &x15_4way_ctx.shavite, - sizeof(sph_shavite512_context) ); - sph_shavite512( &ctx.shavite, hash3, 64 ); - sph_shavite512_close( &ctx.shavite, hash3 ); - - // 10 Simd - intrlv_2x128( vhash, hash0, hash1, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, hash2, hash3, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - - // 11 Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - // 12 Hamsi parallel 4way 32 bit - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 13 Fugue - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - // 14 Shabal, parallel 32 bit - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - shabal512_4way( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 15 Whirlpool - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool, - sizeof(sph_whirlpool_context) ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool, - sizeof(sph_whirlpool_context) ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool, - sizeof(sph_whirlpool_context) ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); -} - -int scanhash_x15_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - - for ( int m=0; m < 6; m++ ) - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x15_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( (hash+(i<<3))[7] & mask ) == 0 ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash, mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x15/x15-gate.c b/algo/x15/x15-gate.c deleted file mode 100644 index da33192..0000000 --- a/algo/x15/x15-gate.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "x15-gate.h" - -bool register_x15_algo( algo_gate_t* gate ) -{ -#if defined (X15_4WAY) - init_x15_4way_ctx(); - gate->scanhash = (void*)&scanhash_x15_4way; - gate->hash = (void*)&x15_4way_hash; -#else - init_x15_ctx(); - gate->scanhash = (void*)&scanhash_x15; - gate->hash = (void*)&x15hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - return true; -}; - diff --git a/algo/x15/x15-gate.h b/algo/x15/x15-gate.h deleted file mode 100644 index 8224fe2..0000000 --- a/algo/x15/x15-gate.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef X15_GATE_H__ -#define X15_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X15_4WAY -#endif - -bool register_x15_algo( algo_gate_t* gate ); - -#if defined(X15_4WAY) - -void x15_4way_hash( void *state, const void *input ); -int scanhash_x15_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_x15_4way_ctx(); - -#endif - -void x15hash( void *state, const void *input ); -int scanhash_x15( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_x15_ctx(); - -#endif - diff --git a/algo/x15/x15.c b/algo/x15/x15.c deleted file mode 100644 index 29baafe..0000000 --- a/algo/x15/x15.c +++ /dev/null @@ -1,262 +0,0 @@ -#include "x15-gate.h" - -#include -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" - -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else - hashState_echo echo; - hashState_groestl groestl; -#endif - hashState_luffa luffa; - cubehashParam cubehash; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; -} x15_ctx_holder; - -x15_ctx_holder x15_ctx; - -void init_x15_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init(&x15_ctx.groestl); - sph_echo512_init(&x15_ctx.echo); -#else - init_echo( &x15_ctx.echo, 512 ); - init_groestl( &x15_ctx.groestl, 64 ); -#endif - init_luffa( &x15_ctx.luffa, 512 ); - cubehashInit( &x15_ctx.cubehash, 512, 16, 32 ); - sph_shavite512_init( &x15_ctx.shavite ); - init_sd( &x15_ctx.simd, 512 ); - sph_hamsi512_init( &x15_ctx.hamsi ); - sph_fugue512_init( &x15_ctx.fugue ); - sph_shabal512_init( &x15_ctx.shabal ); - sph_whirlpool_init( &x15_ctx.whirlpool ); -}; - -void x15hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (32))); - #define hashB hash+64 - - x15_ctx_holder ctx; - memcpy( &ctx, &x15_ctx, sizeof(x15_ctx) ); - - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; - - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- - -#ifdef NO_AES_NI - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#endif - - //---skein4--- - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - //---jh5------ - - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence*)hash, 64 ); - - // 8 Cube - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hashB, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hashB); - - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hashB, 512 ); - - //11---echo--- - -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hashB); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hashB, - (const BitSequence *)hash, 512 ); -#endif - - // X13 algos - // 12 Hamsi - sph_hamsi512(&ctx.hamsi, hashB, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - // 13 Fugue - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hashB); - - // X14 Shabal - sph_shabal512(&ctx.shabal, hashB, 64); - sph_shabal512_close(&ctx.shabal, hash); - - // X15 Whirlpool - sph_whirlpool(&ctx.whirlpool, hash, 64); - sph_whirlpool_close(&ctx.whirlpool, hashB); - - - asm volatile ("emms"); - memcpy(output, hashB, 32); -} - -int scanhash_x15( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - swab32_array( endiandata, pdata, 20 ); - -#ifdef DEBUG_ALGO - if (Htarg != 0) - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - x15hash(hash64, endiandata); -#ifndef DEBUG_ALGO - if (!(hash64[7] & mask)) - { - if ( fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -// else -// { -// applog(LOG_INFO, "Result does not validate on CPU!"); -// } - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - submit_solution( work, hash64, mythr ); - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } - } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/x16/hex.c b/algo/x16/hex.c deleted file mode 100644 index 631e428..0000000 --- a/algo/x16/hex.c +++ /dev/null @@ -1,247 +0,0 @@ -/** - * x16r algo implementation - * - * Implementation by tpruvot@github Jan 2018 - * Optimized by JayDDee@github Jan 2018 - */ -#include "x16r-gate.h" - -#include -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include -#if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; - -static void hex_getAlgoString(const uint32_t* prevblock, char *output) -{ - char *sptr = output; - uint8_t* data = (uint8_t*)prevblock; - - for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) { - uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed - uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4; - if (algoDigit >= 10) - sprintf(sptr, "%c", 'A' + (algoDigit - 10)); - else - sprintf(sptr, "%u", (uint32_t) algoDigit); - sptr++; - } - *sptr = '\0'; -} - -union _hex_context_overlay -{ -#if defined(__AES__) - hashState_echo echo; - hashState_groestl groestl; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif - sph_blake512_context blake; - sph_bmw512_context bmw; - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; - SHA512_CTX sha512; -}; -typedef union _hex_context_overlay hex_context_overlay; - -void hex_hash( void* output, const void* input ) -{ - uint32_t _ALIGN(128) hash[16]; - hex_context_overlay ctx; - void *in = (void*) input; - int size = 80; -/* - if ( s_ntime == UINT32_MAX ) - { - const uint8_t* in8 = (uint8_t*) input; - x16_r_s_getAlgoString( &in8[4], hashOrder ); - } -*/ - - char elem = hashOrder[0]; - uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - - for ( int i = 0; i < 16; i++ ) - { - switch ( algo ) - { - case BLAKE: - sph_blake512_init( &ctx.blake ); - sph_blake512( &ctx.blake, in, size ); - sph_blake512_close( &ctx.blake, hash ); - break; - case BMW: - sph_bmw512_init( &ctx.bmw ); - sph_bmw512(&ctx.bmw, in, size); - sph_bmw512_close(&ctx.bmw, hash); - break; - case GROESTL: -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)in, size<<3 ); -#else - sph_groestl512_init( &ctx.groestl ); - sph_groestl512( &ctx.groestl, in, size ); - sph_groestl512_close(&ctx.groestl, hash); -#endif - break; - case SKEIN: - sph_skein512_init( &ctx.skein ); - sph_skein512( &ctx.skein, in, size ); - sph_skein512_close( &ctx.skein, hash ); - break; - case JH: - sph_jh512_init( &ctx.jh ); - sph_jh512(&ctx.jh, in, size ); - sph_jh512_close(&ctx.jh, hash ); - break; - case KECCAK: - sph_keccak512_init( &ctx.keccak ); - sph_keccak512( &ctx.keccak, in, size ); - sph_keccak512_close( &ctx.keccak, hash ); - break; - case LUFFA: - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)in, size ); - break; - case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)in, size ); - break; - case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in, size ); - sph_shavite512_close( &ctx.shavite, hash ); - break; - case SIMD: - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); - break; - case ECHO: -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512( &ctx.echo, in, size ); - sph_echo512_close( &ctx.echo, hash ); -#endif - break; - case HAMSI: - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512( &ctx.hamsi, in, size ); - sph_hamsi512_close( &ctx.hamsi, hash ); - break; - case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in, size ); - sph_fugue512_close( &ctx.fugue, hash ); - break; - case SHABAL: - sph_shabal512_init( &ctx.shabal ); - sph_shabal512( &ctx.shabal, in, size ); - sph_shabal512_close( &ctx.shabal, hash ); - break; - case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in, size ); - sph_whirlpool_close( &ctx.whirlpool, hash ); - break; - case SHA_512: - SHA512_Init( &ctx.sha512 ); - SHA512_Update( &ctx.sha512, in, size ); - SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); - break; - } - algo = (uint8_t)hash[0] % X16R_HASH_FUNC_COUNT; - in = (void*) hash; - size = 64; - } - memcpy(output, hash, 32); -} - -int scanhash_hex( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - uint32_t ntime = swab32(pdata[17]); - if ( s_ntime != ntime ) - { - hex_getAlgoString( (const uint32_t*) (&endiandata[1]), hashOrder ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); - } - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - do - { - be32enc( &endiandata[19], nonce ); - hex_hash( hash32, endiandata ); - - if ( hash32[7] <= Htarg ) - if (fulltest( hash32, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash32, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c deleted file mode 100644 index 1338f8b..0000000 --- a/algo/x16/x16r-4way.c +++ /dev/null @@ -1,342 +0,0 @@ -/** - * x16r algo implementation - * - * Implementation by tpruvot@github Jan 2018 - * Optimized by JayDDee@github Jan 2018 - */ -#include "x16r-gate.h" - -#if defined (X16R_4WAY) - -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha2-hash-4way.h" - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; - -union _x16r_4way_context_overlay -{ - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_echo echo; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; - sha512_4way_context sha512; -}; -typedef union _x16r_4way_context_overlay x16r_4way_context_overlay; - -void x16r_4way_hash( void* output, const void* input ) -{ - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t vhash[24*4] __attribute__ ((aligned (64))); - x16r_4way_context_overlay ctx; - void *in0 = (void*) hash0; - void *in1 = (void*) hash1; - void *in2 = (void*) hash2; - void *in3 = (void*) hash3; - int size = 80; - - dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 ); -/* - if ( s_ntime == UINT32_MAX ) - { - const uint8_t* tmp = (uint8_t*) in0; - x16_r_s_getAlgoString( &tmp[4], hashOrder ); - } -*/ - // Input data is both 64 bit interleaved (input) - // and deinterleaved in inp0-3. - // If First function uses 64 bit data it is not required to interleave inp - // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit. - // All other functions assume data is deinterleaved in hash0-3 - // All functions must exit with data deinterleaved in hash0-3. - // Alias in0-3 points to either inp0-3 or hash0-3 according to - // its hashOrder position. Size is also set accordingly. - for ( int i = 0; i < 16; i++ ) - { - const char elem = hashOrder[i]; - const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - - switch ( algo ) - { - case BLAKE: - blake512_4way_init( &ctx.blake ); - if ( i == 0 ) - blake512_4way( &ctx.blake, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way( &ctx.blake, vhash, size ); - } - blake512_4way_close( &ctx.blake, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case BMW: - bmw512_4way_init( &ctx.bmw ); - if ( i == 0 ) - bmw512_4way( &ctx.bmw, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - bmw512_4way( &ctx.bmw, vhash, size ); - } - bmw512_4way_close( &ctx.bmw, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case GROESTL: - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - break; - case SKEIN: - skein512_4way_init( &ctx.skein ); - if ( i == 0 ) - skein512_4way( &ctx.skein, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way( &ctx.skein, vhash, size ); - } - skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case JH: - jh512_4way_init( &ctx.jh ); - if ( i == 0 ) - jh512_4way( &ctx.jh, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - jh512_4way( &ctx.jh, vhash, size ); - } - jh512_4way_close( &ctx.jh, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case KECCAK: - keccak512_4way_init( &ctx.keccak ); - if ( i == 0 ) - keccak512_4way( &ctx.keccak, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - keccak512_4way( &ctx.keccak, vhash, size ); - } - keccak512_4way_close( &ctx.keccak, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case LUFFA: - intrlv_2x128( vhash, in0, in1, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - break; - case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); - break; - case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in0, size ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in1, size ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in2, size ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in3, size ); - sph_shavite512_close( &ctx.shavite, hash3 ); - break; - case SIMD: - intrlv_2x128( vhash, in0, in1, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - break; - case ECHO: - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); - break; - case HAMSI: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, size ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in0, size ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in1, size ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in2, size ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in3, size ); - sph_fugue512_close( &ctx.fugue, hash3 ); - break; - case SHABAL: - intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, size ); - shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - break; - case SHA_512: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, size ); - sha512_4way_close( &ctx.sha512, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - } - size = 64; - } - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); -} - -int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - __m256i *noncev = (__m256i*)vdata + 9; // aligned - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); - casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - if ( s_ntime != endiandata[17] ) - { - uint32_t ntime = swab32(pdata[17]); - x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); - } - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); - - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x16r_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) - if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !(*restart) ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c deleted file mode 100644 index db2b20c..0000000 --- a/algo/x16/x16r-gate.c +++ /dev/null @@ -1,251 +0,0 @@ -#include "x16r-gate.h" - -void x16r_getAlgoString( const uint8_t* prevblock, char *output ) -{ - char *sptr = output; - for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ ) - { - uint8_t b = (15 - j) >> 1; // 16 first ascii hex chars (lsb in uint256) - uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4; - if (algoDigit >= 10) - sprintf(sptr, "%c", 'A' + (algoDigit - 10)); - else - sprintf(sptr, "%u", (uint32_t) algoDigit); - sptr++; - } - *sptr = '\0'; -} - -void x16s_getAlgoString( const uint8_t* prevblock, char *output ) -{ - strcpy( output, "0123456789ABCDEF" ); - for ( int i = 0; i < 16; i++ ) - { - uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed - uint8_t algoDigit = (i & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4; - int offset = algoDigit; - // insert the nth character at the front - char oldVal = output[offset]; - for( int j = offset; j-- > 0; ) - output[j+1] = output[j]; - output[0] = oldVal; - } -} - -bool register_x16r_algo( algo_gate_t* gate ) -{ -#if defined (X16R_4WAY) - gate->scanhash = (void*)&scanhash_x16r_4way; - gate->hash = (void*)&x16r_4way_hash; -#else - gate->scanhash = (void*)&scanhash_x16r; - gate->hash = (void*)&x16r_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->set_target = (void*)&alt_set_target; - x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; - return true; -}; - -bool register_x16s_algo( algo_gate_t* gate ) -{ -#if defined (X16R_4WAY) - gate->scanhash = (void*)&scanhash_x16r_4way; - gate->hash = (void*)&x16r_4way_hash; -#else - gate->scanhash = (void*)&scanhash_x16r; - gate->hash = (void*)&x16r_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->set_target = (void*)&alt_set_target; - x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; - return true; -}; - -//////////////// -// -// X16RT - - -void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash ) -{ - int32_t maskedTime = timeStamp & 0xffffff80; - sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ), - sizeof( maskedTime ) ); -} - -void x16rt_getAlgoString( const uint32_t *timeHash, char *output) -{ - char *sptr = output; - uint8_t* data = (uint8_t*)timeHash; - - for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) { - uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed - uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4; - - if (algoDigit >= 10) - sprintf(sptr, "%c", 'A' + (algoDigit - 10)); - else - sprintf(sptr, "%u", (uint32_t) algoDigit); - sptr++; - } - *sptr = '\0'; -} - -void x16rt_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) -{ - uchar merkle_tree[64] = { 0 }; - size_t t; - - algo_gate.gen_merkle_root( merkle_tree, sctx ); - // Increment extranonce2 - for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - - // Assemble block header -// algo_gate.build_block_header( g_work, le32dec( sctx->job.version ), -// (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree, -// le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) ); - int i; - - memset( g_work->data, 0, sizeof(g_work->data) ); - g_work->data[0] = le32dec( sctx->job.version ); - - if ( have_stratum ) - for ( i = 0; i < 8; i++ ) - g_work->data[ 1+i ] = le32dec( (uint32_t*)sctx->job.prevhash + i ); - else - for (i = 0; i < 8; i++) - g_work->data[ 8-i ] = le32dec( (uint32_t*)sctx->job.prevhash + i ); - - g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime ); - g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits ); - g_work->data[20] = 0x80000000; - g_work->data[31] = 0x00000280; - - for ( i = 0; i < 8; i++ ) - g_work->merkleroothash[7 - i] = be32dec((uint32_t *)merkle_tree + i); - for ( i = 0; i < 8; i++ ) - g_work->witmerkleroothash[7 - i] = be32dec((uint32_t *)merkle_tree + i); - for ( i = 0; i < 8; i++ ) - g_work->denom10[i] = le32dec((uint32_t *)sctx->job.denom10 + i); - for ( i = 0; i < 8; i++ ) - g_work->denom100[i] = le32dec((uint32_t *)sctx->job.denom100 + i); - for ( i = 0; i < 8; i++ ) - g_work->denom1000[i] = le32dec((uint32_t *)sctx->job.denom1000 + i); - for ( i = 0; i < 8; i++ ) - g_work->denom10000[i] = le32dec((uint32_t *)sctx->job.denom10000 + i); - - uint32_t pofnhash[8]; - memset(pofnhash, 0x00, 32); - - char denom10_str [ 2 * sizeof( g_work->denom10 ) + 1 ]; - char denom100_str [ 2 * sizeof( g_work->denom100 ) + 1 ]; - char denom1000_str [ 2 * sizeof( g_work->denom1000 ) + 1 ]; - char denom10000_str [ 2 * sizeof( g_work->denom10000 ) + 1 ]; - char merkleroot_str [ 2 * sizeof( g_work->merkleroothash ) + 1 ]; - char witmerkleroot_str[ 2 * sizeof( g_work->witmerkleroothash ) + 1 ]; - char pofn_str [ 2 * sizeof( pofnhash ) + 1 ]; - - cbin2hex( denom10_str, (char*) g_work->denom10, 32 ); - cbin2hex( denom100_str, (char*) g_work->denom100, 32 ); - cbin2hex( denom1000_str, (char*) g_work->denom1000, 32 ); - cbin2hex( denom10000_str, (char*) g_work->denom10000, 32 ); - cbin2hex( merkleroot_str, (char*) g_work->merkleroothash, 32 ); - cbin2hex( witmerkleroot_str, (char*) g_work->witmerkleroothash, 32 ); - cbin2hex( pofn_str, (char*) pofnhash, 32 ); - - if ( true ) - { - char* data; - data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4 - + strlen( merkleroot_str ) * 3 ); - // Build the block header veildatahash in hex - sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s", - merkleroot_str, witmerkleroot_str, "04", - "0a00000000000000", denom10_str, - "6400000000000000", denom100_str, - "e803000000000000", denom1000_str, - "1027000000000000", denom10000_str, pofn_str ); - // Covert the hex to binary - uint32_t test[100]; - hex2bin( (unsigned char*)(&test), data, 257); - // Compute the sha256d of the binary - uint32_t _ALIGN(64) hash[8]; - sha256d( (unsigned char*)hash, (unsigned char*)&(test), 257); - // assign the veildatahash in the blockheader - for ( i = 0; i < 8; i++ ) - g_work->data[16 - i] = le32dec(hash + i); - free(data); - } -} - -bool register_x16rt_algo( algo_gate_t* gate ) -{ -#if defined (X16R_4WAY) - gate->scanhash = (void*)&scanhash_x16rt_4way; - gate->hash = (void*)&x16rt_4way_hash; -#else - gate->scanhash = (void*)&scanhash_x16rt; - gate->hash = (void*)&x16rt_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->set_target = (void*)&alt_set_target; - return true; -}; - -bool register_x16rt_veil_algo( algo_gate_t* gate ) -{ -#if defined (X16R_4WAY) - gate->scanhash = (void*)&scanhash_x16rt_4way; - gate->hash = (void*)&x16rt_4way_hash; -#else - gate->scanhash = (void*)&scanhash_x16rt; - gate->hash = (void*)&x16rt_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->set_target = (void*)&alt_set_target; - gate->build_extraheader = (void*)&x16rt_build_extraheader; - return true; -}; - -//////////////////// -// -// HEX - - -void hex_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (128.0 * opt_diff_factor) ); -} - -bool register_hex_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_hex; - gate->hash = (void*)&hex_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; - gate->set_target = (void*)&hex_set_target; - return true; -}; - -/////////////////////////////// -// -// X21S - -bool register_x21s_algo( algo_gate_t* gate ) -{ -#if defined (X16R_4WAY) - gate->scanhash = (void*)&scanhash_x21s_4way; - gate->hash = (void*)&x21s_4way_hash; - gate->miner_thread_init = (void*)&x21s_4way_thread_init; -#else - gate->scanhash = (void*)&scanhash_x21s; - gate->hash = (void*)&x21s_hash; - gate->miner_thread_init = (void*)&x21s_thread_init; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT; - gate->set_target = (void*)&alt_set_target; - x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; - return true; -}; - diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h deleted file mode 100644 index 23d975e..0000000 --- a/algo/x16/x16r-gate.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef X16R_GATE_H__ -#define X16R_GATE_H__ 1 - -#include "algo-gate-api.h" -#include "simd-utils.h" -#include -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X16R_4WAY -#endif - -enum x16r_Algo { - BLAKE = 0, - BMW, - GROESTL, - JH, - KECCAK, - SKEIN, - LUFFA, - CUBEHASH, - SHAVITE, - SIMD, - ECHO, - HAMSI, - FUGUE, - SHABAL, - WHIRLPOOL, - SHA_512, - X16R_HASH_FUNC_COUNT -}; - -void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ); -void x16r_getAlgoString( const uint8_t *prevblock, char *output ); -void x16s_getAlgoString( const uint8_t *prevblock, char *output ); -void x16rt_getAlgoString( const uint32_t *timeHash, char *output ); - -void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash ); - -bool register_x16r_algo( algo_gate_t* gate ); -bool register_x16s_algo( algo_gate_t* gate ); -bool register_x16rt_algo( algo_gate_t* gate ); -bool register_hex__algo( algo_gate_t* gate ); -bool register_x21s__algo( algo_gate_t* gate ); - -#if defined(X16R_4WAY) - -void x16r_4way_hash( void *state, const void *input ); -int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void x16rt_4way_hash( void *state, const void *input ); -int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void x21s_4way_hash( void *state, const void *input ); -int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool x21s_4way_thread_init(); - -#endif - -void x16r_hash( void *state, const void *input ); -int scanhash_x16r( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void x16rt_hash( void *state, const void *input ); -int scanhash_x16rt( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void hex_hash( void *state, const void *input ); -int scanhash_hex( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void x21s_hash( void *state, const void *input ); -int scanhash_x21s( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool x21s_thread_init(); - -#endif - diff --git a/algo/x16/x16r.c b/algo/x16/x16r.c deleted file mode 100644 index 6de195a..0000000 --- a/algo/x16/x16r.c +++ /dev/null @@ -1,228 +0,0 @@ -/** - * x16r algo implementation - * - * Implementation by tpruvot@github Jan 2018 - * Optimized by JayDDee@github Jan 2018 - */ -#include "x16r-gate.h" - -#include -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include -#if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; - -union _x16r_context_overlay -{ -#if defined(__AES__) - hashState_echo echo; - hashState_groestl groestl; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif - sph_blake512_context blake; - sph_bmw512_context bmw; - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; - SHA512_CTX sha512; -}; -typedef union _x16r_context_overlay x16r_context_overlay; - -void x16r_hash( void* output, const void* input ) -{ - uint32_t _ALIGN(128) hash[16]; - x16r_context_overlay ctx; - void *in = (void*) input; - int size = 80; -/* - if ( s_ntime == UINT32_MAX ) - { - const uint8_t* in8 = (uint8_t*) input; - x16_r_s_getAlgoString( &in8[4], hashOrder ); - } -*/ - for ( int i = 0; i < 16; i++ ) - { - const char elem = hashOrder[i]; - const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - - switch ( algo ) - { - case BLAKE: - sph_blake512_init( &ctx.blake ); - sph_blake512( &ctx.blake, in, size ); - sph_blake512_close( &ctx.blake, hash ); - break; - case BMW: - sph_bmw512_init( &ctx.bmw ); - sph_bmw512(&ctx.bmw, in, size); - sph_bmw512_close(&ctx.bmw, hash); - break; - case GROESTL: -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)in, size<<3 ); -#else - sph_groestl512_init( &ctx.groestl ); - sph_groestl512( &ctx.groestl, in, size ); - sph_groestl512_close(&ctx.groestl, hash); -#endif - break; - case SKEIN: - sph_skein512_init( &ctx.skein ); - sph_skein512( &ctx.skein, in, size ); - sph_skein512_close( &ctx.skein, hash ); - break; - case JH: - sph_jh512_init( &ctx.jh ); - sph_jh512(&ctx.jh, in, size ); - sph_jh512_close(&ctx.jh, hash ); - break; - case KECCAK: - sph_keccak512_init( &ctx.keccak ); - sph_keccak512( &ctx.keccak, in, size ); - sph_keccak512_close( &ctx.keccak, hash ); - break; - case LUFFA: - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)in, size ); - break; - case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)in, size ); - break; - case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in, size ); - sph_shavite512_close( &ctx.shavite, hash ); - break; - case SIMD: - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); - break; - case ECHO: -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512( &ctx.echo, in, size ); - sph_echo512_close( &ctx.echo, hash ); -#endif - break; - case HAMSI: - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512( &ctx.hamsi, in, size ); - sph_hamsi512_close( &ctx.hamsi, hash ); - break; - case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in, size ); - sph_fugue512_close( &ctx.fugue, hash ); - break; - case SHABAL: - sph_shabal512_init( &ctx.shabal ); - sph_shabal512( &ctx.shabal, in, size ); - sph_shabal512_close( &ctx.shabal, hash ); - break; - case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in, size ); - sph_whirlpool_close( &ctx.whirlpool, hash ); - break; - case SHA_512: - SHA512_Init( &ctx.sha512 ); - SHA512_Update( &ctx.sha512, in, size ); - SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); - break; - } - in = (void*) hash; - size = 64; - } - memcpy(output, hash, 32); -} - -int scanhash_x16r( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - if ( s_ntime != pdata[17] ) - { - uint32_t ntime = swab32(pdata[17]); - x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); - } - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - do - { - be32enc( &endiandata[19], nonce ); - x16r_hash( hash32, endiandata ); - - if ( hash32[7] <= Htarg ) - if (fulltest( hash32, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash32, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c deleted file mode 100644 index a0941da..0000000 --- a/algo/x16/x16rt-4way.c +++ /dev/null @@ -1,353 +0,0 @@ -#include "x16r-gate.h" - -#if defined (X16R_4WAY) - -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha2-hash-4way.h" - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread bool s_implemented = false; -static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; - -union _x16rt_4way_context_overlay -{ - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_echo echo; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; - sha512_4way_context sha512; -}; -typedef union _x16rt_4way_context_overlay x16rt_4way_context_overlay; - -void x16rt_4way_hash( void* output, const void* input ) -{ - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t vhash[24*4] __attribute__ ((aligned (64))); - x16rt_4way_context_overlay ctx; - void *in0 = (void*) hash0; - void *in1 = (void*) hash1; - void *in2 = (void*) hash2; - void *in3 = (void*) hash3; - int size = 80; - - dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 ); - -/* - void *in = (void*) input; - uint32_t *in32 = (uint32_t*) hash0; - uint32_t ntime = in32[17]; - if ( s_ntime == UINT32_MAX ) - { - uint32_t _ALIGN(64) timeHash[8]; - x16rt_getTimeHash(ntime, &timeHash); - x16rt_getAlgoString(&timeHash[0], hashOrder); - } -*/ - - // Input data is both 64 bit interleaved (input) - // and deinterleaved in inp0-3. - // If First function uses 64 bit data it is not required to interleave inp - // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit. - // All other functions assume data is deinterleaved in hash0-3 - // All functions must exit with data deinterleaved in hash0-3. - // Alias in0-3 points to either inp0-3 or hash0-3 according to - // its hashOrder position. Size is also set accordingly. - for ( int i = 0; i < 16; i++ ) - { - const char elem = hashOrder[i]; - const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - - switch ( algo ) - { - case BLAKE: - blake512_4way_init( &ctx.blake ); - if ( i == 0 ) - blake512_4way( &ctx.blake, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way( &ctx.blake, vhash, size ); - } - blake512_4way_close( &ctx.blake, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case BMW: - bmw512_4way_init( &ctx.bmw ); - if ( i == 0 ) - bmw512_4way( &ctx.bmw, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - bmw512_4way( &ctx.bmw, vhash, size ); - } - bmw512_4way_close( &ctx.bmw, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case GROESTL: - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - break; - case SKEIN: - skein512_4way_init( &ctx.skein ); - if ( i == 0 ) - skein512_4way( &ctx.skein, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way( &ctx.skein, vhash, size ); - } - skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case JH: - jh512_4way_init( &ctx.jh ); - if ( i == 0 ) - jh512_4way( &ctx.jh, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - jh512_4way( &ctx.jh, vhash, size ); - } - jh512_4way_close( &ctx.jh, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case KECCAK: - keccak512_4way_init( &ctx.keccak ); - if ( i == 0 ) - keccak512_4way( &ctx.keccak, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - keccak512_4way( &ctx.keccak, vhash, size ); - } - keccak512_4way_close( &ctx.keccak, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case LUFFA: - intrlv_2x128( vhash, in0, in1, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - break; - case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); - break; - case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in0, size ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in1, size ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in2, size ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in3, size ); - sph_shavite512_close( &ctx.shavite, hash3 ); - break; - case SIMD: - intrlv_2x128( vhash, in0, in1, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - break; - case ECHO: - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); - break; - case HAMSI: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, size ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in0, size ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in1, size ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in2, size ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in3, size ); - sph_fugue512_close( &ctx.fugue, hash3 ); - break; - case SHABAL: - intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, size ); - shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - break; - case SHA_512: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, size ); - sha512_4way_close( &ctx.sha512, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - } - size = 64; - } - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); -} - -int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t _ALIGN(64) timeHash[4*8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - __m256i *noncev = (__m256i*)vdata + 9; // aligned - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); - casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - uint32_t ntime = swab32( pdata[17] ); - if ( s_ntime != ntime ) - { - x16rt_getTimeHash( ntime, &timeHash ); - x16rt_getAlgoString( &timeHash[0], hashOrder ); - s_ntime = ntime; - s_implemented = true; - if ( opt_debug && !thr_id ) - applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)", - hashOrder, ntime, timeHash ); - } - if ( !s_implemented ) - { - applog( LOG_WARNING, "s not implemented"); - sleep(1); - return 0; - } - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); - - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x16rt_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) - if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !(*restart) ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x16/x16rt.c b/algo/x16/x16rt.c deleted file mode 100644 index 1c19cd6..0000000 --- a/algo/x16/x16rt.c +++ /dev/null @@ -1,239 +0,0 @@ -#include "x16r-gate.h" - -#include -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include -#if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread bool s_implemented = false; -static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; - -union _x16rt_context_overlay -{ -#if defined(__AES__) - hashState_echo echo; - hashState_groestl groestl; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif - sph_blake512_context blake; - sph_bmw512_context bmw; - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; - SHA512_CTX sha512; -}; -typedef union _x16rt_context_overlay x16rt_context_overlay; - -void x16rt_hash( void* output, const void* input ) -{ - uint32_t _ALIGN(128) hash[16]; - x16rt_context_overlay ctx; - int size = 80; - void *in = (void*) input; - -/* - void *in = (void*) input; - uint32_t *in32 = (uint32_t*) in; - uint32_t ntime = in32[17]; - if ( s_ntime == UINT32_MAX ) - { - uint32_t _ALIGN(64) timeHash[8]; - x16rt_getTimeHash(ntime, &timeHash); - x16rt_getAlgoString(&timeHash[0], hashOrder); - } -*/ - - for ( int i = 0; i < 16; i++ ) - { - const char elem = hashOrder[i]; - const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - - switch ( algo ) - { - case BLAKE: - sph_blake512_init( &ctx.blake ); - sph_blake512( &ctx.blake, in, size ); - sph_blake512_close( &ctx.blake, hash ); - break; - case BMW: - sph_bmw512_init( &ctx.bmw ); - sph_bmw512(&ctx.bmw, in, size); - sph_bmw512_close(&ctx.bmw, hash); - break; - case GROESTL: -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)in, size<<3 ); -#else - sph_groestl512_init( &ctx.groestl ); - sph_groestl512( &ctx.groestl, in, size ); - sph_groestl512_close(&ctx.groestl, hash); -#endif - break; - case SKEIN: - sph_skein512_init( &ctx.skein ); - sph_skein512( &ctx.skein, in, size ); - sph_skein512_close( &ctx.skein, hash ); - break; - case JH: - sph_jh512_init( &ctx.jh ); - sph_jh512(&ctx.jh, in, size ); - sph_jh512_close(&ctx.jh, hash ); - break; - case KECCAK: - sph_keccak512_init( &ctx.keccak ); - sph_keccak512( &ctx.keccak, in, size ); - sph_keccak512_close( &ctx.keccak, hash ); - break; - case LUFFA: - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)in, size ); - break; - case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)in, size ); - break; - case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in, size ); - sph_shavite512_close( &ctx.shavite, hash ); - break; - case SIMD: - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); - break; - case ECHO: -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512( &ctx.echo, in, size ); - sph_echo512_close( &ctx.echo, hash ); -#endif - break; - case HAMSI: - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512( &ctx.hamsi, in, size ); - sph_hamsi512_close( &ctx.hamsi, hash ); - break; - case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in, size ); - sph_fugue512_close( &ctx.fugue, hash ); - break; - case SHABAL: - sph_shabal512_init( &ctx.shabal ); - sph_shabal512( &ctx.shabal, in, size ); - sph_shabal512_close( &ctx.shabal, hash ); - break; - case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in, size ); - sph_whirlpool_close( &ctx.whirlpool, hash ); - break; - case SHA_512: - SHA512_Init( &ctx.sha512 ); - SHA512_Update( &ctx.sha512, in, size ); - SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); - break; - } - in = (void*) hash; - size = 64; - } - memcpy(output, hash, 32); -} - -int scanhash_x16rt( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t _ALIGN(64) timeHash[8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - uint32_t ntime = swab32( pdata[17] ); - if ( s_ntime != ntime ) - { - x16rt_getTimeHash( ntime, &timeHash ); - x16rt_getAlgoString( &timeHash[0], hashOrder ); - s_ntime = ntime; - s_implemented = true; - if ( opt_debug && !thr_id ) - applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)", - hashOrder, ntime, timeHash ); - } - if ( !s_implemented ) - { - applog( LOG_WARNING, "s not implemented"); - sleep(1); - return 0; - } - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - do - { - be32enc( &endiandata[19], nonce ); - x16rt_hash( hash32, endiandata ); - - if ( hash32[7] <= Htarg ) - if (fulltest( hash32, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash32, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c deleted file mode 100644 index 31e3f27..0000000 --- a/algo/x16/x21s-4way.c +++ /dev/null @@ -1,431 +0,0 @@ -/** - * x16r algo implementation - * - * Implementation by tpruvot@github Jan 2018 - * Optimized by JayDDee@github Jan 2018 - */ -#include "x16r-gate.h" - -#if defined (X16R_4WAY) - -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha2-hash-4way.h" -#include "algo/haval/haval-hash-4way.h" -#include "algo/tiger/sph_tiger.h" -#include "algo/gost/sph_gost.h" -#include "algo/lyra2/lyra2.h" -#if defined(__SHA__) - #include -#endif - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; -static __thread uint64_t* x21s_4way_matrix; - -union _x21s_4way_context_overlay -{ - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_echo echo; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - sph_shavite512_context shavite; - simd_2way_context simd; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; - sph_tiger_context tiger; - sph_gost512_context gost; -#if defined(__SHA__) - SHA256_CTX sha256; -#else - sha256_4way_context sha256; -#endif -}; -typedef union _x21s_4way_context_overlay x21s_4way_context_overlay; - -void x21s_4way_hash( void* output, const void* input ) -{ - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t vhash[24*4] __attribute__ ((aligned (64))); - x21s_4way_context_overlay ctx; - void *in0 = (void*) hash0; - void *in1 = (void*) hash1; - void *in2 = (void*) hash2; - void *in3 = (void*) hash3; - int size = 80; - - dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 ); - - // Input data is both 64 bit interleaved (input) - // and deinterleaved in inp0-3. - // If First function uses 64 bit data it is not required to interleave inp - // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit. - // All other functions assume data is deinterleaved in hash0-3 - // All functions must exit with data deinterleaved in hash0-3. - // Alias in0-3 points to either inp0-3 or hash0-3 according to - // its hashOrder position. Size is also set accordingly. - for ( int i = 0; i < 16; i++ ) - { - const char elem = hashOrder[i]; - const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - - switch ( algo ) - { - case BLAKE: - blake512_4way_init( &ctx.blake ); - if ( i == 0 ) - blake512_4way( &ctx.blake, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way( &ctx.blake, vhash, size ); - } - blake512_4way_close( &ctx.blake, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case BMW: - bmw512_4way_init( &ctx.bmw ); - if ( i == 0 ) - bmw512_4way( &ctx.bmw, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - bmw512_4way( &ctx.bmw, vhash, size ); - } - bmw512_4way_close( &ctx.bmw, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case GROESTL: - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - break; - case SKEIN: - skein512_4way_init( &ctx.skein ); - if ( i == 0 ) - skein512_4way( &ctx.skein, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way( &ctx.skein, vhash, size ); - } - skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case JH: - jh512_4way_init( &ctx.jh ); - if ( i == 0 ) - jh512_4way( &ctx.jh, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - jh512_4way( &ctx.jh, vhash, size ); - } - jh512_4way_close( &ctx.jh, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case KECCAK: - keccak512_4way_init( &ctx.keccak ); - if ( i == 0 ) - keccak512_4way( &ctx.keccak, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - keccak512_4way( &ctx.keccak, vhash, size ); - } - keccak512_4way_close( &ctx.keccak, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case LUFFA: - intrlv_2x128( vhash, in0, in1, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - break; - case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); - break; - case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in0, size ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in1, size ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in2, size ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in3, size ); - sph_shavite512_close( &ctx.shavite, hash3 ); - break; - case SIMD: - intrlv_2x128( vhash, in0, in1, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); - break; - case ECHO: - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); - break; - case HAMSI: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, size ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in0, size ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in1, size ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in2, size ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in3, size ); - sph_fugue512_close( &ctx.fugue, hash3 ); - break; - case SHABAL: - intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, size ); - shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - break; - case SHA_512: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, size ); - sha512_4way_close( &ctx.sha512, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - } - size = 64; - } - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhash, 64 ); - haval256_5_4way_close( &ctx.haval, vhash ); - - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - - sph_tiger_init( &ctx.tiger ); - sph_tiger ( &ctx.tiger, (const void*) hash0, 64 ); - sph_tiger_close( &ctx.tiger, (void*) hash0 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger ( &ctx.tiger, (const void*) hash1, 64 ); - sph_tiger_close( &ctx.tiger, (void*) hash1 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger ( &ctx.tiger, (const void*) hash2, 64 ); - sph_tiger_close( &ctx.tiger, (void*) hash2 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger ( &ctx.tiger, (const void*) hash3, 64 ); - sph_tiger_close( &ctx.tiger, (void*) hash3 ); - - LYRA2REV2( x21s_4way_matrix, (void*) hash0, 32, (const void*) hash0, 32, - (const void*) hash0, 32, 1, 4, 4 ); - LYRA2REV2( x21s_4way_matrix, (void*) hash1, 32, (const void*) hash1, 32, - (const void*) hash1, 32, 1, 4, 4 ); - LYRA2REV2( x21s_4way_matrix, (void*) hash2, 32, (const void*) hash2, 32, - (const void*) hash2, 32, 1, 4, 4 ); - LYRA2REV2( x21s_4way_matrix, (void*) hash3, 32, (const void*) hash3, 32, - (const void*) hash3, 32, 1, 4, 4 ); - - sph_gost512_init( &ctx.gost ); - sph_gost512 ( &ctx.gost, (const void*) hash0, 64 ); - sph_gost512_close( &ctx.gost, (void*) hash0 ); - sph_gost512_init( &ctx.gost ); - sph_gost512 ( &ctx.gost, (const void*) hash1, 64 ); - sph_gost512_close( &ctx.gost, (void*) hash1 ); - sph_gost512_init( &ctx.gost ); - sph_gost512 ( &ctx.gost, (const void*) hash2, 64 ); - sph_gost512_close( &ctx.gost, (void*) hash2 ); - sph_gost512_init( &ctx.gost ); - sph_gost512 ( &ctx.gost, (const void*) hash3, 64 ); - sph_gost512_close( &ctx.gost, (void*) hash3 ); - -#if defined(__SHA__) - - SHA256_Init( &ctx.sha256 ); - SHA256_Update( &ctx.sha256, hash0, 64 ); - SHA256_Final( (unsigned char*)hash0, &ctx.sha256 ); - SHA256_Init( &ctx.sha256 ); - SHA256_Update( &ctx.sha256, hash1, 64 ); - SHA256_Final( (unsigned char*)hash1, &ctx.sha256 ); - SHA256_Init( &ctx.sha256 ); - SHA256_Update( &ctx.sha256, hash2, 64 ); - SHA256_Final( (unsigned char*)hash2, &ctx.sha256 ); - SHA256_Init( &ctx.sha256 ); - SHA256_Update( &ctx.sha256, hash3, 64 ); - SHA256_Final( (unsigned char*)hash3, &ctx.sha256 ); - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); - -#else - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - sha256_4way_init( &ctx.sha256 ); - sha256_4way( &ctx.sha256, vhash, 64 ); - sha256_4way_close( &ctx.sha256, vhash ); - dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 ); - -#endif -} - -int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); - casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - if ( s_ntime != endiandata[17] ) - { - uint32_t ntime = swab32(pdata[17]); - x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); - } - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - uint64_t *edata = (uint64_t*)endiandata; - intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); - - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - - x21s_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) - if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !(*restart) ); - - *hashes_done = n - first_nonce + 1; - return 0; -} - -bool x21s_4way_thread_init() -{ - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - - const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows; - x21s_4way_matrix = _mm_malloc( size, 64 ); - return x21s_4way_matrix; -} - -#endif diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c deleted file mode 100644 index 8f290af..0000000 --- a/algo/x16/x21s.c +++ /dev/null @@ -1,263 +0,0 @@ -/** - * x16r algo implementation - * - * Implementation by tpruvot@github Jan 2018 - * Optimized by JayDDee@github Jan 2018 - */ -#include "x16r-gate.h" - -#include -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include -#if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif -#include "algo/haval/sph-haval.h" -#include "algo/tiger/sph_tiger.h" -#include "algo/gost/sph_gost.h" -#include "algo/lyra2/lyra2.h" - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; - -static __thread uint64_t* x21s_matrix; - -union _x21s_context_overlay -{ -#if defined(__AES__) - hashState_echo echo; - hashState_groestl groestl; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif - sph_blake512_context blake; - sph_bmw512_context bmw; - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; - SHA512_CTX sha512; - sph_haval256_5_context haval; - sph_tiger_context tiger; - sph_gost512_context gost; - SHA256_CTX sha256; -}; -typedef union _x21s_context_overlay x21s_context_overlay; - -void x21s_hash( void* output, const void* input ) -{ - uint32_t _ALIGN(128) hash[16]; - x21s_context_overlay ctx; - void *in = (void*) input; - int size = 80; - - for ( int i = 0; i < 16; i++ ) - { - const char elem = hashOrder[i]; - const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - - switch ( algo ) - { - case BLAKE: - sph_blake512_init( &ctx.blake ); - sph_blake512( &ctx.blake, in, size ); - sph_blake512_close( &ctx.blake, hash ); - break; - case BMW: - sph_bmw512_init( &ctx.bmw ); - sph_bmw512(&ctx.bmw, in, size); - sph_bmw512_close(&ctx.bmw, hash); - break; - case GROESTL: -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)in, size<<3 ); -#else - sph_groestl512_init( &ctx.groestl ); - sph_groestl512( &ctx.groestl, in, size ); - sph_groestl512_close(&ctx.groestl, hash); -#endif - break; - case SKEIN: - sph_skein512_init( &ctx.skein ); - sph_skein512( &ctx.skein, in, size ); - sph_skein512_close( &ctx.skein, hash ); - break; - case JH: - sph_jh512_init( &ctx.jh ); - sph_jh512(&ctx.jh, in, size ); - sph_jh512_close(&ctx.jh, hash ); - break; - case KECCAK: - sph_keccak512_init( &ctx.keccak ); - sph_keccak512( &ctx.keccak, in, size ); - sph_keccak512_close( &ctx.keccak, hash ); - break; - case LUFFA: - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)in, size ); - break; - case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)in, size ); - break; - case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in, size ); - sph_shavite512_close( &ctx.shavite, hash ); - break; - case SIMD: - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); - break; - case ECHO: -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512( &ctx.echo, in, size ); - sph_echo512_close( &ctx.echo, hash ); -#endif - break; - case HAMSI: - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512( &ctx.hamsi, in, size ); - sph_hamsi512_close( &ctx.hamsi, hash ); - break; - case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in, size ); - sph_fugue512_close( &ctx.fugue, hash ); - break; - case SHABAL: - sph_shabal512_init( &ctx.shabal ); - sph_shabal512( &ctx.shabal, in, size ); - sph_shabal512_close( &ctx.shabal, hash ); - break; - case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in, size ); - sph_whirlpool_close( &ctx.whirlpool, hash ); - break; - case SHA_512: - SHA512_Init( &ctx.sha512 ); - SHA512_Update( &ctx.sha512, in, size ); - SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); - break; - } - in = (void*) hash; - size = 64; - } - - sph_haval256_5_init( &ctx.haval ); - sph_haval256_5( &ctx.haval, (const void*) hash, 64) ; - sph_haval256_5_close( &ctx.haval, hash ); - - sph_tiger_init( &ctx.tiger ); - sph_tiger ( &ctx.tiger, (const void*) hash, 64 ); - sph_tiger_close( &ctx.tiger, (void*) hash ); - - LYRA2REV2( x21s_matrix, (void*) hash, 32, (const void*) hash, 32, - (const void*) hash, 32, 1, 4, 4); - - sph_gost512_init( &ctx.gost ); - sph_gost512 ( &ctx.gost, (const void*) hash, 64 ); - sph_gost512_close( &ctx.gost, (void*) hash ); - - SHA256_Init( &ctx.sha256 ); - SHA256_Update( &ctx.sha256, hash, 64 ); - SHA256_Final( (unsigned char*)hash, &ctx.sha256 ); - - memcpy( output, hash, 32 ); -} - -int scanhash_x21s( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - if ( s_ntime != pdata[17] ) - { - uint32_t ntime = swab32(pdata[17]); - x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); - } - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - do - { - be32enc( &endiandata[19], nonce ); - x21s_hash( hash32, endiandata ); - - if ( hash32[7] <= Htarg ) - if (fulltest( hash32, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash32, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -bool x21s_thread_init() -{ - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - - const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows; - x21s_matrix = _mm_malloc( size, 64 ); - return x21s_matrix; -} - diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c deleted file mode 100644 index 39a037a..0000000 --- a/algo/x17/sonoa-4way.c +++ /dev/null @@ -1,856 +0,0 @@ -#include "sonoa-gate.h" - -#if defined(SONOA_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cube-hash-2way.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/shavite/shavite-hash-2way.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/haval-hash-4way.h" -#include "algo/sha/sha2-hash-4way.h" - -union _sonoa_4way_context_overlay -{ - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cube_2way_context cube; - shavite512_2way_context shavite; - simd_2way_context simd; - hashState_echo echo; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; -}; - -typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay; - -void sonoa_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhashA[8*4] __attribute__ ((aligned (64))); - uint64_t vhashB[8*4] __attribute__ ((aligned (64))); - sonoa_4way_context_overlay ctx; - -// 1 - - blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - - dintrlv_2x128_512( hash0, hash1, vhashA ); - dintrlv_2x128_512( hash2, hash3, vhashB ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - -// 2 - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - - dintrlv_2x128_512( hash0, hash1, vhashA ); - dintrlv_2x128_512( hash2, hash3, vhashB ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - -// 3 - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - - dintrlv_2x128_512( hash0, hash1, vhashA ); - dintrlv_2x128_512( hash2, hash3, vhashB ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - -// 4 - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - - dintrlv_2x128_512( hash0, hash1, vhashA ); - dintrlv_2x128_512( hash2, hash3, vhashB ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); - - rintrlv_4x32_4x64( vhashB, vhash, 512 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhashB, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - intrlv_2x128_512( vhashA, hash0, hash1 ); - intrlv_2x128_512( vhashB, hash2, hash3 ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - -// 5 - rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - rintrlv_4x64_4x32( vhashB, vhash, 512 ); - - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhashB, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); - - dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - - dintrlv_2x128_512( hash0, hash1, vhashA ); - dintrlv_2x128_512( hash2, hash3, vhashB ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); - - dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - -// 6 - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - - dintrlv_2x128_512( hash0, hash1, vhashA ); - dintrlv_2x128_512( hash2, hash3, vhashB ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); - - dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - -// 7 - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - - dintrlv_2x128_512( hash0, hash1, vhashA ); - dintrlv_2x128_512( hash2, hash3, vhashB ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); - - dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); - - rintrlv_4x64_4x32( vhashB, vhash, 512 ); - - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashB, 64 ); - haval256_5_4way_close( &ctx.haval, state ); -} - -int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash7 = &(hash[7<<2]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - // Need big endian data - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), - *noncev ); - sonoa_4way_hash( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( ( ( hash7[ lane ] & mask ) == 0 ) ) - { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c deleted file mode 100644 index b420564..0000000 --- a/algo/x17/sonoa-gate.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "sonoa-gate.h" - -bool register_sonoa_algo( algo_gate_t* gate ) -{ -#if defined (SONOA_4WAY) -// init_sonoa_4way_ctx(); - gate->scanhash = (void*)&scanhash_sonoa_4way; - gate->hash = (void*)&sonoa_4way_hash; -#else - init_sonoa_ctx(); - gate->scanhash = (void*)&scanhash_sonoa; - gate->hash = (void*)&sonoa_hash; -#endif - gate->get_max64 = (void*)&get_max64_0x1ffff; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - return true; -}; - diff --git a/algo/x17/sonoa-gate.h b/algo/x17/sonoa-gate.h deleted file mode 100644 index c97a375..0000000 --- a/algo/x17/sonoa-gate.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef SONOA_GATE_H__ -#define SONOA_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define SONOA_4WAY -#endif - -bool register_sonoa_algo( algo_gate_t* gate ); - -#if defined(SONOA_4WAY) - -void sonoa_4way_hash( void *state, const void *input ); - -int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -//void init_sonoa_4way_ctx(); - -#endif - -void sonoa_hash( void *state, const void *input ); - -int scanhash_sonoa( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_sonoa_ctx(); - -#endif - diff --git a/algo/x17/sonoa.c b/algo/x17/sonoa.c deleted file mode 100644 index ce1c0fc..0000000 --- a/algo/x17/sonoa.c +++ /dev/null @@ -1,623 +0,0 @@ -#include "sonoa-gate.h" -#include -#include -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/sph-haval.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" -#include -#if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" -#else - #include "algo/groestl/sph_groestl.h" - #include "algo/echo/sph_echo.h" -#endif - -typedef struct { - sph_blake512_context blake; - sph_bmw512_context bmw; -#if defined(__AES__) - hashState_echo echo; - hashState_groestl groestl; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif - sph_jh512_context jh; - sph_keccak512_context keccak; - sph_skein512_context skein; - hashState_luffa luffa; - cubehashParam cubehash; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; - SHA512_CTX sha512; - sph_haval256_5_context haval; -} sonoa_ctx_holder; - -sonoa_ctx_holder sonoa_ctx __attribute__ ((aligned (64))); - -void init_sonoa_ctx() -{ - sph_blake512_init( &sonoa_ctx.blake); - sph_bmw512_init( &sonoa_ctx.bmw); -#if defined(__AES__) - init_echo( &sonoa_ctx.echo, 512 ); - init_groestl( &sonoa_ctx.groestl, 64 ); -#else - sph_groestl512_init(&sonoa_ctx.groestl ); - sph_echo512_init( &sonoa_ctx.echo ); -#endif - sph_skein512_init( &sonoa_ctx.skein); - sph_jh512_init( &sonoa_ctx.jh); - sph_keccak512_init( &sonoa_ctx.keccak ); - init_luffa( &sonoa_ctx.luffa, 512 ); - cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 ); - sph_shavite512_init( &sonoa_ctx.shavite ); - init_sd( &sonoa_ctx.simd, 512 ); - sph_hamsi512_init( &sonoa_ctx.hamsi ); - sph_fugue512_init( &sonoa_ctx.fugue ); - sph_shabal512_init( &sonoa_ctx.shabal ); - sph_whirlpool_init( &sonoa_ctx.whirlpool ); - SHA512_Init( &sonoa_ctx.sha512 ); - sph_haval256_5_init(&sonoa_ctx.haval); -}; - -void sonoa_hash( void *state, const void *input ) -{ - uint8_t hash[128] __attribute__ ((aligned (64))); - sonoa_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &sonoa_ctx, sizeof(sonoa_ctx) ); - - sph_blake512(&ctx.blake, input, 80); - sph_blake512_close(&ctx.blake, hash); - - sph_bmw512(&ctx.bmw, hash, 64); - sph_bmw512_close(&ctx.bmw, hash); - -#if defined(__AES__) - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - sph_skein512(&ctx.skein, hash, 64); - sph_skein512_close(&ctx.skein, hash); - - sph_jh512(&ctx.jh, hash, 64); - sph_jh512_close(&ctx.jh, hash); - - sph_keccak512(&ctx.keccak, hash, 64); - sph_keccak512_close(&ctx.keccak, hash); - - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hash, 64 ); - - sph_shavite512(&ctx.shavite, hash, 64); - sph_shavite512_close(&ctx.shavite, hash); - - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - -#if defined(__AES__) - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#else - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#endif - -// - - sph_bmw512_init( &ctx.bmw); - sph_bmw512(&ctx.bmw, hash, 64); - sph_bmw512_close(&ctx.bmw, hash); - -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512_init(&ctx.groestl ); - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - sph_skein512_init( &ctx.skein); - sph_skein512(&ctx.skein, hash, 64); - sph_skein512_close(&ctx.skein, hash); - - sph_jh512_init( &ctx.jh); - sph_jh512(&ctx.jh, hash, 64); - sph_jh512_close(&ctx.jh, hash); - - sph_keccak512_init( &ctx.keccak ); - sph_keccak512(&ctx.keccak, hash, 64); - sph_keccak512_close(&ctx.keccak, hash); - - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - cubehashInit( &ctx.cubehash, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hash, 64 ); - - sph_shavite512_init( &ctx.shavite ); - sph_shavite512(&ctx.shavite, hash, 64); - sph_shavite512_close(&ctx.shavite, hash); - - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#endif - - sph_hamsi512(&ctx.hamsi, hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - -// - - sph_bmw512_init( &ctx.bmw); - sph_bmw512(&ctx.bmw, hash, 64); - sph_bmw512_close(&ctx.bmw, hash); - -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512_init(&ctx.groestl ); - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - sph_skein512_init( &ctx.skein); - sph_skein512(&ctx.skein, hash, 64); - sph_skein512_close(&ctx.skein, hash); - - sph_jh512_init( &ctx.jh); - sph_jh512(&ctx.jh, hash, 64); - sph_jh512_close(&ctx.jh, hash); - - sph_keccak512_init( &ctx.keccak ); - sph_keccak512(&ctx.keccak, hash, 64); - sph_keccak512_close(&ctx.keccak, hash); - - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - cubehashInit( &ctx.cubehash, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hash, 64 ); - - sph_shavite512_init( &ctx.shavite ); - sph_shavite512(&ctx.shavite, hash, 64); - sph_shavite512_close(&ctx.shavite, hash); - - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#endif - - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512(&ctx.hamsi, hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hash); - -// - - sph_bmw512_init( &ctx.bmw); - sph_bmw512(&ctx.bmw, hash, 64); - sph_bmw512_close(&ctx.bmw, hash); - -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512_init(&ctx.groestl ); - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - sph_skein512_init( &ctx.skein); - sph_skein512(&ctx.skein, hash, 64); - sph_skein512_close(&ctx.skein, hash); - - sph_jh512_init( &ctx.jh); - sph_jh512(&ctx.jh, hash, 64); - sph_jh512_close(&ctx.jh, hash); - - sph_keccak512_init( &ctx.keccak ); - sph_keccak512(&ctx.keccak, hash, 64); - sph_keccak512_close(&ctx.keccak, hash); - - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - cubehashInit( &ctx.cubehash, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hash, 64 ); - - sph_shavite512_init( &ctx.shavite ); - sph_shavite512(&ctx.shavite, hash, 64); - sph_shavite512_close(&ctx.shavite, hash); - - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#endif - - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512(&ctx.hamsi, hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hash); - - sph_shabal512(&ctx.shabal, hash, 64); - sph_shabal512_close(&ctx.shabal, hash); - - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512(&ctx.hamsi, hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#endif - - sph_shavite512_init( &ctx.shavite ); - sph_shavite512(&ctx.shavite, hash, 64); - sph_shavite512_close(&ctx.shavite, hash); - -// - - sph_bmw512_init( &ctx.bmw); - sph_bmw512(&ctx.bmw, hash, 64); - sph_bmw512_close(&ctx.bmw, hash); - - sph_shabal512_init( &ctx.shabal ); - sph_shabal512(&ctx.shabal, hash, 64); - sph_shabal512_close(&ctx.shabal, hash); - -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512_init(&ctx.groestl ); - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - sph_skein512_init( &ctx.skein); - sph_skein512(&ctx.skein, hash, 64); - sph_skein512_close(&ctx.skein, hash); - - sph_jh512_init( &ctx.jh); - sph_jh512(&ctx.jh, hash, 64); - sph_jh512_close(&ctx.jh, hash); - - sph_keccak512_init( &ctx.keccak ); - sph_keccak512(&ctx.keccak, hash, 64); - sph_keccak512_close(&ctx.keccak, hash); - - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - cubehashInit( &ctx.cubehash, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hash, 64 ); - - sph_shavite512_init( &ctx.shavite ); - sph_shavite512(&ctx.shavite, hash, 64); - sph_shavite512_close(&ctx.shavite, hash); - - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#endif - - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512(&ctx.hamsi, hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hash); - - sph_shabal512_init( &ctx.shabal ); - sph_shabal512(&ctx.shabal, hash, 64); - sph_shabal512_close(&ctx.shabal, hash); - - sph_whirlpool(&ctx.whirlpool, hash, 64); - sph_whirlpool_close(&ctx.whirlpool, hash); - -// - sph_bmw512_init( &ctx.bmw); - sph_bmw512(&ctx.bmw, hash, 64); - sph_bmw512_close(&ctx.bmw, hash); - -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512_init(&ctx.groestl ); - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - sph_skein512_init( &ctx.skein); - sph_skein512(&ctx.skein, hash, 64); - sph_skein512_close(&ctx.skein, hash); - - sph_jh512_init( &ctx.jh); - sph_jh512(&ctx.jh, hash, 64); - sph_jh512_close(&ctx.jh, hash); - - sph_keccak512_init( &ctx.keccak ); - sph_keccak512(&ctx.keccak, hash, 64); - sph_keccak512_close(&ctx.keccak, hash); - - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - cubehashInit( &ctx.cubehash, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hash, 64 ); - - sph_shavite512_init( &ctx.shavite ); - sph_shavite512(&ctx.shavite, hash, 64); - sph_shavite512_close(&ctx.shavite, hash); - - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#endif - - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512(&ctx.hamsi, hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hash); - - sph_shabal512_init( &ctx.shabal ); - sph_shabal512(&ctx.shabal, hash, 64); - sph_shabal512_close(&ctx.shabal, hash); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool(&ctx.whirlpool, hash, 64); - sph_whirlpool_close(&ctx.whirlpool, hash); - - SHA512_Update( &ctx.sha512, hash, 64 ); - SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool(&ctx.whirlpool, hash, 64); - sph_whirlpool_close(&ctx.whirlpool, hash); - -// - - sph_bmw512_init( &ctx.bmw); - sph_bmw512(&ctx.bmw, hash, 64); - sph_bmw512_close(&ctx.bmw, hash); - -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512_init(&ctx.groestl ); - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - sph_skein512_init( &ctx.skein); - sph_skein512(&ctx.skein, hash, 64); - sph_skein512_close(&ctx.skein, hash); - - sph_jh512_init( &ctx.jh); - sph_jh512(&ctx.jh, hash, 64); - sph_jh512_close(&ctx.jh, hash); - - sph_keccak512_init( &ctx.keccak ); - sph_keccak512(&ctx.keccak, hash, 64); - sph_keccak512_close(&ctx.keccak, hash); - - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - cubehashInit( &ctx.cubehash, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hash, 64 ); - - sph_shavite512_init( &ctx.shavite ); - sph_shavite512(&ctx.shavite, hash, 64); - sph_shavite512_close(&ctx.shavite, hash); - - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#endif - - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512(&ctx.hamsi, hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hash); - - sph_shabal512_init( &ctx.shabal ); - sph_shabal512(&ctx.shabal, hash, 64); - sph_shabal512_close(&ctx.shabal, hash); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool(&ctx.whirlpool, hash, 64); - sph_whirlpool_close(&ctx.whirlpool, hash); - - SHA512_Init( &ctx.sha512 ); - SHA512_Update( &ctx.sha512, hash, 64 ); - SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); - - sph_haval256_5(&ctx.haval,(const void*) hash, 64); - sph_haval256_5_close(&ctx.haval, hash); - - memcpy(state, hash, 32); -} - -int scanhash_sonoa( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t n = pdata[19] - 1; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = - { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = - { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - - // we need bigendian data... - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - sonoa_hash(hash32, endiandata); - if ( !( hash32[7] & mask ) ) - if ( fulltest( hash32, ptarget ) && !opt_benchmark ) - submit_solution( work, hash32, mythr ); - } while (n < max_nonce && !work_restart[thr_id].restart); - break; - } - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c deleted file mode 100644 index f34f7ee..0000000 --- a/algo/x17/x17-4way.c +++ /dev/null @@ -1,254 +0,0 @@ -#include "x17-gate.h" - -#if defined(X17_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cube-hash-2way.h" -#include "algo/shavite/shavite-hash-2way.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/haval-hash-4way.h" -#include "algo/sha/sha2-hash-4way.h" - -union _x17_4way_context_overlay -{ - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cube_2way_context cube; - shavite512_2way_context shavite; - simd_2way_context simd; - hashState_echo echo; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; -}; -typedef union _x17_4way_context_overlay x17_4way_context_overlay; - -void x17_4way_hash( void *state, const void *input ) -{ - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhashA[8*4] __attribute__ ((aligned (64))); - uint64_t vhashB[8*4] __attribute__ ((aligned (64))); - x17_4way_context_overlay ctx; - - // 1 Blake parallel 4 way 64 bit - blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); - - // 2 Bmw - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serialize - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - // 3 Groestl - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // Parallellize - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - // 4 Skein parallel 4 way 64 bit - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // 5 JH - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // 6 Keccak - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - // 7 Luffa parallel 2 way 128 bit - rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - - // 8 Cubehash - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - - // 9 Shavite - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); - - // 10 Simd - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - - dintrlv_2x128_512( hash0, hash1, vhashA ); - dintrlv_2x128_512( hash2, hash3, vhashB ); - - // 11 Echo serial - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - - // 12 Hamsi parallel 4 way 64 bit - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - - // 13 Fugue serial - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, 64 ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, 64 ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, 64 ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, 64 ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - // 14 Shabal, parallel 4 way 32 bit - intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); - - dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - - // 15 Whirlpool serial - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - - // 16 SHA512 parallel 64 bit - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); - - // 17 Haval parallel 32 bit - rintrlv_4x64_4x32( vhashB, vhash, 512 ); - - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashB, 64 ); - haval256_5_4way_close( &ctx.haval, state ); -} - -int scanhash_x17_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash7 = &(hash[7<<2]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - // Need big endian data - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[ m ]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x17_4way_hash( hash, vdata ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( ( hash7[ lane ] & mask ) == 0 ) - { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c deleted file mode 100644 index 69d28f6..0000000 --- a/algo/x17/x17-gate.c +++ /dev/null @@ -1,15 +0,0 @@ -#include "x17-gate.h" - -bool register_x17_algo( algo_gate_t* gate ) -{ -#if defined (X17_4WAY) - gate->scanhash = (void*)&scanhash_x17_4way; - gate->hash = (void*)&x17_4way_hash; -#else - gate->scanhash = (void*)&scanhash_x17; - gate->hash = (void*)&x17_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - return true; -}; - diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h deleted file mode 100644 index 9a40b34..0000000 --- a/algo/x17/x17-gate.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef X17_GATE_H__ -#define X17_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define X17_4WAY -#endif - -bool register_x17_algo( algo_gate_t* gate ); - -#if defined(X17_4WAY) - -void x17_4way_hash( void *state, const void *input ); -int scanhash_x17_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - -void x17_hash( void *state, const void *input ); -int scanhash_x17( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - diff --git a/algo/x17/x17.c b/algo/x17/x17.c deleted file mode 100644 index c4ddc76..0000000 --- a/algo/x17/x17.c +++ /dev/null @@ -1,242 +0,0 @@ -#include "x17-gate.h" -#include -#include -#include -#include -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" -#include "algo/simd/sph_simd.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/sph-haval.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" -#include -#if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" -#else - #include "algo/groestl/sph_groestl.h" - #include "algo/echo/sph_echo.h" -#endif - -union _x17_context_overlay -{ -#if defined(__AES__) - hashState_groestl groestl; - hashState_echo echo; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; - SHA512_CTX sha512; - sph_haval256_5_context haval; -}; -typedef union _x17_context_overlay x17_context_overlay; - -void x17_hash(void *output, const void *input) -{ - unsigned char hash[128] __attribute__ ((aligned (64))); - #define hashB hash+64 - x17_context_overlay ctx; - - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; - - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- - -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512_init( &ctx.groestl ); - sph_groestl512( &ctx.groestl, hash, 64 ); - sph_groestl512_close( &ctx.groestl, hash ); -#endif - - //---skein4--- - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - //---jh5------ - - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - //--- luffa7 - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - // 8 Cube - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)hash, 64 ); - - // 9 Shavite - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hash); - - // 10 Simd - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence*)hash, - (const BitSequence*)hash, 512 ); - - //11---echo--- -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence*)hash, - (const BitSequence*)hash, 512 ); -#else - sph_echo512_init( &ctx.echo ); - sph_echo512( &ctx.echo, hash, 64 ); - sph_echo512_close( &ctx.echo, hash ); -#endif - - // X13 algos - // 12 Hamsi - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512( &ctx.hamsi, hash, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash ); - - // 13 Fugue - sph_fugue512_init( &ctx.fugue ); - sph_fugue512(&ctx.fugue, hash, 64 ); - sph_fugue512_close(&ctx.fugue, hash ); - - // X14 Shabal - sph_shabal512_init( &ctx.shabal ); - sph_shabal512(&ctx.shabal, hash, 64); - sph_shabal512_close( &ctx.shabal, hash ); - - // X15 Whirlpool - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash ); - - SHA512_Init( &ctx.sha512 ); - SHA512_Update( &ctx.sha512, hash, 64 ); - SHA512_Final( (unsigned char*)hash, &ctx.sha512 ); - - sph_haval256_5_init(&ctx.haval); - sph_haval256_5( &ctx.haval, (const void*)hash, 64 ); - sph_haval256_5_close( &ctx.haval, output ); -} - -int scanhash_x17( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - - uint64_t htmax[] = - { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = - { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - for ( int m = 0; m < 6; m++ ) - { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - pdata[19] = ++n; - be32enc( &endiandata[19], n ); - x17_hash( hash64, endiandata ); - if ( !( hash64[7] & mask ) ) - if ( fulltest( hash64, ptarget ) && !opt_benchmark ) - submit_solution( work, hash64, mythr ); - } while ( n < max_nonce && !work_restart[thr_id].restart); - break; - } - } - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c deleted file mode 100644 index aad5b27..0000000 --- a/algo/x17/xevan-4way.c +++ /dev/null @@ -1,373 +0,0 @@ -#include "xevan-gate.h" - -#if defined(XEVAN_4WAY) - -#include -#include -#include -#include -#include "algo/blake/blake-hash-4way.h" -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cube-hash-2way.h" -#include "algo/shavite/shavite-hash-2way.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/shabal-hash-4way.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha2-hash-4way.h" -#include "algo/haval/haval-hash-4way.h" - -union _xevan_4way_context_overlay -{ - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cube_2way_context cube; - shavite512_2way_context shavite; - simd_2way_context simd; - hashState_echo echo; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; -}; -typedef union _xevan_4way_context_overlay xevan_4way_context_overlay; - -void xevan_4way_hash( void *output, const void *input ) -{ - uint64_t hash0[16] __attribute__ ((aligned (64))); - uint64_t hash1[16] __attribute__ ((aligned (64))); - uint64_t hash2[16] __attribute__ ((aligned (64))); - uint64_t hash3[16] __attribute__ ((aligned (64))); - uint64_t vhash[16<<2] __attribute__ ((aligned (64))); - uint64_t vhashA[16<<2] __attribute__ ((aligned (64))); - uint64_t vhashB[16<<2] __attribute__ ((aligned (64))); - const int dataLen = 128; - xevan_4way_context_overlay ctx __attribute__ ((aligned (64))); - - // parallel 4 way - - blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, input, 80 ); - blake512_4way_close(&ctx.blake, vhash); - memset( &vhash[8<<2], 0, 64<<2 ); - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, dataLen ); - bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, - dataLen<<3 ); - - // Parallel 4way - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, dataLen ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, dataLen ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, dataLen ); - keccak512_4way_close( &ctx.keccak, vhash ); - - rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen ); - - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen ); - - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); - - dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 ); - dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, dataLen<<3 ); - // Parallel - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, dataLen ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, dataLen ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, dataLen ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, dataLen ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, dataLen ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - // Parallel 4way 32 bit - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, dataLen ); - shabal512_4way_close( &ctx.shabal, vhash ); - - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - - // Serial - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, dataLen ); - sha512_4way_close( &ctx.sha512, vhash ); - - rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 ); - - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashA, dataLen ); - haval256_5_4way_close( &ctx.haval, vhashA ); - - rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 ); - - memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 ); - - blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, dataLen ); - blake512_4way_close(&ctx.blake, vhash); - - bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, dataLen ); - bmw512_4way_close( &ctx.bmw, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, - dataLen<<3 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - - skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, dataLen ); - skein512_4way_close( &ctx.skein, vhash ); - - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, dataLen ); - jh512_4way_close( &ctx.jh, vhash ); - - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, dataLen ); - keccak512_4way_close( &ctx.keccak, vhash ); - - rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); - - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen ); - - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); - - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen ); - - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); - - dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 ); - dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 ); - - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, dataLen<<3 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, dataLen ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash0, dataLen ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash1, dataLen ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash2, dataLen ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, hash3, dataLen ); - sph_fugue512_close( &ctx.fugue, hash3 ); - - intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - - shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, dataLen ); - shabal512_4way_close( &ctx.shabal, vhash ); - - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash0, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash1, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash2, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - - sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, dataLen ); - sha512_4way_close( &ctx.sha512, vhash ); - - rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 ); - - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashA, dataLen ); - haval256_5_4way_close( &ctx.haval, output ); -} - -int scanhash_xevan_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash7 = &(hash[7<<2]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - __m256i *noncev = (__m256i*)vdata + 9; // aligned - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev ); - - xevan_4way_hash( hash, vdata ); - for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane ] <= Htarg ) - { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c deleted file mode 100644 index 1f9cd8e..0000000 --- a/algo/x17/xevan-gate.c +++ /dev/null @@ -1,24 +0,0 @@ -#include "xevan-gate.h" - -void xevan_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool register_xevan_algo( algo_gate_t* gate ) -{ -#if defined (XEVAN_4WAY) -// init_xevan_4way_ctx(); - gate->scanhash = (void*)&scanhash_xevan_4way; - gate->hash = (void*)&xevan_4way_hash; -#else - init_xevan_ctx(); - gate->scanhash = (void*)&scanhash_xevan; - gate->hash = (void*)&xevan_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - gate->set_target = (void*)&xevan_set_target; - gate->get_max64 = (void*)&get_max64_0xffffLL; - return true; -}; - diff --git a/algo/x17/xevan-gate.h b/algo/x17/xevan-gate.h deleted file mode 100644 index c614c0b..0000000 --- a/algo/x17/xevan-gate.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef XEVAN_GATE_H__ -#define XEVAN_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) && defined(__AES__) - #define XEVAN_4WAY -#endif - -bool register_xevan_algo( algo_gate_t* gate ); - -#if defined(XEVAN_4WAY) - -void xevan_4way_hash( void *state, const void *input ); - -int scanhash_xevan_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -//void init_xevan_4way_ctx(); - -#endif - -void xevan_hash( void *state, const void *input ); - -int scanhash_xevan( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void init_xevan_ctx(); - -#endif - diff --git a/algo/x17/xevan.c b/algo/x17/xevan.c deleted file mode 100644 index b351eb3..0000000 --- a/algo/x17/xevan.c +++ /dev/null @@ -1,270 +0,0 @@ -#include "xevan-gate.h" - -#include -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/sph-haval.h" -#include "algo/simd/nist.h" -#include "algo/cubehash/cubehash_sse2.h" -#include -#if defined(__AES__) - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#else - #include "algo/groestl/sph_groestl.h" - #include "algo/echo/sph_echo.h" -#endif - -typedef struct { - sph_blake512_context blake; - sph_bmw512_context bmw; - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - hashState_luffa luffa; - cubehashParam cubehash; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; - SHA512_CTX sha512; - sph_haval256_5_context haval; -#if defined(__AES__) - hashState_echo echo; - hashState_groestl groestl; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif -} xevan_ctx_holder; - -xevan_ctx_holder xevan_ctx __attribute__ ((aligned (64))); -static __thread sph_blake512_context xevan_blake_mid - __attribute__ ((aligned (64))); - -void init_xevan_ctx() -{ - sph_blake512_init(&xevan_ctx.blake); - sph_bmw512_init(&xevan_ctx.bmw); - sph_skein512_init(&xevan_ctx.skein); - sph_jh512_init(&xevan_ctx.jh); - sph_keccak512_init(&xevan_ctx.keccak); - init_luffa( &xevan_ctx.luffa, 512 ); - cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 ); - sph_shavite512_init( &xevan_ctx.shavite ); - init_sd( &xevan_ctx.simd, 512 ); - sph_hamsi512_init( &xevan_ctx.hamsi ); - sph_fugue512_init( &xevan_ctx.fugue ); - sph_shabal512_init( &xevan_ctx.shabal ); - sph_whirlpool_init( &xevan_ctx.whirlpool ); - SHA512_Init( &xevan_ctx.sha512 ); - sph_haval256_5_init(&xevan_ctx.haval); -#if defined(__AES__) - init_groestl( &xevan_ctx.groestl, 64 ); - init_echo( &xevan_ctx.echo, 512 ); -#else - sph_groestl512_init( &xevan_ctx.groestl ); - sph_echo512_init( &xevan_ctx.echo ); -#endif -}; - -void xevan_blake512_midstate( const void* input ) -{ - memcpy( &xevan_blake_mid, &xevan_ctx.blake, sizeof xevan_blake_mid ); - sph_blake512( &xevan_blake_mid, input, 64 ); -} - -void xevan_hash(void *output, const void *input) -{ - uint32_t _ALIGN(64) hash[32]; // 128 bytes required - const int dataLen = 128; - xevan_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) ); - - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - memcpy( &ctx.blake, &xevan_blake_mid, sizeof xevan_blake_mid ); - sph_blake512( &ctx.blake, input + midlen, tail ); - sph_blake512_close(&ctx.blake, hash); - - memset(&hash[16], 0, 64); - - sph_bmw512(&ctx.bmw, hash, dataLen); - sph_bmw512_close(&ctx.bmw, hash); - -#if defined(__AES__) - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, dataLen*8 ); -#else - sph_groestl512(&ctx.groestl, hash, dataLen); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - sph_skein512(&ctx.skein, hash, dataLen); - sph_skein512_close(&ctx.skein, hash); - - sph_jh512(&ctx.jh, hash, dataLen); - sph_jh512_close(&ctx.jh, hash); - - sph_keccak512(&ctx.keccak, hash, dataLen); - sph_keccak512_close(&ctx.keccak, hash); - - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, dataLen ); - - cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, - (const byte*) hash, dataLen ); - - sph_shavite512(&ctx.shavite, hash, dataLen); - sph_shavite512_close(&ctx.shavite, hash); - - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, dataLen*8 ); - -#if defined(__AES__) - update_final_echo( &ctx.echo, (BitSequence *) hash, - (const BitSequence *) hash, dataLen*8 ); -#else - sph_echo512(&ctx.echo, hash, dataLen); - sph_echo512_close(&ctx.echo, hash); -#endif - - sph_hamsi512(&ctx.hamsi, hash, dataLen); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512(&ctx.fugue, hash, dataLen); - sph_fugue512_close(&ctx.fugue, hash); - - sph_shabal512(&ctx.shabal, hash, dataLen); - sph_shabal512_close(&ctx.shabal, hash); - - sph_whirlpool(&ctx.whirlpool, hash, dataLen); - sph_whirlpool_close(&ctx.whirlpool, hash); - - SHA512_Update( &ctx.sha512, hash, dataLen ); - SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); - - sph_haval256_5(&ctx.haval,(const void*) hash, dataLen); - sph_haval256_5_close(&ctx.haval, hash); - - memset(&hash[8], 0, dataLen - 32); - - memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) ); - - sph_blake512(&ctx.blake, hash, dataLen); - sph_blake512_close(&ctx.blake, hash); - - sph_bmw512(&ctx.bmw, hash, dataLen); - sph_bmw512_close(&ctx.bmw, hash); - -#if defined(__AES__) - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const BitSequence*)hash, dataLen*8 ); -#else - sph_groestl512(&ctx.groestl, hash, dataLen); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - sph_skein512(&ctx.skein, hash, dataLen); - sph_skein512_close(&ctx.skein, hash); - - sph_jh512(&ctx.jh, hash, dataLen); - sph_jh512_close(&ctx.jh, hash); - - sph_keccak512(&ctx.keccak, hash, dataLen); - sph_keccak512_close(&ctx.keccak, hash); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, dataLen ); - - cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, - (const byte*) hash, dataLen ); - - sph_shavite512(&ctx.shavite, hash, dataLen); - sph_shavite512_close(&ctx.shavite, hash); - - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, dataLen*8 ); - -#if defined(__AES__) - update_final_echo( &ctx.echo, (BitSequence *) hash, - (const BitSequence *) hash, dataLen*8 ); -#else - sph_echo512(&ctx.echo, hash, dataLen); - sph_echo512_close(&ctx.echo, hash); -#endif - - sph_hamsi512(&ctx.hamsi, hash, dataLen); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512(&ctx.fugue, hash, dataLen); - sph_fugue512_close(&ctx.fugue, hash); - - sph_shabal512(&ctx.shabal, hash, dataLen); - sph_shabal512_close(&ctx.shabal, hash); - - sph_whirlpool(&ctx.whirlpool, hash, dataLen); - sph_whirlpool_close(&ctx.whirlpool, hash); - - SHA512_Update( &ctx.sha512, hash, dataLen ); - SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); - - sph_haval256_5(&ctx.haval,(const void*) hash, dataLen); - sph_haval256_5_close(&ctx.haval, hash); - - memcpy(output, hash, 32); -} - -int scanhash_xevan( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) hash[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if (opt_benchmark) - ptarget[7] = 0x0cff; - - for (int k=0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - xevan_blake512_midstate( endiandata ); - do { - be32enc(&endiandata[19], nonce); - xevan_hash(hash, endiandata); - - if (hash[7] <= Htarg ) - if ( fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - diff --git a/algo/x20/x20r-gate.c b/algo/x20/x20r-gate.c deleted file mode 100644 index 36113b7..0000000 --- a/algo/x20/x20r-gate.c +++ /dev/null @@ -1,34 +0,0 @@ -#include "x20r-gate.h" - -void getAlgoString( const uint8_t* prevblock, char *output ) -{ - char *sptr = outpuit; - - for ( int j = 0; j < X20R_HASH_FUNC_COUNT; j++ ) - { - char b = (19 - j) >> 1; // 16 ascii hex chars, reversed - uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4; - if (algoDigit >= 10) - sprintf(sptr, "%c", 'A' + (algoDigit - 10)); - else - sprintf(sptr, "%u", (uint32_t) algoDigit); - sptr++; - } - *sptr = '\0'; -} - -bool register_x20r_algo( algo_gate_t* gate ) -{ -#if defined (X20R_4WAY) - gate->scanhash = (void*)&scanhash_x20r_4way; - gate->hash = (void*)&x20r_4way_hash; -#else - gate->scanhash = (void*)&scanhash_x20r; - gate->hash = (void*)&x20r_hash; -#endif - gate->set_target = (void*)&alt_set_target; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - x20_r_s_getAlgoString = (void*)&x20r_getAlgoString; - return true; -}; - diff --git a/algo/x20/x20r-gate.h b/algo/x20/x20r-gate.h deleted file mode 100644 index 359b6d4..0000000 --- a/algo/x20/x20r-gate.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef X20R_GATE_H__ -#define X20R_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -/* -#if defined(__AVX2__) && defined(__AES__) - #define X20R_4WAY -#endif -*/ - -enum x20r_Algo { - BLAKE = 0, - BMW, - GROESTL, - JH, - KECCAK, - SKEIN, - LUFFA, - CUBEHASH, - SHAVITE, - SIMD, - ECHO, - HAMSI, - FUGUE, - SHABAL, - WHIRLPOOL, - SHA_512, - HAVAL, // 256-bits output - GOST, - RADIOGATUN, // 256-bits output - PANAMA, // 256-bits output - X20R_HASH_FUNC_COUNT -}; - -void (*x20_r_s_getAlgoString) ( const uint8_t*, char* ); - -void x20r_getAlgoString( const uint8_t* prevblock, char *output ); - -bool register_xi20r_algo( algo_gate_t* gate ); - -#if defined(X20R_4WAY) - -void x20r_4way_hash( void *state, const void *input ); - -int scanhash_x20r_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - -#endif - -void x20rhash( void *state, const void *input ); - -int scanhash_x20r( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - -#endif - diff --git a/algo/x20/x20r.c b/algo/x20/x20r.c deleted file mode 100644 index 7b98990..0000000 --- a/algo/x20/x20r.c +++ /dev/null @@ -1,275 +0,0 @@ -#include "x20r-gate.h" - -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/sph-haval.h" -#include "algo/radiogatun/sph_radiogatun.h" -#include "algo/panama/sph_panama.h" -#include "algo/gost/sph_gost.h" -#include -#if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" -#else - #include "algo/groestl/sph_groestl.h" - #include "algo/echo/sph_echo.h" -#endif -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" - - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread char hashOrder[X20R_HASH_FUNC_COUNT + 1] = { 0 }; - -union _x20r_context_overlay -{ - sph_blake512_context blake; - sph_bmw512_context bmw; -#if defined(__AES__) - hashState_groestl groestl; - hashState_echo echo; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - hashState_luffa luffa; - cubehashParam cube; - hashState_sd simd; - sph_shavite512_context shavite; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; - SHA512_CTX sha512; - sph_haval256_5_context haval; - sph_gost512_context gost; - sph_radiogatun64_context radiogatun; - sph_panama_context panama; -}; -typedef union _x20r_context_overlay x20r_context_overlay; - -void x20r_hash(void* output, const void* input) -{ - uint32_t _ALIGN(128) hash[64/4]; - x20r_context_overlay ctx; -/* - sph_blake512_context ctx_blake; - sph_bmw512_context ctx_bmw; - sph_groestl512_context ctx_groestl; - sph_skein512_context ctx_skein; - sph_jh512_context ctx_jh; - sph_keccak512_context ctx_keccak; - sph_luffa512_context ctx_luffa; - sph_cubehash512_context ctx_cubehash; - sph_shavite512_context ctx_shavite; - sph_simd512_context ctx_simd; - sph_echo512_context ctx_echo; - sph_hamsi512_context ctx_hamsi; - sph_fugue512_context ctx_fugue; - sph_shabal512_context ctx_shabal; - sph_whirlpool_context ctx_whirlpool; - sph_sha512_context ctx_sha512; - sph_haval256_5_context ctx_haval; - sph_gost512_context ctx_gost; - sph_radiogatun64_context ctx_radiogatun; - sph_panama_context ctx_panama; -*/ - void *in = (void*) input; - int size = 80; - - if ( s_ntime == UINT32_MAX ) - { - const uint8_t* in8 = (uint8_t*) input; - x20_r_s_getAlgoString(&in8[4], hashOrder); - } - - for (int i = 0; i < 20; i++) - { - const char elem = hashOrder[i]; - const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - - switch ( algo ) - { - case BLAKE: - sph_blake512_init(&ctx.blake); - sph_blake512(&ctx.blake, in, size); - sph_blake512_close(&ctx.blake, hash); - break; - case BMW: - sph_bmw512_init(&ctx.bmw); - sph_bmw512(&ctx.bmw, in, size); - sph_bmw512_close(&ctx.bmw, hash); - break; - case GROESTL: -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)in, size<<3 ); -#else - sph_groestl512_init(&ctx.groestl); - sph_groestl512(&ctx.groestl, in, size); - sph_groestl512_close(&ctx.groestl, hash); -#endif - break; - case SKEIN: - sph_skein512_init(&ctx.skein); - sph_skein512(&ctx.skein, in, size); - sph_skein512_close(&ctx.skein, hash); - break; - case JH: - sph_jh512_init(&ctx.jh); - sph_jh512(&ctx.jh, in, size); - sph_jh512_close(&ctx.jh, hash); - break; - case KECCAK: - sph_keccak512_init(&ctx.keccak); - sph_keccak512(&ctx.keccak, in, size); - sph_keccak512_close(&ctx.keccak, hash); - break; - case LUFFA: - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)in, size ); - break; - case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)in, size ); - break; - case SHAVITE: - sph_shavite512_init(&ctx.shavite); - sph_shavite512(&ctx.shavite, in, size); - sph_shavite512_close(&ctx.shavite, hash); - break; - case SIMD: - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)in, size<<3 ); - break; - case ECHO: -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)in, size<<3 ); -#else - sph_echo512_init(&ctx.echo); - sph_echo512(&ctx.echo, in, size); - sph_echo512_close(&ctx.echo, hash); -#endif - break; - case HAMSI: - sph_hamsi512_init(&ctx.hamsi); - sph_hamsi512(&ctx.hamsi, in, size); - sph_hamsi512_close(&ctx.hamsi, hash); - break; - case FUGUE: - sph_fugue512_init(&ctx.fugue); - sph_fugue512(&ctx.fugue, in, size); - sph_fugue512_close(&ctx.fugue, hash); - break; - case SHABAL: - sph_shabal512_init(&ctx.shabal); - sph_shabal512(&ctx.shabal, in, size); - sph_shabal512_close(&ctx.shabal, hash); - break; - case WHIRLPOOL: - sph_whirlpool_init(&ctx.whirlpool); - sph_whirlpool(&ctx.whirlpool, in, size); - sph_whirlpool_close(&ctx.whirlpool, hash); - break; - case SHA_512: - SHA512_Init( &ctx.sha512 ); - SHA512_Update( &ctx.sha512, in, size ); - SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); - break; - case HAVAL: - sph_haval256_5_init(&ctx.haval); - sph_haval256_5(&ctx.haval, in, size); - sph_haval256_5_close(&ctx.haval, hash); - memset(&hash[8], 0, 32); - break; - case GOST: - sph_gost512_init(&ctx.gost); - sph_gost512(&ctx.gost, in, size); - sph_gost512_close(&ctx.gost, hash); - break; - case RADIOGATUN: - sph_radiogatun64_init(&ctx.radiogatun); - sph_radiogatun64(&ctx.radiogatun, in, size); - sph_radiogatun64_close(&ctx.radiogatun, hash); - memset(&hash[8], 0, 32); - break; - case PANAMA: - sph_panama_init(&ctx.panama); - sph_panama(&ctx.panama, in, size); - sph_panama_close(&ctx.panama, hash); - memset(&hash[8], 0, 32); - break; - } - in = (void*) hash; - size = 64; - } - memcpy(output, hash, 32); -} - -int scanhash_x20r( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - for (int k=0; k < 19; k++) - be32enc( &endiandata[k], pdata[k] ); - - if ( s_ntime != pdata[17] ) - { - uint32_t ntime = swab32(pdata[17]); - x20_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder ); - s_ntime = ntime; - if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime); - } - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - do { - be32enc( &endiandata[19], nonce ); - x20r_hash( hash32, endiandata ); - - if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) ) - { - work_set_target_ratio( work, hash32 ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 1; - } - nonce++; - - } while (nonce < max_nonce && !(*restart)); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} diff --git a/algo/yescrypt/sha256_Y.c b/algo/yescrypt/sha256_Y.c deleted file mode 100644 index 7b778ed..0000000 --- a/algo/yescrypt/sha256_Y.c +++ /dev/null @@ -1,409 +0,0 @@ -/*- - * Copyright 2005,2007,2009 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include - -#include -#include - -#include "sysendian.h" - -#include "sha256_Y.h" -#include "compat.h" - -/* - * Encode a length len/4 vector of (uint32_t) into a length len vector of - * (unsigned char) in big-endian form. Assumes len is a multiple of 4. - */ -static void -be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) -{ - size_t i; - - for (i = 0; i < len / 4; i++) - be32enc(dst + i * 4, src[i]); -} - -/* - * Decode a big-endian length len vector of (unsigned char) into a length - * len/4 vector of (uint32_t). Assumes len is a multiple of 4. - */ -static void -be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) -{ - size_t i; - - for (i = 0; i < len / 4; i++) - dst[i] = be32dec(src + i * 4); -} - -/* Elementary functions used by SHA256 */ -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define SHR(x, n) (x >> n) -#define ROTR(x, n) ((x >> n) | (x << (32 - n))) -#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) -#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) -#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) -#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) - -/* SHA256 round function */ -#define RND(a, b, c, d, e, f, g, h, k) \ - t0 = h + S1(e) + Ch(e, f, g) + k; \ - t1 = S0(a) + Maj(a, b, c); \ - d += t0; \ - h = t0 + t1; - -/* Adjusted round function for rotating state */ -#define RNDr(S, W, i, k) \ - RND(S[(64 - i) % 8], S[(65 - i) % 8], \ - S[(66 - i) % 8], S[(67 - i) % 8], \ - S[(68 - i) % 8], S[(69 - i) % 8], \ - S[(70 - i) % 8], S[(71 - i) % 8], \ - W[i] + k) - -/* - * SHA256 block compression function. The 256-bit state is transformed via - * the 512-bit input block to produce a new state. - */ -static void -SHA256_Transform_Y(uint32_t * state, const unsigned char block[64]) -{ - uint32_t _ALIGN(128) W[64], S[8]; - uint32_t t0, t1; - int i; - - /* 1. Prepare message schedule W. */ - be32dec_vect(W, block, 64); - for (i = 16; i < 64; i++) - W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; - - /* 2. Initialize working variables. */ - memcpy(S, state, 32); - - /* 3. Mix. */ - RNDr(S, W, 0, 0x428a2f98); - RNDr(S, W, 1, 0x71374491); - RNDr(S, W, 2, 0xb5c0fbcf); - RNDr(S, W, 3, 0xe9b5dba5); - RNDr(S, W, 4, 0x3956c25b); - RNDr(S, W, 5, 0x59f111f1); - RNDr(S, W, 6, 0x923f82a4); - RNDr(S, W, 7, 0xab1c5ed5); - RNDr(S, W, 8, 0xd807aa98); - RNDr(S, W, 9, 0x12835b01); - RNDr(S, W, 10, 0x243185be); - RNDr(S, W, 11, 0x550c7dc3); - RNDr(S, W, 12, 0x72be5d74); - RNDr(S, W, 13, 0x80deb1fe); - RNDr(S, W, 14, 0x9bdc06a7); - RNDr(S, W, 15, 0xc19bf174); - RNDr(S, W, 16, 0xe49b69c1); - RNDr(S, W, 17, 0xefbe4786); - RNDr(S, W, 18, 0x0fc19dc6); - RNDr(S, W, 19, 0x240ca1cc); - RNDr(S, W, 20, 0x2de92c6f); - RNDr(S, W, 21, 0x4a7484aa); - RNDr(S, W, 22, 0x5cb0a9dc); - RNDr(S, W, 23, 0x76f988da); - RNDr(S, W, 24, 0x983e5152); - RNDr(S, W, 25, 0xa831c66d); - RNDr(S, W, 26, 0xb00327c8); - RNDr(S, W, 27, 0xbf597fc7); - RNDr(S, W, 28, 0xc6e00bf3); - RNDr(S, W, 29, 0xd5a79147); - RNDr(S, W, 30, 0x06ca6351); - RNDr(S, W, 31, 0x14292967); - RNDr(S, W, 32, 0x27b70a85); - RNDr(S, W, 33, 0x2e1b2138); - RNDr(S, W, 34, 0x4d2c6dfc); - RNDr(S, W, 35, 0x53380d13); - RNDr(S, W, 36, 0x650a7354); - RNDr(S, W, 37, 0x766a0abb); - RNDr(S, W, 38, 0x81c2c92e); - RNDr(S, W, 39, 0x92722c85); - RNDr(S, W, 40, 0xa2bfe8a1); - RNDr(S, W, 41, 0xa81a664b); - RNDr(S, W, 42, 0xc24b8b70); - RNDr(S, W, 43, 0xc76c51a3); - RNDr(S, W, 44, 0xd192e819); - RNDr(S, W, 45, 0xd6990624); - RNDr(S, W, 46, 0xf40e3585); - RNDr(S, W, 47, 0x106aa070); - RNDr(S, W, 48, 0x19a4c116); - RNDr(S, W, 49, 0x1e376c08); - RNDr(S, W, 50, 0x2748774c); - RNDr(S, W, 51, 0x34b0bcb5); - RNDr(S, W, 52, 0x391c0cb3); - RNDr(S, W, 53, 0x4ed8aa4a); - RNDr(S, W, 54, 0x5b9cca4f); - RNDr(S, W, 55, 0x682e6ff3); - RNDr(S, W, 56, 0x748f82ee); - RNDr(S, W, 57, 0x78a5636f); - RNDr(S, W, 58, 0x84c87814); - RNDr(S, W, 59, 0x8cc70208); - RNDr(S, W, 60, 0x90befffa); - RNDr(S, W, 61, 0xa4506ceb); - RNDr(S, W, 62, 0xbef9a3f7); - RNDr(S, W, 63, 0xc67178f2); - - /* 4. Mix local working variables into global state */ - for (i = 0; i < 8; i++) - state[i] += S[i]; -#if 0 - /* Clean the stack. */ - memset(W, 0, 256); - memset(S, 0, 32); - t0 = t1 = 0; -#endif -} - -static unsigned char PAD[64] = { - 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -/* Add padding and terminating bit-count. */ -static void -SHA256_Pad_Y(SHA256_CTX_Y * ctx) -{ - unsigned char len[8]; - uint32_t r, plen; - - /* - * Convert length to a vector of bytes -- we do this now rather - * than later because the length will change after we pad. - */ - be32enc_vect(len, ctx->count, 8); - - /* Add 1--64 bytes so that the resulting length is 56 mod 64 */ - r = (ctx->count[1] >> 3) & 0x3f; - plen = (r < 56) ? (56 - r) : (120 - r); - SHA256_Update_Y(ctx, PAD, (size_t)plen); - - /* Add the terminating bit-count */ - SHA256_Update_Y(ctx, len, 8); -} - -/* SHA-256 initialization. Begins a SHA-256 operation. */ -void -SHA256_Init_Y(SHA256_CTX_Y * ctx) -{ - /* Zero bits processed so far */ - ctx->count[0] = ctx->count[1] = 0; - - /* Magic initialization constants */ - ctx->state[0] = 0x6A09E667; - ctx->state[1] = 0xBB67AE85; - ctx->state[2] = 0x3C6EF372; - ctx->state[3] = 0xA54FF53A; - ctx->state[4] = 0x510E527F; - ctx->state[5] = 0x9B05688C; - ctx->state[6] = 0x1F83D9AB; - ctx->state[7] = 0x5BE0CD19; -} - -/* Add bytes into the hash */ -void -SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len) -{ - uint32_t bitlen[2]; - uint32_t r; - const unsigned char *src = in; - - /* Number of bytes left in the buffer from previous updates */ - r = (ctx->count[1] >> 3) & 0x3f; - - /* Convert the length into a number of bits */ - bitlen[1] = ((uint32_t)len) << 3; - bitlen[0] = (uint32_t)(len >> 29); - - /* Update number of bits */ - if ((ctx->count[1] += bitlen[1]) < bitlen[1]) - ctx->count[0]++; - ctx->count[0] += bitlen[0]; - - /* Handle the case where we don't need to perform any transforms */ - if (len < 64 - r) { - memcpy(&ctx->buf[r], src, len); - return; - } - - /* Finish the current block */ - memcpy(&ctx->buf[r], src, 64 - r); - SHA256_Transform_Y(ctx->state, ctx->buf); - src += 64 - r; - len -= 64 - r; - - /* Perform complete blocks */ - while (len >= 64) { - SHA256_Transform_Y(ctx->state, src); - src += 64; - len -= 64; - } - - /* Copy left over data into buffer */ - memcpy(ctx->buf, src, len); -} - -/* - * SHA-256 finalization. Pads the input data, exports the hash value, - * and clears the context state. - */ -void -SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx) -{ - /* Add padding */ - SHA256_Pad_Y(ctx); - - /* Write the hash */ - be32enc_vect(digest, ctx->state, 32); - - /* Clear the context state */ - memset((void *)ctx, 0, sizeof(*ctx)); -} - -/* Initialize an HMAC-SHA256 operation with the given key. */ -void -HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen) -{ - unsigned char pad[64]; - unsigned char khash[32]; - const unsigned char * K = _K; - size_t i; - - /* If Klen > 64, the key is really SHA256(K). */ - if (Klen > 64) { - SHA256_Init(&ctx->ictx); - SHA256_Update(&ctx->ictx, K, Klen); - SHA256_Final(khash, &ctx->ictx); - K = khash; - Klen = 32; - } - - /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ - SHA256_Init(&ctx->ictx); - memset(pad, 0x36, 64); - for (i = 0; i < Klen; i++) - pad[i] ^= K[i]; - SHA256_Update(&ctx->ictx, pad, 64); - - /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ - SHA256_Init(&ctx->octx); - memset(pad, 0x5c, 64); - for (i = 0; i < Klen; i++) - pad[i] ^= K[i]; - SHA256_Update(&ctx->octx, pad, 64); - - /* Clean the stack. */ - //memset(khash, 0, 32); -} - -/* Add bytes to the HMAC-SHA256 operation. */ -void -HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len) -{ - - /* Feed data to the inner SHA256 operation. */ - SHA256_Update(&ctx->ictx, in, len); -} - -/* Finish an HMAC-SHA256 operation. */ -void -HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx) -{ - unsigned char ihash[32]; - - /* Finish the inner SHA256 operation. */ - SHA256_Final(ihash, &ctx->ictx); - - /* Feed the inner hash to the outer SHA256 operation. */ - SHA256_Update(&ctx->octx, ihash, 32); - - /* Finish the outer SHA256 operation. */ - SHA256_Final(digest, &ctx->octx); - - /* Clean the stack. */ - //memset(ihash, 0, 32); -} - -/** - * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): - * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and - * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). - */ -void -PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, - size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) -{ - HMAC_SHA256_CTX_Y PShctx, hctx; - uint8_t _ALIGN(128) T[32]; - uint8_t _ALIGN(128) U[32]; - uint8_t ivec[4]; - size_t i, clen; - uint64_t j; - int k; - - /* Compute HMAC state after processing P and S. */ - HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen); - HMAC_SHA256_Update_Y(&PShctx, salt, saltlen); - - /* Iterate through the blocks. */ - for (i = 0; i * 32 < dkLen; i++) { - /* Generate INT(i + 1). */ - be32enc(ivec, (uint32_t)(i + 1)); - - /* Compute U_1 = PRF(P, S || INT(i)). */ - memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y)); - HMAC_SHA256_Update_Y(&hctx, ivec, 4); - HMAC_SHA256_Final_Y(U, &hctx); - - /* T_i = U_1 ... */ - memcpy(T, U, 32); - - for (j = 2; j <= c; j++) { - /* Compute U_j. */ - HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen); - HMAC_SHA256_Update_Y(&hctx, U, 32); - HMAC_SHA256_Final_Y(U, &hctx); - - /* ... xor U_j ... */ - for (k = 0; k < 32; k++) - T[k] ^= U[k]; - } - - /* Copy as many bytes as necessary into buf. */ - clen = dkLen - i * 32; - if (clen > 32) - clen = 32; - memcpy(&buf[i * 32], T, clen); - } - - /* Clean PShctx, since we never called _Final on it. */ - //memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y)); -} diff --git a/algo/yescrypt/sha256_Y.h b/algo/yescrypt/sha256_Y.h deleted file mode 100644 index 703d059..0000000 --- a/algo/yescrypt/sha256_Y.h +++ /dev/null @@ -1,69 +0,0 @@ -/*- - * Copyright 2005,2007,2009 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $ - */ - -#ifndef _SHA256_H_ -#define _SHA256_H_ - -#include -#include -#include - -typedef struct SHA256Context { - uint32_t state[8]; - uint32_t count[2]; - unsigned char buf[64]; -} SHA256_CTX_Y; - -/* -typedef struct HMAC_SHA256Context { - SHA256_CTX_Y ictx; - SHA256_CTX_Y octx; -} HMAC_SHA256_CTX_Y; -*/ - -typedef struct HMAC_SHA256Context { - SHA256_CTX ictx; - SHA256_CTX octx; -} HMAC_SHA256_CTX_Y; - -void SHA256_Init_Y(SHA256_CTX_Y *); -void SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t); -void SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *); -void HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); -void HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); -void HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *); - -/** - * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): - * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and - * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). - */ -void PBKDF2_SHA256_Y(const uint8_t *, size_t, const uint8_t *, size_t, - uint64_t, uint8_t *, size_t); - -#endif /* !_SHA256_H_ */ diff --git a/algo/yescrypt/sysendian.h b/algo/yescrypt/sysendian.h deleted file mode 100644 index 29933d4..0000000 --- a/algo/yescrypt/sysendian.h +++ /dev/null @@ -1,124 +0,0 @@ -/*- - * Copyright 2007-2009 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ -#ifndef _SYSENDIAN_H_ -#define _SYSENDIAN_H_ - -/* If we don't have be64enc, the we have isn't usable. */ -#if !HAVE_DECL_BE64ENC -#undef HAVE_SYS_ENDIAN_H -#endif - -#ifdef HAVE_SYS_ENDIAN_H - -#include - -#else - -#include - - - -static __inline uint64_t -be64dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - - return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) + - ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) + - ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) + - ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56)); -} - -static __inline void -be64enc(void *pp, uint64_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[7] = x & 0xff; - p[6] = (x >> 8) & 0xff; - p[5] = (x >> 16) & 0xff; - p[4] = (x >> 24) & 0xff; - p[3] = (x >> 32) & 0xff; - p[2] = (x >> 40) & 0xff; - p[1] = (x >> 48) & 0xff; - p[0] = (x >> 56) & 0xff; -} - - - -static __inline uint64_t -le64dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - - return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) + - ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) + - ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) + - ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56)); -} - -static __inline void -le64enc(void *pp, uint64_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; - p[4] = (x >> 32) & 0xff; - p[5] = (x >> 40) & 0xff; - p[6] = (x >> 48) & 0xff; - p[7] = (x >> 56) & 0xff; -} - - -static __inline uint32_t -be32dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - - return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + - ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); -} - -static __inline void -be32enc(void *pp, uint32_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[3] = x & 0xff; - p[2] = (x >> 8) & 0xff; - p[1] = (x >> 16) & 0xff; - p[0] = (x >> 24) & 0xff; -} - -#endif /* !HAVE_SYS_ENDIAN_H */ - -#endif /* !_SYSENDIAN_H_ */ diff --git a/algo/yescrypt/yescrypt-best.c b/algo/yescrypt/yescrypt-best.c deleted file mode 100644 index 4e83621..0000000 --- a/algo/yescrypt/yescrypt-best.c +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef __SSE2__ -#include "yescrypt-simd.c" -#else -#include "yescrypt-opt.c" -#endif diff --git a/algo/yescrypt/yescrypt-platform.h b/algo/yescrypt/yescrypt-platform.h deleted file mode 100644 index a80640c..0000000 --- a/algo/yescrypt/yescrypt-platform.h +++ /dev/null @@ -1,211 +0,0 @@ -/*- - * Copyright 2013,2014 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifdef MAP_ANON -#include -#endif - -#include "yescrypt.h" -#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) - -#ifdef __x86_64__ -#define HUGEPAGE_SIZE (2 * 1024 * 1024) -#else -#undef HUGEPAGE_SIZE -#endif - -static __inline uint32_t -le32dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - - return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + - ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); -} - -static __inline void -le32enc(void *pp, uint32_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; -} - -static void * -alloc_region(yescrypt_region_t * region, size_t size) -{ - size_t base_size = size; - uint8_t * base, * aligned; -#ifdef MAP_ANON - int flags = -#ifdef MAP_NOCORE - MAP_NOCORE | -#endif - MAP_ANON | MAP_PRIVATE; -#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) - size_t new_size = size; - const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; - if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { - flags |= MAP_HUGETLB; -/* - * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of - * huge page size, so let's round up to huge page size here. - */ - new_size = size + hugepage_mask; - new_size &= ~hugepage_mask; - } - base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); - if (base != MAP_FAILED) { - base_size = new_size; - } else - if (flags & MAP_HUGETLB) { - flags &= ~MAP_HUGETLB; - base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); - } - -#else - base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); -#endif - if (base == MAP_FAILED) - base = NULL; - aligned = base; -#elif defined(HAVE_POSIX_MEMALIGN) - if ((errno = posix_memalign((void **)&base, 64, size)) != 0) - base = NULL; - aligned = base; -#else - base = aligned = NULL; - if (size + 63 < size) { - errno = ENOMEM; - } else if ((base = malloc(size + 63)) != NULL) { - aligned = base + 63; - aligned -= (uintptr_t)aligned & 63; - } -#endif - region->base = base; - region->aligned = aligned; - region->base_size = base ? base_size : 0; - region->aligned_size = base ? size : 0; - return aligned; -} - -static __inline void -init_region(yescrypt_region_t * region) -{ - region->base = region->aligned = NULL; - region->base_size = region->aligned_size = 0; -} - -static int -free_region(yescrypt_region_t * region) -{ - if (region->base) { -#ifdef MAP_ANON - if (munmap(region->base, region->base_size)) - return -1; -#else - free(region->base); -#endif - } - init_region(region); - return 0; -} - -int yescrypt_init_shared(yescrypt_shared_t * shared, const uint8_t * param, size_t paramlen, - uint64_t N, uint32_t r, uint32_t p, yescrypt_init_shared_flags_t flags, uint32_t mask, - uint8_t * buf, size_t buflen) -{ - yescrypt_shared1_t* shared1 = &shared->shared1; - yescrypt_shared_t dummy, half1, half2; - uint8_t salt[32]; - - if (flags & YESCRYPT_SHARED_PREALLOCATED) { - if (!shared1->aligned || !shared1->aligned_size) - return -1; - } else { - init_region(shared1); - } - shared->mask1 = 1; - if (!param && !paramlen && !N && !r && !p && !buf && !buflen) - return 0; - - init_region(&dummy.shared1); - dummy.mask1 = 1; - if (yescrypt_kdf(&dummy, shared1, - param, paramlen, NULL, 0, N, r, p, 0, - YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, - salt, sizeof(salt))) - goto out; - - half1 = half2 = *shared; - half1.shared1.aligned_size /= 2; - half2.shared1.aligned = (void*) ((size_t)half2.shared1.aligned + half1.shared1.aligned_size); - half2.shared1.aligned_size = half1.shared1.aligned_size; - N /= 2; - - if (p > 1 && yescrypt_kdf(&half1, &half2.shared1, - param, paramlen, salt, sizeof(salt), N, r, p, 0, - YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2, - salt, sizeof(salt))) - goto out; - - if (yescrypt_kdf(&half2, &half1.shared1, - param, paramlen, salt, sizeof(salt), N, r, p, 0, - YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, - salt, sizeof(salt))) - goto out; - - if (yescrypt_kdf(&half1, &half2.shared1, - param, paramlen, salt, sizeof(salt), N, r, p, 0, - YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, - buf, buflen)) - goto out; - - shared->mask1 = mask; - - return 0; - -out: - if (!(flags & YESCRYPT_SHARED_PREALLOCATED)) - free_region(shared1); - return -1; -} - -int -yescrypt_free_shared(yescrypt_shared_t * shared) -{ - return free_region(&shared->shared1); -} - -int -yescrypt_init_local(yescrypt_local_t * local) -{ - init_region(local); - return 0; -} - -int -yescrypt_free_local(yescrypt_local_t * local) -{ - return free_region(local); -} diff --git a/algo/yescrypt/yescrypt-simd.c b/algo/yescrypt/yescrypt-simd.c deleted file mode 100644 index e70c37e..0000000 --- a/algo/yescrypt/yescrypt-simd.c +++ /dev/null @@ -1,1380 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2012-2014 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -/* - * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding - * gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX - * and XOP are of further help either way. - */ -/* -#ifndef __SSE4_1__ -#warning "Consider enabling SSE4.1, AVX, or XOP in the C compiler for significantly better performance" -#endif -*/ - -#include -#ifdef __XOP__ -#include -#endif - -#include -#include -#include -#include -#include "sha256_Y.h" -#include "sysendian.h" - -#include "yescrypt.h" -#include "yescrypt-platform.h" - -#include "compat.h" - -#if __STDC_VERSION__ >= 199901L -/* have restrict */ -#elif defined(__GNUC__) -#define restrict __restrict -#else -#define restrict -#endif - -#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint)); -#define PREFETCH_OUT(x, hint) /* disabled */ - -#ifdef __XOP__ -#define ARX(out, in1, in2, s) \ - out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); -#else -#define ARX(out, in1, in2, s) \ - { \ - __m128i T = _mm_add_epi32(in1, in2); \ - out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \ - out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \ - } -#endif - -#define SALSA20_2ROUNDS \ - /* Operate on "columns" */ \ - ARX(X1, X0, X3, 7) \ - ARX(X2, X1, X0, 9) \ - ARX(X3, X2, X1, 13) \ - ARX(X0, X3, X2, 18) \ -\ - /* Rearrange data */ \ - X1 = _mm_shuffle_epi32(X1, 0x93); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x39); \ -\ - /* Operate on "rows" */ \ - ARX(X3, X0, X1, 7) \ - ARX(X2, X3, X0, 9) \ - ARX(X1, X2, X3, 13) \ - ARX(X0, X1, X2, 18) \ -\ - /* Rearrange data */ \ - X1 = _mm_shuffle_epi32(X1, 0x39); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x93); - -/** - * Apply the salsa20/8 core to the block provided in (X0 ... X3). - */ -#define SALSA20_8_BASE(maybe_decl, out) \ - { \ - maybe_decl Y0 = X0; \ - maybe_decl Y1 = X1; \ - maybe_decl Y2 = X2; \ - maybe_decl Y3 = X3; \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - (out)[0] = X0 = _mm_add_epi32(X0, Y0); \ - (out)[1] = X1 = _mm_add_epi32(X1, Y1); \ - (out)[2] = X2 = _mm_add_epi32(X2, Y2); \ - (out)[3] = X3 = _mm_add_epi32(X3, Y3); \ - } -#define SALSA20_8(out) \ - SALSA20_8_BASE(__m128i, out) - -/** - * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3). - */ -#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \ - X0 = _mm_xor_si128(X0, Z0); \ - X1 = _mm_xor_si128(X1, Z1); \ - X2 = _mm_xor_si128(X2, Z2); \ - X3 = _mm_xor_si128(X3, Z3); \ - SALSA20_8_BASE(maybe_decl, out) - -#define SALSA20_8_XOR_MEM(in, out) \ - SALSA20_8_XOR_ANY(__m128i, (in)[0], (in)[1], (in)[2], (in)[3], out) - -#define SALSA20_8_XOR_REG(out) \ - SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out) - -typedef union { - uint32_t w[16]; - __m128i q[4]; -} salsa20_blk_t; - -/** - * blockmix_salsa8(Bin, Bout, r): - * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r - * bytes in length; the output Bout must also be the same size. - */ -static inline void -blockmix_salsa8(const salsa20_blk_t *restrict Bin, - salsa20_blk_t *restrict Bout, size_t r) -{ - __m128i X0, X1, X2, X3; - size_t i; - - r--; - PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin[i * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin[r * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - X0 = Bin[r * 2 + 1].q[0]; - X1 = Bin[r * 2 + 1].q[1]; - X2 = Bin[r * 2 + 1].q[2]; - X3 = Bin[r * 2 + 1].q[3]; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[0].q, Bout[0].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r;) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q) - - i++; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q) - } - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[r * 2 + 1].q, Bout[r * 2 + 1].q) -} - -/* - * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs - * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and - * destination registers, whereas the shifts would require an extra move - * instruction for our code when building without AVX. Unfortunately, PSHUFD - * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) - * and somewhat slower on some non-Intel CPUs (luckily not including AMD - * Bulldozer and Piledriver). Since for many other CPUs using (V)PSHUFD is a - * win in terms of throughput or/and not needing a move instruction, we - * currently use it despite of the higher latency on some older CPUs. As an - * alternative, the #if below may be patched to only enable use of (V)PSHUFD - * when building with SSE4.1 or newer, which is not available on older CPUs - * where this instruction has higher latency. - */ -#if 1 -#define HI32(X) \ - _mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1)) -#elif 0 -#define HI32(X) \ - _mm_srli_si128((X), 4) -#else -#define HI32(X) \ - _mm_srli_epi64((X), 32) -#endif - -#if defined(__x86_64__) && (defined(__ICC) || defined(__llvm__)) -/* Intel's name, also supported by recent gcc */ -#define EXTRACT64(X) _mm_cvtsi128_si64(X) -#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__) -/* gcc got the 'x' name earlier than non-'x', MSVC and Open64 had bugs */ -#define EXTRACT64(X) _mm_cvtsi128_si64x(X) -#elif defined(__x86_64__) && defined(__SSE4_1__) -/* No known bugs for this intrinsic */ -#include -#define EXTRACT64(X) _mm_extract_epi64((X), 0) -#elif defined(__SSE4_1__) -/* 32-bit */ -#include -#if 0 -/* This is currently unused by the code below, which instead uses these two - * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */ -#define EXTRACT64(X) \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ - ((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32)) -#endif -#else -/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64*() */ -#define EXTRACT64(X) \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) -#endif - -/* This is tunable */ -#define S_BITS 8 - -/* Not tunable in this implementation, hard-coded in a few places */ -#define S_SIMD 2 -#define S_P 4 - -/* Number of S-boxes. Not tunable by design, hard-coded in a few places. */ -#define S_N 2 - -/* Derived values. Not tunable except via S_BITS above. */ -#define S_SIZE1 (1 << S_BITS) -#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8) -#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK) -#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD * 8) - -#if !defined(__x86_64__) && defined(__SSE4_1__) -/* 32-bit with SSE4.1 */ -#define PWXFORM_X_T __m128i -#define PWXFORM_SIMD(X, x, s0, s1) \ - x = _mm_and_si128(X, _mm_set1_epi64x(S_MASK2)); \ - s0 = *(const __m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \ - s1 = *(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \ - X = _mm_mul_epu32(HI32(X), X); \ - X = _mm_add_epi64(X, s0); \ - X = _mm_xor_si128(X, s1); -#else -/* 64-bit, or 32-bit without SSE4.1 */ -#define PWXFORM_X_T uint64_t -#define PWXFORM_SIMD(X, x, s0, s1) \ - x = EXTRACT64(X) & S_MASK2; \ - s0 = *(const __m128i *)(S0 + (uint32_t)x); \ - s1 = *(const __m128i *)(S1 + (x >> 32)); \ - X = _mm_mul_epu32(HI32(X), X); \ - X = _mm_add_epi64(X, s0); \ - X = _mm_xor_si128(X, s1); -#endif - -#define PWXFORM_ROUND \ - PWXFORM_SIMD(X0, x0, s00, s01) \ - PWXFORM_SIMD(X1, x1, s10, s11) \ - PWXFORM_SIMD(X2, x2, s20, s21) \ - PWXFORM_SIMD(X3, x3, s30, s31) - -#define PWXFORM \ - { \ - PWXFORM_X_T x0, x1, x2, x3; \ - __m128i s00, s01, s10, s11, s20, s21, s30, s31; \ - PWXFORM_ROUND PWXFORM_ROUND \ - PWXFORM_ROUND PWXFORM_ROUND \ - PWXFORM_ROUND PWXFORM_ROUND \ - } - -#define XOR4(in) \ - X0 = _mm_xor_si128(X0, (in)[0]); \ - X1 = _mm_xor_si128(X1, (in)[1]); \ - X2 = _mm_xor_si128(X2, (in)[2]); \ - X3 = _mm_xor_si128(X3, (in)[3]); - -#define XOUT(out) \ - (out)[0] = X0; \ - (out)[1] = X1; \ - (out)[2] = X2; \ - (out)[3] = X3; - -/** - * blockmix_pwxform(Bin, Bout, r, S): - * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin). The input Bin must - * be 128r bytes in length; the output Bout must also be the same size. - */ -static void -blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout, - size_t r, const __m128i *restrict S) -{ - const uint8_t * S0, * S1; - __m128i X0, X1, X2, X3; - size_t i; - - if (!S) { - blockmix_salsa8(Bin, Bout, r); - return; - } - - S0 = (const uint8_t *)S; - S1 = (const uint8_t *)S + S_SIZE_ALL / 2; - - /* Convert 128-byte blocks to 64-byte blocks */ - r *= 2; - - r--; - PREFETCH(&Bin[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - - /* X <-- B_{r1 - 1} */ - X0 = Bin[r].q[0]; - X1 = Bin[r].q[1]; - X2 = Bin[r].q[2]; - X3 = Bin[r].q[3]; - - /* for i = 0 to r1 - 1 do */ - for (i = 0; i < r; i++) { - /* X <-- H'(X \xor B_i) */ - XOR4(Bin[i].q) - PWXFORM - /* B'_i <-- X */ - XOUT(Bout[i].q) - } - - /* Last iteration of the loop above */ - XOR4(Bin[i].q) - PWXFORM - - /* B'_i <-- H(B'_i) */ - SALSA20_8(Bout[i].q) -} - -#define XOR4_2(in1, in2) \ - X0 = _mm_xor_si128((in1)[0], (in2)[0]); \ - X1 = _mm_xor_si128((in1)[1], (in2)[1]); \ - X2 = _mm_xor_si128((in1)[2], (in2)[2]); \ - X3 = _mm_xor_si128((in1)[3], (in2)[3]); - -static inline uint32_t -blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, int Bin2_in_ROM) -{ - __m128i X0, X1, X2, X3; - size_t i; - - r--; - if (Bin2_in_ROM) { - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_NTA) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_NTA) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_NTA) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - } else { - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - } - PREFETCH(&Bin1[r * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[0].q) - SALSA20_8_XOR_MEM(Bin2[0].q, Bout[0].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r;) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2 + 1].q) - SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q) - - i++; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2].q) - SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q) - } - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[r * 2 + 1].q) - SALSA20_8_XOR_MEM(Bin2[r * 2 + 1].q, Bout[r * 2 + 1].q) - - return _mm_cvtsi128_si32(X0); -} - -static uint32_t -blockmix_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, int Bin2_in_ROM, const __m128i *restrict S) -{ - const uint8_t * S0, * S1; - __m128i X0, X1, X2, X3; - size_t i; - - if (!S) - return blockmix_salsa8_xor(Bin1, Bin2, Bout, r, Bin2_in_ROM); - - S0 = (const uint8_t *)S; - S1 = (const uint8_t *)S + S_SIZE_ALL / 2; - - /* Convert 128-byte blocks to 64-byte blocks */ - r *= 2; - - r--; - if (Bin2_in_ROM) { - PREFETCH(&Bin2[r], _MM_HINT_NTA) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_NTA) - PREFETCH(&Bin1[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - } else { - PREFETCH(&Bin2[r], _MM_HINT_T0) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - PREFETCH(&Bin1[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - } - PREFETCH_OUT(&Bout[r], _MM_HINT_T0); - - /* X <-- B_{r1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) - - /* for i = 0 to r1 - 1 do */ - for (i = 0; i < r; i++) { - /* X <-- H'(X \xor B_i) */ - XOR4(Bin1[i].q) - XOR4(Bin2[i].q) - PWXFORM - /* B'_i <-- X */ - XOUT(Bout[i].q) - } - - /* Last iteration of the loop above */ - XOR4(Bin1[i].q) - XOR4(Bin2[i].q) - PWXFORM - - /* B'_i <-- H(B'_i) */ - SALSA20_8(Bout[i].q) - - return _mm_cvtsi128_si32(X0); -} - -#undef XOR4 -#define XOR4(in, out) \ - (out)[0] = Y0 = _mm_xor_si128((in)[0], (out)[0]); \ - (out)[1] = Y1 = _mm_xor_si128((in)[1], (out)[1]); \ - (out)[2] = Y2 = _mm_xor_si128((in)[2], (out)[2]); \ - (out)[3] = Y3 = _mm_xor_si128((in)[3], (out)[3]); - -static inline uint32_t -blockmix_salsa8_xor_save(const salsa20_blk_t *restrict Bin1, - salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r) -{ - __m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3; - size_t i; - - r--; - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[0].q, Bin2[0].q) - SALSA20_8_XOR_REG(Bout[0].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r;) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2 + 1].q, Bin2[i * 2 + 1].q) - SALSA20_8_XOR_REG(Bout[r + 1 + i].q) - - i++; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2].q, Bin2[i * 2].q) - SALSA20_8_XOR_REG(Bout[i].q) - } - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - SALSA20_8_XOR_REG(Bout[r * 2 + 1].q) - - return _mm_cvtsi128_si32(X0); -} - -#define XOR4_Y \ - X0 = _mm_xor_si128(X0, Y0); \ - X1 = _mm_xor_si128(X1, Y1); \ - X2 = _mm_xor_si128(X2, Y2); \ - X3 = _mm_xor_si128(X3, Y3); - -static uint32_t -blockmix_xor_save(const salsa20_blk_t *restrict Bin1, - salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, const __m128i *restrict S) -{ - const uint8_t * S0, * S1; - __m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3; - size_t i; - - if (!S) - return blockmix_salsa8_xor_save(Bin1, Bin2, Bout, r); - - S0 = (const uint8_t *)S; - S1 = (const uint8_t *)S + S_SIZE_ALL / 2; - - /* Convert 128-byte blocks to 64-byte blocks */ - r *= 2; - - r--; - PREFETCH(&Bin2[r], _MM_HINT_T0) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - PREFETCH(&Bin1[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - PREFETCH_OUT(&Bout[r], _MM_HINT_T0); - - /* X <-- B_{r1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) - - /* for i = 0 to r1 - 1 do */ - for (i = 0; i < r; i++) { - XOR4(Bin1[i].q, Bin2[i].q) - /* X <-- H'(X \xor B_i) */ - XOR4_Y - PWXFORM - /* B'_i <-- X */ - XOUT(Bout[i].q) - } - - /* Last iteration of the loop above */ - XOR4(Bin1[i].q, Bin2[i].q) - XOR4_Y - PWXFORM - - /* B'_i <-- H(B'_i) */ - SALSA20_8(Bout[i].q) - - return _mm_cvtsi128_si32(X0); -} - -#undef ARX -#undef SALSA20_2ROUNDS -#undef SALSA20_8 -#undef SALSA20_8_XOR_ANY -#undef SALSA20_8_XOR_MEM -#undef SALSA20_8_XOR_REG -#undef PWXFORM_SIMD_1 -#undef PWXFORM_SIMD_2 -#undef PWXFORM_ROUND -#undef PWXFORM -#undef OUT -#undef XOR4 -#undef XOR4_2 -#undef XOR4_Y - -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static inline uint32_t -integerify(const salsa20_blk_t * B, size_t r) -{ - return B[2 * r - 1].w[0]; -} - -/** - * smix1(B, r, N, flags, V, NROM, shared, XY, S): - * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 128r bytes in length. The value N must be even and no - * smaller than 2. The array V must be aligned to a multiple of 64 bytes, and - * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64 - * bytes as well saves cache lines, but might result in cache bank conflicts). - */ -static void -smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags, - salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, - salsa20_blk_t * XY, void * S) -{ - const salsa20_blk_t * VROM = shared->shared1.aligned; - uint32_t VROM_mask = shared->mask1; - size_t s = 2 * r; - salsa20_blk_t * X = V, * Y; - uint32_t i, j; - size_t k; - - /* 1: X <-- B */ - /* 3: V_i <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } - - if (NROM && (VROM_mask & 1)) { - uint32_t n; - salsa20_blk_t * V_n; - const salsa20_blk_t * V_j; - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[s]; - blockmix(X, Y, r, S); - - X = &V[2 * s]; - if ((1 & VROM_mask) == 1) { - /* j <-- Integerify(X) mod NROM */ - j = integerify(Y, r) & (NROM - 1); - V_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - j = blockmix_xor(Y, V_j, X, r, 1, S); - } else { - /* X <-- H(X) */ - blockmix(Y, X, r, S); - j = integerify(X, r); - } - - for (n = 2; n < N; n <<= 1) { - uint32_t m = (n < N / 2) ? n : (N - 1 - n); - - V_n = &V[n * s]; - - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < m; i += 2) { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i - 1; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V_n[i * s]; - j = blockmix_xor(X, V_j, Y, r, 0, S); - - if (((n + i) & VROM_mask) == 1) { - /* j <-- Integerify(X) mod NROM */ - j &= NROM - 1; - V_j = &VROM[j * s]; - } else { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i; - V_j = &V[j * s]; - } - - /* X <-- H(X \xor VROM_j) */ - X = &V_n[(i + 1) * s]; - j = blockmix_xor(Y, V_j, X, r, 1, S); - } - } - - n >>= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 2 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[(N - 1) * s]; - j = blockmix_xor(X, V_j, Y, r, 0, S); - - if (((N - 1) & VROM_mask) == 1) { - /* j <-- Integerify(X) mod NROM */ - j &= NROM - 1; - V_j = &VROM[j * s]; - } else { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 1 - n; - V_j = &V[j * s]; - } - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - X = XY; - blockmix_xor(Y, V_j, X, r, 1, S); - } else if (flags & YESCRYPT_RW) { - uint32_t n; - salsa20_blk_t * V_n, * V_j; - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[s]; - blockmix(X, Y, r, S); - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V[2 * s]; - blockmix(Y, X, r, S); - j = integerify(X, r); - - for (n = 2; n < N; n <<= 1) { - uint32_t m = (n < N / 2) ? n : (N - 1 - n); - - V_n = &V[n * s]; - - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < m; i += 2) { - Y = &V_n[i * s]; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i - 1; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - j = blockmix_xor(X, V_j, Y, r, 0, S); - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V_n[(i + 1) * s]; - j = blockmix_xor(Y, V_j, X, r, 0, S); - } - } - - n >>= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 2 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[(N - 1) * s]; - j = blockmix_xor(X, V_j, Y, r, 0, S); - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 1 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - X = XY; - blockmix_xor(Y, V_j, X, r, 0, S); - } else { - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < N - 1; i += 2) { - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[i * s]; - blockmix(X, Y, r, S); - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V[(i + 1) * s]; - blockmix(Y, X, r, S); - } - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[i * s]; - blockmix(X, Y, r, S); - - /* 4: X <-- H(X) */ - X = XY; - blockmix(Y, X, r, S); - } - - /* B' <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); - } - } -} - -/** - * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S): - * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r bytes in length. The value N must be a power of 2 - * greater than 1. The value Nloop must be even. The array V must be aligned - * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16 - * bytes (aligning them to 64 bytes as well saves cache lines, but might result - * in cache bank conflicts). - */ -static void -smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop, - yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM, - const yescrypt_shared_t * shared, salsa20_blk_t * XY, void * S) -{ - const salsa20_blk_t * VROM = shared->shared1.aligned; - uint32_t VROM_mask = shared->mask1; - size_t s = 2 * r; - salsa20_blk_t * X = XY, * Y = &XY[s]; - uint64_t i; - uint32_t j; - size_t k; - - if (Nloop == 0) - return; - - /* X <-- B' */ - /* 3: V_i <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } - - i = Nloop / 2; - - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - -/* - * Normally, NROM implies YESCRYPT_RW, but we check for these separately - * because YESCRYPT_PARALLEL_SMIX resets YESCRYPT_RW for the smix2() calls - * operating on the entire V. - */ - if (NROM && (flags & YESCRYPT_RW)) { - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i += 2) { - salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor_save(X, V_j, Y, r, S); - - if (((i + 1) & VROM_mask) == 1) { - const salsa20_blk_t * VROM_j; - - j &= NROM - 1; - VROM_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, VROM_j, X, r, 1, S); - } else { - j &= N - 1; - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor_save(Y, V_j, X, r, S); - } - j &= N - 1; - V_j = &V[j * s]; - } - } else if (NROM) { - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i += 2) { - const salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor(X, V_j, Y, r, 0, S); - - if (((i + 1) & VROM_mask) == 1) { - j &= NROM - 1; - V_j = &VROM[j * s]; - } else { - j &= N - 1; - V_j = &V[j * s]; - } - - /* X <-- H(X \xor VROM_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, V_j, X, r, 1, S); - j &= N - 1; - V_j = &V[j * s]; - } - } else if (flags & YESCRYPT_RW) { - /* 6: for i = 0 to N - 1 do */ - do { - salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor_save(X, V_j, Y, r, S); - j &= N - 1; - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor_save(Y, V_j, X, r, S); - j &= N - 1; - } while (--i); - } else { - /* 6: for i = 0 to N - 1 do */ - do { - const salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(X, V_j, Y, r, 0, S); - j &= N - 1; - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, V_j, X, r, 0, S); - j &= N - 1; - } while (--i); - } - - /* 10: B' <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); - } - } -} - -/** - * p2floor(x): - * Largest power of 2 not greater than argument. - */ -static uint64_t -p2floor(uint64_t x) -{ - uint64_t y; - while ((y = x & (x - 1))) - x = y; - return x; -} - -/** - * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S): - * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the - * temporary storage V must be 128rN bytes in length; the temporary storage XY - * must be 256r or 256rp bytes in length (the larger size is required with - * OpenMP-enabled builds). The value N must be a power of 2 greater than 1. - * The array V must be aligned to a multiple of 64 bytes, and arrays B and - * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well - * saves cache lines and helps avoid false sharing in OpenMP-enabled builds - * when p > 1, but it might also result in cache bank conflicts). - */ -static void -smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t, - yescrypt_flags_t flags, - salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, - salsa20_blk_t * XY, void * S) -{ - size_t s = 2 * r; - uint32_t Nchunk = N / p; - uint64_t Nloop_all, Nloop_rw; - uint32_t i; - - Nloop_all = Nchunk; - if (flags & YESCRYPT_RW) { - if (t <= 1) { - if (t) - Nloop_all *= 2; /* 2/3 */ - Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ - } else { - Nloop_all *= t - 1; - } - } else if (t) { - if (t == 1) - Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ - Nloop_all *= t; - } - - Nloop_rw = 0; - if (flags & __YESCRYPT_INIT_SHARED) - Nloop_rw = Nloop_all; - else if (flags & YESCRYPT_RW) - Nloop_rw = Nloop_all / p; - - Nchunk &= ~(uint32_t)1; /* round down to even */ - Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ - Nloop_rw &= ~(uint64_t)1; /* round down to even */ - -#ifdef _OPENMP -#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw) - { -#pragma omp for -#endif - for (i = 0; i < p; i++) { - uint32_t Vchunk = i * Nchunk; - uint8_t * Bp = &B[128 * r * i]; - salsa20_blk_t * Vp = &V[Vchunk * s]; -#ifdef _OPENMP - salsa20_blk_t * XYp = &XY[i * (2 * s)]; -#else - salsa20_blk_t * XYp = XY; -#endif - uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); - void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; - if (Sp) - smix1(Bp, 1, S_SIZE_ALL / 128, - flags & ~YESCRYPT_PWXFORM, - Sp, NROM, shared, XYp, NULL); - if (!(flags & __YESCRYPT_INIT_SHARED_2)) - smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); - smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, - NROM, shared, XYp, Sp); - } - - if (Nloop_all > Nloop_rw) { -#ifdef _OPENMP -#pragma omp for -#endif - for (i = 0; i < p; i++) { - uint8_t * Bp = &B[128 * r * i]; -#ifdef _OPENMP - salsa20_blk_t * XYp = &XY[i * (2 * s)]; -#else - salsa20_blk_t * XYp = XY; -#endif - void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; - smix2(Bp, r, N, Nloop_all - Nloop_rw, - flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); - } - } -#ifdef _OPENMP - } -#endif -} - -/** - * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, flags, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen), or a revision of scrypt as requested by flags and shared, and - * write the result into buf. The parameters r, p, and buflen must satisfy - * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power - * of 2 greater than 1. (This optimized implementation currently additionally - * limits N to the range from 8 to 2^31, but other implementation might not.) - * - * t controls computation time while not affecting peak memory usage. shared - * and flags may request special modes as described in yescrypt.h. local is - * the thread-local data structure, allowing to preserve and reuse a memory - * allocation across calls, thereby reducing its overhead. - * - * Return 0 on success; or -1 on error. - */ -int -yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, - const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, - uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, - uint8_t * buf, size_t buflen) -{ - uint8_t _ALIGN(128) sha256[32]; - yescrypt_region_t tmp; - uint64_t NROM; - size_t B_size, V_size, XY_size, need; - uint8_t * B, * S; - salsa20_blk_t * V, * XY; - - /* - * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, - * so don't let it have side-effects. Without this adjustment, it'd - * enable the SHA-256 password pre-hashing and output post-hashing, - * because any deviation from classic scrypt implies those. - */ - if (p == 1) - flags &= ~YESCRYPT_PARALLEL_SMIX; - - /* Sanity-check parameters */ - if (flags & ~YESCRYPT_KNOWN_FLAGS) { - errno = EINVAL; - return -1; - } -#if SIZE_MAX > UINT32_MAX - if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { - errno = EFBIG; - return -1; - } -#endif - if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { - errno = EFBIG; - return -1; - } - if (N > UINT32_MAX) { - errno = EFBIG; - return -1; - } - if (((N & (N - 1)) != 0) || (N <= 7) || (r < 1) || (p < 1)) { - errno = EINVAL; - return -1; - } - if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 7)) { - errno = EINVAL; - return -1; - } - if ((r > SIZE_MAX / 256 / p) || - (N > SIZE_MAX / 128 / r)) { - errno = ENOMEM; - return -1; - } -#ifdef _OPENMP - if (!(flags & YESCRYPT_PARALLEL_SMIX) && - (N > SIZE_MAX / 128 / (r * p))) { - errno = ENOMEM; - return -1; - } -#endif - if ((flags & YESCRYPT_PWXFORM) && -#ifndef _OPENMP - (flags & YESCRYPT_PARALLEL_SMIX) && -#endif - p > SIZE_MAX / S_SIZE_ALL) { - errno = ENOMEM; - return -1; - } - - NROM = 0; - if (shared->shared1.aligned) { - NROM = shared->shared1.aligned_size / ((size_t)128 * r); - if (NROM > UINT32_MAX) { - errno = EFBIG; - return -1; - } - if (((NROM & (NROM - 1)) != 0) || (NROM <= 7) || - !(flags & YESCRYPT_RW)) { - errno = EINVAL; - return -1; - } - } - - /* Allocate memory */ - V = NULL; - V_size = (size_t)128 * r * N; -#ifdef _OPENMP - if (!(flags & YESCRYPT_PARALLEL_SMIX)) - V_size *= p; -#endif - need = V_size; - if (flags & __YESCRYPT_INIT_SHARED) { - if (local->aligned_size < need) { - if (local->base || local->aligned || - local->base_size || local->aligned_size) { - errno = EINVAL; - return -1; - } - if (!alloc_region(local, need)) - return -1; - } - V = (salsa20_blk_t *)local->aligned; - need = 0; - } - B_size = (size_t)128 * r * p; - need += B_size; - if (need < B_size) { - errno = ENOMEM; - return -1; - } - XY_size = (size_t)256 * r; -#ifdef _OPENMP - XY_size *= p; -#endif - need += XY_size; - if (need < XY_size) { - errno = ENOMEM; - return -1; - } - if (flags & YESCRYPT_PWXFORM) { - size_t S_size = S_SIZE_ALL; -#ifdef _OPENMP - S_size *= p; -#else - if (flags & YESCRYPT_PARALLEL_SMIX) - S_size *= p; -#endif - need += S_size; - if (need < S_size) { - errno = ENOMEM; - return -1; - } - } - if (flags & __YESCRYPT_INIT_SHARED) { - if (!alloc_region(&tmp, need)) - return -1; - B = (uint8_t *)tmp.aligned; - XY = (salsa20_blk_t *)((uint8_t *)B + B_size); - } else { - init_region(&tmp); - if (local->aligned_size < need) { - if (free_region(local)) - return -1; - if (!alloc_region(local, need)) - return -1; - } - B = (uint8_t *)local->aligned; - V = (salsa20_blk_t *)((uint8_t *)B + B_size); - XY = (salsa20_blk_t *)((uint8_t *)V + V_size); - } - S = NULL; - if (flags & YESCRYPT_PWXFORM) - S = (uint8_t *)XY + XY_size; - - if (t || flags) { - SHA256_CTX ctx; - SHA256_Init(&ctx); - SHA256_Update(&ctx, passwd, passwdlen); - SHA256_Final(sha256, &ctx); - passwd = sha256; - passwdlen = sizeof(sha256); - } - - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256_Y(passwd, passwdlen, salt, saltlen, 1, B, B_size); - - if (t || flags) - memcpy(sha256, B, sizeof(sha256)); - - if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) { - smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); - } else { - uint32_t i; - - /* 2: for i = 0 to p - 1 do */ -#ifdef _OPENMP -#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S) -#endif - for (i = 0; i < p; i++) { - /* 3: B_i <-- MF(B_i, N) */ -#ifdef _OPENMP - smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, - &V[(size_t)2 * r * i * N], - NROM, shared, - &XY[(size_t)4 * r * i], - S ? &S[S_SIZE_ALL * i] : S); -#else - smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V, - NROM, shared, XY, S); -#endif - } - } - - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256_Y(passwd, passwdlen, B, B_size, 1, buf, buflen); - - /* - * Except when computing classic scrypt, allow all computation so far - * to be performed on the client. The final steps below match those of - * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so - * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of - * SCRAM's use of SHA-1) would be usable with yescrypt hashes. - */ - if ((t || flags) && buflen == sizeof(sha256)) { - /* Compute ClientKey */ - { - HMAC_SHA256_CTX_Y ctx; - HMAC_SHA256_Init_Y(&ctx, buf, buflen); - if ( yescrypt_client_key ) - HMAC_SHA256_Update_Y( &ctx, (uint8_t*)yescrypt_client_key, - yescrypt_client_key_len ); - else - HMAC_SHA256_Update_Y( &ctx, salt, saltlen ); - HMAC_SHA256_Final_Y(sha256, &ctx); - } - /* Compute StoredKey */ - { - SHA256_CTX ctx; - SHA256_Init(&ctx); - SHA256_Update(&ctx, sha256, sizeof(sha256)); - SHA256_Final(buf, &ctx); - } - } - - if (free_region(&tmp)) - return -1; - - /* Success! */ - return 0; -} diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c deleted file mode 100644 index 2665a1a..0000000 --- a/algo/yescrypt/yescrypt.c +++ /dev/null @@ -1,484 +0,0 @@ -/*- - * Copyright 2013,2014 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -#include -#include - -#include "compat.h" - -#include "yescrypt.h" -#include "sha256_Y.h" -#include "algo-gate-api.h" - -#define BYTES2CHARS(bytes) \ - ((((bytes) * 8) + 5) / 6) - -#define HASH_SIZE 32 /* bytes */ -#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */ -#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM) - -static const char * const itoa64 = - "./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - -static uint8_t* encode64_uint32(uint8_t* dst, size_t dstlen, uint32_t src, uint32_t srcbits) -{ - uint32_t bit; - - for (bit = 0; bit < srcbits; bit += 6) { - if (dstlen < 1) - return NULL; - *dst++ = itoa64[src & 0x3f]; - dstlen--; - src >>= 6; - } - - return dst; -} - -static uint8_t* encode64(uint8_t* dst, size_t dstlen, const uint8_t* src, size_t srclen) -{ - size_t i; - - for (i = 0; i < srclen; ) { - uint8_t * dnext; - uint32_t value = 0, bits = 0; - do { - value |= (uint32_t)src[i++] << bits; - bits += 8; - } while (bits < 24 && i < srclen); - dnext = encode64_uint32(dst, dstlen, value, bits); - if (!dnext) - return NULL; - dstlen -= dnext - dst; - dst = dnext; - } - - return dst; -} - -static int decode64_one(uint32_t* dst, uint8_t src) -{ - const char * ptr = strchr(itoa64, src); - if (ptr) { - *dst = (uint32_t) (ptr - itoa64); - return 0; - } - *dst = 0; - return -1; -} - -static const uint8_t* decode64_uint32(uint32_t* dst, uint32_t dstbits, const uint8_t* src) -{ - uint32_t bit; - uint32_t value; - - value = 0; - for (bit = 0; bit < dstbits; bit += 6) { - uint32_t one; - if (decode64_one(&one, *src)) { - *dst = 0; - return NULL; - } - src++; - value |= one << bit; - } - - *dst = value; - return src; -} - -uint8_t* yescrypt_r(const yescrypt_shared_t* shared, yescrypt_local_t* local, - const uint8_t* passwd, size_t passwdlen, const uint8_t* setting, uint8_t* buf, size_t buflen) -{ - uint8_t hash[HASH_SIZE]; - const uint8_t * src, * salt; - uint8_t * dst; - size_t prefixlen, saltlen, need; - uint8_t version; - uint64_t N; - uint32_t r, p; - yescrypt_flags_t flags = YESCRYPT_WORM; - - printf("pass1 ..."); - fflush(stdout); - - if (setting[0] != '$' || setting[1] != '7') { - printf("died$7 ..."); - fflush(stdout); - return NULL; - } - - printf("died80 ..."); - fflush(stdout); - - src = setting + 2; - - printf("hello '%p'\n", (char *)src); - fflush(stdout); - - switch ((version = *src)) { - case '$': - printf("died2 ..."); - fflush(stdout); - break; - case 'X': - src++; - flags = YESCRYPT_RW; - printf("died3 ..."); - fflush(stdout); - break; - default: - printf("died4 ..."); - fflush(stdout); - return NULL; - } - - printf("pass2 ..."); - fflush(stdout); - - if (*src != '$') { - uint32_t decoded_flags; - if (decode64_one(&decoded_flags, *src)) { - printf("died5 ..."); - fflush(stdout); - return NULL; - } - flags = decoded_flags; - if (*++src != '$') { - printf("died6 ..."); - fflush(stdout); - return NULL; - } - } - - src++; - - { - uint32_t N_log2; - if (decode64_one(&N_log2, *src)) { - printf("died7 ..."); - return NULL; - } - src++; - N = (uint64_t)1 << N_log2; - } - - src = decode64_uint32(&r, 30, src); - if (!src) { - printf("died6 ..."); - return NULL; - } - - src = decode64_uint32(&p, 30, src); - if (!src) { - printf("died7 ..."); - return NULL; - } - - prefixlen = src - setting; - - salt = src; - src = (uint8_t *)strrchr((char *)salt, '$'); - if (src) - saltlen = src - salt; - else - saltlen = strlen((char *)salt); - - need = prefixlen + saltlen + 1 + HASH_LEN + 1; - if (need > buflen || need < saltlen) { - printf("'%d %d %d'", (int) need, (int) buflen, (int) saltlen); - printf("died8killbuf ..."); - fflush(stdout); - return NULL; - } - - if (yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, N, r, p, 0, flags, hash, sizeof(hash))) { - printf("died10 ..."); - fflush(stdout); - return NULL; - } - - dst = buf; - memcpy(dst, setting, prefixlen + saltlen); - dst += prefixlen + saltlen; - *dst++ = '$'; - - dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash)); - /* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its - * memory allocations yet anyway. */ - if (!dst || dst >= buf + buflen) { /* Can't happen */ - printf("died11 ..."); - return NULL; - } - - *dst = 0; /* NUL termination */ - - printf("died12 ..."); - fflush(stdout); - - return buf; -} - -uint8_t* yescrypt(const uint8_t* passwd, const uint8_t* setting) -{ - static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1]; - yescrypt_shared_t shared; - yescrypt_local_t local; - uint8_t * retval; - - if (yescrypt_init_shared(&shared, NULL, 0, - 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) - return NULL; - if (yescrypt_init_local(&local)) { - yescrypt_free_shared(&shared); - return NULL; - } - retval = yescrypt_r(&shared, &local, - passwd, 80, setting, buf, sizeof(buf)); - //printf("hashse='%s'\n", (char *)retval); - if (yescrypt_free_local(&local)) { - yescrypt_free_shared(&shared); - return NULL; - } - if (yescrypt_free_shared(&shared)) - return NULL; - return retval; -} - -uint8_t* yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags, - const uint8_t* src, size_t srclen, uint8_t* buf, size_t buflen) -{ - uint8_t * dst; - size_t prefixlen = 3 + 1 + 5 + 5; - size_t saltlen = BYTES2CHARS(srclen); - size_t need; - - if (p == 1) - flags &= ~YESCRYPT_PARALLEL_SMIX; - - if (flags) { - if (flags & ~0x3f) - return NULL; - - prefixlen++; - if (flags != YESCRYPT_RW) - prefixlen++; - } - - need = prefixlen + saltlen + 1; - if (need > buflen || need < saltlen || saltlen < srclen) - return NULL; - - if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30))) - return NULL; - - dst = buf; - *dst++ = '$'; - *dst++ = '7'; - if (flags) { - *dst++ = 'X'; /* eXperimental, subject to change */ - if (flags != YESCRYPT_RW) - *dst++ = itoa64[flags]; - } - *dst++ = '$'; - - *dst++ = itoa64[N_log2]; - - dst = encode64_uint32(dst, buflen - (dst - buf), r, 30); - if (!dst) /* Can't happen */ - return NULL; - - dst = encode64_uint32(dst, buflen - (dst - buf), p, 30); - if (!dst) /* Can't happen */ - return NULL; - - dst = encode64(dst, buflen - (dst - buf), src, srclen); - if (!dst || dst >= buf + buflen) /* Can't happen */ - return NULL; - - *dst = 0; /* NUL termination */ - - return buf; -} - -uint8_t* yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags, - const uint8_t * src, size_t srclen) -{ - static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1]; - return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, - buf, sizeof(buf)); -} - -static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, - uint8_t * buf, size_t buflen) -{ - static __thread int initialized = 0; - static __thread yescrypt_shared_t shared; - static __thread yescrypt_local_t local; - int retval; - if (!initialized) { -/* "shared" could in fact be shared, but it's simpler to keep it private - * along with "local". It's dummy and tiny anyway. */ - if (yescrypt_init_shared(&shared, NULL, 0, - 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) - return -1; - if (yescrypt_init_local(&local)) { - yescrypt_free_shared(&shared); - return -1; - } - initialized = 1; - } - retval = yescrypt_kdf(&shared, &local, - passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS, - buf, buflen); -#if 0 - if (yescrypt_free_local(&local)) { - yescrypt_free_shared(&shared); - return -1; - } - if (yescrypt_free_shared(&shared)) - return -1; - initialized = 0; -#endif - return retval; -} - -// scrypt parameters initialized at run time. -uint64_t YESCRYPT_N; -uint32_t YESCRYPT_R; -uint32_t YESCRYPT_P; -char *yescrypt_client_key = NULL; -int yescrypt_client_key_len = 0; - -/* main hash 80 bytes input */ -void yescrypt_hash( const char *input, char *output, uint32_t len ) -{ - yescrypt_bsty( (uint8_t*)input, len, (uint8_t*)input, len, YESCRYPT_N, - YESCRYPT_R, YESCRYPT_P, (uint8_t*)output, 32 ); -} - -/* for util.c test */ -void yescrypthash(void *output, const void *input) -{ - yescrypt_hash((char*) input, (char*) output, 80); -} - -int scanhash_yescrypt( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) vhash[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - for (int k = 0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - do { - be32enc(&endiandata[19], n); - yescrypt_hash((char*) endiandata, (char*) vhash, 80); - if (vhash[7] < Htarg && fulltest(vhash, ptarget ) - && !opt_benchmark ) - { - pdata[19] = n; - submit_solution( work, vhash, mythr ); - } - n++; - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} - -int64_t yescrypt_get_max64() -{ - return 0x1ffLL; -} - -int64_t yescryptr16_get_max64() -{ - return 0xfffLL; -} - -void yescrypt_gate_base(algo_gate_t *gate ) -{ - gate->optimizations = SSE2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_yescrypt; - gate->hash = (void*)&yescrypt_hash; - gate->set_target = (void*)&scrypt_set_target; -} - -bool register_yescrypt_algo( algo_gate_t* gate ) -{ - yescrypt_gate_base( gate ); - gate->get_max64 = (void*)&yescrypt_get_max64; - yescrypt_client_key = NULL; - yescrypt_client_key_len = 0; - YESCRYPT_N = 2048; - YESCRYPT_R = 8; - YESCRYPT_P = 1; - return true; -} - -bool register_yescryptr8_algo( algo_gate_t* gate ) -{ - yescrypt_gate_base( gate ); - gate->get_max64 = (void*)&yescrypt_get_max64; - yescrypt_client_key = "Client Key"; - yescrypt_client_key_len = 10; - YESCRYPT_N = 2048; - YESCRYPT_R = 8; - YESCRYPT_P = 1; - return true; -} - -bool register_yescryptr16_algo( algo_gate_t* gate ) -{ - yescrypt_gate_base( gate ); - gate->get_max64 = (void*)&yescryptr16_get_max64; - yescrypt_client_key = "Client Key"; - yescrypt_client_key_len = 10; - YESCRYPT_N = 4096; - YESCRYPT_R = 16; - YESCRYPT_P = 1; - return true; -} - -bool register_yescryptr32_algo( algo_gate_t* gate ) -{ - yescrypt_gate_base( gate ); - gate->get_max64 = (void*)&yescryptr16_get_max64; - yescrypt_client_key = "WaviBanana"; - yescrypt_client_key_len = 10; - YESCRYPT_N = 4096; - YESCRYPT_R = 32; - YESCRYPT_P = 1; - return true; -} - diff --git a/algo/yescrypt/yescrypt.h b/algo/yescrypt/yescrypt.h deleted file mode 100644 index c33ba40..0000000 --- a/algo/yescrypt/yescrypt.h +++ /dev/null @@ -1,381 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2013,2014 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#ifndef YESCRYPT_H -#define YESCRYPT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include /* for size_t */ -#include - -//#define __SSE4_1__ - -void yescrypt_hash(const char* input, char* output, uint32_t len); - -void yescrypthash(void *output, const void *input); - -/** - * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen) and write the result into buf. The parameters r, p, and buflen - * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N - * must be a power of 2 greater than 1. - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as buf is local to the thread. - */ -extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen, - const uint8_t * __salt, size_t __saltlen, - uint64_t __N, uint32_t __r, uint32_t __p, - uint8_t * __buf, size_t __buflen); - -/** - * Internal type used by the memory allocator. Please do not use it directly. - * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since - * they might differ from each other in a future version. - */ -typedef struct { - void * base, * aligned; - size_t base_size, aligned_size; -} yescrypt_region_t; - -/** - * Types for shared (ROM) and thread-local (RAM) data structures. - */ -typedef yescrypt_region_t yescrypt_shared1_t; -typedef struct { - yescrypt_shared1_t shared1; - uint32_t mask1; -} yescrypt_shared_t; -typedef yescrypt_region_t yescrypt_local_t; - -/** - * Possible values for yescrypt_init_shared()'s flags argument. - */ -typedef enum { - YESCRYPT_SHARED_DEFAULTS = 0, - YESCRYPT_SHARED_PREALLOCATED = 0x100 -} yescrypt_init_shared_flags_t; - -/** - * Possible values for the flags argument of yescrypt_kdf(), - * yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together, - * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive. - * Please refer to the description of yescrypt_kdf() below for the meaning of - * these flags. - */ -typedef enum { -/* public */ - YESCRYPT_WORM = 0, - YESCRYPT_RW = 1, - YESCRYPT_PARALLEL_SMIX = 2, - YESCRYPT_PWXFORM = 4, -/* private */ - __YESCRYPT_INIT_SHARED_1 = 0x10000, - __YESCRYPT_INIT_SHARED_2 = 0x20000, - __YESCRYPT_INIT_SHARED = 0x30000 -} yescrypt_flags_t; - -extern char *yescrypt_client_key; -extern int yescrypt_client_key_len; - - -#define YESCRYPT_KNOWN_FLAGS \ - (YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \ - __YESCRYPT_INIT_SHARED) - -/** - * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask, - * buf, buflen): - * Optionally allocate memory for and initialize the shared (ROM) data - * structure. The parameters N, r, and p must satisfy the same conditions as - * with crypto_scrypt(). param and paramlen specify a local parameter with - * which the ROM is seeded. If buf is not NULL, then it is used to return - * buflen bytes of message digest for the initialized ROM (the caller may use - * this to verify that the ROM has been computed in the same way that it was on - * a previous run). - * - * Return 0 on success; or -1 on error. - * - * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the - * ROM is assumed to have been preallocated by the caller, with - * shared->shared1.aligned being the start address of the ROM and - * shared->shared1.aligned_size being its size (which must be consistent with - * N, r, and p). This may be used e.g. when the ROM is to be placed in a SysV - * shared memory segment allocated by the caller. - * - * mask controls the frequency of ROM accesses by yescrypt_kdf(). Normally it - * should be set to 1, to interleave RAM and ROM accesses, which works well - * when both regions reside in the machine's RAM anyway. Other values may be - * used e.g. when the ROM is memory-mapped from a disk file. Recommended mask - * values are powers of 2 minus 1 or minus 2. Here's the effect of some mask - * values: - * mask value ROM accesses in SMix 1st loop ROM accesses in SMix 2nd loop - * 0 0 1/2 - * 1 1/2 1/2 - * 2 0 1/4 - * 3 1/4 1/4 - * 6 0 1/8 - * 7 1/8 1/8 - * 14 0 1/16 - * 15 1/16 1/16 - * 1022 0 1/1024 - * 1023 1/1024 1/1024 - * - * Actual computation of the ROM contents may be avoided, if you don't intend - * to use a ROM but need a dummy shared structure, by calling this function - * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the - * arguments starting with param and on. - * - * MT-safe as long as shared is local to the thread. - */ -extern int yescrypt_init_shared(yescrypt_shared_t * __shared, - const uint8_t * __param, size_t __paramlen, - uint64_t __N, uint32_t __r, uint32_t __p, - yescrypt_init_shared_flags_t __flags, uint32_t __mask, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt_free_shared(shared): - * Free memory that had been allocated with yescrypt_init_shared(). - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as shared is local to the thread. - */ -extern int yescrypt_free_shared(yescrypt_shared_t * __shared); - -/** - * yescrypt_init_local(local): - * Initialize the thread-local (RAM) data structure. Actual memory allocation - * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r(). - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as local is local to the thread. - */ -extern int yescrypt_init_local(yescrypt_local_t * __local); - -/** - * yescrypt_free_local(local): - * Free memory that may have been allocated for an initialized thread-local - * (RAM) data structure. - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as local is local to the thread. - */ -extern int yescrypt_free_local(yescrypt_local_t * __local); - -/** - * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, flags, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen), or a revision of scrypt as requested by flags and shared, and - * write the result into buf. The parameters N, r, p, and buflen must satisfy - * the same conditions as with crypto_scrypt(). t controls computation time - * while not affecting peak memory usage. shared and flags may request - * special modes as described below. local is the thread-local data - * structure, allowing to preserve and reuse a memory allocation across calls, - * thereby reducing its overhead. - * - * Return 0 on success; or -1 on error. - * - * t controls computation time. t = 0 is optimal in terms of achieving the - * highest area-time for ASIC attackers. Thus, higher computation time, if - * affordable, is best achieved by increasing N rather than by increasing t. - * However, if the higher memory usage (which goes along with higher N) is not - * affordable, or if fine-tuning of the time is needed (recall that N must be a - * power of 2), then t = 1 or above may be used to increase time while staying - * at the same peak memory usage. t = 1 increases the time by 25% and - * decreases the normalized area-time to 96% of optimal. (Of course, in - * absolute terms the area-time increases with higher t. It's just that it - * would increase slightly more with higher N*r rather than with higher t.) - * t = 2 increases the time by another 20% and decreases the normalized - * area-time to 89% of optimal. Thus, these two values are reasonable to use - * for fine-tuning. Values of t higher than 2 result in further increase in - * time while reducing the efficiency much further (e.g., down to around 50% of - * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact - * numbers varying by the flags settings). - * - * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and - * passing a dummy shared structure (see the description of - * yescrypt_init_shared() above for how to produce one). In this mode, the - * thread-local memory region (RAM) is first sequentially written to and then - * randomly read from. This algorithm is friendly towards time-memory - * tradeoffs (TMTO), available both to defenders (albeit not in this - * implementation) and to attackers. - * - * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local - * memory region (RAM), which makes TMTO a lot less efficient. This may be - * used to slow down the kinds of attackers who would otherwise benefit from - * classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not - * only for the tradeoff, but also for a decrease of attacker's area-time (by - * up to a constant factor), setting YESCRYPT_RW substantially increases the - * cost of attacks in area-time terms as well. Yet another benefit of it is - * that optimal area-time is reached at an earlier time than with classic - * scrypt, and t = 0 actually corresponds to this earlier completion time, - * resulting in quicker hash computations (and thus in higher request rate - * capacity). Due to these properties, YESCRYPT_RW should almost always be - * set, except when compatibility with classic scrypt or TMTO-friendliness are - * desired. - * - * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a - * lower level as compared to where it is in classic scrypt. This reduces - * flexibility for efficient computation (for both attackers and defenders) by - * requiring that, short of resorting to TMTO, the full amount of memory be - * allocated as needed for the specified p, regardless of whether that - * parallelism is actually being fully made use of or not. (For comparison, a - * single instance of classic scrypt may be computed in less memory without any - * CPU time overhead, but in more real time, by not making full use of the - * parallelism.) This may be desirable when the defender has enough memory - * with sufficiently low latency and high bandwidth for efficient full parallel - * execution, yet the required memory size is high enough that some likely - * attackers might end up being forced to choose between using higher latency - * memory than they could use otherwise (waiting for data longer) or using TMTO - * (waiting for data more times per one hash computation). The area-time cost - * for other kinds of attackers (who would use the same memory type and TMTO - * factor or no TMTO either way) remains roughly the same, given the same - * running time for the defender. In the TMTO-friendly YESCRYPT_WORM mode, as - * long as the defender has enough memory that is just as fast as the smaller - * per-thread regions would be, doesn't expect to ever need greater - * flexibility (except possibly via TMTO), and doesn't need backwards - * compatibility with classic scrypt, there are no other serious drawbacks to - * this setting. In the YESCRYPT_RW mode, which is meant to discourage TMTO, - * this new approach to parallelization makes TMTO less inefficient. (This is - * an unfortunate side-effect of avoiding some random writes, as we have to in - * order to allow for parallel threads to access a common memory region without - * synchronization overhead.) Thus, in this mode this setting poses an extra - * tradeoff of its own (higher area-time cost for a subset of attackers vs. - * better TMTO resistance). Setting YESCRYPT_PARALLEL_SMIX also changes the - * way the running time is to be controlled from N*r*p (for classic scrypt) to - * N*r (in this modification). All of this applies only when p > 1. For - * p = 1, this setting is a no-op. - * - * Passing a real shared structure, with ROM contents previously computed by - * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for - * the thread-local RAM region. In order to allow for initialization of the - * ROM to be split into a separate program, the shared->shared1.aligned and - * shared->shared1.aligned_size fields may be set by the caller of - * yescrypt_kdf() manually rather than with yescrypt_init_shared(). - * - * local must be initialized with yescrypt_init_local(). - * - * MT-safe as long as local and buf are local to the thread. - */ -extern int yescrypt_kdf(const yescrypt_shared_t * __shared, - yescrypt_local_t * __local, - const uint8_t * __passwd, size_t __passwdlen, - const uint8_t * __salt, size_t __saltlen, - uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t, - yescrypt_flags_t __flags, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen): - * Compute and encode an scrypt or enhanced scrypt hash of passwd given the - * parameters and salt value encoded in setting. If the shared structure is - * not dummy, a ROM is used and YESCRYPT_RW is required. Otherwise, whether to - * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff - * discouraging modification) is determined by the setting string. shared and - * local must be initialized as described above for yescrypt_kdf(). buf must - * be large enough (as indicated by buflen) to hold the encoded hash string. - * - * Return the encoded hash string on success; or NULL on error. - * - * MT-safe as long as local and buf are local to the thread. - */ -extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared, - yescrypt_local_t * __local, - const uint8_t * __passwd, size_t __passwdlen, - const uint8_t * __setting, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt(passwd, setting): - * Compute and encode an scrypt or enhanced scrypt hash of passwd given the - * parameters and salt value encoded in setting. Whether to use the - * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff - * discouraging modification) is determined by the setting string. - * - * Return the encoded hash string on success; or NULL on error. - * - * This is a crypt(3)-like interface, which is simpler to use than - * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM, - * and it is slower than yescrypt_r() for repeated calls because it allocates - * and frees memory on each call. - * - * MT-unsafe. - */ -extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting); - -/** - * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen): - * Generate a setting string for use with yescrypt_r() and yescrypt() by - * encoding into it the parameters N_log2 (which is to be set to base 2 - * logarithm of the desired value for N), r, p, flags, and a salt given by src - * (of srclen bytes). buf must be large enough (as indicated by buflen) to - * hold the setting string. - * - * Return the setting string on success; or NULL on error. - * - * MT-safe as long as buf is local to the thread. - */ -extern uint8_t * yescrypt_gensalt_r( - uint32_t __N_log2, uint32_t __r, uint32_t __p, - yescrypt_flags_t __flags, - const uint8_t * __src, size_t __srclen, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt_gensalt(N_log2, r, p, flags, src, srclen): - * Generate a setting string for use with yescrypt_r() and yescrypt(). This - * function is the same as yescrypt_gensalt_r() except that it uses a static - * buffer and thus is not MT-safe. - * - * Return the setting string on success; or NULL on error. - * - * MT-unsafe. - */ -extern uint8_t * yescrypt_gensalt( - uint32_t __N_log2, uint32_t __r, uint32_t __p, - yescrypt_flags_t __flags, - const uint8_t * __src, size_t __srclen); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/yespower/PERFORMANCE b/algo/yespower/PERFORMANCE deleted file mode 100644 index 99cddd0..0000000 --- a/algo/yespower/PERFORMANCE +++ /dev/null @@ -1,95 +0,0 @@ -Included with yespower is the "benchmark" program, which is built by -simply invoking "make". When invoked without parameters, it tests -yespower 0.5 at N = 2048, r = 8, which appears to be the lowest setting -in use by existing cryptocurrencies. On an i7-4770K with 4x DDR3-1600 -(on two memory channels) running CentOS 7 for x86-64 (and built with -CentOS 7's default version of gcc) and with thread affinity set, this -reports between 3700 and 3800 hashes per second for both SSE2 and AVX -builds, e.g.: - -$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark -version=0.5 N=2048 r=8 -Will use 2048.00 KiB RAM -a5 9f ec 4c 4f dd a1 6e 3b 14 05 ad da 66 d5 25 b6 8e 7c ad fc fe 6a c0 66 c7 ad 11 8c d8 05 90 -Benchmarking 1 thread ... -1018 H/s real, 1018 H/s virtual (2047 hashes in 2.01 seconds) -Benchmarking 4 threads ... -3773 H/s real, 950 H/s virtual (8188 hashes in 2.17 seconds) -min 0.984 ms, avg 1.052 ms, max 1.074 ms - -Running 8 threads (to match the logical rather than the physical CPU -core count) results in very slightly worse performance on this system, -but this might be the other way around on another and/or with other -parameters. Upgrading to yespower 1.0, performance at these parameters -improves to almost 4000 hashes per second: - -$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark 10 -version=1.0 N=2048 r=8 -Will use 2048.00 KiB RAM -d0 78 cd d4 cf 3f 5a a8 4e 3c 4a 58 66 29 81 d8 2d 27 e5 67 36 37 c4 be 77 63 61 32 24 c1 8a 93 -Benchmarking 1 thread ... -1080 H/s real, 1080 H/s virtual (4095 hashes in 3.79 seconds) -Benchmarking 4 threads ... -3995 H/s real, 1011 H/s virtual (16380 hashes in 4.10 seconds) -min 0.923 ms, avg 0.989 ms, max 1.137 ms - -Running 8 threads results in substantial slowdown with this new version -(to between 3200 and 3400 hashes per second) because of cache thrashing. - -For higher settings such as those achieving 8 MiB instead of the 2 MiB -above, this system performs at around 800 hashes per second for yespower -0.5 and at around 830 hashes per second for yespower 1.0: - -$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark 5 2048 32 -version=0.5 N=2048 r=32 -Will use 8192.00 KiB RAM -56 0a 89 1b 5c a2 e1 c6 36 11 1a 9f f7 c8 94 a5 d0 a2 60 2f 43 fd cf a5 94 9b 95 e2 2f e4 46 1e -Benchmarking 1 thread ... -265 H/s real, 265 H/s virtual (1023 hashes in 3.85 seconds) -Benchmarking 4 threads ... -803 H/s real, 200 H/s virtual (4092 hashes in 5.09 seconds) -min 4.924 ms, avg 4.980 ms, max 5.074 ms - -$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark 10 2048 32 -version=1.0 N=2048 r=32 -Will use 8192.00 KiB RAM -f7 69 26 ae 4a dc 56 53 90 2f f0 22 78 ea aa 39 eb 99 84 11 ac 3e a6 24 2e 19 6d fb c4 3d 68 25 -Benchmarking 1 thread ... -275 H/s real, 275 H/s virtual (1023 hashes in 3.71 seconds) -Benchmarking 4 threads ... -831 H/s real, 209 H/s virtual (4092 hashes in 4.92 seconds) -min 3.614 ms, avg 4.769 ms, max 5.011 ms - -Again, running 8 threads results in a slowdown, albeit not as bad as can -be seen for lower settings. - -On x86(-64), the following code versions may reasonably be built: SSE2, -AVX, and XOP. (There's no reason to build for AVX2 and higher, which is -unsuitable for and thus unused by current yespower anyway. There's also -no reason to build yespower as-is for SSE4, although there's a disabled -by default 32-bit specific SSE4 code version that may be re-enabled and -given a try if someone is so inclined; it may perform slightly slower or -slightly faster across different systems.) - -yescrypt and especially yespower 1.0 have been designed to fit the SSE2 -instruction set almost perfectly, so there's very little benefit from -the AVX and XOP builds, yet even at yespower 1.0 there may be -performance differences between SSE2, AVX, and XOP builds within 2% or -so (and it is unclear which is the fastest on a given system until -tested, except that where XOP is supported it is almost always faster -than AVX). - -Proper setting of thread affinities to run exactly one thread per -physical CPU core is non-trivial. In the above examples, it so happened -that the first 4 logical CPU numbers corresponded to different physical -cores, but this won't always be the case. This can vary even between -apparently similar systems. On Linux, the mapping of logical CPUs to -physical cores may be obtained from /proc/cpuinfo (on x86[-64] and MIC) -or sysfs, which an optimized implementation of e.g. a cryptocurrency -miner could use. If you do not bother obtaining this information from -the operating system, you might be better off not setting thread -affinities at all (in order to avoid the risk of doing this incorrectly, -which would have a greater negative performance impact) and/or running -as many threads as there are logical CPUs. Also, there's no certainty -whether different and future CPUs will run yespower faster using one or -maybe more threads per physical core. diff --git a/algo/yespower/README b/algo/yespower/README deleted file mode 100644 index 59e565d..0000000 --- a/algo/yespower/README +++ /dev/null @@ -1,192 +0,0 @@ - What is yespower? - -yespower is a proof-of-work (PoW) focused fork of yescrypt. While -yescrypt is a password-based key derivation function (KDF) and password -hashing scheme, and thus is meant for processing passwords, yespower is -meant for processing trial inputs such as block headers (including -nonces) in PoW-based blockchains. - -On its own, yespower isn't a complete proof-of-work system. Rather, in -the blockchain use case, yespower's return value is meant to be checked -for being numerically no greater than the blockchain's current target -(which is related to mining difficulty) or else the proof attempt -(yespower invocation) is to be repeated (with a different nonce) until -the condition is finally met (allowing a new block to be mined). This -process isn't specific to yespower and isn't part of yespower itself -(rather, it is similar in many PoW-based blockchains and is to be -defined and implemented externally to yespower) and thus isn't described -in here any further. - - - Why or why not yespower? - -Different proof-of-work schemes in existence vary in many aspects, -including in friendliness to different types of hardware. There's -demand for all sorts of hardware (un)friendliness in those - for -different use cases and by different communities. - -yespower in particular is designed to be CPU-friendly, GPU-unfriendly, -and FPGA/ASIC-neutral. In other words, it's meant to be relatively -efficient to compute on current CPUs and relatively inefficient on -current GPUs. Unfortunately, being GPU-unfriendly also means that -eventual FPGA and ASIC implementations will only compete with CPUs, and -at least ASICs will win over the CPUs (FPGAs might not because of this -market's peculiarities - large FPGAs are even more "over-priced" than -large CPUs are), albeit by far not to the extent they did e.g. for -Bitcoin and Litecoin. - -There's a lot of talk about "ASIC resistance". What is (or should be) -meant by that is limiting the advantage of specialized ASICs. While -limiting the advantage at KDF to e.g. 10x and at password hashing to -e.g. 100x (talking orders of magnitude here, in whatever terms) may be -considered "ASIC resistant" (as compared to e.g. 100,000x we'd have -without trying), similar improvement factors are practically not "ASIC -resistant" for cryptocurrency mining where they can make all the -difference between CPU mining being profitable and not. There might -also exist in-between PoW use cases where moderate ASIC advantage is OK, -such as with non-cryptocurrency and/or private/permissioned blockchains. - -Thus, current yespower may be considered either a short-term choice -(valid until one of its uses provides sufficient perceived incentive to -likely result in specialized ASICs) or a deliberate choice of a pro-CPU, -anti-GPU, moderately-pro-ASIC PoW scheme. It is also possible to -respond to known improvements in future GPUs/implementations and/or to -ASICs with new versions of yespower that users would need to switch to. - - - yespower versions. - -yespower includes optimized and specialized re-implementation of the -obsolete yescrypt 0.5 (based off its first submission to Password -Hashing Competition back in 2014) now re-released as yespower 0.5, and -brand new proof-of-work specific variation known as yespower 1.0. - -yespower 0.5 is intended as a compatible upgrade for cryptocurrencies -that already use yescrypt 0.5 (providing a few percent speedup), and -yespower 1.0 may be used as a further upgrade or a new choice of PoW by -those and other cryptocurrencies and other projects. - -There are many significant differences between yespower 0.5 and 1.0 -under the hood, but the main user visible difference is yespower 1.0 -greatly improving on GPU-unfriendliness in light of improvements seen in -modern GPUs (up to and including NVIDIA Volta) and GPU implementations -of yescrypt 0.5. This is achieved mostly through greater use of CPUs' -L2 cache. - -The version of algorithm to use is requested through parameters, -allowing for both algorithms to co-exist in client and miner -implementations (such as in preparation for a cryptocurrency hard-fork -and/or supporting multiple cryptocurrencies in one program). - - - Parameter selection. - -For new uses of yespower, set the requested version to the highest -supported, and set N*r to the highest you can reasonably afford in terms -of proof verification time (which might in turn be determined by desired -share rate per mining pool server), using one of the following options: - -1 MiB: N = 1024, r = 8 -2 MiB: N = 2048, r = 8 -4 MiB: N = 1024, r = 32 -8 MiB: N = 2048, r = 32 -16 MiB: N = 4096, r = 32 - -and so on for higher N keeping r=32. - -You may also set the personalization string to your liking, but that is -not required (you can set its pointer to NULL and its length to 0). Its -support is provided mostly for compatibility with existing modifications -of yescrypt 0.5. - - - Performance. - -Please refer to PERFORMANCE for some benchmarks and performance tuning. - - - How to test yespower for proper operation. - -On a Unix-like system, invoke "make check". This will build and run a -program called "tests", and check its output against the supplied file -TESTS-OK. If everything matches, the final line of output should be the -word "PASSED". - -We do most of our testing on Linux systems with gcc. The supplied -Makefile assumes that you use gcc. - - - Alternate code versions and make targets. - -Two implementations of yespower are included: reference and optimized. -By default, the optimized implementation is built. Internally, the -optimized implementation uses conditional compilation to choose between -usage of various SIMD instruction sets where supported and scalar code. - -The reference implementation is unoptimized and is very slow, but it has -simpler and shorter source code. Its purpose is to provide a simple -human- and machine-readable specification that implementations intended -for actual use should be tested against. It is deliberately mostly not -optimized, and it is not meant to be used in production. - -Similarly to "make check", there's "make check-ref" to build and test -the reference implementation. There's also "make ref" to build the -reference implementation and have the "benchmark" program use it. - -"make clean" may need to be run between making different builds. - - - How to integrate yespower in a program. - -Although yespower.h provides several functions, chances are that you -will only need to use yespower_tls(). Please see the comment on this -function in yespower.h and its example usage in tests.c and benchmark.c, -including parameter sets requesting yescrypt 0.5 as used by certain -existing cryptocurrencies. - -To integrate yespower in an altcoin based on Bitcoin Core, you might -invoke yespower_tls() from either a maybe-new (depending on where you -fork from) CBlockHeader::GetPoWHash() (and invoke that where PoW is -needed like e.g. Litecoin does for scrypt) or CBlockHeader::GetHash() -(and implement caching for its return value like e.g. YACoin does for -scrypt). Further detail on this (generating new genesis blocks, etc.) -is not yespower-specific and thus is not provided here. Just like (and -even more so than) yespower itself, this guidance is provided as-is and -without guarantee of being correct and safe to follow. You're supposed -to know what you're doing. - - - Credits. - -scrypt has been designed by Colin Percival. yescrypt and yespower have -been designed by Solar Designer building upon scrypt. - -The following other people and projects have also indirectly helped make -yespower what it is: - - - Bill Cox - - Rich Felker - - Anthony Ferrara - - Christian Forler - - Taylor Hornby - - Dmitry Khovratovich - - Samuel Neves - - Marcos Simplicio - - Ken T Takusagawa - - Jakob Wenzel - - Christian Winnerlein - - - DARPA Cyber Fast Track - - Password Hashing Competition - - - Contact info. - -First, please check the yespower homepage for new versions, etc.: - - http://www.openwall.com/yespower/ - -If you have anything valuable to add or a non-trivial question to ask, -you may contact the maintainer of yespower at: - - Solar Designer diff --git a/algo/yespower/TESTS-OK b/algo/yespower/TESTS-OK deleted file mode 100644 index 5413e85..0000000 --- a/algo/yespower/TESTS-OK +++ /dev/null @@ -1,16 +0,0 @@ -yespower(5, 2048, 8, "Client Key") = a5 9f ec 4c 4f dd a1 6e 3b 14 05 ad da 66 d5 25 b6 8e 7c ad fc fe 6a c0 66 c7 ad 11 8c d8 05 90 -yespower(5, 2048, 8, BSTY) = 5e a2 b2 95 6a 9e ac e3 0a 32 37 ff 1d 44 1e de e1 dc 25 aa b8 f0 ea 15 c1 21 65 f8 3a 7b c2 65 -yespower(5, 4096, 16, "Client Key") = 92 7e 72 d0 de d3 d8 04 75 47 3f 40 f1 74 3c 67 28 9d 45 3d 52 42 d4 f5 5a f4 e3 25 e0 66 99 c5 -yespower(5, 4096, 24, "Jagaricoin") = 0e 13 66 97 32 11 e7 fe a8 ad 9d 81 98 9c 84 a2 54 d9 68 c9 d3 33 dd 8f f0 99 32 4f 38 61 1e 04 -yespower(5, 4096, 32, "WaviBanana") = 3a e0 5a bb 3c 5c f6 f7 54 15 a9 25 54 c9 8d 50 e3 8e c9 55 2c fa 78 37 36 16 f4 80 b2 4e 55 9f -yespower(5, 2048, 32, "Client Key") = 56 0a 89 1b 5c a2 e1 c6 36 11 1a 9f f7 c8 94 a5 d0 a2 60 2f 43 fd cf a5 94 9b 95 e2 2f e4 46 1e -yespower(5, 1024, 32, "Client Key") = 2a 79 e5 3d 1b e6 66 9b c5 56 cc c4 17 bc e3 d2 2a 74 a2 32 f5 6b 8e 1d 39 b4 57 92 67 5d e1 08 -yespower(5, 2048, 8, NULL) = 5e cb d8 e8 d7 c9 0b ae d4 bb f8 91 6a 12 25 dc c3 c6 5f 5c 91 65 ba e8 1c dd e3 cf fa d1 28 e8 -yespower(10, 2048, 8, NULL) = 69 e0 e8 95 b3 df 7a ee b8 37 d7 1f e1 99 e9 d3 4f 7e c4 6e cb ca 7a 2c 43 08 e5 18 57 ae 9b 46 -yespower(10, 4096, 16, NULL) = 33 fb 8f 06 38 24 a4 a0 20 f6 3d ca 53 5f 5c a6 6a b5 57 64 68 c7 5d 1c ca ac 75 42 f7 64 95 ac -yespower(10, 4096, 32, NULL) = 77 1a ee fd a8 fe 79 a0 82 5b c7 f2 ae e1 62 ab 55 78 57 46 39 ff c6 ca 37 23 cc 18 e5 e3 e2 85 -yespower(10, 2048, 32, NULL) = d5 ef b8 13 cd 26 3e 9b 34 54 01 30 23 3c bb c6 a9 21 fb ff 34 31 e5 ec 1a 1a bd e2 ae a6 ff 4d -yespower(10, 1024, 32, NULL) = 50 1b 79 2d b4 2e 38 8f 6e 7d 45 3c 95 d0 3a 12 a3 60 16 a5 15 4a 68 83 90 dd c6 09 a4 0c 67 99 -yespower(10, 1024, 32, "personality test") = 1f 02 69 ac f5 65 c4 9a dc 0e f9 b8 f2 6a b3 80 8c dc 38 39 4a 25 4f dd ee dc c3 aa cf f6 ad 9d -XOR of yespower(5, ...) = 44 6c 17 2d fd 2b 39 64 d9 34 08 a3 bf 55 9e c7 42 31 7a 2c f2 b2 48 7c ec 41 17 e8 3d 7d 96 b0 -XOR of yespower(10, ...) = a9 fe c9 66 53 bd 25 ea 18 24 f3 f9 cc 96 c5 98 5f 19 1d 1d ca ee d3 e9 d0 52 f8 ac 76 bc bb da diff --git a/algo/yespower/benchmark.c b/algo/yespower/benchmark.c deleted file mode 100644 index 4842dfa..0000000 --- a/algo/yespower/benchmark.c +++ /dev/null @@ -1,262 +0,0 @@ -/*- - * Copyright 2013-2018 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -#include /* for atoi() */ -#include -#include -#include -#include -#include - -#include "yespower.h" - -#ifdef _OPENMP -#include - -#define NSAVE 1000 - -static uint64_t time_us(void) -{ - struct timespec t; -#ifdef CLOCK_MONOTONIC_RAW - if (clock_gettime(CLOCK_MONOTONIC_RAW, &t)) - return 0; -#else - if (clock_gettime(CLOCK_MONOTONIC, &t)) - return 0; -#endif - return 1 + (uint64_t)t.tv_sec * 1000000 + t.tv_nsec / 1000; -} -#endif - -int main(int argc, const char * const *argv) -{ - yespower_params_t params = { - .version = YESPOWER_0_5, - .N = 2048, - .r = 8, - .pers = (const uint8_t *)"Client Key", - .perslen = 10 - }; - - if (argc > 1) - params.version = atoi(argv[1]); - if (argc > 2) - params.N = atoi(argv[2]); - if (argc > 3) - params.r = atoi(argv[3]); - - printf("version=%.1f N=%u r=%u\n", - params.version * 0.1, params.N, params.r); - - printf("Will use %.2f KiB RAM\n", 0.125 * params.N * params.r); - - static __thread union { - uint8_t u8[80]; - uint32_t u32[20]; - } src; - yespower_binary_t dst; - unsigned int i; - - for (i = 0; i < sizeof(src); i++) - src.u8[i] = i * 3; - - if (yespower_tls(src.u8, sizeof(src), ¶ms, &dst)) { - puts("FAILED"); - return 1; - } - - for (i = 0; i < sizeof(dst); i++) - printf("%02x%c", dst.uc[i], i < sizeof(dst) - 1 ? ' ' : '\n'); - - puts("Benchmarking 1 thread ..."); - - clock_t clk_tck = sysconf(_SC_CLK_TCK); - struct tms start_tms, end_tms; - clock_t start = times(&start_tms), end; - unsigned int n; - unsigned long long count; -#ifdef _OPENMP - yespower_binary_t save[NSAVE]; - unsigned int nsave = 0; -#endif - uint32_t seed = start * 1812433253U; - - n = 1; - count = 0; - do { - for (i = 0; i < n; i++) { - yespower_binary_t *p = &dst; -#ifdef _OPENMP - if (nsave < NSAVE) - p = &save[nsave++]; -#endif - src.u32[19] = seed + (count + i); - if (yespower_tls(src.u8, sizeof(src), ¶ms, p)) { - puts("FAILED"); - return 1; - } - } - count += n; - - end = times(&end_tms); - n <<= 1; - } while (end - start < clk_tck * 2); - - clock_t start_v = start_tms.tms_utime + start_tms.tms_stime + - start_tms.tms_cutime + start_tms.tms_cstime; - clock_t end_v = end_tms.tms_utime + end_tms.tms_stime + - end_tms.tms_cutime + end_tms.tms_cstime; - - printf("%llu H/s real, %llu H/s virtual " - "(%llu hashes in %.2f seconds)\n", - count * clk_tck / (end - start), - count * clk_tck / (end_v - start_v), - count, (double)(end - start) / clk_tck); - - for (i = 0; i < nsave; i++) { - unsigned int j; - for (j = i + 1; j < nsave; j++) { - unsigned int k = 8; - if (!memcmp(&save[i], &save[j], k)) { - printf("%u-byte collision(s) detected\n", k); - i = nsave; break; - } - } - } - -#ifdef _OPENMP - unsigned int nt = omp_get_max_threads(); - - printf("Benchmarking %u thread%s ...\n", - nt, nt == 1 ? "" : "s"); - - typedef struct { - uint64_t min, max, total; - } thread_data_s; - union { - thread_data_s s; - uint8_t cachelines[2][64]; /* avoid false sharing */ - } thread_data[nt]; /* tricky to align this when on stack */ - - unsigned int t; - for (t = 0; t < nt; t++) { - thread_data_s *td = &thread_data[t].s; - td->min = ~(uint64_t)0; td->max = 0; td->total = 0; - } - - unsigned long long count1 = count, count_restart = 0; - - if (!geteuid()) { - puts("Running as root, so trying to set SCHED_RR"); -#pragma omp parallel - { - struct sched_param param = { .sched_priority = 1 }; - if (sched_setscheduler(getpid(), SCHED_RR, ¶m)) - perror("sched_setscheduler"); - } - } - - start = times(&start_tms); - - n = count * omp_get_max_threads(); - count = 0; - do { -#pragma omp parallel for default(none) copyin(src) private(i, dst) shared(n, thread_data, params, seed, count, save, nsave) - for (i = 0; i < n; i++) { - unsigned int j = count + i; - - src.u32[19] = seed + j; - - uint64_t start1 = time_us(); - - if (yespower_tls(src.u8, sizeof(src), ¶ms, &dst)) { -#pragma omp critical - puts("FAILED"); - } - - uint64_t end1 = time_us(); - if (end1 < start1) - end1 = start1; - uint64_t diff1 = end1 - start1; - - thread_data_s *td = &thread_data[omp_get_thread_num()].s; - td->total += diff1; - if (diff1 < td->min) - td->min = diff1; - if (diff1 > td->max) - td->max = diff1; - -#ifdef _OPENMP - if (j < nsave && memcmp(&save[j], &dst, sizeof(dst))) { -#pragma omp critical - printf("Mismatch at %u\n", j); - } -#endif - } - - count += n; - if ((count - n) < count1 && count >= count1) { -/* Disregard our repeat of single thread's results (could be partially cached - * by same core, but OTOH other cores not yet warmed up to full clock rate). */ - start = times(&start_tms); - count_restart = count; - for (t = 0; t < nt; t++) { - thread_data_s *td = &thread_data[t].s; - td->min = ~(uint64_t)0; td->max = 0; td->total = 0; - } - } else { - n <<= 1; - } - - end = times(&end_tms); - } while (end - start < clk_tck); - - if (!count_restart) - puts("Didn't reach single-thread's hash count"); - count -= count_restart; - - start_v = start_tms.tms_utime + start_tms.tms_stime + - start_tms.tms_cutime + start_tms.tms_cstime; - end_v = end_tms.tms_utime + end_tms.tms_stime + - end_tms.tms_cutime + end_tms.tms_cstime; - - printf("%llu H/s real, %llu H/s virtual " - "(%llu hashes in %.2f seconds)\n", - count * clk_tck / (end - start), - count * clk_tck / (end_v - start_v), - count, (double)(end - start) / clk_tck); - - uint64_t min = ~(uint64_t)0, max = 0, total = 0; - for (t = 0; t < nt; t++) { - thread_data_s *td = &thread_data[t].s; - total += td->total; - if (td->min < min) - min = td->min; - if (td->max > max) - max = td->max; - } - printf("min %.3f ms, avg %.3f ms, max %.3f ms\n", - min / 1000.0, total / 1000.0 / count, max / 1000.0); -#endif - - return 0; -} diff --git a/algo/yespower/insecure_memzero.h b/algo/yespower/insecure_memzero.h deleted file mode 100644 index 5a0ba75..0000000 --- a/algo/yespower/insecure_memzero.h +++ /dev/null @@ -1 +0,0 @@ -#define insecure_memzero(buf, len) /* empty */ diff --git a/algo/yespower/sha256_p.c b/algo/yespower/sha256_p.c deleted file mode 100644 index 7201797..0000000 --- a/algo/yespower/sha256_p.c +++ /dev/null @@ -1,218 +0,0 @@ -/*- - * Copyright 2005,2007,2009 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include - -#include -#include - -#include "sysendian.h" - -#include "sha256_p.h" -#include "compat.h" - - -/* Elementary functions used by SHA256 */ -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define SHR(x, n) (x >> n) -#define ROTR(x, n) ((x >> n) | (x << (32 - n))) -#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) -#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) -#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) -#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) - -/* SHA256 round function */ -#define RND(a, b, c, d, e, f, g, h, k) \ - t0 = h + S1(e) + Ch(e, f, g) + k; \ - t1 = S0(a) + Maj(a, b, c); \ - d += t0; \ - h = t0 + t1; - -/* Adjusted round function for rotating state */ -#define RNDr(S, W, i, k) \ - RND(S[(64 - i) % 8], S[(65 - i) % 8], \ - S[(66 - i) % 8], S[(67 - i) % 8], \ - S[(68 - i) % 8], S[(69 - i) % 8], \ - S[(70 - i) % 8], S[(71 - i) % 8], \ - W[i] + k) - -/* -static unsigned char PAD[64] = { - 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; -*/ -/** - * SHA256_Buf(in, len, digest): - * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}. - */ -void -SHA256_Buf( const void * in, size_t len, uint8_t digest[32] ) -{ - SHA256_CTX ctx; - SHA256_Init( &ctx ); - SHA256_Update( &ctx, in, len ); - SHA256_Final( digest, &ctx ); -} - -/** - * HMAC_SHA256_Buf(K, Klen, in, len, digest): - * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of - * length ${Klen}, and write the result to ${digest}. - */ -void -HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len, - uint8_t digest[32]) -{ - HMAC_SHA256_CTX ctx; - - HMAC_SHA256_Init( &ctx, K, Klen ); - HMAC_SHA256_Update( &ctx, in, len ); - HMAC_SHA256_Final( digest, &ctx ); -} - -/* Initialize an HMAC-SHA256 operation with the given key. */ -void -HMAC_SHA256_Init( HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen ) -{ - unsigned char pad[64]; - unsigned char khash[32]; - const unsigned char * K = _K; - size_t i; - - /* If Klen > 64, the key is really SHA256(K). */ - if (Klen > 64) { - SHA256_Init( &ctx->ictx ); - SHA256_Update( &ctx->ictx, K, Klen ); - SHA256_Final( khash, &ctx->ictx ); - K = khash; - Klen = 32; - } - - /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ - SHA256_Init( &ctx->ictx ); - memset( pad, 0x36, 64 ); - for ( i = 0; i < Klen; i++ ) - pad[i] ^= K[i]; - SHA256_Update( &ctx->ictx, pad, 64 ); - - /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ - SHA256_Init( &ctx->octx ); - memset(pad, 0x5c, 64); - for ( i = 0; i < Klen; i++ ) - pad[i] ^= K[i]; - SHA256_Update( &ctx->octx, pad, 64 ); - - /* Clean the stack. */ - //memset(khash, 0, 32); -} - -/* Add bytes to the HMAC-SHA256 operation. */ -void -HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len) -{ - - /* Feed data to the inner SHA256 operation. */ - SHA256_Update( &ctx->ictx, in, len ); -} - -/* Finish an HMAC-SHA256 operation. */ -void -HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx ) -{ - unsigned char ihash[32]; - - /* Finish the inner SHA256 operation. */ - SHA256_Final( ihash, &ctx->ictx ); - - /* Feed the inner hash to the outer SHA256 operation. */ - SHA256_Update( &ctx->octx, ihash, 32 ); - - /* Finish the outer SHA256 operation. */ - SHA256_Final( digest, &ctx->octx ); - - /* Clean the stack. */ - //memset(ihash, 0, 32); -} - -/** - * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): - * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and - * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). - */ -void -PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, - size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) -{ - HMAC_SHA256_CTX PShctx, hctx; - uint8_t _ALIGN(128) T[32]; - uint8_t _ALIGN(128) U[32]; - uint8_t ivec[4]; - size_t i, clen; - uint64_t j; - int k; - - /* Compute HMAC state after processing P and S. */ - HMAC_SHA256_Init(&PShctx, passwd, passwdlen); - HMAC_SHA256_Update(&PShctx, salt, saltlen); - - /* Iterate through the blocks. */ - for (i = 0; i * 32 < dkLen; i++) { - /* Generate INT(i + 1). */ - be32enc(ivec, (uint32_t)(i + 1)); - - /* Compute U_1 = PRF(P, S || INT(i)). */ - memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX)); - HMAC_SHA256_Update(&hctx, ivec, 4); - HMAC_SHA256_Final(U, &hctx); - - /* T_i = U_1 ... */ - memcpy(T, U, 32); - - for (j = 2; j <= c; j++) { - /* Compute U_j. */ - HMAC_SHA256_Init(&hctx, passwd, passwdlen); - HMAC_SHA256_Update(&hctx, U, 32); - HMAC_SHA256_Final(U, &hctx); - - /* ... xor U_j ... */ - for (k = 0; k < 32; k++) - T[k] ^= U[k]; - } - - /* Copy as many bytes as necessary into buf. */ - clen = dkLen - i * 32; - if (clen > 32) - clen = 32; - memcpy(&buf[i * 32], T, clen); - } - - /* Clean PShctx, since we never called _Final on it. */ - //memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y)); -} diff --git a/algo/yespower/sha256_p.h b/algo/yespower/sha256_p.h deleted file mode 100644 index 2481caf..0000000 --- a/algo/yespower/sha256_p.h +++ /dev/null @@ -1,56 +0,0 @@ -/*- - * Copyright 2005,2007,2009 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $ - */ - -#ifndef _SHA256_H_ -#define _SHA256_H_ - -#include -#include -#include - -typedef struct HMAC_SHA256Context { - SHA256_CTX ictx; - SHA256_CTX octx; -} HMAC_SHA256_CTX; - -void SHA256_Buf( const void * in, size_t len, uint8_t digest[32] ); -void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t ); -void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t ); -void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * ); -void HMAC_SHA256_Buf( const void * K, size_t Klen, const void * in, - size_t len, uint8_t digest[32] ); - -/** - * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): - * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and - * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). - */ -void PBKDF2_SHA256( const uint8_t *, size_t, const uint8_t *, size_t, - uint64_t, uint8_t *, size_t); - -#endif /* !_SHA256_H_ */ diff --git a/algo/yespower/sysendian.h b/algo/yespower/sysendian.h deleted file mode 100644 index 52c1fe7..0000000 --- a/algo/yespower/sysendian.h +++ /dev/null @@ -1,94 +0,0 @@ -/*- - * Copyright 2007-2014 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _SYSENDIAN_H_ -#define _SYSENDIAN_H_ - -#include - -/* Avoid namespace collisions with BSD . */ -#define be32dec libcperciva_be32dec -#define be32enc libcperciva_be32enc -#define be64enc libcperciva_be64enc -#define le32dec libcperciva_le32dec -#define le32enc libcperciva_le32enc - -static inline uint32_t -be32dec(const void * pp) -{ - const uint8_t * p = (uint8_t const *)pp; - - return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + - ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); -} - -static inline void -be32enc(void * pp, uint32_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[3] = x & 0xff; - p[2] = (x >> 8) & 0xff; - p[1] = (x >> 16) & 0xff; - p[0] = (x >> 24) & 0xff; -} - -static inline void -be64enc(void * pp, uint64_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[7] = x & 0xff; - p[6] = (x >> 8) & 0xff; - p[5] = (x >> 16) & 0xff; - p[4] = (x >> 24) & 0xff; - p[3] = (x >> 32) & 0xff; - p[2] = (x >> 40) & 0xff; - p[1] = (x >> 48) & 0xff; - p[0] = (x >> 56) & 0xff; -} - -static inline uint32_t -le32dec(const void * pp) -{ - const uint8_t * p = (uint8_t const *)pp; - - return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + - ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); -} - -static inline void -le32enc(void * pp, uint32_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; -} - -#endif /* !_SYSENDIAN_H_ */ diff --git a/algo/yespower/tests.c b/algo/yespower/tests.c deleted file mode 100644 index 3c394a8..0000000 --- a/algo/yespower/tests.c +++ /dev/null @@ -1,182 +0,0 @@ -/*- - * Copyright 2013-2018 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -#include - -#include "yespower.h" - -#undef TEST_PBKDF2_SHA256 - -#ifdef TEST_PBKDF2_SHA256 -#include - -#include "sha256.h" - -static void print_PBKDF2_SHA256_raw(const char *passwd, size_t passwdlen, - const char *salt, size_t saltlen, uint64_t c, size_t dkLen) -{ - uint8_t dk[64]; - size_t i; - - assert(dkLen <= sizeof(dk)); - - /* XXX This prints the strings truncated at first NUL */ - printf("PBKDF2_SHA256(\"%s\", \"%s\", %llu, %llu) = ", - passwd, salt, (unsigned long long)c, (unsigned long long)dkLen); - - PBKDF2_SHA256((const uint8_t *) passwd, passwdlen, - (const uint8_t *) salt, saltlen, c, dk, dkLen); - - for (i = 0; i < dkLen; i++) - printf("%02x%c", dk[i], i < dkLen - 1 ? ' ' : '\n'); -} - -static void print_PBKDF2_SHA256(const char *passwd, - const char *salt, uint64_t c, size_t dkLen) -{ - print_PBKDF2_SHA256_raw(passwd, strlen(passwd), salt, strlen(salt), c, - dkLen); -} -#endif - -static const char *pers_bsty_magic = "BSTY"; - -static void print_yespower(yespower_version_t version, uint32_t N, uint32_t r, - const char *pers) -{ - yespower_params_t params = { - .version = version, - .N = N, - .r = r, - .pers = (const uint8_t *)pers, - .perslen = pers ? strlen(pers) : 0 - }; - uint8_t src[80]; - yespower_binary_t dst; - size_t i; - - const char *q = (pers && pers != pers_bsty_magic) ? "\"": ""; - printf("yespower(%u, %u, %u, %s%s%s) = ", (unsigned int)version, N, r, - q, pers ? pers : "NULL", q); - - for (i = 0; i < sizeof(src); i++) - src[i] = i * 3; - - if (pers == pers_bsty_magic) { - params.pers = src; - params.perslen = sizeof(src); - } - - if (yespower_tls(src, sizeof(src), ¶ms, &dst)) { - puts("FAILED"); - return; - } - - for (i = 0; i < sizeof(dst); i++) - printf("%02x%c", dst.uc[i], i < sizeof(dst) - 1 ? ' ' : '\n'); -} - -static void print_yespower_loop(yespower_version_t version, const char *pers) -{ - uint32_t N, r; - uint8_t src[80]; - yespower_binary_t dst, xor = {{0}}; - size_t i; - - printf("XOR of yespower(%u, ...) = ", (unsigned int)version); - - for (i = 0; i < sizeof(src); i++) - src[i] = i * 3; - - for (N = 1024; N <= 4096; N <<= 1) { - for (r = 8; r <= 32; r++) { - yespower_params_t params = { - .version = version, - .N = N, - .r = r, - .pers = (const uint8_t *)pers, - .perslen = pers ? strlen(pers) : 0 - }; - if (yespower_tls(src, sizeof(src), ¶ms, &dst)) { - puts("FAILED"); - return; - } - for (i = 0; i < sizeof(xor); i++) - xor.uc[i] ^= dst.uc[i]; - } - } - - for (i = 0; i < sizeof(xor); i++) - printf("%02x%c", xor.uc[i], i < sizeof(xor) - 1 ? ' ' : '\n'); -} - -int main(void) -{ - setvbuf(stdout, NULL, _IOLBF, 0); - -#ifdef TEST_PBKDF2_SHA256 - print_PBKDF2_SHA256("password", "salt", 1, 20); - print_PBKDF2_SHA256("password", "salt", 2, 20); - print_PBKDF2_SHA256("password", "salt", 4096, 20); - print_PBKDF2_SHA256("password", "salt", 16777216, 20); - print_PBKDF2_SHA256("passwordPASSWORDpassword", - "saltSALTsaltSALTsaltSALTsaltSALTsalt", 4096, 25); - print_PBKDF2_SHA256_raw("pass\0word", 9, "sa\0lt", 5, 4096, 16); -#if 0 - print_PBKDF2_SHA256("password", "salt", 1, 32); - print_PBKDF2_SHA256("password", "salt", 2, 32); - print_PBKDF2_SHA256("password", "salt", 4096, 32); - print_PBKDF2_SHA256("password", "salt", 16777216, 32); - print_PBKDF2_SHA256("passwordPASSWORDpassword", - "saltSALTsaltSALTsaltSALTsaltSALTsalt", 4096, 40); - print_PBKDF2_SHA256("password", "salt", 4096, 16); - print_PBKDF2_SHA256("password", "salt", 1, 20); - print_PBKDF2_SHA256("password", "salt", 2, 20); - print_PBKDF2_SHA256("password", "salt", 4096, 20); - print_PBKDF2_SHA256("password", "salt", 16777216, 20); - print_PBKDF2_SHA256("password", "salt", 4096, 25); - print_PBKDF2_SHA256("password", "salt", 4096, 16); -#endif -#endif - - print_yespower(YESPOWER_0_5, 2048, 8, "Client Key"); /* yescrypt 0.5 */ - print_yespower(YESPOWER_0_5, 2048, 8, pers_bsty_magic); /* BSTY */ - print_yespower(YESPOWER_0_5, 4096, 16, "Client Key"); /* Cryply */ - print_yespower(YESPOWER_0_5, 4096, 24, "Jagaricoin"); - print_yespower(YESPOWER_0_5, 4096, 32, "WaviBanana"); - print_yespower(YESPOWER_0_5, 2048, 32, "Client Key"); - print_yespower(YESPOWER_0_5, 1024, 32, "Client Key"); - - print_yespower(YESPOWER_0_5, 2048, 8, NULL); /* no personality */ - - print_yespower(YESPOWER_1_0, 2048, 8, NULL); - print_yespower(YESPOWER_1_0, 4096, 16, NULL); - print_yespower(YESPOWER_1_0, 4096, 32, NULL); - print_yespower(YESPOWER_1_0, 2048, 32, NULL); - print_yespower(YESPOWER_1_0, 1024, 32, NULL); - - print_yespower(YESPOWER_1_0, 1024, 32, "personality test"); - - print_yespower_loop(YESPOWER_0_5, "Client Key"); - print_yespower_loop(YESPOWER_1_0, NULL); - - return 0; -} diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c deleted file mode 100644 index b6f76ec..0000000 --- a/algo/yespower/yespower-opt.c +++ /dev/null @@ -1,1149 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2012-2018 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - * - * This is a proof-of-work focused fork of yescrypt, including optimized and - * cut-down implementation of the obsolete yescrypt 0.5 (based off its first - * submission to PHC back in 2014) and a new proof-of-work specific variation - * known as yespower 1.0. The former is intended as an upgrade for - * cryptocurrencies that already use yescrypt 0.5 and the latter may be used - * as a further upgrade (hard fork) by those and other cryptocurrencies. The - * version of algorithm to use is requested through parameters, allowing for - * both algorithms to co-exist in client and miner implementations (such as in - * preparation for a hard-fork). - */ - -#ifndef _YESPOWER_OPT_C_PASS_ -#define _YESPOWER_OPT_C_PASS_ 1 -#endif - -#if _YESPOWER_OPT_C_PASS_ == 1 -/* - * AVX and especially XOP speed up Salsa20 a lot, but needlessly result in - * extra instruction prefixes for pwxform (which we make more use of). While - * no slowdown from the prefixes is generally observed on AMD CPUs supporting - * XOP, some slowdown is sometimes observed on Intel CPUs with AVX. - */ -/* -#ifdef __XOP__ -#warning "Note: XOP is enabled. That's great." -#elif defined(__AVX__) -#warning "Note: AVX is enabled. That's OK." -#elif defined(__SSE2__) -#warning "Note: AVX and XOP are not enabled. That's OK." -#elif defined(__x86_64__) || defined(__i386__) -#warning "SSE2 not enabled. Expect poor performance." -#else -#warning "Note: building generic code for non-x86. That's OK." -#endif -*/ - -/* - * The SSE4 code version has fewer instructions than the generic SSE2 version, - * but all of the instructions are SIMD, thereby wasting the scalar execution - * units. Thus, the generic SSE2 version below actually runs faster on some - * CPUs due to its balanced mix of SIMD and scalar instructions. - */ -#undef USE_SSE4_FOR_32BIT - -#ifdef __SSE2__ -/* - * GCC before 4.9 would by default unnecessarily use store/load (without - * SSE4.1) or (V)PEXTR (with SSE4.1 or AVX) instead of simply (V)MOV. - * This was tracked as GCC bug 54349. - * "-mtune=corei7" works around this, but is only supported for GCC 4.6+. - * We use inline asm for pre-4.6 GCC, further down this file. - */ -#if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && __GNUC_MINOR__ < 9 && \ - !defined(__clang__) && !defined(__ICC) -#pragma GCC target ("tune=corei7") -#endif -#include -#ifdef __XOP__ -#include -#endif -#elif defined(__SSE__) -#include -#endif - -#include -#include -#include -#include - -#include "insecure_memzero.h" -#include "sha256_p.h" -#include "sysendian.h" - -#include "yespower.h" - -#include "yespower-platform.c" - -#if __STDC_VERSION__ >= 199901L -/* Have restrict */ -#elif defined(__GNUC__) -#define restrict __restrict -#else -#define restrict -#endif - -#ifdef __GNUC__ -#define unlikely(exp) __builtin_expect(exp, 0) -#else -#define unlikely(exp) (exp) -#endif - -#ifdef __SSE__ -#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint)); -#else -#undef PREFETCH -#endif - -typedef union { - uint32_t w[16]; - uint64_t d[8]; -#ifdef __SSE2__ - __m128i q[4]; -#endif -} salsa20_blk_t; - -static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin, - salsa20_blk_t *Bout) -{ -#define COMBINE(out, in1, in2) \ - Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); - COMBINE(0, 0, 2) - COMBINE(1, 5, 7) - COMBINE(2, 2, 4) - COMBINE(3, 7, 1) - COMBINE(4, 4, 6) - COMBINE(5, 1, 3) - COMBINE(6, 6, 0) - COMBINE(7, 3, 5) -#undef COMBINE -} - -static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, - salsa20_blk_t *Bout) -{ -#define UNCOMBINE(out, in1, in2) \ - Bout->w[out * 2] = Bin->d[in1]; \ - Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; - UNCOMBINE(0, 0, 6) - UNCOMBINE(1, 5, 3) - UNCOMBINE(2, 2, 0) - UNCOMBINE(3, 7, 5) - UNCOMBINE(4, 4, 2) - UNCOMBINE(5, 1, 7) - UNCOMBINE(6, 6, 4) - UNCOMBINE(7, 3, 1) -#undef UNCOMBINE -} - -#ifdef __SSE2__ -#define DECL_X \ - __m128i X0, X1, X2, X3; -#define DECL_Y \ - __m128i Y0, Y1, Y2, Y3; -#define READ_X(in) \ - X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3]; -#define WRITE_X(out) \ - (out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3; - -#ifdef __XOP__ -#define ARX(out, in1, in2, s) \ - out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); -#else -#define ARX(out, in1, in2, s) { \ - __m128i tmp = _mm_add_epi32(in1, in2); \ - out = _mm_xor_si128(out, _mm_slli_epi32(tmp, s)); \ - out = _mm_xor_si128(out, _mm_srli_epi32(tmp, 32 - s)); \ -} -#endif - -#define SALSA20_2ROUNDS \ - /* Operate on "columns" */ \ - ARX(X1, X0, X3, 7) \ - ARX(X2, X1, X0, 9) \ - ARX(X3, X2, X1, 13) \ - ARX(X0, X3, X2, 18) \ - /* Rearrange data */ \ - X1 = _mm_shuffle_epi32(X1, 0x93); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x39); \ - /* Operate on "rows" */ \ - ARX(X3, X0, X1, 7) \ - ARX(X2, X3, X0, 9) \ - ARX(X1, X2, X3, 13) \ - ARX(X0, X1, X2, 18) \ - /* Rearrange data */ \ - X1 = _mm_shuffle_epi32(X1, 0x39); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x93); - -/** - * Apply the Salsa20 core to the block provided in (X0 ... X3). - */ -#define SALSA20_wrapper(out, rounds) { \ - __m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \ - rounds \ - (out).q[0] = X0 = _mm_add_epi32(X0, Z0); \ - (out).q[1] = X1 = _mm_add_epi32(X1, Z1); \ - (out).q[2] = X2 = _mm_add_epi32(X2, Z2); \ - (out).q[3] = X3 = _mm_add_epi32(X3, Z3); \ -} - -/** - * Apply the Salsa20/2 core to the block provided in X. - */ -#define SALSA20_2(out) \ - SALSA20_wrapper(out, SALSA20_2ROUNDS) - -#define SALSA20_8ROUNDS \ - SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS - -/** - * Apply the Salsa20/8 core to the block provided in X. - */ -#define SALSA20_8(out) \ - SALSA20_wrapper(out, SALSA20_8ROUNDS) - -#define XOR_X(in) \ - X0 = _mm_xor_si128(X0, (in).q[0]); \ - X1 = _mm_xor_si128(X1, (in).q[1]); \ - X2 = _mm_xor_si128(X2, (in).q[2]); \ - X3 = _mm_xor_si128(X3, (in).q[3]); - -#define XOR_X_2(in1, in2) \ - X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \ - X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \ - X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \ - X3 = _mm_xor_si128((in1).q[3], (in2).q[3]); - -#define XOR_X_WRITE_XOR_Y_2(out, in) \ - (out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \ - (out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \ - (out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \ - (out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \ - X0 = _mm_xor_si128(X0, Y0); \ - X1 = _mm_xor_si128(X1, Y1); \ - X2 = _mm_xor_si128(X2, Y2); \ - X3 = _mm_xor_si128(X3, Y3); - -#define INTEGERIFY _mm_cvtsi128_si32(X0) - -#else /* !defined(__SSE2__) */ - -#define DECL_X \ - salsa20_blk_t X; -#define DECL_Y \ - salsa20_blk_t Y; - -#define COPY(out, in) \ - (out).d[0] = (in).d[0]; \ - (out).d[1] = (in).d[1]; \ - (out).d[2] = (in).d[2]; \ - (out).d[3] = (in).d[3]; \ - (out).d[4] = (in).d[4]; \ - (out).d[5] = (in).d[5]; \ - (out).d[6] = (in).d[6]; \ - (out).d[7] = (in).d[7]; - -#define READ_X(in) COPY(X, in) -#define WRITE_X(out) COPY(out, X) - -/** - * salsa20(B): - * Apply the Salsa20 core to the provided block. - */ -static inline void salsa20(salsa20_blk_t *restrict B, - salsa20_blk_t *restrict Bout, uint32_t doublerounds) -{ - salsa20_blk_t X; -#define x X.w - - salsa20_simd_unshuffle(B, &X); - - do { -#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns */ - x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); - x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); - - x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); - x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); - - x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); - x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); - - x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); - x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); - - /* Operate on rows */ - x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); - x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); - - x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); - x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); - - x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); - x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); - - x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); - x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); -#undef R - } while (--doublerounds); -#undef x - - { - uint32_t i; - salsa20_simd_shuffle(&X, Bout); - for (i = 0; i < 16; i += 4) { - B->w[i] = Bout->w[i] += B->w[i]; - B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1]; - B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2]; - B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3]; - } - } -} - -/** - * Apply the Salsa20/2 core to the block provided in X. - */ -#define SALSA20_2(out) \ - salsa20(&X, &out, 1); - -/** - * Apply the Salsa20/8 core to the block provided in X. - */ -#define SALSA20_8(out) \ - salsa20(&X, &out, 4); - -#define XOR(out, in1, in2) \ - (out).d[0] = (in1).d[0] ^ (in2).d[0]; \ - (out).d[1] = (in1).d[1] ^ (in2).d[1]; \ - (out).d[2] = (in1).d[2] ^ (in2).d[2]; \ - (out).d[3] = (in1).d[3] ^ (in2).d[3]; \ - (out).d[4] = (in1).d[4] ^ (in2).d[4]; \ - (out).d[5] = (in1).d[5] ^ (in2).d[5]; \ - (out).d[6] = (in1).d[6] ^ (in2).d[6]; \ - (out).d[7] = (in1).d[7] ^ (in2).d[7]; - -#define XOR_X(in) XOR(X, X, in) -#define XOR_X_2(in1, in2) XOR(X, in1, in2) -#define XOR_X_WRITE_XOR_Y_2(out, in) \ - XOR(Y, out, in) \ - COPY(out, Y) \ - XOR(X, X, Y) - -#define INTEGERIFY (uint32_t)X.d[0] -#endif - -/** - * Apply the Salsa20 core to the block provided in X ^ in. - */ -#define SALSA20_XOR_MEM(in, out) \ - XOR_X(in) \ - SALSA20(out) - -#define SALSA20 SALSA20_8 -#else /* pass 2 */ -#undef SALSA20 -#define SALSA20 SALSA20_2 -#endif - -/** - * blockmix_salsa(Bin, Bout): - * Compute Bout = BlockMix_{salsa20, 1}(Bin). The input Bin must be 128 - * bytes in length; the output Bout must also be the same size. - */ -static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin, - salsa20_blk_t *restrict Bout) -{ - DECL_X - - READ_X(Bin[1]) - SALSA20_XOR_MEM(Bin[0], Bout[0]) - SALSA20_XOR_MEM(Bin[1], Bout[1]) -} - -static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout) -{ - DECL_X - - XOR_X_2(Bin1[1], Bin2[1]) - XOR_X(Bin1[0]) - SALSA20_XOR_MEM(Bin2[0], Bout[0]) - XOR_X(Bin1[1]) - SALSA20_XOR_MEM(Bin2[1], Bout[1]) - - return INTEGERIFY; -} - -#if _YESPOWER_OPT_C_PASS_ == 1 -/* This is tunable, but it is part of what defines a yespower version */ -/* Version 0.5 */ -#define Swidth_0_5 8 -/* Version 1.0 */ -#define Swidth_1_0 11 - -/* Not tunable in this implementation, hard-coded in a few places */ -#define PWXsimple 2 -#define PWXgather 4 - -/* Derived value. Not tunable on its own. */ -#define PWXbytes (PWXgather * PWXsimple * 8) - -/* (Maybe-)runtime derived values. Not tunable on their own. */ -#define Swidth_to_Sbytes1(Swidth) ((1 << (Swidth)) * PWXsimple * 8) -#define Swidth_to_Smask(Swidth) (((1 << (Swidth)) - 1) * PWXsimple * 8) -#define Smask_to_Smask2(Smask) (((uint64_t)(Smask) << 32) | (Smask)) - -/* These should be compile-time derived */ -#define Smask2_0_5 Smask_to_Smask2(Swidth_to_Smask(Swidth_0_5)) -#define Smask2_1_0 Smask_to_Smask2(Swidth_to_Smask(Swidth_1_0)) - -typedef struct { - uint8_t *S0, *S1, *S2; - size_t w; - uint32_t Sbytes; -} pwxform_ctx_t; - -#define DECL_SMASK2REG /* empty */ -#define MAYBE_MEMORY_BARRIER /* empty */ - -#ifdef __SSE2__ -/* - * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs - * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and - * destination registers, whereas the shifts would require an extra move - * instruction for our code when building without AVX. Unfortunately, PSHUFD - * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) - * and somewhat slower on some non-Intel CPUs (luckily not including AMD - * Bulldozer and Piledriver). - */ -#ifdef __AVX__ -#define HI32(X) \ - _mm_srli_si128((X), 4) -#elif 1 /* As an option, check for __SSE4_1__ here not to hurt Conroe */ -#define HI32(X) \ - _mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1)) -#else -#define HI32(X) \ - _mm_srli_epi64((X), 32) -#endif - -#if defined(__x86_64__) && \ - __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC) -#ifdef __AVX__ -#define MOVQ "vmovq" -#else -/* "movq" would be more correct, but "movd" is supported by older binutils - * due to an error in AMD's spec for x86-64. */ -#define MOVQ "movd" -#endif -#define EXTRACT64(X) ({ \ - uint64_t result; \ - __asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \ - result; \ -}) -#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__) -/* MSVC and Open64 had bugs */ -#define EXTRACT64(X) _mm_cvtsi128_si64(X) -#elif defined(__x86_64__) && defined(__SSE4_1__) -/* No known bugs for this intrinsic */ -#include -#define EXTRACT64(X) _mm_extract_epi64((X), 0) -#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__) -/* 32-bit */ -#include -#if 0 -/* This is currently unused by the code below, which instead uses these two - * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */ -#define EXTRACT64(X) \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ - ((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32)) -#endif -#else -/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */ -#define EXTRACT64(X) \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) -#endif - -#if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__)) -/* 64-bit with AVX */ -/* Force use of 64-bit AND instead of two 32-bit ANDs */ -#undef DECL_SMASK2REG -#if defined(__GNUC__) && !defined(__ICC) -#define DECL_SMASK2REG uint64_t Smask2reg = Smask2; -/* Force use of lower-numbered registers to reduce number of prefixes, relying - * on out-of-order execution and register renaming. */ -#define FORCE_REGALLOC_1 \ - __asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1)); -#define FORCE_REGALLOC_2 \ - __asm__("" : : "c" (lo)); -#else -static volatile uint64_t Smask2var = Smask2; -#define DECL_SMASK2REG uint64_t Smask2reg = Smask2var; -#define FORCE_REGALLOC_1 /* empty */ -#define FORCE_REGALLOC_2 /* empty */ -#endif -#define PWXFORM_SIMD(X) { \ - uint64_t x; \ - FORCE_REGALLOC_1 \ - uint32_t lo = x = EXTRACT64(X) & Smask2reg; \ - FORCE_REGALLOC_2 \ - uint32_t hi = x >> 32; \ - X = _mm_mul_epu32(HI32(X), X); \ - X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \ - X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \ -} -#elif defined(__x86_64__) -/* 64-bit without AVX. This relies on out-of-order execution and register - * renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g., - * it runs great on Haswell. */ -#warning "Note: using x86-64 inline assembly for pwxform. That's great." -#undef MAYBE_MEMORY_BARRIER -#define MAYBE_MEMORY_BARRIER \ - __asm__("" : : : "memory"); -#define PWXFORM_SIMD(X) { \ - __m128i H; \ - __asm__( \ - "movd %0, %%rax\n\t" \ - "pshufd $0xb1, %0, %1\n\t" \ - "andq %2, %%rax\n\t" \ - "pmuludq %1, %0\n\t" \ - "movl %%eax, %%ecx\n\t" \ - "shrq $0x20, %%rax\n\t" \ - "paddq (%3,%%rcx), %0\n\t" \ - "pxor (%4,%%rax), %0\n\t" \ - : "+x" (X), "=x" (H) \ - : "d" (Smask2), "S" (S0), "D" (S1) \ - : "cc", "ax", "cx"); \ -} -#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__) -/* 32-bit with SSE4.1 */ -#define PWXFORM_SIMD(X) { \ - __m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \ - __m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \ - __m128i s1 = *(__m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \ - X = _mm_mul_epu32(HI32(X), X); \ - X = _mm_add_epi64(X, s0); \ - X = _mm_xor_si128(X, s1); \ -} -#else -/* 32-bit without SSE4.1 */ -#define PWXFORM_SIMD(X) { \ - uint64_t x = EXTRACT64(X) & Smask2; \ - __m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \ - __m128i s1 = *(__m128i *)(S1 + (x >> 32)); \ - X = _mm_mul_epu32(HI32(X), X); \ - X = _mm_add_epi64(X, s0); \ - X = _mm_xor_si128(X, s1); \ -} -#endif - -#define PWXFORM_SIMD_WRITE(X, Sw) \ - PWXFORM_SIMD(X) \ - MAYBE_MEMORY_BARRIER \ - *(__m128i *)(Sw + w) = X; \ - MAYBE_MEMORY_BARRIER - -#define PWXFORM_ROUND \ - PWXFORM_SIMD(X0) \ - PWXFORM_SIMD(X1) \ - PWXFORM_SIMD(X2) \ - PWXFORM_SIMD(X3) - -#define PWXFORM_ROUND_WRITE4 \ - PWXFORM_SIMD_WRITE(X0, S0) \ - PWXFORM_SIMD_WRITE(X1, S1) \ - w += 16; \ - PWXFORM_SIMD_WRITE(X2, S0) \ - PWXFORM_SIMD_WRITE(X3, S1) \ - w += 16; - -#define PWXFORM_ROUND_WRITE2 \ - PWXFORM_SIMD_WRITE(X0, S0) \ - PWXFORM_SIMD_WRITE(X1, S1) \ - w += 16; \ - PWXFORM_SIMD(X2) \ - PWXFORM_SIMD(X3) - -#else /* !defined(__SSE2__) */ - -#define PWXFORM_SIMD(x0, x1) { \ - uint64_t x = x0 & Smask2; \ - uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \ - uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \ - x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \ - x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \ -} - -#define PWXFORM_SIMD_WRITE(x0, x1, Sw) \ - PWXFORM_SIMD(x0, x1) \ - ((uint64_t *)(Sw + w))[0] = x0; \ - ((uint64_t *)(Sw + w))[1] = x1; - -#define PWXFORM_ROUND \ - PWXFORM_SIMD(X.d[0], X.d[1]) \ - PWXFORM_SIMD(X.d[2], X.d[3]) \ - PWXFORM_SIMD(X.d[4], X.d[5]) \ - PWXFORM_SIMD(X.d[6], X.d[7]) - -#define PWXFORM_ROUND_WRITE4 \ - PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \ - PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \ - w += 16; \ - PWXFORM_SIMD_WRITE(X.d[4], X.d[5], S0) \ - PWXFORM_SIMD_WRITE(X.d[6], X.d[7], S1) \ - w += 16; - -#define PWXFORM_ROUND_WRITE2 \ - PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \ - PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \ - w += 16; \ - PWXFORM_SIMD(X.d[4], X.d[5]) \ - PWXFORM_SIMD(X.d[6], X.d[7]) -#endif - -#define PWXFORM \ - PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \ - PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND - -#define Smask2 Smask2_0_5 - -#else /* pass 2 */ - -#undef PWXFORM -#define PWXFORM \ - PWXFORM_ROUND_WRITE4 PWXFORM_ROUND_WRITE2 PWXFORM_ROUND_WRITE2 \ - w &= Smask2; \ - { \ - uint8_t *Stmp = S2; \ - S2 = S1; \ - S1 = S0; \ - S0 = Stmp; \ - } - -#undef Smask2 -#define Smask2 Smask2_1_0 - -#endif - -/** - * blockmix_pwxform(Bin, Bout, r, S): - * Compute Bout = BlockMix_pwxform{salsa20, r, S}(Bin). The input Bin must - * be 128r bytes in length; the output Bout must also be the same size. - */ -static void blockmix(const salsa20_blk_t *restrict Bin, - salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx) -{ - if (unlikely(!ctx)) { - blockmix_salsa(Bin, Bout); - return; - } - - uint8_t *S0 = ctx->S0, *S1 = ctx->S1; -#if _YESPOWER_OPT_C_PASS_ > 1 - uint8_t *S2 = ctx->S2; - size_t w = ctx->w; -#endif - size_t i; - DECL_X - - /* Convert count of 128-byte blocks to max index of 64-byte block */ - r = r * 2 - 1; - - READ_X(Bin[r]) - - DECL_SMASK2REG - - i = 0; - do { - XOR_X(Bin[i]) - PWXFORM - if (unlikely(i >= r)) - break; - WRITE_X(Bout[i]) - i++; - } while (1); - -#if _YESPOWER_OPT_C_PASS_ > 1 - ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; - ctx->w = w; -#endif - - SALSA20(Bout[i]) -} - -static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, pwxform_ctx_t *restrict ctx) -{ - if (unlikely(!ctx)) - return blockmix_salsa_xor(Bin1, Bin2, Bout); - - uint8_t *S0 = ctx->S0, *S1 = ctx->S1; -#if _YESPOWER_OPT_C_PASS_ > 1 - uint8_t *S2 = ctx->S2; - size_t w = ctx->w; -#endif - size_t i; - DECL_X - - /* Convert count of 128-byte blocks to max index of 64-byte block */ - r = r * 2 - 1; - -#ifdef PREFETCH - PREFETCH(&Bin2[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - } -#endif - - XOR_X_2(Bin1[r], Bin2[r]) - - DECL_SMASK2REG - - i = 0; - r--; - do { - XOR_X(Bin1[i]) - XOR_X(Bin2[i]) - PWXFORM - WRITE_X(Bout[i]) - - XOR_X(Bin1[i + 1]) - XOR_X(Bin2[i + 1]) - PWXFORM - - if (unlikely(i >= r)) - break; - - WRITE_X(Bout[i + 1]) - - i += 2; - } while (1); - i++; - -#if _YESPOWER_OPT_C_PASS_ > 1 - ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; - ctx->w = w; -#endif - - SALSA20(Bout[i]) - - return INTEGERIFY; -} - -static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out, - salsa20_blk_t *restrict Bin2, - size_t r, pwxform_ctx_t *restrict ctx) -{ - uint8_t *S0 = ctx->S0, *S1 = ctx->S1; -#if _YESPOWER_OPT_C_PASS_ > 1 - uint8_t *S2 = ctx->S2; - size_t w = ctx->w; -#endif - size_t i; - DECL_X - DECL_Y - - /* Convert count of 128-byte blocks to max index of 64-byte block */ - r = r * 2 - 1; - -#ifdef PREFETCH - PREFETCH(&Bin2[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - } -#endif - - XOR_X_2(Bin1out[r], Bin2[r]) - - DECL_SMASK2REG - - i = 0; - r--; - do { - XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]) - PWXFORM - WRITE_X(Bin1out[i]) - - XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1]) - PWXFORM - - if (unlikely(i >= r)) - break; - - WRITE_X(Bin1out[i + 1]) - - i += 2; - } while (1); - i++; - -#if _YESPOWER_OPT_C_PASS_ > 1 - ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; - ctx->w = w; -#endif - - SALSA20(Bin1out[i]) - - return INTEGERIFY; -} - -#if _YESPOWER_OPT_C_PASS_ == 1 -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static inline uint32_t integerify(const salsa20_blk_t *B, size_t r) -{ -/* - * Our 64-bit words are in host byte order, which is why we don't just read - * w[0] here (would be wrong on big-endian). Also, our 32-bit words are - * SIMD-shuffled, but we only care about the least significant 32 bits anyway. - */ - return (uint32_t)B[2 * r - 1].d[0]; -} -#endif - -/** - * smix1(B, r, N, V, XY, S): - * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 128r+64 bytes in length. N must be even and at least 4. - * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY - * to a multiple of at least 16 bytes. - */ -static void smix1(uint8_t *B, size_t r, uint32_t N, - salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx) -{ - size_t s = 2 * r; - salsa20_blk_t *X = V, *Y = &V[s], *V_j; - uint32_t i, j, n; - -#if _YESPOWER_OPT_C_PASS_ == 1 - for (i = 0; i < 2 * r; i++) { -#else - for (i = 0; i < 2; i++) { -#endif - const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64]; - salsa20_blk_t *tmp = Y; - salsa20_blk_t *dst = &X[i]; - size_t k; - for (k = 0; k < 16; k++) - tmp->w[k] = le32dec(&src->w[k]); - salsa20_simd_shuffle(tmp, dst); - } - -#if _YESPOWER_OPT_C_PASS_ > 1 - for (i = 1; i < r; i++) - blockmix(&X[(i - 1) * 2], &X[i * 2], 1, ctx); -#endif - - blockmix(X, Y, r, ctx); - X = Y + s; - blockmix(Y, X, r, ctx); - j = integerify(X, r); - - for (n = 2; n < N; n <<= 1) { - uint32_t m = (n < N / 2) ? n : (N - 1 - n); - for (i = 1; i < m; i += 2) { - Y = X + s; - j &= n - 1; - j += i - 1; - V_j = &V[j * s]; - j = blockmix_xor(X, V_j, Y, r, ctx); - j &= n - 1; - j += i; - V_j = &V[j * s]; - X = Y + s; - j = blockmix_xor(Y, V_j, X, r, ctx); - } - } - n >>= 1; - - j &= n - 1; - j += N - 2 - n; - V_j = &V[j * s]; - Y = X + s; - j = blockmix_xor(X, V_j, Y, r, ctx); - j &= n - 1; - j += N - 1 - n; - V_j = &V[j * s]; - blockmix_xor(Y, V_j, XY, r, ctx); - - for (i = 0; i < 2 * r; i++) { - const salsa20_blk_t *src = &XY[i]; - salsa20_blk_t *tmp = &XY[s]; - salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; - size_t k; - for (k = 0; k < 16; k++) - le32enc(&tmp->w[k], src->w[k]); - salsa20_simd_unshuffle(tmp, dst); - } -} - -/** - * smix2(B, r, N, Nloop, V, XY, S): - * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r bytes in length. N must be a power of 2 and at - * least 2. Nloop must be even. The array V must be aligned to a multiple of - * 64 bytes, and arrays B and XY to a multiple of at least 16 bytes. - */ -static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop, - salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx) -{ - size_t s = 2 * r; - salsa20_blk_t *X = XY, *Y = &XY[s]; - uint32_t i, j; - - for (i = 0; i < 2 * r; i++) { - const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64]; - salsa20_blk_t *tmp = Y; - salsa20_blk_t *dst = &X[i]; - size_t k; - for (k = 0; k < 16; k++) - tmp->w[k] = le32dec(&src->w[k]); - salsa20_simd_shuffle(tmp, dst); - } - - j = integerify(X, r) & (N - 1); - -#if _YESPOWER_OPT_C_PASS_ == 1 - if (Nloop > 2) { -#endif - do { - salsa20_blk_t *V_j = &V[j * s]; - j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1); - V_j = &V[j * s]; - j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1); - } while (Nloop -= 2); -#if _YESPOWER_OPT_C_PASS_ == 1 - } else { - do { - const salsa20_blk_t * V_j = &V[j * s]; - j = blockmix_xor(X, V_j, Y, r, ctx) & (N - 1); - V_j = &V[j * s]; - j = blockmix_xor(Y, V_j, X, r, ctx) & (N - 1); - } while (Nloop -= 2); - } -#endif - - for (i = 0; i < 2 * r; i++) { - const salsa20_blk_t *src = &X[i]; - salsa20_blk_t *tmp = Y; - salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; - size_t k; - for (k = 0; k < 16; k++) - le32enc(&tmp->w[k], src->w[k]); - salsa20_simd_unshuffle(tmp, dst); - } -} - -/** - * smix(B, r, N, V, XY, S): - * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the - * temporary storage V must be 128rN bytes in length; the temporary storage - * XY must be 256r bytes in length. N must be a power of 2 and at least 16. - * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY - * to a multiple of at least 16 bytes (aligning them to 64 bytes as well saves - * cache lines, but it might also result in cache bank conflicts). - */ -static void smix(uint8_t *B, size_t r, uint32_t N, - salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx) -{ -#if _YESPOWER_OPT_C_PASS_ == 1 - uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */ - uint32_t Nloop_rw = Nloop_all; - - Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */ - Nloop_rw &= ~(uint32_t)1; /* round down to even */ -#else - uint32_t Nloop_rw = (N + 2) / 3; /* 1/3, round up */ - Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */ -#endif - - smix1(B, 1, ctx->Sbytes / 128, (salsa20_blk_t *)ctx->S0, XY, NULL); - smix1(B, r, N, V, XY, ctx); - smix2(B, r, N, Nloop_rw /* must be > 2 */, V, XY, ctx); -#if _YESPOWER_OPT_C_PASS_ == 1 - if (Nloop_all > Nloop_rw) - smix2(B, r, N, 2, V, XY, ctx); -#endif -} - -#if _YESPOWER_OPT_C_PASS_ == 1 -#undef _YESPOWER_OPT_C_PASS_ -#define _YESPOWER_OPT_C_PASS_ 2 -#define blockmix_salsa blockmix_salsa_1_0 -#define blockmix_salsa_xor blockmix_salsa_xor_1_0 -#define blockmix blockmix_1_0 -#define blockmix_xor blockmix_xor_1_0 -#define blockmix_xor_save blockmix_xor_save_1_0 -#define smix1 smix1_1_0 -#define smix2 smix2_1_0 -#define smix smix_1_0 -#include "yespower-opt.c" -#undef smix - -/** - * yespower(local, src, srclen, params, dst): - * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". - * local is the thread-local data structure, allowing to preserve and reuse a - * memory allocation across calls, thereby reducing its overhead. - * - * Return 0 on success; or -1 on error. - */ -int yespower(yespower_local_t *local, - const uint8_t *src, size_t srclen, - const yespower_params_t *params, - yespower_binary_t *dst) -{ - yespower_version_t version = params->version; - uint32_t N = params->N; - uint32_t r = params->r; - const uint8_t *pers = params->pers; - size_t perslen = params->perslen; - uint32_t Swidth; - size_t B_size, V_size, XY_size, need; - uint8_t *B, *S; - salsa20_blk_t *V, *XY; - pwxform_ctx_t ctx; - uint8_t sha256[32]; - - /* Sanity-check parameters */ - if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) || - N < 1024 || N > 512 * 1024 || r < 8 || r > 32 || - (N & (N - 1)) != 0 || - (!pers && perslen)) { - errno = EINVAL; - return -1; - } - - /* Allocate memory */ - B_size = (size_t)128 * r; - V_size = B_size * N; - if (version == YESPOWER_0_5) { - XY_size = B_size * 2; - Swidth = Swidth_0_5; - ctx.Sbytes = 2 * Swidth_to_Sbytes1(Swidth); - } else { - XY_size = B_size + 64; - Swidth = Swidth_1_0; - ctx.Sbytes = 3 * Swidth_to_Sbytes1(Swidth); - } - need = B_size + V_size + XY_size + ctx.Sbytes; - if (local->aligned_size < need) { - if (free_region(local)) - return -1; - if (!alloc_region(local, need)) - return -1; - } - B = (uint8_t *)local->aligned; - V = (salsa20_blk_t *)((uint8_t *)B + B_size); - XY = (salsa20_blk_t *)((uint8_t *)V + V_size); - S = (uint8_t *)XY + XY_size; - ctx.S0 = S; - ctx.S1 = S + Swidth_to_Sbytes1(Swidth); - - SHA256_Buf(src, srclen, sha256); - - if (version == YESPOWER_0_5) { - PBKDF2_SHA256(sha256, sizeof(sha256), src, srclen, 1, - B, B_size); - memcpy(sha256, B, sizeof(sha256)); - smix(B, r, N, V, XY, &ctx); - PBKDF2_SHA256(sha256, sizeof(sha256), B, B_size, 1, - (uint8_t *)dst, sizeof(*dst)); - - if (pers) { - HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen, - sha256); - SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst); - } - } else { - ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth); - ctx.w = 0; - - if (pers) { - src = pers; - srclen = perslen; - } else { - srclen = 0; - } - - PBKDF2_SHA256(sha256, sizeof(sha256), src, srclen, 1, B, 128); - memcpy(sha256, B, sizeof(sha256)); - smix_1_0(B, r, N, V, XY, &ctx); - HMAC_SHA256_Buf(B + B_size - 64, 64, - sha256, sizeof(sha256), (uint8_t *)dst); - } - - /* Success! */ - return 0; -} - -/** - * yespower_tls(src, srclen, params, dst): - * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". - * The memory allocation is maintained internally using thread-local storage. - * - * Return 0 on success; or -1 on error. - */ -int yespower_tls(const uint8_t *src, size_t srclen, - const yespower_params_t *params, yespower_binary_t *dst) -{ - static __thread int initialized = 0; - static __thread yespower_local_t local; - - if (!initialized) { - if (yespower_init_local(&local)) - return -1; - initialized = 1; - } - - return yespower(&local, src, srclen, params, dst); -} - -int yespower_init_local(yespower_local_t *local) -{ - init_region(local); - return 0; -} - -int yespower_free_local(yespower_local_t *local) -{ - return free_region(local); -} -#endif diff --git a/algo/yespower/yespower-platform.c b/algo/yespower/yespower-platform.c deleted file mode 100644 index 5985791..0000000 --- a/algo/yespower/yespower-platform.c +++ /dev/null @@ -1,108 +0,0 @@ -/*- - * Copyright 2013-2018 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifdef __unix__ -#include -#endif - -#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) - -#ifdef __x86_64__ -#define HUGEPAGE_SIZE (2 * 1024 * 1024) -#else -#undef HUGEPAGE_SIZE -#endif - -static void *alloc_region(yespower_region_t *region, size_t size) -{ - size_t base_size = size; - uint8_t *base, *aligned; -#ifdef MAP_ANON - int flags = -#ifdef MAP_NOCORE - MAP_NOCORE | -#endif - MAP_ANON | MAP_PRIVATE; -#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) - size_t new_size = size; - const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; - if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { - flags |= MAP_HUGETLB; -/* - * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of - * huge page size, so let's round up to huge page size here. - */ - new_size = size + hugepage_mask; - new_size &= ~hugepage_mask; - } - base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); - if (base != MAP_FAILED) { - base_size = new_size; - - } else if (flags & MAP_HUGETLB) { - flags &= ~MAP_HUGETLB; - base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); - } - -#else - base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); -#endif - if (base == MAP_FAILED) - base = NULL; - aligned = base; -#elif defined(HAVE_POSIX_MEMALIGN) - if ((errno = posix_memalign((void **)&base, 64, size)) != 0) - base = NULL; - aligned = base; -#else - base = aligned = NULL; - if (size + 63 < size) { - errno = ENOMEM; - } else if ((base = malloc(size + 63)) != NULL) { - aligned = base + 63; - aligned -= (uintptr_t)aligned & 63; - } -#endif - region->base = base; - region->aligned = aligned; - region->base_size = base ? base_size : 0; - region->aligned_size = base ? size : 0; - return aligned; -} - -static inline void init_region(yespower_region_t *region) -{ - region->base = region->aligned = NULL; - region->base_size = region->aligned_size = 0; -} - -static int free_region(yespower_region_t *region) -{ - if (region->base) { -#ifdef MAP_ANON - if (munmap(region->base, region->base_size)) - return -1; -#else - free(region->base); -#endif - } - init_region(region); - return 0; -} diff --git a/algo/yespower/yespower-ref.c b/algo/yespower/yespower-ref.c deleted file mode 100644 index bec75c5..0000000 --- a/algo/yespower/yespower-ref.c +++ /dev/null @@ -1,581 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2013-2018 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - * - * This is a proof-of-work focused fork of yescrypt, including reference and - * cut-down implementation of the obsolete yescrypt 0.5 (based off its first - * submission to PHC back in 2014) and a new proof-of-work specific variation - * known as yespower 1.0. The former is intended as an upgrade for - * cryptocurrencies that already use yescrypt 0.5 and the latter may be used - * as a further upgrade (hard fork) by those and other cryptocurrencies. The - * version of algorithm to use is requested through parameters, allowing for - * both algorithms to co-exist in client and miner implementations (such as in - * preparation for a hard-fork). - * - * This is the reference implementation. Its purpose is to provide a simple - * human- and machine-readable specification that implementations intended - * for actual use should be tested against. It is deliberately mostly not - * optimized, and it is not meant to be used in production. Instead, use - * yespower-opt.c. - */ - -#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose." - -#include -#include -#include -#include - -#include "sha256_p.h" -#include "sysendian.h" - -#include "yespower.h" - -static void blkcpy(uint32_t *dst, const uint32_t *src, size_t count) -{ - do { - *dst++ = *src++; - } while (--count); -} - -static void blkxor(uint32_t *dst, const uint32_t *src, size_t count) -{ - do { - *dst++ ^= *src++; - } while (--count); -} - -/** - * salsa20(B): - * Apply the Salsa20 core to the provided block. - */ -static void salsa20(uint32_t B[16], uint32_t rounds) -{ - uint32_t x[16]; - size_t i; - - /* SIMD unshuffle */ - for (i = 0; i < 16; i++) - x[i * 5 % 16] = B[i]; - - for (i = 0; i < rounds; i += 2) { -#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns */ - x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); - x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); - - x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); - x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); - - x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); - x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); - - x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); - x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); - - /* Operate on rows */ - x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); - x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); - - x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); - x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); - - x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); - x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); - - x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); - x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); -#undef R - } - - /* SIMD shuffle */ - for (i = 0; i < 16; i++) - B[i] += x[i * 5 % 16]; -} - -/** - * blockmix_salsa(B): - * Compute B = BlockMix_{salsa20, 1}(B). The input B must be 128 bytes in - * length. - */ -static void blockmix_salsa(uint32_t *B, uint32_t rounds) -{ - uint32_t X[16]; - size_t i; - - /* 1: X <-- B_{2r - 1} */ - blkcpy(X, &B[16], 16); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < 2; i++) { - /* 3: X <-- H(X xor B_i) */ - blkxor(X, &B[i * 16], 16); - salsa20(X, rounds); - - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&B[i * 16], X, 16); - } -} - -/* - * These are tunable, but they must meet certain constraints and are part of - * what defines a yespower version. - */ -#define PWXsimple 2 -#define PWXgather 4 -/* Version 0.5 */ -#define PWXrounds_0_5 6 -#define Swidth_0_5 8 -/* Version 1.0 */ -#define PWXrounds_1_0 3 -#define Swidth_1_0 11 - -/* Derived values. Not tunable on their own. */ -#define PWXbytes (PWXgather * PWXsimple * 8) -#define PWXwords (PWXbytes / sizeof(uint32_t)) -#define rmin ((PWXbytes + 127) / 128) - -/* Runtime derived values. Not tunable on their own. */ -#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8) -#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8) - -typedef struct { - yespower_version_t version; - uint32_t salsa20_rounds; - uint32_t PWXrounds, Swidth, Sbytes, Smask; - uint32_t *S; - uint32_t (*S0)[2], (*S1)[2], (*S2)[2]; - size_t w; -} pwxform_ctx_t; - -/** - * pwxform(B): - * Transform the provided block using the provided S-boxes. - */ -static void pwxform(uint32_t *B, pwxform_ctx_t *ctx) -{ - uint32_t (*X)[PWXsimple][2] = (uint32_t (*)[PWXsimple][2])B; - uint32_t (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2; - uint32_t Smask = ctx->Smask; - size_t w = ctx->w; - size_t i, j, k; - - /* 1: for i = 0 to PWXrounds - 1 do */ - for (i = 0; i < ctx->PWXrounds; i++) { - /* 2: for j = 0 to PWXgather - 1 do */ - for (j = 0; j < PWXgather; j++) { - uint32_t xl = X[j][0][0]; - uint32_t xh = X[j][0][1]; - uint32_t (*p0)[2], (*p1)[2]; - - /* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */ - p0 = S0 + (xl & Smask) / sizeof(*S0); - /* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */ - p1 = S1 + (xh & Smask) / sizeof(*S1); - - /* 5: for k = 0 to PWXsimple - 1 do */ - for (k = 0; k < PWXsimple; k++) { - uint64_t x, s0, s1; - - /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) xor S1_{p1,k} */ - s0 = ((uint64_t)p0[k][1] << 32) + p0[k][0]; - s1 = ((uint64_t)p1[k][1] << 32) + p1[k][0]; - - xl = X[j][k][0]; - xh = X[j][k][1]; - - x = (uint64_t)xh * xl; - x += s0; - x ^= s1; - - X[j][k][0] = x; - X[j][k][1] = x >> 32; - } - - if (ctx->version != YESPOWER_0_5 && - (i == 0 || j < PWXgather / 2)) { - if (j & 1) { - for (k = 0; k < PWXsimple; k++) { - S1[w][0] = X[j][k][0]; - S1[w][1] = X[j][k][1]; - w++; - } - } else { - for (k = 0; k < PWXsimple; k++) { - S0[w + k][0] = X[j][k][0]; - S0[w + k][1] = X[j][k][1]; - } - } - } - } - } - - if (ctx->version != YESPOWER_0_5) { - /* 14: (S0, S1, S2) <-- (S2, S0, S1) */ - ctx->S0 = S2; - ctx->S1 = S0; - ctx->S2 = S1; - /* 15: w <-- w mod 2^Swidth */ - ctx->w = w & ((1 << ctx->Swidth) * PWXsimple - 1); - } -} - -/** - * blockmix_pwxform(B, ctx, r): - * Compute B = BlockMix_pwxform{salsa20, ctx, r}(B). The input B must be - * 128r bytes in length. - */ -static void blockmix_pwxform(uint32_t *B, pwxform_ctx_t *ctx, size_t r) -{ - uint32_t X[PWXwords]; - size_t r1, i; - - /* Convert 128-byte blocks to PWXbytes blocks */ - /* 1: r_1 <-- 128r / PWXbytes */ - r1 = 128 * r / PWXbytes; - - /* 2: X <-- B'_{r_1 - 1} */ - blkcpy(X, &B[(r1 - 1) * PWXwords], PWXwords); - - /* 3: for i = 0 to r_1 - 1 do */ - for (i = 0; i < r1; i++) { - /* 4: if r_1 > 1 */ - if (r1 > 1) { - /* 5: X <-- X xor B'_i */ - blkxor(X, &B[i * PWXwords], PWXwords); - } - - /* 7: X <-- pwxform(X) */ - pwxform(X, ctx); - - /* 8: B'_i <-- X */ - blkcpy(&B[i * PWXwords], X, PWXwords); - } - - /* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */ - i = (r1 - 1) * PWXbytes / 64; - - /* 11: B_i <-- H(B_i) */ - salsa20(&B[i * 16], ctx->salsa20_rounds); - -#if 1 /* No-op with our current pwxform settings, but do it to make sure */ - /* 12: for i = i + 1 to 2r - 1 do */ - for (i++; i < 2 * r; i++) { - /* 13: B_i <-- H(B_i xor B_{i-1}) */ - blkxor(&B[i * 16], &B[(i - 1) * 16], 16); - salsa20(&B[i * 16], ctx->salsa20_rounds); - } -#endif -} - -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static uint32_t integerify(const uint32_t *B, size_t r) -{ -/* - * Our 32-bit words are in host byte order. Also, they are SIMD-shuffled, but - * we only care about the least significant 32 bits anyway. - */ - const uint32_t *X = &B[(2 * r - 1) * 16]; - return X[0]; -} - -/** - * p2floor(x): - * Largest power of 2 not greater than argument. - */ -static uint32_t p2floor(uint32_t x) -{ - uint32_t y; - while ((y = x & (x - 1))) - x = y; - return x; -} - -/** - * wrap(x, i): - * Wrap x to the range 0 to i-1. - */ -static uint32_t wrap(uint32_t x, uint32_t i) -{ - uint32_t n = p2floor(i); - return (x & (n - 1)) + (i - n); -} - -/** - * smix1(B, r, N, V, X, ctx): - * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage X must be 128r bytes in length. - */ -static void smix1(uint32_t *B, size_t r, uint32_t N, - uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) -{ - size_t s = 32 * r; - uint32_t i, j; - size_t k; - - /* 1: X <-- B */ - for (k = 0; k < 2 * r; k++) - for (i = 0; i < 16; i++) - X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]); - - if (ctx->version != YESPOWER_0_5) { - for (k = 1; k < r; k++) { - blkcpy(&X[k * 32], &X[(k - 1) * 32], 32); - blockmix_pwxform(&X[k * 32], ctx, 1); - } - } - - /* 2: for i = 0 to N - 1 do */ - for (i = 0; i < N; i++) { - /* 3: V_i <-- X */ - blkcpy(&V[i * s], X, s); - - if (i > 1) { - /* j <-- Wrap(Integerify(X), i) */ - j = wrap(integerify(X, r), i); - - /* X <-- X xor V_j */ - blkxor(X, &V[j * s], s); - } - - /* 4: X <-- H(X) */ - if (V != ctx->S) - blockmix_pwxform(X, ctx, r); - else - blockmix_salsa(X, ctx->salsa20_rounds); - } - - /* B' <-- X */ - for (k = 0; k < 2 * r; k++) - for (i = 0; i < 16; i++) - le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]); -} - -/** - * smix2(B, r, N, Nloop, V, X, ctx): - * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage X must be 128r bytes in length. The value N must be a power of 2 - * greater than 1. - */ -static void smix2(uint32_t *B, size_t r, uint32_t N, uint32_t Nloop, - uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) -{ - size_t s = 32 * r; - uint32_t i, j; - size_t k; - - /* X <-- B */ - for (k = 0; k < 2 * r; k++) - for (i = 0; i < 16; i++) - X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]); - - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i++) { - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - - /* 8.1: X <-- X xor V_j */ - blkxor(X, &V[j * s], s); - /* V_j <-- X */ - if (Nloop != 2) - blkcpy(&V[j * s], X, s); - - /* 8.2: X <-- H(X) */ - blockmix_pwxform(X, ctx, r); - } - - /* 10: B' <-- X */ - for (k = 0; k < 2 * r; k++) - for (i = 0; i < 16; i++) - le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]); -} - -/** - * smix(B, r, N, p, t, V, X, ctx): - * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the - * temporary storage V must be 128rN bytes in length; the temporary storage - * X must be 128r bytes in length. The value N must be a power of 2 and at - * least 16. - */ -static void smix(uint32_t *B, size_t r, uint32_t N, - uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) -{ - uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */ - uint32_t Nloop_rw = Nloop_all; - - Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */ - if (ctx->version == YESPOWER_0_5) { - Nloop_rw &= ~(uint32_t)1; /* round down to even */ - } else { - Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */ - } - - smix1(B, 1, ctx->Sbytes / 128, ctx->S, X, ctx); - smix1(B, r, N, V, X, ctx); - smix2(B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx); - smix2(B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx); -} - -/** - * yespower(local, src, srclen, params, dst): - * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". - * - * Return 0 on success; or -1 on error. - */ -int yespower(yespower_local_t *local, - const uint8_t *src, size_t srclen, - const yespower_params_t *params, yespower_binary_t *dst) -{ - yespower_version_t version = params->version; - uint32_t N = params->N; - uint32_t r = params->r; - const uint8_t *pers = params->pers; - size_t perslen = params->perslen; - int retval = -1; - size_t B_size, V_size; - uint32_t *B, *V, *X, *S; - pwxform_ctx_t ctx; - uint32_t sha256[8]; - - /* Sanity-check parameters */ - if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) || - N < 1024 || N > 512 * 1024 || r < 8 || r > 32 || - (N & (N - 1)) != 0 || r < rmin || - (!pers && perslen)) { - errno = EINVAL; - return -1; - } - - /* Allocate memory */ - B_size = (size_t)128 * r; - V_size = B_size * N; - if ((V = malloc(V_size)) == NULL) - return -1; - if ((B = malloc(B_size)) == NULL) - goto free_V; - if ((X = malloc(B_size)) == NULL) - goto free_B; - ctx.version = version; - if (version == YESPOWER_0_5) { - ctx.salsa20_rounds = 8; - ctx.PWXrounds = PWXrounds_0_5; - ctx.Swidth = Swidth_0_5; - ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth); - } else { - ctx.salsa20_rounds = 2; - ctx.PWXrounds = PWXrounds_1_0; - ctx.Swidth = Swidth_1_0; - ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth); - } - if ((S = malloc(ctx.Sbytes)) == NULL) - goto free_X; - ctx.S = S; - ctx.S0 = (uint32_t (*)[2])S; - ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple; - ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple; - ctx.Smask = Swidth_to_Smask(ctx.Swidth); - ctx.w = 0; - - SHA256_Buf(src, srclen, (uint8_t *)sha256); - - if (version != YESPOWER_0_5) { - if (pers) { - src = pers; - srclen = perslen; - } else { - srclen = 0; - } - } - - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256), - src, srclen, 1, (uint8_t *)B, B_size); - - blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); - - /* 3: B_i <-- MF(B_i, N) */ - smix(B, r, N, V, X, &ctx); - - if (version == YESPOWER_0_5) { - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256), - (uint8_t *)B, B_size, 1, (uint8_t *)dst, sizeof(*dst)); - - if (pers) { - HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen, - return true; - (uint8_t *)sha256); - SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst); - } - } else { - HMAC_SHA256_Buf_P((uint8_t *)B + B_size - 64, 64, - sha256, sizeof(sha256), (uint8_t *)dst); - } - - /* Success! */ - retval = 0; - - /* Free memory */ - free(S); -free_X: - free(X); -free_B: - free(B); -free_V: - free(V); - - return retval; -} - -int yespower_tls(const uint8_t *src, size_t srclen, - const yespower_params_t *params, yespower_binary_t *dst) -{ -/* The reference implementation doesn't use thread-local storage */ - return yespower(NULL, src, srclen, params, dst); -} - -int yespower_init_local(yespower_local_t *local) -{ -/* The reference implementation doesn't use the local structure */ - local->base = local->aligned = NULL; - local->base_size = local->aligned_size = 0; - return 0; -} - -int yespower_free_local(yespower_local_t *local) -{ -/* The reference implementation frees its memory in yespower() */ - (void)local; /* unused */ - return 0; -} diff --git a/algo/yespower/yespower.c b/algo/yespower/yespower.c deleted file mode 100644 index d0bcc39..0000000 --- a/algo/yespower/yespower.c +++ /dev/null @@ -1,174 +0,0 @@ -/*- - * Copyright 2018 Cryply team - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Cryply team as part of the Cryply - * coin. - */ -#include "yespower.h" - -#include "algo-gate-api.h" - -static yespower_params_t yespower_params; - -void yespower_hash( const char *input, char *output, uint32_t len ) -{ - yespower_tls( input, len, &yespower_params, (yespower_binary_t*)output ); -} - -int scanhash_yespower( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) vhash[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - for (int k = 0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - do { - be32enc(&endiandata[19], n); - yespower_hash((char*) endiandata, (char*) vhash, 80); - if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) - && !opt_benchmark ) - { - pdata[19] = n; - submit_solution( work, vhash, mythr ); - } - n++; - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - - return 0; -} - -int64_t yespower_get_max64() -{ - return 0xfffLL; -} - -bool register_yespower_algo( algo_gate_t* gate ) -{ - yespower_params.version = YESPOWER_1_0; - yespower_params.N = 2048; - yespower_params.r = 32; - yespower_params.pers = NULL; - yespower_params.perslen = 0; - gate->optimizations = SSE2_OPT; - gate->get_max64 = (void*)&yespower_get_max64; - gate->scanhash = (void*)&scanhash_yespower; - gate->hash = (void*)&yespower_hash; - gate->set_target = (void*)&scrypt_set_target; - return true; -}; - -bool register_yespowerr16_algo( algo_gate_t* gate ) -{ - yespower_params.version = YESPOWER_1_0; - yespower_params.N = 4096; - yespower_params.r = 16; - yespower_params.pers = NULL; - yespower_params.perslen = 0; - gate->optimizations = SSE2_OPT; - gate->get_max64 = (void*)&yespower_get_max64; - gate->scanhash = (void*)&scanhash_yespower; - gate->hash = (void*)&yespower_hash; - gate->set_target = (void*)&scrypt_set_target; - return true; - }; - - -int64_t yescrypt_05_get_max64() -{ - return 0x1ffLL; -} - -int64_t yescryptr16_05_get_max64() -{ - return 0xfffLL; -} - -bool register_yescrypt_05_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_yespower; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&yescrypt_05_get_max64; - yespower_params.version = YESPOWER_0_5; - yespower_params.N = 2048; - yespower_params.r = 8; - yespower_params.pers = NULL; - yespower_params.perslen = 0; - return true; -} - -bool register_yescryptr8_05_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_yespower; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&yescrypt_05_get_max64; - yespower_params.version = YESPOWER_0_5; - yespower_params.N = 2048; - yespower_params.r = 8; - yespower_params.pers = "Client Key"; - yespower_params.perslen = 10; - return true; -} - -bool register_yescryptr16_05_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_yespower; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&yescryptr16_05_get_max64; - yespower_params.version = YESPOWER_0_5; - yespower_params.N = 4096; - yespower_params.r = 16; - yespower_params.pers = NULL; - yespower_params.perslen = 0; - return true; -} - -bool register_yescryptr32_05_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_yespower; - gate->set_target = (void*)&scrypt_set_target; - gate->get_max64 = (void*)&yescryptr16_05_get_max64; - yespower_params.version = YESPOWER_0_5; - yespower_params.N = 4096; - yespower_params.r = 32; - yespower_params.pers = "WaviBanana"; - yespower_params.perslen = 10; - return true; -} - diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h deleted file mode 100644 index b388d72..0000000 --- a/algo/yespower/yespower.h +++ /dev/null @@ -1,130 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2013-2018 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ -#ifndef _YESPOWER_H_ -#define _YESPOWER_H_ - -#include -#include /* for size_t */ - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * Internal type used by the memory allocator. Please do not use it directly. - * Use yespower_local_t instead. - */ -typedef struct { - void *base, *aligned; - size_t base_size, aligned_size; -} yespower_region_t; - -/** - * Type for thread-local (RAM) data structure. - */ -typedef yespower_region_t yespower_local_t; - -/* - * Type for yespower algorithm version numbers. - */ -typedef enum { YESPOWER_0_5 = 5, YESPOWER_1_0 = 10 } yespower_version_t; - -/** - * yespower parameters combined into one struct. - */ -typedef struct { - yespower_version_t version; - uint32_t N, r; - const uint8_t *pers; - size_t perslen; -} yespower_params_t; - -/** - * A 256-bit yespower hash. - */ -typedef struct { - unsigned char uc[32]; -} yespower_binary_t; - -/** - * yespower_init_local(local): - * Initialize the thread-local (RAM) data structure. Actual memory allocation - * is currently fully postponed until a call to yespower(). - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as local is local to the thread. - */ -extern int yespower_init_local(yespower_local_t *local); - -/** - * yespower_free_local(local): - * Free memory that may have been allocated for an initialized thread-local - * (RAM) data structure. - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as local is local to the thread. - */ -extern int yespower_free_local(yespower_local_t *local); - -/** - * yespower(local, src, srclen, params, dst): - * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". - * local is the thread-local data structure, allowing to preserve and reuse a - * memory allocation across calls, thereby reducing processing overhead. - * - * Return 0 on success; or -1 on error. - * - * local must be initialized with yespower_init_local(). - * - * MT-safe as long as local and dst are local to the thread. - */ -extern int yespower(yespower_local_t *local, - const uint8_t *src, size_t srclen, - const yespower_params_t *params, yespower_binary_t *dst); - -/** - * yespower_tls(src, srclen, params, dst): - * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". - * The memory allocation is maintained internally using thread-local storage. - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as dst is local to the thread. - */ -extern int yespower_tls(const uint8_t *src, size_t srclen, - const yespower_params_t *params, yespower_binary_t *dst); - -#ifdef __cplusplus -} -#endif - -#endif /* !_YESPOWER_H_ */ diff --git a/api.c b/api.c index 8999397..60855b4 100644 --- a/api.c +++ b/api.c @@ -32,7 +32,7 @@ #include #include "miner.h" - +#include "sysinfos.c" #ifndef WIN32 # include # include @@ -105,7 +105,7 @@ extern double global_hashrate; #define USE_MONITORING extern float cpu_temp(int); extern uint32_t cpu_clock(int); -extern int cpu_fanpercent(void); +//extern int cpu_fanpercent(void); /***************************************************************/ diff --git a/build-allarch.sh b/build-allarch.sh index 63e0e95..a280d89 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -1,86 +1,68 @@ #!/bin/bash # # This script is not intended for users, it is only used for compile testing -# during develpment. Howver the information contained my provide cimpilation +# during develpment. However the information contained may provide compilation # tips to users. +rm -r bin/unix 2>/dev/null +rm cpuminer 2>/dev/null +mkdir -p bin/{win,unix} 2>/dev/null + +DFLAGS="-Wall -fno-common -Wno-comment -Wno-maybe-uninitialized" + +# 1 - Architecture +# 2 - Output suffix +# 3 - Additional options +compile() { + make distclean || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-avx512.exe +CFLAGS="-O3 -march=${1} ${3} ${DFLAGS}" ./configure --with-curl +make -j 8 strip -s cpuminer -mv cpuminer cpuminer-avx512 +mv cpuminer bin/unix/cpuminer-${2} -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-avx2.exe -strip -s cpuminer -mv cpuminer cpuminer-avx2 +} -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-aes-avx.exe -strip -s cpuminer -mv cpuminer cpuminer-aes-avx +# Icelake AVX512 SHA VAES +compile "icelake-client" "avx512-sha-vaes" -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-aes-sse42.exe -strip -s cpuminer -mv cpuminer cpuminer-aes-sse42 +# Rocketlake AVX512 SHA AES +compile "cascadelake" "avx512-sha" "-msha" -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-sse42.exe -strip -s cpuminer -mv cpuminer cpuminer-sse42 +# Slylake-X AVX512 AES +compile "skylake-avx512" "avx512" -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-ssse3.exe -strip -s cpuminer -mv cpuminer cpuminer-ssse3 +# Haswell AVX2 AES +# GCC 9 doesn't include AES with core-avx2 +compile "core-avx2" "avx2" "-maes" -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-sse2.exe -strip -s cpuminer -mv cpuminer cpuminer-sse2 +# Sandybridge AVX AES +compile "corei7-avx" "avx" "-maes" -make clean || echo done -rm -f config.status -CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-zen.exe -strip -s cpuminer -mv cpuminer cpuminer-zen +# Westmere SSE4.2 AES +compile "westmere" "aes-sse42" -make clean || echo done -rm -f config.status -CFLAGS="-O3 -march=native -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -strip -s cpuminer +# Nehalem SSE4.2 +compile "corei7" "sse42" + +# Core2 SSSE3 +compile "core2" "ssse3" + +# Generic SSE2 +compile "x86-64" "sse2" "-msse2" + +# AMD Zen1 AVX2 SHA +compile "znver1" "zen" + +# AMD Zen3 AVX2 SHA VAES +compile "znver2" "zen3" "-mvaes" + +# Build native +./build.sh +ls -l bin/unix +if (( $(ls bin/unix/ | wc -l) != "11" )); then + echo "Some binaries did not compile?" +fi diff --git a/build.sh b/build.sh index bf713ea..8fcf694 100755 --- a/build.sh +++ b/build.sh @@ -1,27 +1,42 @@ #!/bin/bash -#if [ "$OS" = "Windows_NT" ]; then -# ./mingw64.sh -# exit 0 -#fi - -# Linux build - -make distclean || echo clean - -rm -f config.status -./autogen.sh || echo done - -# Ubuntu 10.04 (gcc 4.4) -# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16" - -# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+) -#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores" - -#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr -CFLAGS="-O3 -march=native -Wall" ./configure --with-curl -#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl - -make -j 4 - -strip -s cpuminer +rm -v build.log 2>/dev/null + +make distclean | tee build.log + +rm -f config.status | tee -a build.log +./autogen.sh | tee -a build.log + + +ARCH="" +MFPU="" + + +if [[ $(uname -m) =~ "armv7" ]]; then + if [[ $(uname -m) != "armv7l" ]]; then + echo "Detected unknown ARMv7 processor $(uname -m)" | tee -a build.log + fi + echo "Detected ARMv7 (arm) system" | tee -a build.log + ARCH="armv7-a" + if [[ ! -z "$(cat /proc/cpuinfo | grep "vfpv4")" ]]; then + echo "Detected vfpv4 instruction set. Changing to -mfpu=neon-vfpv4" | tee -a build.log + MFPU="-mfpu=neon-vfpv4" + else + echo $(cat /proc/cpuinfo | grep "vfpv4") | tee -a build.log + echo "Using default -mfpu=neon" | tee -a build.log + MFPU="-mfpu=neon" + fi +elif [[ $(uname -m) =~ "aarch64" ]]; then + echo "Detected ARMv8 (aarch64) system" | tee -a build.log + ARCH="armv8-a+simd" +else + echo "Architecture $(uname -m). Compile as native" | tee -a build.log + ARCH="native" + MFPU="" +fi + +CFLAGS="-O3 -march=${ARCH} ${MFPU} -mtune=native" CXXFLAGS="$CFLAGS -std=c++11" ./configure --with-curl | tee -a build.log + +make -j 4 | tee -a build.log + +strip -s cpuminer | tee -a build.log diff --git a/configure b/configure index 62cc85e..48b6788 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.6.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt-gr 3.9.6.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -575,10 +575,10 @@ MFLAGS= MAKEFLAGS= # Identity of this package. -PACKAGE_NAME='cpuminer-opt' -PACKAGE_TARNAME='cpuminer-opt' +PACKAGE_NAME='cpuminer-opt-gr' +PACKAGE_TARNAME='cpuminer-opt-gr' PACKAGE_VERSION='3.9.6.1' -PACKAGE_STRING='cpuminer-opt 3.9.6.1' +PACKAGE_STRING='cpuminer-opt-gr 3.9.6.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -667,7 +667,6 @@ am__nodep AMDEPBACKSLASH AMDEP_FALSE AMDEP_TRUE -am__quote am__include DEPDIR OBJEXT @@ -757,7 +756,8 @@ PACKAGE_VERSION PACKAGE_TARNAME PACKAGE_NAME PATH_SEPARATOR -SHELL' +SHELL +am__quote' ac_subst_files='' ac_user_opts=' enable_option_checking @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.9.6.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt-gr 3.9.6.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1381,7 +1381,7 @@ Fine tuning of the installation directories: --infodir=DIR info documentation [DATAROOTDIR/info] --localedir=DIR locale-dependent data [DATAROOTDIR/locale] --mandir=DIR man documentation [DATAROOTDIR/man] - --docdir=DIR documentation root [DATAROOTDIR/doc/cpuminer-opt] + --docdir=DIR documentation root [DATAROOTDIR/doc/cpuminer-opt-gr] --htmldir=DIR html documentation [DOCDIR] --dvidir=DIR dvi documentation [DOCDIR] --pdfdir=DIR pdf documentation [DOCDIR] @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.9.6.1:";; + short | recursive ) echo "Configuration of cpuminer-opt-gr 3.9.6.1:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.9.6.1 +cpuminer-opt-gr configure 3.9.6.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.9.6.1, which was +It was created by cpuminer-opt-gr $as_me 3.9.6.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2507,7 +2507,7 @@ ac_config_headers="$ac_config_headers cpuminer-config.h" -am__api_version='1.15' +am__api_version='1.16' # Find a good install program. We prefer a C program (faster), # so one script is as good as another. But avoid the broken or @@ -2683,12 +2683,7 @@ program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"` am_aux_dir=`cd "$ac_aux_dir" && pwd` if test x"${MISSING+set}" != xset; then - case $am_aux_dir in - *\ * | *\ *) - MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;; - *) - MISSING="\${SHELL} $am_aux_dir/missing" ;; - esac + MISSING="\${SHELL} '$am_aux_dir/missing'" fi # Use eval to expand $SHELL if eval "$MISSING --is-lightweight"; then @@ -2992,7 +2987,7 @@ fi # Define the identity of the package. - PACKAGE='cpuminer-opt' + PACKAGE='cpuminer-opt-gr' VERSION='3.9.6.1' @@ -3023,8 +3018,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"} # For better backward compatibility. To be removed once Automake 1.9.x # dies out for good. For more background, see: -# -# +# +# mkdir_p='$(MKDIR_P)' # We need awk for the "check" target (and possibly the TAP driver). The @@ -3075,7 +3070,7 @@ END Aborting the configuration process, to ensure you take notice of the issue. You can download and install GNU coreutils to get an 'rm' implementation -that behaves properly: . +that behaves properly: . If you want to complete the configuration process using your problematic 'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM @@ -3115,45 +3110,45 @@ DEPDIR="${am__leading_dot}deps" ac_config_commands="$ac_config_commands depfiles" - -am_make=${MAKE-make} -cat > confinc << 'END' +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} supports the include directive" >&5 +$as_echo_n "checking whether ${MAKE-make} supports the include directive... " >&6; } +cat > confinc.mk << 'END' am__doit: - @echo this is the am__doit target + @echo this is the am__doit target >confinc.out .PHONY: am__doit END -# If we don't find an include directive, just comment out the code. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5 -$as_echo_n "checking for style of include used by $am_make... " >&6; } am__include="#" am__quote= -_am_result=none -# First try GNU make style include. -echo "include confinc" > confmf -# Ignore all kinds of additional output from 'make'. -case `$am_make -s -f confmf 2> /dev/null` in #( -*the\ am__doit\ target*) - am__include=include - am__quote= - _am_result=GNU - ;; -esac -# Now try BSD make style include. -if test "$am__include" = "#"; then - echo '.include "confinc"' > confmf - case `$am_make -s -f confmf 2> /dev/null` in #( - *the\ am__doit\ target*) - am__include=.include - am__quote="\"" - _am_result=BSD +# BSD make does it like this. +echo '.include "confinc.mk" # ignored' > confmf.BSD +# Other make implementations (GNU, Solaris 10, AIX) do it like this. +echo 'include confinc.mk # ignored' > confmf.GNU +_am_result=no +for s in GNU BSD; do + { echo "$as_me:$LINENO: ${MAKE-make} -f confmf.$s && cat confinc.out" >&5 + (${MAKE-make} -f confmf.$s && cat confinc.out) >&5 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } + case $?:`cat confinc.out 2>/dev/null` in #( + '0:this is the am__doit target') : + case $s in #( + BSD) : + am__include='.include' am__quote='"' ;; #( + *) : + am__include='include' am__quote='' ;; +esac ;; #( + *) : ;; - esac -fi - - -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5 -$as_echo "$_am_result" >&6; } -rm -f confinc confmf +esac + if test "$am__include" != "#"; then + _am_result="yes ($s style)" + break + fi +done +rm -f confinc.* confmf.* +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${_am_result}" >&5 +$as_echo "${_am_result}" >&6; } # Check whether --enable-dependency-tracking was given. if test "${enable_dependency_tracking+set}" = set; then : @@ -6690,7 +6685,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.9.6.1, which was +This file was extended by cpuminer-opt-gr $as_me 3.9.6.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6751,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.9.6.1 +cpuminer-opt-gr config.status 3.9.6.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" @@ -6875,7 +6870,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 # # INIT-COMMANDS # -AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir" +AMDEP_TRUE="$AMDEP_TRUE" MAKE="${MAKE-make}" _ACEOF @@ -7489,29 +7484,35 @@ $as_echo "$as_me: executing $ac_file commands" >&6;} # Older Autoconf quotes --file arguments for eval, but not when files # are listed without --file. Let's play safe and only enable the eval # if we detect the quoting. - case $CONFIG_FILES in - *\'*) eval set x "$CONFIG_FILES" ;; - *) set x $CONFIG_FILES ;; - esac + # TODO: see whether this extra hack can be removed once we start + # requiring Autoconf 2.70 or later. + case $CONFIG_FILES in #( + *\'*) : + eval set x "$CONFIG_FILES" ;; #( + *) : + set x $CONFIG_FILES ;; #( + *) : + ;; +esac shift - for mf + # Used to flag and report bootstrapping failures. + am_rc=0 + for am_mf do # Strip MF so we end up with the name of the file. - mf=`echo "$mf" | sed -e 's/:.*$//'` - # Check whether this is an Automake generated Makefile or not. - # We used to match only the files named 'Makefile.in', but - # some people rename them; so instead we look at the file content. - # Grep'ing the first line is not enough: some people post-process - # each Makefile.in and add a new line on top of each file to say so. - # Grep'ing the whole file is not good either: AIX grep has a line + am_mf=`$as_echo "$am_mf" | sed -e 's/:.*$//'` + # Check whether this is an Automake generated Makefile which includes + # dependency-tracking related rules and includes. + # Grep'ing the whole file directly is not great: AIX grep has a line # limit of 2048, but all sed's we know have understand at least 4000. - if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then - dirpart=`$as_dirname -- "$mf" || -$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$mf" : 'X\(//\)[^/]' \| \ - X"$mf" : 'X\(//\)$' \| \ - X"$mf" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$mf" | + sed -n 's,^am--depfiles:.*,X,p' "$am_mf" | grep X >/dev/null 2>&1 \ + || continue + am_dirpart=`$as_dirname -- "$am_mf" || +$as_expr X"$am_mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$am_mf" : 'X\(//\)[^/]' \| \ + X"$am_mf" : 'X\(//\)$' \| \ + X"$am_mf" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$am_mf" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q @@ -7529,53 +7530,50 @@ $as_echo X"$mf" | q } s/.*/./; q'` - else - continue - fi - # Extract the definition of DEPDIR, am__include, and am__quote - # from the Makefile without running 'make'. - DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"` - test -z "$DEPDIR" && continue - am__include=`sed -n 's/^am__include = //p' < "$mf"` - test -z "$am__include" && continue - am__quote=`sed -n 's/^am__quote = //p' < "$mf"` - # Find all dependency output files, they are included files with - # $(DEPDIR) in their names. We invoke sed twice because it is the - # simplest approach to changing $(DEPDIR) to its actual value in the - # expansion. - for file in `sed -n " - s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \ - sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do - # Make sure the directory exists. - test -f "$dirpart/$file" && continue - fdir=`$as_dirname -- "$file" || -$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$file" : 'X\(//\)[^/]' \| \ - X"$file" : 'X\(//\)$' \| \ - X"$file" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$file" | - sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ - s//\1/ - q - } - /^X\(\/\/\)[^/].*/{ + am_filepart=`$as_basename -- "$am_mf" || +$as_expr X/"$am_mf" : '.*/\([^/][^/]*\)/*$' \| \ + X"$am_mf" : 'X\(//\)$' \| \ + X"$am_mf" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$am_mf" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/ q } - /^X\(\/\/\)$/{ + /^X\/\(\/\/\)$/{ s//\1/ q } - /^X\(\/\).*/{ + /^X\/\(\/\).*/{ s//\1/ q } s/.*/./; q'` - as_dir=$dirpart/$fdir; as_fn_mkdir_p - # echo "creating $dirpart/$file" - echo '# dummy' > "$dirpart/$file" - done + { echo "$as_me:$LINENO: cd "$am_dirpart" \ + && sed -e '/# am--include-marker/d' "$am_filepart" \ + | $MAKE -f - am--depfiles" >&5 + (cd "$am_dirpart" \ + && sed -e '/# am--include-marker/d' "$am_filepart" \ + | $MAKE -f - am--depfiles) >&5 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } || am_rc=$? done + if test $am_rc -ne 0; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "Something went wrong bootstrapping makefile fragments + for automatic dependency tracking. If GNU make was not used, consider + re-running the configure script with MAKE=\"gmake\" (or whatever is + necessary). You can also try re-running configure with the + '--disable-dependency-tracking' option to at least be able to build + the package (albeit without support for automatic dependency tracking). +See \`config.log' for more details" "$LINENO" 5; } + fi + { am_dirpart=; unset am_dirpart;} + { am_filepart=; unset am_filepart;} + { am_mf=; unset am_mf;} + { am_rc=; unset am_rc;} + rm -f conftest-deps.mk } ;; diff --git a/cpu-miner.c b/cpu-miner.c index c2c88d9..dd93d78 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2,8 +2,8 @@ * Copyright 2010 Jeff Garzik * Copyright 2012-2014 pooler * Copyright 2014 Lucas Jones - * Copyright 2014 Tanguy Pruvot - * Copyright 2016 Jay D Dee + * Copyright 2014-2016 Tanguy Pruvot + * Copyright 2016-2020 Jay D Dee * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -24,23 +24,25 @@ #include #define _GNU_SOURCE +#include "sysinfos.c" +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include -#include -#include -#include #include #include -#include -#include -#include -#include -#include +#include #ifdef WIN32 -#include #include +#include #endif #ifdef _MSC_VER @@ -52,42 +54,47 @@ #if HAVE_SYS_PARAM_H #include #endif + +// GCC 9 warning sysctl.h is deprecated +#if (__GNUC__ < 9) #include #endif -#endif + +#endif // HAVE_SYS_SYSCTL_H +#endif // _MSC_VER ELSE #ifndef WIN32 #include #endif -#include "miner.h" #include "algo-gate-api.h" +#include "miner.h" #ifdef WIN32 #include "compat/winansi.h" -//BOOL WINAPI ConsoleHandler(DWORD); +// BOOL WINAPI ConsoleHandler(DWORD); #endif #ifdef _MSC_VER #include #pragma comment(lib, "winmm.lib") #endif -#define LP_SCANTIME 60 +#define LP_SCANTIME 60 algo_gate_t algo_gate; -bool opt_debug = true; +bool opt_debug = false; bool opt_debug_diff = false; bool opt_protocol = false; bool opt_benchmark = false; +bool opt_benchmark_extended = false; bool opt_redirect = true; -bool opt_showdiff = true; bool opt_extranonce = true; -bool want_longpoll = true; +bool want_longpoll = false; bool have_longpoll = false; bool have_gbt = true; bool allow_getwork = true; -bool want_stratum = true; +bool want_stratum = true; // pretty useless bool have_stratum = false; bool allow_mininginfo = true; bool use_syslog = false; @@ -100,28 +107,45 @@ static int opt_fail_pause = 10; static int opt_time_limit = 0; int opt_timeout = 300; static int opt_scantime = 5; -//static const bool opt_time = true; +const int min_scantime = 1; +// static const bool opt_time = true; enum algos opt_algo = ALGO_NULL; -int opt_scrypt_n = 0; -int opt_pluck_n = 128; +char *opt_param_key = NULL; +int opt_param_n = 0; +int opt_param_r = 0; int opt_n_threads = 0; +bool opt_sapling = false; + // Windows doesn't support 128 bit affinity mask. -#if defined(__linux) && defined(GCC_INT128) +// Need compile time and run time test. +#if defined(__linux) && defined(GCC_INT128) #define AFFINITY_USES_UINT128 1 -uint128_t opt_affinity = -1LL; +static uint128_t opt_affinity = -1; +static bool affinity_uses_uint128 = true; #else -uint64_t opt_affinity = -1LL; +static uint64_t opt_affinity = -1; +static bool affinity_uses_uint128 = false; #endif -int opt_priority = 0; + +int opt_priority = 0; // deprecated int num_cpus = 1; int num_cpugroups = 1; -char *rpc_url = NULL;; +char *rpc_url = NULL; +; char *rpc_userpass = NULL; char *rpc_user, *rpc_pass; char *short_url = NULL; -static unsigned char pk_script[25] = { 0 }; +char *coinbase_address; +char *opt_data_file = NULL; +bool opt_verify = false; + +// pk_buffer_size is used as a version selector by b58 code, therefore +// it must be set correctly to work. +const int pk_buffer_size_max = 26; +int pk_buffer_size = 25; +static unsigned char pk_script[26] = {0}; static size_t pk_script_size = 0; -static char coinbase_sig[101] = { 0 }; +static char coinbase_sig[101] = {0}; char *opt_cert; char *opt_proxy; long opt_proxy_type; @@ -129,3345 +153,3410 @@ struct thr_info *thr_info; int work_thr_id; int longpoll_thr_id = -1; int stratum_thr_id = -1; +int dev_stratum_thr_id = -1; int api_thr_id = -1; bool stratum_need_reset = false; +bool dev_stratum_need_reset = false; struct work_restart *work_restart = NULL; struct stratum_ctx stratum; -bool jsonrpc_2 = false; -char rpc2_id[64] = ""; -char *rpc2_blob = NULL; -size_t rpc2_bloblen = 0; -uint32_t rpc2_target = 0; -char *rpc2_job_id = NULL; +struct stratum_ctx dev_stratum; double opt_diff_factor = 1.0; +double opt_target_factor = 1.0; uint32_t zr5_pok = 0; bool opt_stratum_stats = false; bool opt_hash_meter = false; - -uint32_t accepted_share_count = 0ULL; -uint32_t rejected_share_count = 0ULL; -uint32_t solved_block_count = 0ULL; +uint32_t submitted_share_count = 0; +uint32_t accepted_share_count = 0; +uint32_t rejected_share_count = 0; +uint32_t stale_share_count = 0; +uint32_t solved_block_count = 0; double *thr_hashrates; -double *thr_hashcount; -double global_hashcount = 0; -double global_hashrate = 0; +double global_hashrate = 0.; double stratum_diff = 0.; double net_diff = 0.; double net_hashrate = 0.; uint64_t net_blocks = 0; -// conditional mining - bool conditional_state[MAX_CPUS] = { 0 }; - double opt_max_temp = 0.0; - double opt_max_diff = 0.0; - double opt_max_rate = 0.0; - - uint32_t opt_work_size = 0; - char *opt_api_allow = NULL; - int opt_api_remote = 0; - int opt_api_listen = 4048; - - pthread_mutex_t rpc2_job_lock; - pthread_mutex_t rpc2_login_lock; - pthread_mutex_t applog_lock; - pthread_mutex_t stats_lock; +uint32_t opt_work_size = 0; +double gr_bench_hashes = 0.; +double gr_bench_time = 0.; +// When should the first dev mining begin. +const struct timeval first_dev = {300, 0}; // First Dev mining after. +struct timeval dev_start; +// How often should it occur. +const struct timeval dev_interval = {3600, 0}; +// Dev fee - 1% of time. +const double dev_fee = 0.01; +bool dev_mining = false; +// conditional mining +bool conditional_state[MAX_CPUS] = {0}; +double opt_max_temp = 0.0; +double opt_max_diff = 0.0; +double opt_max_rate = 0.0; + +// Dev pool data. +const char *dev_address = "RQKcAZBtsSacMUiGNnbk3h3KJAN94tstvt"; +const char *dev_userpass = "RQKcAZBtsSacMUiGNnbk3h3KJAN94tstvt:x"; +// Dev pools. In case of no pools available user pool will be used. +const char *dev_pools[5] = {"stratum+tcp://rtm.suprnova.cc:6273", + "stratum+tcp://stratum.us-ny1.rtm.suprnova.cc:6273", + "stratum+tcp://stratum-eu.rplant.xyz:7056", + "stratum+tcp://stratum-na.rplant.xyz:7056", ""}; + +// API +static bool opt_api_enabled = false; +char *opt_api_allow = NULL; +int opt_api_listen = 0; +int opt_api_remote = 0; +char *default_api_allow = "127.0.0.1"; +int default_api_listen = 4048; + +pthread_mutex_t applog_lock; +pthread_mutex_t stats_lock; +pthread_cond_t sync_cond; + +static struct timeval session_start; +static struct timeval five_min_start; +static uint64_t session_first_block = 0; +static double latency_sum = 0.; +static uint64_t submit_sum = 0; +static uint64_t accept_sum = 0; +static uint64_t stale_sum = 0; +static uint64_t reject_sum = 0; +static uint64_t solved_sum = 0; +static double norm_diff_sum = 0.; +static uint32_t last_block_height = 0; +static double highest_share = 0; // highest accepted share diff +static double lowest_share = 9e99; // lowest accepted share diff +static double last_targetdiff = 0.; +#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32)) +static uint32_t hi_temp = 0; +static uint32_t prev_temp = 0; +#endif static char const short_options[] = #ifdef HAVE_SYSLOG_H - "S" + "S" #endif - "a:b:Bc:CDf:hm:n:p:Px:qr:R:s:t:T:o:u:O:V"; + "a:b:Bc:CDf:hK:m:n:N:p:Px:qr:R:s:t:T:o:u:O:V"; -static struct work g_work __attribute__ ((aligned (64))) = {{ 0 }}; -//static struct work tmp_work; +static struct work g_work __attribute__((aligned(64))) = {{0}}; time_t g_work_time = 0; -static pthread_mutex_t g_work_lock; -static bool submit_old = false; -char* lp_id; +pthread_rwlock_t g_work_lock; +static bool submit_old = false; +char *lp_id; + +static void workio_cmd_free(struct workio_cmd *wc); -static void workio_cmd_free(struct workio_cmd *wc); +static void format_affinity_map(char *map_str, uint64_t map) { + int n = num_cpus < 64 ? num_cpus : 64; + int i; + + for (i = 0; i < n; i++) { + if (map & 1) + map_str[i] = '!'; + else + map_str[i] = '.'; + map >>= 1; + } + memset(&map_str[i], 0, 64 - i); +} #ifdef __linux /* Linux specific policy and affinity management */ #include -static inline void drop_policy(void) -{ - struct sched_param param; - param.sched_priority = 0; +static inline void drop_policy(void) { + struct sched_param param; + param.sched_priority = 0; #ifdef SCHED_IDLE - if (unlikely(sched_setscheduler(0, SCHED_IDLE, ¶m) == -1)) + if (unlikely(sched_setscheduler(0, SCHED_IDLE, ¶m) == -1)) #endif #ifdef SCHED_BATCH - sched_setscheduler(0, SCHED_BATCH, ¶m); + sched_setscheduler(0, SCHED_BATCH, ¶m); #endif } #ifdef __BIONIC__ -#define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */ +#define pthread_setaffinity_np(tid, sz, s) \ + {} /* only do process affinity */ #endif // Linux affinity can use int128. #if AFFINITY_USES_UINT128 -static void affine_to_cpu_mask( int id, unsigned __int128 mask ) +static void affine_to_cpu_mask(int id, uint128_t mask) #else -static void affine_to_cpu_mask( int id, unsigned long long mask ) +static void affine_to_cpu_mask(int id, uint64_t mask) #endif { - cpu_set_t set; - CPU_ZERO( &set ); - uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus; + cpu_set_t set; + CPU_ZERO(&set); + uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus; - for ( uint8_t i = 0; i < ncpus; i++ ) - { - // cpu mask + for (uint8_t i = 0; i < ncpus; i++) { + // cpu mask #if AFFINITY_USES_UINT128 - if( ( mask & ( (unsigned __int128)1ULL << i ) ) ) CPU_SET( i, &set ); + if ((mask & ((uint128_t)1 << i))) + CPU_SET(i, &set); #else - if( (ncpus > 64) || ( mask & (1ULL << i) ) ) CPU_SET( i, &set ); + if ((ncpus > 64) || (mask & (1 << i))) + CPU_SET(i, &set); #endif - } - if ( id == -1 ) - { - // process affinity - sched_setaffinity(0, sizeof(&set), &set); - } - else - { - // thread only - pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set); - } + } + if (id == -1) { + // process affinity + sched_setaffinity(0, sizeof(&set), &set); + } else { + // thread only + pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set); + } } #elif defined(WIN32) /* Windows */ -static inline void drop_policy(void) { } +static inline void drop_policy(void) {} // Windows CPU groups to manage more than 64 CPUs. -static void affine_to_cpu_mask( int id, unsigned long mask ) -{ - bool success; - unsigned long last_error; -// BOOL success; -// DWORD last_error; +static void affine_to_cpu_mask(int id, uint64_t mask) { + bool success; + unsigned long last_error; + // BOOL success; + // DWORD last_error; + + if (id == -1) + success = SetProcessAffinityMask(GetCurrentProcess(), mask); + + // Are Windows CPU Groups supported? +#if _WIN32_WINNT == 0x0601 + else if (num_cpugroups == 1) + success = SetThreadAffinityMask(GetCurrentThread(), mask); + else { + // Find the correct cpu group + int cpu = id % num_cpus; + int group; + for (group = 0; group < num_cpugroups; group++) { + int cpus = GetActiveProcessorCount(group); + if (cpu < cpus) + break; + cpu -= cpus; + } - if ( id == -1 ) - success = SetProcessAffinityMask( GetCurrentProcess(), mask ); + if (opt_debug) + applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)", + id, cpu, group, (1ULL << cpu)); -// Are Windows CPU Groups supported? -#if _WIN32_WINNT==0x0601 - else if ( num_cpugroups == 1 ) - success = SetThreadAffinityMask( GetCurrentThread(), mask ); - else - { - // Find the correct cpu group - int cpu = id % num_cpus; - int group; - for( group = 0; group < num_cpugroups; group++ ) - { - int cpus = GetActiveProcessorCount( group ); - if ( cpu < cpus ) - break; - - cpu -= cpus; - } - - if (opt_debug) - applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)", id, cpu, group, (1ULL << cpu)); - - GROUP_AFFINITY affinity; - affinity.Group = group; - affinity.Mask = 1ULL << cpu; - success = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL ); - } + GROUP_AFFINITY affinity; + affinity.Group = group; + affinity.Mask = 1ULL << cpu; + success = SetThreadGroupAffinity(GetCurrentThread(), &affinity, NULL); + } #else - else - success = SetThreadAffinityMask( GetCurrentThread(), mask ); + else + success = SetThreadAffinityMask(GetCurrentThread(), mask); #endif - if (!success) - { - last_error = GetLastError(); - applog(LOG_WARNING, "affine_to_cpu_mask for %u returned %x", id, last_error); - } - + if (!success) { + last_error = GetLastError(); + applog(LOG_WARNING, "affine_to_cpu_mask for %u returned %x", id, + last_error); + } } #else -static inline void drop_policy(void) { } -static void affine_to_cpu_mask(int id, unsigned long mask) { } +static inline void drop_policy(void) {} +static void affine_to_cpu_mask(int id, unsigned long mask) {} #endif // not very useful, just index the arrray directly. // but declaring this function in miner.h eliminates // an annoying compiler warning for not using a static. -const char* algo_name( enum algos a ) {return algo_names[a];} +const char *algo_name(enum algos a) { return algo_names[a]; } -void get_currentalgo(char* buf, int sz) -{ - snprintf(buf, sz, "%s", algo_names[opt_algo]); +void get_currentalgo(char *buf, int sz) { + snprintf(buf, sz, "%s", algo_names[opt_algo]); } -void proper_exit(int reason) -{ +void proper_exit(int reason) { #ifdef WIN32 - if (opt_background) { - HWND hcon = GetConsoleWindow(); - if (hcon) { - // unhide parent command line windows - ShowWindow(hcon, SW_SHOWMINNOACTIVE); - } - } + if (opt_background) { + HWND hcon = GetConsoleWindow(); + if (hcon) { + // unhide parent command line windows + ShowWindow(hcon, SW_SHOWMINNOACTIVE); + } + } #endif - exit(reason); + exit(reason); +} + +uint32_t *get_stratum_job_ntime() { return (uint32_t *)stratum.job.ntime; } + +void work_free(struct work *w) { + if (w->txs) + free(w->txs); + if (w->workid) + free(w->workid); + if (w->job_id) + free(w->job_id); + if (w->xnonce2) + free(w->xnonce2); +} + +void work_copy(struct work *dest, const struct work *src) { + memcpy(dest, src, sizeof(struct work)); + if (src->txs) + dest->txs = strdup(src->txs); + if (src->workid) + dest->workid = strdup(src->workid); + if (src->job_id) + dest->job_id = strdup(src->job_id); + if (src->xnonce2) { + dest->xnonce2 = (uchar *)malloc(src->xnonce2_len); + memcpy(dest->xnonce2, src->xnonce2, src->xnonce2_len); + } } -uint32_t* get_stratum_job_ntime() -{ - return (uint32_t*)stratum.job.ntime; -} +int std_get_work_data_size() { return STD_WORK_DATA_SIZE; } -void work_free(struct work *w) -{ - if (w->txs) free(w->txs); - if (w->workid) free(w->workid); - if (w->job_id) free(w->job_id); - if (w->xnonce2) free(w->xnonce2); +// Default +bool std_le_work_decode(struct work *work) { + int i; + const int adata_sz = algo_gate.get_work_data_size() / 4; + const int atarget_sz = ARRAY_SIZE(work->target); + + for (i = 0; i < adata_sz; i++) + work->data[i] = le32dec(work->data + i); + for (i = 0; i < atarget_sz; i++) + work->target[i] = le32dec(work->target + i); + return true; } -void work_copy(struct work *dest, const struct work *src) -{ - memcpy(dest, src, sizeof(struct work)); - if (src->txs) - dest->txs = strdup(src->txs); - if (src->workid) - dest->workid = strdup(src->workid); - if (src->job_id) - dest->job_id = strdup(src->job_id); - if (src->xnonce2) { - dest->xnonce2 = (uchar*) malloc(src->xnonce2_len); - memcpy(dest->xnonce2, src->xnonce2, src->xnonce2_len); - } +bool std_be_work_decode(struct work *work) { + int i; + const int adata_sz = algo_gate.get_work_data_size() / 4; + const int atarget_sz = ARRAY_SIZE(work->target); + + for (i = 0; i < adata_sz; i++) + work->data[i] = be32dec(work->data + i); + for (i = 0; i < atarget_sz; i++) + work->target[i] = le32dec(work->target + i); + return true; } -int std_get_work_data_size() { return STD_WORK_DATA_SIZE; } +static bool work_decode(const json_t *val, struct work *work) { + const int data_size = algo_gate.get_work_data_size(); + const int target_size = sizeof(work->target); + + if (unlikely(!jobj_binary(val, "data", work->data, data_size))) { + applog(LOG_ERR, "JSON invalid data"); + return false; + } + if (unlikely(!jobj_binary(val, "target", work->target, target_size))) { + applog(LOG_ERR, "JSON invalid target"); + return false; + } -bool jr2_work_decode( const json_t *val, struct work *work ) -{ return rpc2_job_decode( val, work ); } + if (unlikely(!algo_gate.work_decode(work))) + return false; -// Default -bool std_le_work_decode( const json_t *val, struct work *work ) -{ - int i; - const int data_size = algo_gate.get_work_data_size(); - const int target_size = sizeof(work->target); - const int adata_sz = data_size / 4; - const int atarget_sz = ARRAY_SIZE(work->target); + if (!allow_mininginfo) + net_diff = algo_gate.calc_network_diff(work); - if (unlikely( !jobj_binary(val, "data", work->data, data_size) )) - { - applog(LOG_ERR, "JSON invalid data"); - return false; - } - if (unlikely( !jobj_binary(val, "target", work->target, target_size) )) - { - applog(LOG_ERR, "JSON invalid target"); - return false; - } - for ( i = 0; i < adata_sz; i++ ) - work->data[i] = le32dec( work->data + i ); - for ( i = 0; i < atarget_sz; i++ ) - work->target[i] = le32dec( work->target + i ); - return true; + work->targetdiff = hash_to_diff(work->target); + stratum_diff = last_targetdiff = work->targetdiff; + work->sharediff = 0; + algo_gate.decode_extra_data(work, &net_blocks); + + return true; } -bool std_be_work_decode( const json_t *val, struct work *work ) -{ - int i; - const int data_size = algo_gate.get_work_data_size(); - const int target_size = sizeof(work->target); - const int adata_sz = data_size / 4; - const int atarget_sz = ARRAY_SIZE(work->target); +// good alternative for wallet mining, difficulty and net hashrate +static const char *info_req = + "{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n"; - if (unlikely( !jobj_binary(val, "data", work->data, data_size) )) - { - applog(LOG_ERR, "JSON invalid data"); - return false; +static bool get_mininginfo(CURL *curl, struct work *work) { + if (have_stratum || !allow_mininginfo || !dev_mining) + return false; + + int curl_err = 0; + json_t *val = + json_rpc_call(curl, rpc_url, rpc_userpass, info_req, &curl_err, 0); + + if (!val && curl_err == -1) { + allow_mininginfo = false; + applog(LOG_NOTICE, + "\"getmininginfo\" not supported, some stats not available"); + return false; + } + + json_t *res = json_object_get(val, "result"); + // "blocks": 491493 (= current work height - 1) + // "difficulty": 0.99607860999999998 + // "networkhashps": 56475980 + if (res) { + json_t *key = json_object_get(res, "difficulty"); + if (key) { + if (json_is_object(key)) + key = json_object_get(key, "proof-of-work"); + if (json_is_real(key)) + net_diff = work->targetdiff = json_real_value(key); } - if (unlikely( !jobj_binary(val, "target", work->target, target_size) )) - { - applog(LOG_ERR, "JSON invalid target"); - return false; + + key = json_object_get(res, "networkhashps"); + if (key) { + if (json_is_integer(key)) + net_hashrate = (double)json_integer_value(key); + else if (json_is_real(key)) + net_hashrate = (double)json_real_value(key); } - for ( i = 0; i < adata_sz; i++ ) - work->data[i] = be32dec( work->data + i ); - for ( i = 0; i < atarget_sz; i++ ) - work->target[i] = le32dec( work->target + i ); - return true; -} -static bool work_decode( const json_t *val, struct work *work ) -{ - if ( !algo_gate.work_decode( val, work ) ) - return false; - if ( !allow_mininginfo ) - net_diff = algo_gate.calc_network_diff( work ); - work->targetdiff = target_to_diff( work->target ); - // for api stats, on longpoll pools - stratum_diff = work->targetdiff; - work->sharediff = 0; - algo_gate.decode_extra_data( work, &net_blocks ); - return true; -} + key = json_object_get(res, "blocks"); + if (key && json_is_integer(key)) + net_blocks = json_integer_value(key); -// good alternative for wallet mining, difficulty and net hashrate -static const char *info_req = -"{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n"; + if (opt_debug) + applog(LOG_INFO, "Mining info: diff %.5g, net_hashrate %f, height %d", + net_diff, net_hashrate, net_blocks); -static bool get_mininginfo(CURL *curl, struct work *work) -{ - if (have_stratum || !allow_mininginfo) - return false; - - int curl_err = 0; - json_t *val = json_rpc_call(curl, rpc_url, rpc_userpass, info_req, &curl_err, 0); - - if (!val && curl_err == -1) { - allow_mininginfo = false; - if (opt_debug) { - applog(LOG_DEBUG, "getmininginfo not supported"); - } - return false; - } - else - { - json_t *res = json_object_get(val, "result"); - // "blocks": 491493 (= current work height - 1) - // "difficulty": 0.99607860999999998 - // "networkhashps": 56475980 - if (res) - { - json_t *key = json_object_get(res, "difficulty"); - if (key) { - if (json_is_object(key)) - key = json_object_get(key, "proof-of-work"); - if (json_is_real(key)) - net_diff = json_real_value(key); - } - key = json_object_get(res, "networkhashps"); - if (key && json_is_integer(key)) { - net_hashrate = (double) json_integer_value(key); - } - key = json_object_get(res, "blocks"); - if (key && json_is_integer(key)) { - net_blocks = json_integer_value(key); - } - if (!work->height) - { - // complete missing data from getwork - work->height = (uint32_t) net_blocks + 1; - if (work->height > g_work.height) - { - restart_threads(); - if (!opt_quiet) { - char netinfo[64] = { 0 }; - char srate[32] = { 0 }; - sprintf(netinfo, "diff %.2f", net_diff); - if (net_hashrate) { - format_hashrate(net_hashrate, srate); - strcat(netinfo, ", net "); - strcat(netinfo, srate); - } - applog(LOG_BLUE, "%s block %d, %s", - algo_names[opt_algo], work->height, netinfo); - } - } - } - } - } - json_decref(val); - return true; + if (!work->height) { + // complete missing data from getwork + work->height = (uint32_t)net_blocks + 1; + if (work->height > g_work.height) + restart_threads(); + } // res + } + json_decref(val); + return true; } // hodl needs 4 but leave it at 3 until gbt better understood //#define BLOCK_VERSION_CURRENT 3 #define BLOCK_VERSION_CURRENT 4 -static bool gbt_work_decode( const json_t *val, struct work *work ) -{ - int i, n; - uint32_t version, curtime, bits; - uint32_t prevhash[8]; - uint32_t target[8]; - int cbtx_size; - uchar *cbtx = NULL; - int tx_count, tx_size; - uchar txc_vi[9]; - uchar(*merkle_tree)[32] = NULL; - bool coinbase_append = false; - bool submit_coinbase = false; - bool version_force = false; - bool version_reduce = false; - json_t *tmp, *txa; - bool rc = false; - - tmp = json_object_get( val, "mutable" ); - if ( tmp && json_is_array( tmp ) ) - { - n = (int) json_array_size( tmp ); - for ( i = 0; i < n; i++ ) - { - const char *s = json_string_value( json_array_get( tmp, i ) ); - if ( !s ) - continue; - if ( !strcmp( s, "coinbase/append" ) ) coinbase_append = true; - else if ( !strcmp( s, "submit/coinbase" ) ) submit_coinbase = true; - else if ( !strcmp( s, "version/force" ) ) version_force = true; - else if ( !strcmp( s, "version/reduce" ) ) version_reduce = true; - } - } +static bool gbt_work_decode(const json_t *val, struct work *work) { + int i, n; + uint32_t version, curtime, bits; + uint32_t prevhash[8]; + uint32_t target[8]; + unsigned char final_sapling_hash[32]; + int cbtx_size; + uchar *cbtx = NULL; + int tx_count, tx_size; + uchar txc_vi[9]; + uchar(*merkle_tree)[32] = NULL; + bool coinbase_append = false; + bool submit_coinbase = false; + bool version_force = false; + bool version_reduce = false; + json_t *tmp, *txa; + bool rc = false; + + // Segwit BEGIN + bool segwit = false; + tmp = json_object_get(val, "rules"); + if (tmp && json_is_array(tmp)) { + n = json_array_size(tmp); + for (i = 0; i < n; i++) { + const char *s = json_string_value(json_array_get(tmp, i)); + if (!s) + continue; + if (!strcmp(s, "segwit") || !strcmp(s, "!segwit")) + segwit = true; + } + } + // Segwit END + + tmp = json_object_get(val, "mutable"); + if (tmp && json_is_array(tmp)) { + n = (int)json_array_size(tmp); + for (i = 0; i < n; i++) { + const char *s = json_string_value(json_array_get(tmp, i)); + if (!s) + continue; + if (!strcmp(s, "coinbase/append")) + coinbase_append = true; + else if (!strcmp(s, "submit/coinbase")) + submit_coinbase = true; + else if (!strcmp(s, "version/force")) + version_force = true; + else if (!strcmp(s, "version/reduce")) + version_reduce = true; + } + } - tmp = json_object_get( val, "height" ); - if ( !tmp || !json_is_integer( tmp ) ) - { - applog( LOG_ERR, "JSON invalid height" ); + tmp = json_object_get(val, "height"); + if (!tmp || !json_is_integer(tmp)) { + applog(LOG_ERR, "JSON invalid height"); + goto out; + } + work->height = (int)json_integer_value(tmp); + + tmp = json_object_get(val, "version"); + if (!tmp || !json_is_integer(tmp)) { + applog(LOG_ERR, "JSON invalid version"); + goto out; + } + version = (uint32_t)json_integer_value(tmp); + // yescryptr8g uses block version 5 and sapling. + if (opt_sapling) + work->sapling = true; + if ((version & 0xffU) > BLOCK_VERSION_CURRENT) { + if (version_reduce) + version = (version & ~0xffU) | BLOCK_VERSION_CURRENT; + else if (have_gbt && allow_getwork && !version_force) { + applog(LOG_DEBUG, "Switching to getwork, gbt version %d", version); + have_gbt = false; goto out; - } - work->height = (int) json_integer_value( tmp ); - applog( LOG_BLUE, "Current block is %d", work->height ); - - tmp = json_object_get(val, "version"); - if ( !tmp || !json_is_integer( tmp ) ) - { - applog( LOG_ERR, "JSON invalid version" ); + } else if (!version_force) { + applog(LOG_ERR, "Unrecognized block version: %u", version); goto out; - } - version = (uint32_t) json_integer_value( tmp ); - if ( (version & 0xffU) > BLOCK_VERSION_CURRENT ) - { - if ( version_reduce ) - { - version = ( version & ~0xffU ) | BLOCK_VERSION_CURRENT; - } - else if ( have_gbt && allow_getwork && !version_force ) - { - applog( LOG_DEBUG, "Switching to getwork, gbt version %d", version ); - have_gbt = false; - goto out; - } - else if ( !version_force ) - { - applog(LOG_ERR, "Unrecognized block version: %u", version); - goto out; - } - } + } + } - if ( unlikely( !jobj_binary(val, "previousblockhash", prevhash, - sizeof(prevhash)) ) ) - { - applog( LOG_ERR, "JSON invalid previousblockhash" ); - goto out; - } + if (unlikely( + !jobj_binary(val, "previousblockhash", prevhash, sizeof(prevhash)))) { + applog(LOG_ERR, "JSON invalid previousblockhash"); + goto out; + } + + tmp = json_object_get(val, "curtime"); + if (!tmp || !json_is_integer(tmp)) { + applog(LOG_ERR, "JSON invalid curtime"); + goto out; + } + curtime = (uint32_t)json_integer_value(tmp); + + if (unlikely(!jobj_binary(val, "bits", &bits, sizeof(bits)))) { + applog(LOG_ERR, "JSON invalid bits"); + goto out; + } - tmp = json_object_get( val, "curtime" ); - if ( !tmp || !json_is_integer( tmp ) ) - { - applog( LOG_ERR, "JSON invalid curtime" ); + if (work->sapling) { + if (unlikely(!jobj_binary(val, "finalsaplingroothash", final_sapling_hash, + sizeof(final_sapling_hash)))) { + applog(LOG_ERR, "JSON invalid finalsaplingroothash"); goto out; - } - curtime = (uint32_t) json_integer_value(tmp); + } + } - if ( unlikely( !jobj_binary( val, "bits", &bits, sizeof(bits) ) ) ) - { - applog(LOG_ERR, "JSON invalid bits"); + /* find count and size of transactions */ + txa = json_object_get(val, "transactions"); + if (!txa || !json_is_array(txa)) { + applog(LOG_ERR, "JSON invalid transactions"); + goto out; + } + tx_count = (int)json_array_size(txa); + tx_size = 0; + for (i = 0; i < tx_count; i++) { + const json_t *tx = json_array_get(txa, i); + const char *tx_hex = json_string_value(json_object_get(tx, "data")); + if (!tx_hex) { + applog(LOG_ERR, "JSON invalid transactions"); goto out; - } + } + tx_size += (int)(strlen(tx_hex) / 2); + } - /* find count and size of transactions */ - txa = json_object_get(val, "transactions" ); - if ( !txa || !json_is_array( txa ) ) - { - applog( LOG_ERR, "JSON invalid transactions" ); + /* build coinbase transaction */ + tmp = json_object_get(val, "coinbasetxn"); + if (tmp) { + const char *cbtx_hex = json_string_value(json_object_get(tmp, "data")); + cbtx_size = cbtx_hex ? (int)strlen(cbtx_hex) / 2 : 0; + cbtx = (uchar *)malloc(cbtx_size + 100); + if (cbtx_size < 60 || !hex2bin(cbtx, cbtx_hex, cbtx_size)) { + applog(LOG_ERR, "JSON invalid coinbasetxn"); goto out; - } - tx_count = (int) json_array_size( txa ); - tx_size = 0; - for ( i = 0; i < tx_count; i++ ) - { - const json_t *tx = json_array_get( txa, i ); - const char *tx_hex = json_string_value( json_object_get( tx, "data" ) ); - if ( !tx_hex ) - { - applog( LOG_ERR, "JSON invalid transactions" ); - goto out; - } - tx_size += (int) ( strlen( tx_hex ) / 2 ); - } - - /* build coinbase transaction */ - tmp = json_object_get( val, "coinbasetxn" ); - if ( tmp ) - { - const char *cbtx_hex = json_string_value( json_object_get( tmp, "data" )); - cbtx_size = cbtx_hex ? (int) strlen( cbtx_hex ) / 2 : 0; - cbtx = (uchar*) malloc( cbtx_size + 100 ); - if ( cbtx_size < 60 || !hex2bin( cbtx, cbtx_hex, cbtx_size ) ) - { - applog( LOG_ERR, "JSON invalid coinbasetxn" ); - goto out; - } - } - else - { - int64_t cbvalue; - if ( !pk_script_size ) - { - if ( allow_getwork ) - { - applog( LOG_INFO, "No payout address provided, switching to getwork"); - have_gbt = false; - } - else - applog( LOG_ERR, "No payout address provided" ); - goto out; - } - tmp = json_object_get( val, "coinbasevalue" ); - if ( !tmp || !json_is_number( tmp ) ) - { - applog( LOG_ERR, "JSON invalid coinbasevalue" ); - goto out; - } - cbvalue = (int64_t) ( json_is_integer( tmp ) ? json_integer_value( tmp ) - : json_number_value( tmp ) ); - cbtx = (uchar*) malloc(256); - le32enc( (uint32_t *)cbtx, 1 ); /* version */ - cbtx[4] = 1; /* in-counter */ - memset( cbtx+5, 0x00, 32 ); /* prev txout hash */ - le32enc( (uint32_t *)(cbtx+37), 0xffffffff ); /* prev txout index */ - cbtx_size = 43; - /* BIP 34: height in coinbase */ - for ( n = work->height; n; n >>= 8 ) - cbtx[cbtx_size++] = n & 0xff; - /* If the last byte pushed is >= 0x80, then we need to add - another zero byte to signal that the block height is a - positive number. */ - if (cbtx[cbtx_size - 1] & 0x80) - cbtx[cbtx_size++] = 0; - cbtx[42] = cbtx_size - 43; - cbtx[41] = cbtx_size - 42; /* scriptsig length */ - le32enc( (uint32_t *)( cbtx+cbtx_size ), 0xffffffff ); /* sequence */ - cbtx_size += 4; - cbtx[ cbtx_size++ ] = 1; /* out-counter */ - le32enc( (uint32_t *)( cbtx+cbtx_size) , (uint32_t)cbvalue ); /* value */ - le32enc( (uint32_t *)( cbtx+cbtx_size+4 ), cbvalue >> 32 ); + } + } else { + int64_t cbvalue; + if (!pk_script_size) { + if (allow_getwork) { + applog(LOG_INFO, "No payout address provided, switching to getwork"); + have_gbt = false; + } else + applog(LOG_ERR, "No payout address provided"); + goto out; + } + tmp = json_object_get(val, "coinbasevalue"); + if (!tmp || !json_is_number(tmp)) { + applog(LOG_ERR, "JSON invalid coinbasevalue"); + goto out; + } + cbvalue = (int64_t)(json_is_integer(tmp) ? json_integer_value(tmp) + : json_number_value(tmp)); + cbtx = (uchar *)malloc(256); + le32enc((uint32_t *)cbtx, 1); /* version */ + cbtx[4] = 1; /* in-counter */ + memset(cbtx + 5, 0x00, 32); /* prev txout hash */ + le32enc((uint32_t *)(cbtx + 37), 0xffffffff); /* prev txout index */ + cbtx_size = 43; + /* BIP 34: height in coinbase */ + for (n = work->height; n; n >>= 8) + cbtx[cbtx_size++] = n & 0xff; + /* If the last byte pushed is >= 0x80, then we need to add + another zero byte to signal that the block height is a + positive number. */ + if (cbtx[cbtx_size - 1] & 0x80) + cbtx[cbtx_size++] = 0; + cbtx[42] = cbtx_size - 43; + cbtx[41] = cbtx_size - 42; /* scriptsig length */ + le32enc((uint32_t *)(cbtx + cbtx_size), 0xffffffff); /* sequence */ + cbtx_size += 4; + + // Segwit BEGIN + // cbtx[cbtx_size++] = 1; /* out-counter */ + cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */ + // Segwit END + + le32enc((uint32_t *)(cbtx + cbtx_size), (uint32_t)cbvalue); /* value */ + le32enc((uint32_t *)(cbtx + cbtx_size + 4), cbvalue >> 32); + cbtx_size += 8; + cbtx[cbtx_size++] = (uint8_t)pk_script_size; /* txout-script length */ + memcpy(cbtx + cbtx_size, pk_script, pk_script_size); + cbtx_size += (int)pk_script_size; + + // Segwit BEGIN + if (segwit) { + unsigned char(*wtree)[32] = calloc(tx_count + 2, 32); + memset(cbtx + cbtx_size, 0, 8); /* value */ cbtx_size += 8; - cbtx[ cbtx_size++ ] = (uint8_t) pk_script_size; /* txout-script length */ - memcpy( cbtx+cbtx_size, pk_script, pk_script_size ); - cbtx_size += (int) pk_script_size; - le32enc( (uint32_t *)( cbtx+cbtx_size ), 0 ); /* lock time */ - cbtx_size += 4; - coinbase_append = true; - } - if ( coinbase_append ) - { - unsigned char xsig[100]; - int xsig_len = 0; - if ( *coinbase_sig ) - { - n = (int) strlen( coinbase_sig ); - if ( cbtx[41] + xsig_len + n <= 100 ) - { - memcpy( xsig+xsig_len, coinbase_sig, n ); - xsig_len += n; - } - else - { - applog( LOG_WARNING, - "Signature does not fit in coinbase, skipping" ); - } + cbtx[cbtx_size++] = 38; /* txout-script length */ + cbtx[cbtx_size++] = 0x6a; /* txout-script */ + cbtx[cbtx_size++] = 0x24; + cbtx[cbtx_size++] = 0xaa; + cbtx[cbtx_size++] = 0x21; + cbtx[cbtx_size++] = 0xa9; + cbtx[cbtx_size++] = 0xed; + for (i = 0; i < tx_count; i++) { + const json_t *tx = json_array_get(txa, i); + const json_t *hash = json_object_get(tx, "hash"); + if (!hash || !hex2bin(wtree[1 + i], json_string_value(hash), 32)) { + applog(LOG_ERR, "JSON invalid transaction hash"); + free(wtree); + goto out; + } + memrev(wtree[1 + i], 32); } - tmp = json_object_get( val, "coinbaseaux" ); - if ( tmp && json_is_object( tmp ) ) - { - void *iter = json_object_iter( tmp ); - while ( iter ) - { - unsigned char buf[100]; - const char *s = json_string_value( json_object_iter_value( iter ) ); - n = s ? (int) ( strlen(s) / 2 ) : 0; - if ( !s || n > 100 || !hex2bin( buf, s, n ) ) - { - applog(LOG_ERR, "JSON invalid coinbaseaux"); - break; - } - if ( cbtx[41] + xsig_len + n <= 100 ) - { - memcpy( xsig+xsig_len, buf, n ); - xsig_len += n; - } - iter = json_object_iter_next( tmp, iter ); - } + n = tx_count + 1; + while (n > 1) { + if (n % 2) + memcpy(wtree[n], wtree[n - 1], 32); + n = (n + 1) / 2; + for (i = 0; i < n; i++) + sha256d(wtree[i], wtree[2 * i], 64); } - if ( xsig_len ) - { - unsigned char *ssig_end = cbtx + 42 + cbtx[41]; - int push_len = cbtx[41] + xsig_len < 76 ? 1 : - cbtx[41] + 2 + xsig_len > 100 ? 0 : 2; - n = xsig_len + push_len; - memmove( ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41] ); - cbtx[41] += n; - if ( push_len == 2 ) - *(ssig_end++) = 0x4c; /* OP_PUSHDATA1 */ - if ( push_len ) - *(ssig_end++) = xsig_len; - memcpy( ssig_end, xsig, xsig_len ); - cbtx_size += n; + memset(wtree[1], 0, 32); /* witness reserved value = 0 */ + sha256d(cbtx + cbtx_size, wtree[0], 64); + cbtx_size += 32; + free(wtree); + } + // Segwit END + + le32enc((uint32_t *)(cbtx + cbtx_size), 0); /* lock time */ + cbtx_size += 4; + coinbase_append = true; + } + if (coinbase_append) { + unsigned char xsig[100]; + int xsig_len = 0; + if (*coinbase_sig) { + n = (int)strlen(coinbase_sig); + if (cbtx[41] + xsig_len + n <= 100) { + memcpy(xsig + xsig_len, coinbase_sig, n); + xsig_len += n; + } else { + applog(LOG_WARNING, "Signature does not fit in coinbase, skipping"); + } + } + tmp = json_object_get(val, "coinbaseaux"); + if (tmp && json_is_object(tmp)) { + void *iter = json_object_iter(tmp); + while (iter) { + unsigned char buf[100]; + const char *s = json_string_value(json_object_iter_value(iter)); + n = s ? (int)(strlen(s) / 2) : 0; + if (!s || n > 100 || !hex2bin(buf, s, n)) { + applog(LOG_ERR, "JSON invalid coinbaseaux"); + break; + } + if (cbtx[41] + xsig_len + n <= 100) { + memcpy(xsig + xsig_len, buf, n); + xsig_len += n; + } + iter = json_object_iter_next(tmp, iter); } - } - - n = varint_encode( txc_vi, 1 + tx_count ); - work->txs = (char*) malloc( 2 * ( n + cbtx_size + tx_size ) + 1 ); - bin2hex( work->txs, txc_vi, n ); - bin2hex( work->txs + 2*n, cbtx, cbtx_size ); - - /* generate merkle root */ - merkle_tree = (uchar(*)[32]) calloc(((1 + tx_count + 1) & ~1), 32); - sha256d(merkle_tree[0], cbtx, cbtx_size); - for ( i = 0; i < tx_count; i++ ) - { - tmp = json_array_get( txa, i ); - const char *tx_hex = json_string_value( json_object_get( tmp, "data" ) ); - const int tx_size = tx_hex ? (int) ( strlen( tx_hex ) / 2 ) : 0; - unsigned char *tx = (uchar*) malloc( tx_size ); - if ( !tx_hex || !hex2bin( tx, tx_hex, tx_size ) ) - { - applog( LOG_ERR, "JSON invalid transactions" ); - free( tx ); - goto out; + } + if (xsig_len) { + unsigned char *ssig_end = cbtx + 42 + cbtx[41]; + int push_len = cbtx[41] + xsig_len < 76 ? 1 + : cbtx[41] + 2 + xsig_len > 100 ? 0 + : 2; + n = xsig_len + push_len; + memmove(ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41]); + cbtx[41] += n; + if (push_len == 2) + *(ssig_end++) = 0x4c; /* OP_PUSHDATA1 */ + if (push_len) + *(ssig_end++) = xsig_len; + memcpy(ssig_end, xsig, xsig_len); + cbtx_size += n; + } + } + + n = varint_encode(txc_vi, 1 + tx_count); + work->txs = (char *)malloc(2 * (n + cbtx_size + tx_size) + 1); + bin2hex(work->txs, txc_vi, n); + bin2hex(work->txs + 2 * n, cbtx, cbtx_size); + + /* generate merkle root */ + merkle_tree = (uchar(*)[32])calloc(((1 + tx_count + 1) & ~1), 32); + sha256d(merkle_tree[0], cbtx, cbtx_size); + for (i = 0; i < tx_count; i++) { + tmp = json_array_get(txa, i); + const char *tx_hex = json_string_value(json_object_get(tmp, "data")); + const int tx_size = tx_hex ? (int)(strlen(tx_hex) / 2) : 0; + + // Segwit BEGIN + if (segwit) { + const char *txid = json_string_value(json_object_get(tmp, "txid")); + if (!txid || !hex2bin(merkle_tree[1 + i], txid, 32)) { + applog(LOG_ERR, "JSON invalid transaction txid"); + goto out; } - sha256d( merkle_tree[1 + i], tx, tx_size ); - if ( !submit_coinbase ) - strcat( work->txs, tx_hex ); - } - n = 1 + tx_count; - while ( n > 1 ) - { - if ( n % 2 ) - { - memcpy( merkle_tree[n], merkle_tree[n-1], 32 ); - ++n; + memrev(merkle_tree[1 + i], 32); + } else { + // Segwit END + + unsigned char *tx = (uchar *)malloc(tx_size); + if (!tx_hex || !hex2bin(tx, tx_hex, tx_size)) { + applog(LOG_ERR, "JSON invalid transactions"); + free(tx); + goto out; } - n /= 2; - for ( i = 0; i < n; i++ ) - sha256d( merkle_tree[i], merkle_tree[2*i], 64 ); - } - - /* assemble block header */ - algo_gate.build_block_header( work, swab32( version ), - (uint32_t*) prevhash, (uint32_t*) merkle_tree, - swab32( curtime ), le32dec( &bits ) ); - - if ( unlikely( !jobj_binary(val, "target", target, sizeof(target)) ) ) - { - applog( LOG_ERR, "JSON invalid target" ); - goto out; - } - for ( i = 0; i < ARRAY_SIZE( work->target ); i++ ) - work->target[7 - i] = be32dec( target + i ); - - tmp = json_object_get( val, "workid" ); - if ( tmp ) - { - if ( !json_is_string( tmp ) ) - { - applog( LOG_ERR, "JSON invalid workid" ); + sha256d(merkle_tree[1 + i], tx, tx_size); + free(tx); + + // Segwit BEGIN + } + // Segwit END + + if (!submit_coinbase) + strcat(work->txs, tx_hex); + } + n = 1 + tx_count; + while (n > 1) { + if (n % 2) { + memcpy(merkle_tree[n], merkle_tree[n - 1], 32); + ++n; + } + n /= 2; + for (i = 0; i < n; i++) + sha256d(merkle_tree[i], merkle_tree[2 * i], 64); + } + + /* assemble block header */ + algo_gate.build_block_header(work, swab32(version), (uint32_t *)prevhash, + (uint32_t *)merkle_tree, swab32(curtime), + le32dec(&bits), final_sapling_hash); + + if (unlikely(!jobj_binary(val, "target", target, sizeof(target)))) { + applog(LOG_ERR, "JSON invalid target"); + goto out; + } + for (i = 0; i < ARRAY_SIZE(work->target); i++) + work->target[7 - i] = be32dec(target + i); + + tmp = json_object_get(val, "workid"); + if (tmp) { + if (!json_is_string(tmp)) { + applog(LOG_ERR, "JSON invalid workid"); goto out; - } - work->workid = strdup( json_string_value( tmp ) ); - } + } + work->workid = strdup(json_string_value(tmp)); + } - rc = true; + rc = true; out: - /* Long polling */ - tmp = json_object_get( val, "longpollid" ); - if ( want_longpoll && json_is_string( tmp ) ) - { - free( lp_id ); - lp_id = strdup( json_string_value( tmp ) ); - if ( !have_longpoll ) - { - char *lp_uri; - tmp = json_object_get( val, "longpolluri" ); - lp_uri = json_is_string( tmp ) ? strdup( json_string_value( tmp ) ) - : rpc_url; - have_longpoll = true; - tq_push(thr_info[longpoll_thr_id].q, lp_uri); - } - } - - free( merkle_tree ); - free( cbtx ); - return rc; -} + /* Long polling */ + tmp = json_object_get(val, "longpollid"); + if (want_longpoll && json_is_string(tmp)) { + free(lp_id); + lp_id = strdup(json_string_value(tmp)); + if (!have_longpoll) { + char *lp_uri; + tmp = json_object_get(val, "longpolluri"); + lp_uri = json_is_string(tmp) ? strdup(json_string_value(tmp)) : rpc_url; + have_longpoll = true; + tq_push(thr_info[longpoll_thr_id].q, lp_uri); + } + } -void scale_hash_for_display ( double* hashrate, char* units ) -{ - if ( *hashrate < 1e4 ) // 0 H/s to 9999 H/s - *units = 0; - else if ( *hashrate < 1e7 ) // 10 kH/s to 9999 kH/s - { *units = 'k'; *hashrate /= 1e3; } - else if ( *hashrate < 1e10 ) // 10 Mh/s to 9999 Mh/s - { *units = 'M'; *hashrate /= 1e6; } - else if ( *hashrate < 1e13 ) // 10 Gh/s to 9999 Gh/s - { *units = 'G'; *hashrate /= 1e9; } - else if ( *hashrate < 1e16 ) // 10 Th/s to 9999 Th/s - { *units = 'T'; *hashrate /= 1e12; } - else // 10 Ph/s and higher - { *units = 'P'; *hashrate /= 1e15; } + free(merkle_tree); + free(cbtx); + return rc; +} + +// returns the unit prefix and the hashrate appropriately scaled. +void scale_hash_for_display(double *hashrate, char *prefix) { + if (*hashrate < 1e4) + *prefix = 0; + else if (*hashrate < 1e7) { + *prefix = 'k'; + *hashrate /= 1e3; + } else if (*hashrate < 1e10) { + *prefix = 'M'; + *hashrate /= 1e6; + } else if (*hashrate < 1e13) { + *prefix = 'G'; + *hashrate /= 1e9; + } else if (*hashrate < 1e16) { + *prefix = 'T'; + *hashrate /= 1e12; + } else if (*hashrate < 1e19) { + *prefix = 'P'; + *hashrate /= 1e15; + } else if (*hashrate < 1e22) { + *prefix = 'E'; + *hashrate /= 1e18; + } else if (*hashrate < 1e25) { + *prefix = 'Z'; + *hashrate /= 1e21; + } else { + *prefix = 'Y'; + *hashrate /= 1e24; + } } -// Bitcoin formula for converting a share's difficulty to an equivalent -// number of hashes. -// -// https://en.bitcoin.it/wiki/Difficulty -// -// H = D * 2**48 / 0xffff -// = D * 2**32 -// -// That formula doesn't seem to be accurate but an adjustment to the -// constant produces correct results. -// -// The formula used is: -// -// hash = sharediff * 2**48 / 0x3fff -// = sharediff * 2**30 -// = sharediff * diff2hash - -const uint64_t diff2hash = 0x40000000ULL; - -static struct timeval five_min_start; -static double shash_sum = 0.; -static double bhash_sum = 0.; -static double time_sum = 0.; -static double latency_sum = 0.; -static uint64_t submit_sum = 0; -static uint64_t reject_sum = 0; - -struct share_stats_t -{ - struct timeval submit_time; - double net_diff; - double share_diff; - char job_id[32]; +static inline void sprintf_et(char *str, int seconds) { + // sprintf doesn't like uint64_t, Linux thinks it's long, Windows long long. + unsigned int min = seconds / 60; + unsigned int sec = seconds % 60; + unsigned int hrs = min / 60; + if (unlikely(hrs)) { + unsigned int years = hrs / (24 * 365); + unsigned int days = hrs / 24; + if (years) + sprintf(str, "%uy%ud", years, years % 365); + else if (days) // 0d00h + sprintf(str, "%ud%02uh", days, hrs % 24); + else // 0h00m + sprintf(str, "%uh%02um", hrs, min % 60); + } else // 0m00s + sprintf(str, "%um%02us", min, sec); +} + +const long double exp32 = EXP32; // 2**32 +const long double exp48 = EXP32 * EXP16; // 2**48 +const long double exp64 = EXP32 * EXP32; // 2**64 +const long double exp96 = EXP32 * EXP32 * EXP32; // 2**96 +const long double exp128 = EXP32 * EXP32 * EXP32 * EXP32; // 2**128 +const long double exp160 = EXP32 * EXP32 * EXP32 * EXP32 * EXP16; // 2**160 + +struct share_stats_t { + int share_count; + struct timeval submit_time; + double net_diff; + double share_diff; + double stratum_diff; + double target_diff; + char job_id[32]; }; -// with more and more parallelism the chances of submitting multiple -// shares in a very short time grows. #define s_stats_size 8 -static struct share_stats_t share_stats[ s_stats_size ]; +static struct share_stats_t share_stats[s_stats_size] = {{0}}; static int s_get_ptr = 0, s_put_ptr = 0; static struct timeval last_submit_time = {0}; -static inline int stats_ptr_incr( int p ) -{ - return ++p < s_stats_size ? p : 0; +static inline int stats_ptr_incr(int p) { return ++p % s_stats_size; } + +void report_summary_log(bool force) { + struct timeval now, et, uptime, start_time; + + gettimeofday(&now, NULL); + timeval_subtract(&et, &now, &five_min_start); + +#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32)) + + // Display CPU temperature and clock rate. + int curr_temp = cpu_temp(0); + static struct timeval cpu_temp_time = {0}; + struct timeval diff; + + if (!opt_quiet || (curr_temp >= 80)) { + int wait_time = curr_temp >= 90 ? 5 + : curr_temp >= 80 ? 30 + : curr_temp >= 70 ? 60 + : 120; + timeval_subtract(&diff, &now, &cpu_temp_time); + if ((diff.tv_sec > wait_time) || + ((curr_temp > prev_temp) && (curr_temp >= 75))) { + char tempstr[32]; + float lo_freq = 0., hi_freq = 0.; + + memcpy(&cpu_temp_time, &now, sizeof(cpu_temp_time)); + linux_cpu_hilo_freq(&lo_freq, &hi_freq); + if (use_colors && (curr_temp >= 70)) { + if (curr_temp >= 80) + sprintf(tempstr, "%s%d C%s", CL_RED, curr_temp, CL_WHT); + else + sprintf(tempstr, "%s%d C%s", CL_YLW, curr_temp, CL_WHT); + } else + sprintf(tempstr, "%d C", curr_temp); + + applog(LOG_NOTICE, "CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz", + tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6); + if (curr_temp > hi_temp) + hi_temp = curr_temp; + prev_temp = curr_temp; + } + } + +#endif + + if (!(force && (submit_sum || (et.tv_sec > 5))) && (et.tv_sec < 300)) + return; + + // collect and reset periodic counters + pthread_mutex_lock(&stats_lock); + + uint64_t submits = submit_sum; + submit_sum = 0; + uint64_t accepts = accept_sum; + accept_sum = 0; + uint64_t rejects = reject_sum; + reject_sum = 0; + uint64_t stales = stale_sum; + stale_sum = 0; + uint64_t solved = solved_sum; + solved_sum = 0; + memcpy(&start_time, &five_min_start, sizeof start_time); + memcpy(&five_min_start, &now, sizeof now); + + pthread_mutex_unlock(&stats_lock); + + timeval_subtract(&et, &now, &start_time); + timeval_subtract(&uptime, &now, &session_start); + + double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6; + double ghrate = global_hashrate; + double shrate = + safe_div(exp32 * last_targetdiff * (double)(accepts), share_time, 0.); + double sess_hrate = + safe_div(exp32 * norm_diff_sum, (double)uptime.tv_sec, 0.); + double submit_rate = safe_div((double)submits * 60., share_time, 0.); + char shr_units[4] = {0}; + char ghr_units[4] = {0}; + char sess_hr_units[4] = {0}; + char et_str[24]; + char upt_str[24]; + + scale_hash_for_display(&shrate, shr_units); + scale_hash_for_display(&ghrate, ghr_units); + scale_hash_for_display(&sess_hrate, sess_hr_units); + + sprintf_et(et_str, et.tv_sec); + sprintf_et(upt_str, uptime.tv_sec); + + applog(LOG_BLUE, "%s: %s", algo_names[opt_algo], short_url); + applog2(LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str); + applog2(LOG_INFO, "Share rate %.2f/min %.2f/min", submit_rate, + (double)submitted_share_count * 60. / + ((double)uptime.tv_sec + (double)uptime.tv_usec / 1e6)); + applog2(LOG_INFO, "Hash rate %7.2f%sh/s %7.2f%sh/s (%.2f%sh/s)", + shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units); + + if (accepted_share_count < submitted_share_count) { + double ltd = exp32 * last_targetdiff; + double lost_ghrate = + uptime.tv_sec == 0 + ? 0. + : ltd * (double)(submitted_share_count - accepted_share_count) / + (double)uptime.tv_sec; + double lost_shrate = + share_time == 0. ? 0. : ltd * (double)(submits - accepts) / share_time; + char lshr_units[4] = {0}; + char lghr_units[4] = {0}; + scale_hash_for_display(&lost_shrate, lshr_units); + scale_hash_for_display(&lost_ghrate, lghr_units); + applog2(LOG_INFO, "Lost hash rate %7.2f%sh/s %7.2f%sh/s", lost_shrate, + lshr_units, lost_ghrate, lghr_units); + } + + applog2(LOG_INFO, "Submitted %7d %7d", submits, + submitted_share_count); + applog2(LOG_INFO, "Accepted %7d %7d %5.1f%%", accepts, + accepted_share_count, + 100. * safe_div((double)accepted_share_count, + (double)submitted_share_count, 0.)); + if (stale_share_count) + applog2(LOG_INFO, "Stale %7d %7d %5.1f%%", stales, + stale_share_count, + 100. * safe_div((double)stale_share_count, + (double)submitted_share_count, 0.)); + if (rejected_share_count) + applog2(LOG_INFO, "Rejected %7d %7d %5.1f%%", rejects, + rejected_share_count, + 100. * safe_div((double)rejected_share_count, + (double)submitted_share_count, 0.)); + if (solved_block_count) + applog2(LOG_INFO, "Blocks Solved %7d %7d", solved, + solved_block_count); + applog2(LOG_INFO, "Hi/Lo Share Diff %.5g / %.5g", highest_share, + lowest_share); + + int mismatch = + submitted_share_count - + (accepted_share_count + stale_share_count + rejected_share_count); + if (mismatch) { + if (mismatch != 1) + applog(LOG_WARNING, "Share count mismatch: %d, stats may be incorrect", + mismatch); + else + applog(LOG_INFO, + "Share count mismatch, submitted share may still be pending"); + } } -static int share_result( int result, struct work *null_work, - const char *reason ) -{ - double share_time = 0., share_hash = 0., block_hash = 0., share_size = 0.; - double hashcount = 0., hashrate = 0.; - uint64_t latency = 0; - struct share_stats_t my_stats = {0}; - struct timeval ack_time, latency_tv, et; - char hr[32]; - char hr_units[4] = {0}; - char shr[32]; - char shr_units[4] = {0}; - char diffstr[32]; - const char *sres = NULL; - bool solved = false; - - // Mutex while we grab asnapshot of the global counters. - pthread_mutex_lock( &stats_lock ); - - // When submit_work detects a buffer overflow it discards the stats for - // the new share. When we catch up we may get acks for shares with - // no stats. Leaving the get pointer un-incremented will resync with the - // put pointer. - if ( share_stats[ s_get_ptr ].submit_time.tv_sec ) - { - memcpy( &my_stats, &share_stats[ s_get_ptr], sizeof my_stats ); - memset( &share_stats[ s_get_ptr ], 0, sizeof my_stats ); - s_get_ptr = stats_ptr_incr( s_get_ptr ); - pthread_mutex_unlock( &stats_lock ); - } - else - { - pthread_mutex_unlock( &stats_lock ); - applog(LOG_WARNING,"Pending shares overflow, stats for share are lost."); - } - - for ( int i = 0; i < opt_n_threads; i++ ) - { - hashcount += thr_hashcount[i]; - hashrate += thr_hashrates[i]; - } - global_hashcount = hashcount; - global_hashrate = hashrate; - - // calculate latency and share time. - if ( my_stats.submit_time.tv_sec ) - { - gettimeofday( &ack_time, NULL ); - timeval_subtract( &latency_tv, &ack_time, &my_stats.submit_time ); - latency = ( latency_tv.tv_sec * 1000 + latency_tv.tv_usec / 1000 ); - timeval_subtract( &et, &my_stats.submit_time, &last_submit_time ); - share_time = (double)et.tv_sec + ( (double)et.tv_usec / 1000000. ); - memcpy( &last_submit_time, &my_stats.submit_time, - sizeof last_submit_time ); - } - - // calculate share hashrate and size - share_hash = my_stats.share_diff * diff2hash; - block_hash = my_stats.net_diff * diff2hash; - share_size = block_hash == 0. ? 0. : share_hash / block_hash * 100.; - - // check result - result ? accepted_share_count++ : rejected_share_count++; - solved = result && (my_stats.net_diff > 0.0 ) - && ( my_stats.share_diff >= net_diff ); - solved_block_count += solved ? 1 : 0 ; - - // update counters for 5 minute summary report - pthread_mutex_lock( &stats_lock ); - - shash_sum += share_hash; - bhash_sum += block_hash; - time_sum += share_time; - submit_sum ++; - reject_sum += (uint64_t)!result; - latency_sum += latency; - - pthread_mutex_unlock( &stats_lock ); - - double share_hash_rate = share_time == 0. ? 0. : share_hash / share_time; - double scaled_shr; - - scaled_shr = share_hash_rate; - scale_hash_for_display ( &scaled_shr, shr_units ); - - if ( use_colors ) - { - sres = ( solved ? ( CL_MAG "BLOCK SOLVED" CL_WHT ) - : result ? ( CL_GRN "Accepted" CL_WHT ) - : ( CL_RED "Rejected" CL_WHT ) ); - - // colour code the share diff to highlight high value. - if ( solved ) - sprintf( diffstr, "%s%.3g%s", CL_MAG, my_stats.share_diff, CL_WHT ); - else if ( my_stats.share_diff > ( my_stats.net_diff * 0.1 ) ) - sprintf( diffstr, "%s%.3g%s", CL_GRN, my_stats.share_diff, CL_WHT ); - else if ( my_stats.share_diff > ( my_stats.net_diff * 0.01 ) ) - sprintf( diffstr, "%s%.3g%s", CL_CYN, my_stats.share_diff, CL_WHT ); +static int share_result(int result, struct work *work, const char *reason) { + double share_time = 0.; + double hashrate = 0.; + int latency = 0; + struct share_stats_t my_stats = {0}; + struct timeval ack_time, latency_tv, et; + char ares[48]; + char sres[48]; + char rres[48]; + char bres[48]; + bool solved = false; + bool stale = false; + char *acol = NULL, *bcol = NULL, *scol = NULL, *rcol = NULL; + + if (!dev_mining || opt_debug) { + pthread_mutex_lock(&stats_lock); + + if (likely(share_stats[s_get_ptr].submit_time.tv_sec)) { + memcpy(&my_stats, &share_stats[s_get_ptr], sizeof my_stats); + memset(&share_stats[s_get_ptr], 0, sizeof my_stats); + s_get_ptr = stats_ptr_incr(s_get_ptr); + pthread_mutex_unlock(&stats_lock); + } else { + // empty queue, it must have overflowed and stats were lost for a share. + pthread_mutex_unlock(&stats_lock); + applog(LOG_WARNING, "Share stats not available."); + } + + // calculate latency and share time. + if likely (my_stats.submit_time.tv_sec) { + gettimeofday(&ack_time, NULL); + timeval_subtract(&latency_tv, &ack_time, &my_stats.submit_time); + latency = (latency_tv.tv_sec * 1e3 + latency_tv.tv_usec / 1e3); + timeval_subtract(&et, &my_stats.submit_time, &last_submit_time); + share_time = (double)et.tv_sec + ((double)et.tv_usec / 1e6); + memcpy(&last_submit_time, &my_stats.submit_time, sizeof last_submit_time); + } + + // check result + if (likely(result)) { + accepted_share_count++; + if ((my_stats.share_diff > 0.) && (my_stats.share_diff < lowest_share)) + lowest_share = my_stats.share_diff; + if (my_stats.share_diff > highest_share) + highest_share = my_stats.share_diff; + sprintf(sres, "S%d", stale_share_count); + sprintf(rres, "R%d", rejected_share_count); + if unlikely ((my_stats.net_diff > 0.) && + (my_stats.share_diff >= net_diff)) { + solved = true; + solved_block_count++; + sprintf(bres, "BLOCK SOLVED %d", solved_block_count); + sprintf(ares, "A%d", accepted_share_count); + } else { + sprintf(bres, "B%d", solved_block_count); + sprintf(ares, "Accepted %d", accepted_share_count); + } + } else { + sprintf(ares, "A%d", accepted_share_count); + sprintf(bres, "B%d", solved_block_count); + if (reason) + stale = strstr(reason, "job"); + else if (work) + stale = work->data[algo_gate.ntime_index] != + g_work.data[algo_gate.ntime_index]; + if (stale) { + stale_share_count++; + sprintf(sres, "Stale %d", stale_share_count); + sprintf(rres, "R%d", rejected_share_count); + } else { + rejected_share_count++; + sprintf(sres, "S%d", stale_share_count); + sprintf(rres, "Rejected %d", rejected_share_count); + } + } + } + + // update global counters for summary report + + if (opt_debug) { + pthread_mutex_lock(&stats_lock); + for (int i = 0; i < opt_n_threads; i++) + hashrate += thr_hashrates[i]; + global_hashrate = hashrate; + pthread_mutex_unlock(&stats_lock); + } + + if (!dev_mining || opt_debug) { + pthread_mutex_lock(&stats_lock); + + if (likely(result)) { + accept_sum++; + norm_diff_sum += my_stats.target_diff; + if (solved) + solved_sum++; + } else { + if (stale) + stale_sum++; else - sprintf( diffstr, "%.3g", my_stats.share_diff ); - - if ( hashrate ) // don't colour share hash rate without reference rate. - { - if ( share_hash_rate > 768. * hashrate ) - sprintf( shr, "%s%.2f %sH/s%s", CL_MAG, scaled_shr, shr_units, - CL_WHT ); - else if ( share_hash_rate > 32. * hashrate ) - sprintf( shr, "%s%.2f %sH/s%s", CL_GRN, scaled_shr, shr_units, - CL_WHT ); - else if ( share_hash_rate > 2.0 * hashrate ) - sprintf( shr, "%s%.2f %sH/s%s", CL_CYN, scaled_shr, shr_units, - CL_WHT ); - else if ( share_hash_rate > 0.5 * hashrate ) - sprintf( shr, "%.2f %sH/s", scaled_shr, shr_units ); - else - sprintf( shr, "%s%.2f %sH/s%s", CL_YLW, scaled_shr, shr_units, - CL_WHT ); - } - else - sprintf( shr, "%.2f %sH/s", scaled_shr, shr_units ); - } - else // monochrome - { - sres = ( solved ? "BLOCK SOLVED" : result ? "Accepted" : "Rejected" ); - sprintf( diffstr, "%.3g", my_stats.share_diff ); - sprintf( shr, "%.2f %sH/s", scaled_shr, shr_units ); - } - - scale_hash_for_display ( &hashrate, hr_units ); - if ( hashrate < 10. ) - sprintf(hr, "%.4f", hashrate ); - else - sprintf(hr, "%.2f", hashrate ); - - applog( LOG_NOTICE, "%s, diff %s, %.3f secs, A/R/B: %d/%d/%d.", - sres, diffstr, share_time, accepted_share_count, - rejected_share_count, solved_block_count ); - - if ( have_stratum && result && !opt_quiet ) - { - applog( LOG_NOTICE, "Miner %s %sH/s, Share %s, Latency %d ms.", - hr, hr_units, shr, latency ); - applog( LOG_NOTICE, "Height %d, job %s, %.5f%% block share.", - stratum.bloc_height, my_stats.job_id, share_size ); - applog(LOG_INFO,"- - - - - - - - - - - - - - - - - - - - - - - - - - -"); - } - - if ( reason ) - applog( LOG_WARNING, "reject reason: %s.", reason ); - - return 1; -} + reject_sum++; + } + submit_sum++; + latency_sum += latency; + + pthread_mutex_unlock(&stats_lock); + + if (use_colors) { + bcol = acol = scol = rcol = CL_WHT; + if (likely(result)) { + acol = CL_WHT CL_GRN; + if (unlikely(solved)) + bcol = CL_WHT CL_MAG; + } else if (stale) + scol = CL_WHT CL_YL2; + else + rcol = CL_WHT CL_RED; + } -void std_le_build_stratum_request( char *req, struct work *work ) -{ - unsigned char *xnonce2str; - uint32_t ntime, nonce; - char ntimestr[9], noncestr[9]; - le32enc( &ntime, work->data[ algo_gate.ntime_index ] ); - le32enc( &nonce, work->data[ algo_gate.nonce_index ] ); - bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) ); - bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) ); - xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len ); - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr ); - free( xnonce2str ); -} + applog(LOG_NOTICE, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)", + my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol, bres, + share_time, latency); -// le is default -void std_be_build_stratum_request( char *req, struct work *work ) -{ - unsigned char *xnonce2str; - uint32_t ntime, nonce; - char ntimestr[9], noncestr[9]; - be32enc( &ntime, work->data[ algo_gate.ntime_index ] ); - be32enc( &nonce, work->data[ algo_gate.nonce_index ] ); - bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) ); - bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) ); - xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len ); - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr ); - free( xnonce2str ); -} + if (unlikely(opt_debug || !result || solved)) { + if (have_stratum) + applog2(LOG_INFO, "Diff %.5g, Block %d, Job %s", my_stats.share_diff, + stratum.block_height, my_stats.job_id); + else + applog2(LOG_INFO, "Diff %.5g, Block %d", my_stats.share_diff, + work ? work->height : last_block_height); + } -void jr2_build_stratum_request( char *req, struct work *work ) -{ - uchar hash[32]; - char noncestr[9]; - bin2hex( noncestr, (char*) algo_gate.get_nonceptr( work->data ), - sizeof(uint32_t) ); - algo_gate.hash_suw( hash, work->data ); - char *hashhex = abin2hex(hash, 32); - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"submit\", \"params\": {\"id\": \"%s\", \"job_id\": \"%s\", \"nonce\": \"%s\", \"result\": \"%s\"}, \"id\":4}", - rpc2_id, work->job_id, noncestr, hashhex ); - free( hashhex ); -} + if (unlikely(!(opt_quiet || result || stale))) { + uint32_t str[8]; + uint32_t *targ; -bool std_le_submit_getwork_result( CURL *curl, struct work *work ) -{ - char req[JSON_BUF_LEN]; - json_t *val, *res, *reason; - char* gw_str; - int data_size = algo_gate.get_work_data_size(); - - for ( int i = 0; i < data_size / sizeof(uint32_t); i++ ) - le32enc( &work->data[i], work->data[i] ); - gw_str = abin2hex( (uchar*)work->data, data_size ); - if ( unlikely(!gw_str) ) - { - applog(LOG_ERR, "submit_upstream_work OOM"); - return false; - } - // build JSON-RPC request - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n", gw_str ); - free( gw_str ); - // issue JSON-RPC request - val = json_rpc_call( curl, rpc_url, rpc_userpass, req, NULL, 0 ); - if ( unlikely(!val) ) - { - applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); - return false; - } - res = json_object_get( val, "result" ); - reason = json_object_get( val, "reject-reason" ); - share_result( json_is_true( res ), work, - reason ? json_string_value( reason ) : NULL ); - json_decref( val ); - return true; -} + if (reason) + applog(LOG_WARNING, "Reject reason: %s", reason); -bool std_be_submit_getwork_result( CURL *curl, struct work *work ) -{ - char req[JSON_BUF_LEN]; - json_t *val, *res, *reason; - char* gw_str; - int data_size = algo_gate.get_work_data_size(); - - for ( int i = 0; i < data_size / sizeof(uint32_t); i++ ) - be32enc( &work->data[i], work->data[i] ); - gw_str = abin2hex( (uchar*)work->data, data_size ); - if ( unlikely(!gw_str) ) - { - applog(LOG_ERR, "submit_upstream_work OOM"); - return false; - } - // build JSON-RPC request - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n", gw_str ); - free( gw_str ); - // issue JSON-RPC request - val = json_rpc_call( curl, rpc_url, rpc_userpass, req, NULL, 0 ); - if ( unlikely(!val) ) - { - applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); - return false; - } - res = json_object_get( val, "result" ); - reason = json_object_get( val, "reject-reason" ); - share_result( json_is_true( res ), work, - reason ? json_string_value( reason ) : NULL ); - json_decref( val ); - return true; -} + diff_to_hash(str, my_stats.share_diff); + applog2(LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6], + str[5], str[4], str[3], str[2], str[1], str[0]); + if (work) + targ = work->target; + else { + diff_to_hash(str, my_stats.target_diff); + targ = &str[0]; + } + applog2(LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6], + targ[5], targ[4], targ[3], targ[2], targ[1], targ[0]); + } + } + return 1; +} + +static const char *json_submit_req = + "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", " + "\"%s\", \"%s\"], \"id\":4}"; + +void std_le_build_stratum_request(char *req, struct work *work) { + unsigned char *xnonce2str; + uint32_t ntime, nonce; + char ntimestr[9], noncestr[9]; + le32enc(&ntime, work->data[algo_gate.ntime_index]); + le32enc(&nonce, work->data[algo_gate.nonce_index]); + bin2hex(ntimestr, (char *)(&ntime), sizeof(uint32_t)); + bin2hex(noncestr, (char *)(&nonce), sizeof(uint32_t)); + xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len); + if (dev_mining) { + snprintf(req, JSON_BUF_LEN, json_submit_req, dev_address, work->job_id, + xnonce2str, ntimestr, noncestr); + } else { + snprintf(req, JSON_BUF_LEN, json_submit_req, rpc_user, work->job_id, + xnonce2str, ntimestr, noncestr); + } + free(xnonce2str); +} -bool jr2_submit_getwork_result( CURL *curl, struct work *work ) -{ - json_t *val, *res; - char req[JSON_BUF_LEN]; - char noncestr[9]; - uchar hash[32]; - char *hashhex; - bin2hex( noncestr, (char*) algo_gate.get_nonceptr( work->data ), - sizeof(uint32_t) ); - algo_gate.hash_suw( hash, work->data ); - hashhex = abin2hex( &hash[0], 32 ); - snprintf( req, JSON_BUF_LEN, "{\"method\": \"submit\", \"params\": " - "{\"id\": \"%s\", \"job_id\": \"%s\", \"nonce\": \"%s\", \"result\": \"%s\"}," - "\"id\":4}\r\n", - rpc2_id, work->job_id, noncestr, hashhex ); - free( hashhex ); - // issue JSON-RPC request - val = json_rpc2_call( curl, rpc_url, rpc_userpass, req, NULL, 0 ); - if (unlikely( !val )) - { - applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); - return false; - } - res = json_object_get( val, "result" ); - json_t *status = json_object_get( res, "status" ); - bool valid = !strcmp( status ? json_string_value( status ) : "", "OK" ); - if (valid) - share_result( valid, work, NULL ); - else - { - json_t *err = json_object_get( res, "error" ); - const char *sreason = json_string_value( json_object_get( - err, "message" ) ); - share_result( valid, work, sreason ); - if ( !strcasecmp( "Invalid job id", sreason ) ) - { - work_free( work ); - work_copy( work, &g_work ); - g_work_time = 0; - restart_threads(); - } - } - json_decref(val); - return true; +// le is default +void std_be_build_stratum_request(char *req, struct work *work) { + unsigned char *xnonce2str; + uint32_t ntime, nonce; + char ntimestr[9], noncestr[9]; + be32enc(&ntime, work->data[algo_gate.ntime_index]); + be32enc(&nonce, work->data[algo_gate.nonce_index]); + bin2hex(ntimestr, (char *)(&ntime), sizeof(uint32_t)); + bin2hex(noncestr, (char *)(&nonce), sizeof(uint32_t)); + xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len); + snprintf(req, JSON_BUF_LEN, json_submit_req, rpc_user, work->job_id, + xnonce2str, ntimestr, noncestr); + free(xnonce2str); +} + +static const char *json_getwork_req = + "{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n"; + +bool std_le_submit_getwork_result(CURL *curl, struct work *work) { + char req[JSON_BUF_LEN]; + json_t *val, *res, *reason; + char *gw_str; + int data_size = algo_gate.get_work_data_size(); + + for (int i = 0; i < data_size / sizeof(uint32_t); i++) + le32enc(&work->data[i], work->data[i]); + gw_str = abin2hex((uchar *)work->data, data_size); + if (unlikely(!gw_str)) { + applog(LOG_ERR, "submit_upstream_work OOM"); + return false; + } + // build JSON-RPC request + snprintf(req, JSON_BUF_LEN, json_getwork_req, gw_str); + free(gw_str); + // issue JSON-RPC request + + if (dev_mining) { + val = json_rpc_call(curl, dev_stratum.url, dev_userpass, req, NULL, 0); + } else { + val = json_rpc_call(curl, rpc_url, rpc_userpass, req, NULL, 0); + } + if (unlikely(!val)) { + applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); + return false; + } + res = json_object_get(val, "result"); + reason = json_object_get(val, "reject-reason"); + share_result(json_is_true(res), work, + reason ? json_string_value(reason) : NULL); + json_decref(val); + return true; +} + +bool std_be_submit_getwork_result(CURL *curl, struct work *work) { + char req[JSON_BUF_LEN]; + json_t *val, *res, *reason; + char *gw_str; + int data_size = algo_gate.get_work_data_size(); + + for (int i = 0; i < data_size / sizeof(uint32_t); i++) + be32enc(&work->data[i], work->data[i]); + gw_str = abin2hex((uchar *)work->data, data_size); + if (unlikely(!gw_str)) { + applog(LOG_ERR, "submit_upstream_work OOM"); + return false; + } + // build JSON-RPC request + snprintf(req, JSON_BUF_LEN, json_getwork_req, gw_str); + free(gw_str); + // issue JSON-RPC request + if (dev_mining) { + val = json_rpc_call(curl, dev_stratum.url, dev_userpass, req, NULL, 0); + } else { + val = json_rpc_call(curl, rpc_url, rpc_userpass, req, NULL, 0); + } + if (unlikely(!val)) { + applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); + return false; + } + res = json_object_get(val, "result"); + reason = json_object_get(val, "reject-reason"); + share_result(json_is_true(res), work, + reason ? json_string_value(reason) : NULL); + json_decref(val); + return true; } -char* std_malloc_txs_request( struct work *work ) -{ +char *std_malloc_txs_request(struct work *work) { char *req; json_t *val; char data_str[2 * sizeof(work->data) + 1]; int i; + int datasize = work->sapling ? 112 : 80; - for ( i = 0; i < ARRAY_SIZE(work->data); i++ ) - be32enc( work->data + i, work->data[i] ); - bin2hex( data_str, (unsigned char *)work->data, 80 ); - if ( work->workid ) - { + for (i = 0; i < ARRAY_SIZE(work->data); i++) + be32enc(work->data + i, work->data[i]); + bin2hex(data_str, (unsigned char *)work->data, datasize); + if (work->workid) { char *params; val = json_object(); - json_object_set_new( val, "workid", json_string( work->workid ) ); - params = json_dumps( val, 0 ); - json_decref( val ); - req = (char*) malloc( 128 + 2 * 80 + strlen( work->txs ) - + strlen( params ) ); - sprintf( req, - "{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":4}\r\n", - data_str, work->txs, params ); - free( params ); - } - else - { - req = (char*) malloc( 128 + 2 * 80 + strlen( work->txs ) ); - sprintf( req, - "{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":4}\r\n", - data_str, work->txs); + json_object_set_new(val, "workid", json_string(work->workid)); + params = json_dumps(val, 0); + json_decref(val); + req = + (char *)malloc(128 + 2 * datasize + strlen(work->txs) + strlen(params)); + sprintf(req, + "{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], " + "\"id\":4}\r\n", + data_str, work->txs, params); + free(params); + } else { + req = (char *)malloc(128 + 2 * datasize + strlen(work->txs)); + sprintf( + req, + "{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":4}\r\n", + data_str, work->txs); } return req; -} +} -static bool submit_upstream_work( CURL *curl, struct work *work ) -{ - /* pass if the previous hash is not the current previous hash */ - if ( !submit_old && memcmp( &work->data[1], &g_work.data[1], 32 ) ) - { - if (opt_debug) - applog(LOG_DEBUG, "DEBUG: stale work detected, discarding"); - return true; - } - - if ( !have_stratum && allow_mininginfo ) - { - struct work wheight; - get_mininginfo( curl, &wheight ); - if ( work->height && work->height <= net_blocks ) - { - if (opt_debug) - applog(LOG_WARNING, "block %u was already solved", work->height); - return true; +static bool submit_upstream_work(CURL *curl, struct work *work) { + if (have_stratum) { + char req[JSON_BUF_LEN]; + + if (dev_mining) { + dev_stratum.sharediff = work->sharediff; + algo_gate.build_stratum_request(req, work, &dev_stratum); + if (unlikely(!stratum_send_line(&dev_stratum, req))) { + applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); + return false; } - } - - if ( have_stratum ) - { - char req[JSON_BUF_LEN]; - stratum.sharediff = work->sharediff; - algo_gate.build_stratum_request( req, work, &stratum ); - if ( unlikely( !stratum_send_line( &stratum, req ) ) ) - { - applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); - return false; - } - return true; - } - else if ( work->txs ) - { - char *req = NULL; - json_t *val, *res; - - req = algo_gate.malloc_txs_request( work ); - val = json_rpc_call( curl, rpc_url, rpc_userpass, req, NULL, 0 ); - free( req ); - - if ( unlikely( !val ) ) - { - applog( LOG_ERR, "submit_upstream_work json_rpc_call failed" ); - return false; + } else { + stratum.sharediff = work->sharediff; + algo_gate.build_stratum_request(req, work, &stratum); + if (unlikely(!stratum_send_line(&stratum, req))) { + applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); + return false; } - res = json_object_get( val, "result" ); - if ( json_is_object( res ) ) - { - char *res_str; - bool sumres = false; - void *iter = json_object_iter( res ); - while ( iter ) - { - if ( json_is_null( json_object_iter_value( iter ) ) ) - { - sumres = true; - break; - } - iter = json_object_iter_next( res, iter ); - } - res_str = json_dumps( res, 0 ); - share_result( sumres, work, res_str ); - free( res_str ); + } + return true; + } else if (work->txs) { + char *req = NULL; + json_t *val, *res; + + req = algo_gate.malloc_txs_request(work); + if (dev_mining) { + val = json_rpc_call(curl, dev_stratum.url, dev_userpass, req, NULL, 0); + } else { + val = json_rpc_call(curl, rpc_url, rpc_userpass, req, NULL, 0); + } + free(req); + + if (unlikely(!val)) { + applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); + return false; + } + res = json_object_get(val, "result"); + if (json_is_object(res)) { + char *res_str; + bool sumres = false; + void *iter = json_object_iter(res); + while (iter) { + if (json_is_null(json_object_iter_value(iter))) { + sumres = true; + break; + } + iter = json_object_iter_next(res, iter); } - else - share_result( json_is_null( res ), work, json_string_value( res ) ); - json_decref( val ); - return true; - } - else - return algo_gate.submit_getwork_result( curl, work ); + res_str = json_dumps(res, 0); + share_result(sumres, work, res_str); + free(res_str); + } else + share_result(json_is_null(res), work, json_string_value(res)); + json_decref(val); + return true; + } else + return algo_gate.submit_getwork_result(curl, work); } const char *getwork_req = - "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; + "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; -#define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]" +#define GBT_CAPABILITIES \ + "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]" -static const char *gbt_req = - "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " - GBT_CAPABILITIES "}], \"id\":0}\r\n"; +// Segwit BEGIN +#define GBT_RULES "[\"segwit\"]" +static const char *gbt_req = "{\"method\": \"getblocktemplate\", \"params\": " + "[{\"capabilities\": " GBT_CAPABILITIES + ", \"rules\": " GBT_RULES "}], \"id\":0}\r\n"; const char *gbt_lp_req = - "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " - GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n"; + "{\"method\": \"getblocktemplate\", \"params\": " + "[{\"capabilities\": " GBT_CAPABILITIES ", \"rules\": " GBT_RULES + ", \"longpollid\": \"%s\"}], \"id\":0}\r\n"; -static bool get_upstream_work( CURL *curl, struct work *work ) -{ - json_t *val; - int err; - bool rc; - struct timeval tv_start, tv_end, diff; +/* + static const char *gbt_req = + "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " + GBT_CAPABILITIES "}], \"id\":0}\r\n"; + const char *gbt_lp_req = + "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " + GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n"; + */ +// Segwit END + +static bool get_upstream_work(CURL *curl, struct work *work) { + json_t *val; + int err; + bool rc; + struct timeval tv_start, tv_end, diff; start: - gettimeofday( &tv_start, NULL ); - - if ( jsonrpc_2 ) - { - char s[128]; - snprintf( s, 128, "{\"method\": \"getjob\", \"params\": {\"id\": \"%s\"}, \"id\":1}\r\n", rpc2_id ); - val = json_rpc2_call( curl, rpc_url, rpc_userpass, s, NULL, 0 ); - } - else - { - val = json_rpc_call( curl, rpc_url, rpc_userpass, - have_gbt ? gbt_req : getwork_req, &err, - have_gbt ? JSON_RPC_QUIET_404 : 0); - } - gettimeofday( &tv_end, NULL ); - - if ( have_stratum ) - { - if ( val ) - json_decref(val); - - return true; - } - - if ( !have_gbt && !allow_getwork ) - { - applog( LOG_ERR, "No usable protocol" ); - if ( val ) - json_decref( val ); - return false; - } + gettimeofday(&tv_start, NULL); - if ( have_gbt && allow_getwork && !val && err == CURLE_OK ) - { - applog( LOG_NOTICE, "getblocktemplate failed, falling back to getwork" ); - have_gbt = false; - goto start; - } + val = json_rpc_call(curl, rpc_url, rpc_userpass, + have_gbt ? gbt_req : getwork_req, &err, + have_gbt ? JSON_RPC_QUIET_404 : 0); - if ( !val ) - return false; + gettimeofday(&tv_end, NULL); - if ( have_gbt ) - { - rc = gbt_work_decode( json_object_get( val, "result" ), work ); - if ( !have_gbt ) - { - json_decref( val ); - goto start; - } - } - else - rc = work_decode( json_object_get( val, "result" ), work ); - - if ( opt_protocol && rc ) - { - timeval_subtract( &diff, &tv_end, &tv_start ); - applog( LOG_DEBUG, "got new work in %.2f ms", - ( 1000.0 * diff.tv_sec ) + ( 0.001 * diff.tv_usec ) ); - } - - json_decref( val ); - // store work height in solo - get_mininginfo(curl, work); - return rc; -} + if (have_stratum) { + if (val) + json_decref(val); -static void workio_cmd_free(struct workio_cmd *wc) -{ - if (!wc) - return; - - switch (wc->cmd) { - case WC_SUBMIT_WORK: - work_free(wc->u.work); - free(wc->u.work); - break; - default: /* do nothing */ - break; - } - - memset(wc, 0, sizeof(*wc)); /* poison */ - free(wc); -} + return true; + } -static bool workio_get_work(struct workio_cmd *wc, CURL *curl) -{ - struct work *ret_work; - int failures = 0; - - ret_work = (struct work*) calloc(1, sizeof(*ret_work)); - if (!ret_work) - return false; - - /* obtain new work from bitcoin via JSON-RPC */ - while (!get_upstream_work(curl, ret_work)) - { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) - { - applog(LOG_ERR, "json_rpc_call failed, terminating workio thread"); - free(ret_work); - return false; - } + if (!have_gbt && !allow_getwork) { + applog(LOG_ERR, "No usable protocol"); + if (val) + json_decref(val); + return false; + } - /* pause, then restart work-request loop */ - applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds", - opt_fail_pause); - sleep(opt_fail_pause); - } + if (have_gbt && allow_getwork && !val && err == CURLE_OK) { + applog(LOG_NOTICE, "getblocktemplate failed, falling back to getwork"); + have_gbt = false; + goto start; + } - /* send work to requesting thread */ - if (!tq_push(wc->thr->q, ret_work)) - free(ret_work); + if (!val) + return false; - return true; -} + if (have_gbt) { + rc = gbt_work_decode(json_object_get(val, "result"), work); + if (!have_gbt) { + json_decref(val); + goto start; + } + } else + rc = work_decode(json_object_get(val, "result"), work); -static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) -{ - int failures = 0; - - /* submit solution to bitcoin via JSON-RPC */ - while (!submit_upstream_work(curl, wc->u.work)) - { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) - { - applog(LOG_ERR, "...terminating workio thread"); - return false; - } - /* pause, then restart work-request loop */ - if (!opt_benchmark) - applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); - sleep(opt_fail_pause); - } - return true; -} + if (rc) { + json_decref(val); -bool rpc2_login(CURL *curl) -{ - json_t *val; - bool rc = false; - struct timeval tv_start, tv_end, diff; - char s[JSON_BUF_LEN]; - - if (!jsonrpc_2) - return false; - snprintf(s, JSON_BUF_LEN, "{\"method\": \"login\", \"params\": {" - "\"login\": \"%s\", \"pass\": \"%s\", \"agent\": \"%s\"}, \"id\": 1}", - rpc_user, rpc_pass, USER_AGENT); - gettimeofday(&tv_start, NULL); - val = json_rpc_call(curl, rpc_url, rpc_userpass, s, NULL, 0); - gettimeofday(&tv_end, NULL); - if (!val) - goto end; - rc = rpc2_login_decode(val); - json_t *result = json_object_get(val, "result"); - if (!result) - goto end; - json_t *job = json_object_get(result, "job"); - if (!rpc2_job_decode(job, &g_work)) - goto end; - if (opt_debug && rc) - { - timeval_subtract(&diff, &tv_end, &tv_start); - applog(LOG_DEBUG, "DEBUG: authenticated in %d ms", - diff.tv_sec * 1000 + diff.tv_usec / 1000); - } - json_decref(val); -end: - return rc; -} + get_mininginfo(curl, work); + report_summary_log(false); -bool rpc2_workio_login(CURL *curl) -{ - int failures = 0; - if (opt_benchmark) - return true; - /* submit solution to bitcoin via JSON-RPC */ - pthread_mutex_lock(&rpc2_login_lock); - while (!rpc2_login(curl)) - { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) - { - applog(LOG_ERR, "...terminating workio thread"); - pthread_mutex_unlock(&rpc2_login_lock); - return false; + if (opt_protocol | opt_debug) { + timeval_subtract(&diff, &tv_end, &tv_start); + applog(LOG_INFO, "%s new work received in %.2f ms", + (have_gbt ? "GBT" : "GetWork"), + (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec)); + } + + if (work->height > last_block_height) { + last_block_height = work->height; + last_targetdiff = net_diff; + + applog(LOG_BLUE, "New Block %d, Net Diff %.5g, Ntime %08x", work->height, + net_diff, work->data[algo_gate.ntime_index]); + + if (!opt_quiet) { + double miner_hr = 0.; + double net_hr = net_hashrate; + double nd = net_diff * exp32; + char net_hr_units[4] = {0}; + char miner_hr_units[4] = {0}; + char net_ttf[32]; + char miner_ttf[32]; + + pthread_mutex_lock(&stats_lock); + + for (int i = 0; i < opt_n_threads; i++) + miner_hr += thr_hashrates[i]; + global_hashrate = miner_hr; + + pthread_mutex_unlock(&stats_lock); + + if (net_hr > 0.) + sprintf_et(net_ttf, nd / net_hr); + else + sprintf(net_ttf, "NA"); + if (miner_hr > 0.) + sprintf_et(miner_ttf, nd / miner_hr); + else + sprintf(miner_ttf, "NA"); + + scale_hash_for_display(&miner_hr, miner_hr_units); + scale_hash_for_display(&net_hr, net_hr_units); + applog2(LOG_INFO, "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s", + miner_hr, miner_hr_units, miner_ttf, net_hr, net_hr_units, + net_ttf); } + } // work->height > last_block_height + else if (memcmp(&work->data[1], &g_work.data[1], 32)) + applog(LOG_BLUE, "New Work: Block %d, Net Diff %.5g, Ntime %08x", + work->height, net_diff, work->data[algo_gate.ntime_index]); + } // rc + + return rc; +} + +static void workio_cmd_free(struct workio_cmd *wc) { + if (!wc) + return; + + switch (wc->cmd) { + case WC_SUBMIT_WORK: + work_free(wc->u.work); + free(wc->u.work); + break; + default: /* do nothing */ + break; + } - /* pause, then restart work-request loop */ - if (!opt_benchmark) - applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); - sleep(opt_fail_pause); - pthread_mutex_unlock(&rpc2_login_lock); - pthread_mutex_lock(&rpc2_login_lock); - } - pthread_mutex_unlock(&rpc2_login_lock); - return true; + memset(wc, 0, sizeof(*wc)); /* poison */ + free(wc); } -static void *workio_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info *) userdata; - CURL *curl; - bool ok = true; - - curl = curl_easy_init(); - if (unlikely(!curl)) - { - applog(LOG_ERR, "CURL initialization failed"); - return NULL; - } - if(jsonrpc_2 && !have_stratum) - ok = rpc2_workio_login(curl); - while (ok) - { - struct workio_cmd *wc; - - /* wait for workio_cmd sent to us, on our queue */ - wc = (struct workio_cmd *) tq_pop(mythr->q, NULL); - if (!wc) - { - ok = false; - break; - } +static bool workio_get_work(struct workio_cmd *wc, CURL *curl) { + struct work *ret_work; + int failures = 0; - /* process workio_cmd */ - switch (wc->cmd) - { - case WC_GET_WORK: - ok = workio_get_work(wc, curl); - break; - case WC_SUBMIT_WORK: - ok = workio_submit_work(wc, curl); - break; - - default: /* should never happen */ - ok = false; - break; - } - workio_cmd_free(wc); - } - tq_freeze(mythr->q); - curl_easy_cleanup(curl); - return NULL; -} + ret_work = (struct work *)calloc(1, sizeof(*ret_work)); + if (!ret_work) + return false; -static bool get_work(struct thr_info *thr, struct work *work) -{ - struct workio_cmd *wc; - struct work *work_heap; - - if (opt_benchmark) - { - uint32_t ts = (uint32_t) time(NULL); - - // why 74? std cmp_size is 76, std data is 128 - for ( int n = 0; n < 74; n++ ) ( (char*)work->data )[n] = n; - - work->data[algo_gate.ntime_index] = swab32(ts); // ntime - - // this overwrites much of the for loop init - memset( work->data + algo_gate.nonce_index, 0x00, 52); // nonce..nonce+52 - work->data[20] = 0x80000000; // extraheader not used for jr2 - work->data[31] = 0x00000280; // extraheader not used for jr2 - return true; - } - /* fill out work request message */ - wc = (struct workio_cmd *) calloc(1, sizeof(*wc)); - if (!wc) - return false; - wc->cmd = WC_GET_WORK; - wc->thr = thr; - /* send work request to workio thread */ - if (!tq_push(thr_info[work_thr_id].q, wc)) - { - workio_cmd_free(wc); - return false; - } - /* wait for response, a unit of work */ - work_heap = (struct work*) tq_pop(thr->q, NULL); - if (!work_heap) - return false; - /* copy returned work into storage provided by caller */ - memcpy(work, work_heap, sizeof(*work)); - free(work_heap); - return true; -} + /* obtain new work from bitcoin via JSON-RPC */ + while (!get_upstream_work(curl, ret_work)) { + if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + applog(LOG_ERR, "json_rpc_call failed, terminating workio thread"); + free(ret_work); + return false; + } -bool submit_work(struct thr_info *thr, const struct work *work_in) -{ - struct workio_cmd *wc; - - // collect some share stats - pthread_mutex_lock( &stats_lock ); - - // if buffer full discard stats and don't increment pointer. - // We're on the clock so let share_result report it. - if ( share_stats[ s_put_ptr ].submit_time.tv_sec == 0 ) - { - gettimeofday( &share_stats[ s_put_ptr ].submit_time, NULL ); - share_stats[ s_put_ptr ].share_diff = work_in->sharediff; - share_stats[ s_put_ptr ].net_diff = net_diff; - strcpy( share_stats[ s_put_ptr ].job_id, work_in->job_id ); - s_put_ptr = stats_ptr_incr( s_put_ptr ); - } - - pthread_mutex_unlock( &stats_lock ); - - /* fill out work request message */ - wc = (struct workio_cmd *) calloc(1, sizeof(*wc)); - if (!wc) - return false; - wc->u.work = (struct work*) malloc(sizeof(*work_in)); - if (!wc->u.work) - goto err_out; - wc->cmd = WC_SUBMIT_WORK; - wc->thr = thr; - work_copy(wc->u.work, work_in); - - /* send solution to workio thread */ - if (!tq_push(thr_info[work_thr_id].q, wc)) - goto err_out; - return true; -err_out: - workio_cmd_free(wc); - return false; -} + /* pause, then restart work-request loop */ + applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds", + opt_fail_pause); + sleep(opt_fail_pause); + } -bool rpc2_stratum_job( struct stratum_ctx *sctx, json_t *params ) -{ - bool ret = false; - pthread_mutex_lock(&sctx->work_lock); - ret = rpc2_job_decode(params, &sctx->work); - if (ret) - { - if (sctx->job.job_id) - free(sctx->job.job_id); - sctx->job.job_id = strdup(sctx->work.job_id); - } - - pthread_mutex_unlock(&sctx->work_lock); - return ret; + /* send work to requesting thread */ + if (!tq_push(wc->thr->q, ret_work)) + free(ret_work); + + return true; } -static bool wanna_mine(int thr_id) -{ - bool state = true; +static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) { + int failures = 0; - if (opt_max_temp > 0.0) - { - float temp = cpu_temp(0); - if (temp > opt_max_temp) - { - if (!thr_id && !conditional_state[thr_id] && !opt_quiet) - applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp); - state = false; - } - } - if (opt_max_diff > 0.0 && net_diff > opt_max_diff) - { - if (!thr_id && !conditional_state[thr_id] && !opt_quiet) - applog(LOG_INFO, "network diff too high, waiting..."); - state = false; - } - if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate) - { - if (!thr_id && !conditional_state[thr_id] && !opt_quiet) - { - char rate[32]; - format_hashrate(opt_max_rate, rate); - applog(LOG_INFO, "network hashrate too high, waiting %s...", rate); - } - state = false; - } - if (thr_id < MAX_CPUS) - conditional_state[thr_id] = (uint8_t) !state; - return state; -} + /* submit solution to bitcoin via JSON-RPC */ -void std_wait_for_diff() -{ - while ( time(NULL) >= g_work_time + 120 ) - sleep(1); + while (!submit_upstream_work(curl, wc->u.work)) { + if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + applog(LOG_ERR, "submit_upstream_work WORKIO fail"); + applog(LOG_ERR, "...terminating workio thread"); + return false; + } + /* pause, then restart work-request loop */ + if (!opt_benchmark) + applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); + sleep(opt_fail_pause); + } + return true; } -// Common target functions, default usually listed first. +static void *workio_thread(void *userdata) { + struct thr_info *mythr = (struct thr_info *)userdata; + CURL *curl; + bool ok = true; -// pick your favorite or define your own -int64_t get_max64_0x1fffffLL() { return 0x1fffffLL; } // default -int64_t get_max64_0x40LL() { return 0x40LL; } -int64_t get_max64_0x3ffff() { return 0x3ffff; } -int64_t get_max64_0x3fffffLL() { return 0x3fffffLL; } -int64_t get_max64_0x1ffff() { return 0x1ffff; } -int64_t get_max64_0xffffLL() { return 0xffffLL; }; + curl = curl_easy_init(); + if (unlikely(!curl)) { + applog(LOG_ERR, "CURL initialization failed"); + return NULL; + } -// default -void sha256d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) -{ - sha256d(merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size); - for ( int i = 0; i < sctx->job.merkle_count; i++ ) - { - memcpy( merkle_root + 32, sctx->job.merkle[i], 32 ); - sha256d( merkle_root, merkle_root, 64 ); + while (likely(ok)) { + struct workio_cmd *wc; + + /* wait for workio_cmd sent to us, on our queue */ + wc = (struct workio_cmd *)tq_pop(mythr->q, NULL); + if (!wc) { + ok = false; + break; + } + + /* process workio_cmd */ + switch (wc->cmd) { + case WC_GET_WORK: + ok = workio_get_work(wc, curl); + break; + case WC_SUBMIT_WORK: + ok = workio_submit_work(wc, curl); + break; + + default: /* should never happen */ + ok = false; + break; + } + workio_cmd_free(wc); } + tq_freeze(mythr->q); + curl_easy_cleanup(curl); + return NULL; } -void SHA256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) -{ - SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root ); - for ( int i = 0; i < sctx->job.merkle_count; i++ ) - { - memcpy( merkle_root + 32, sctx->job.merkle[i], 32 ); - sha256d( merkle_root, merkle_root, 64 ); + +static bool get_work(struct thr_info *thr, struct work *work) { + struct workio_cmd *wc; + struct work *work_heap; + + if unlikely (opt_benchmark) { + uint32_t ts = (uint32_t)time(NULL); + + // why 74? std cmp_size is 76, std data is 128 + for (int n = 0; n < 74; n++) + ((char *)work->data)[n] = n; + + work->data[algo_gate.ntime_index] = swab32(ts); // ntime + + // this overwrites much of the for loop init + memset(work->data + algo_gate.nonce_index, 0x00, 52); // nonce..nonce+52 + work->data[20] = 0x80000000; + work->data[31] = 0x00000280; + return true; + } + /* fill out work request message */ + wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); + if (!wc) + return false; + wc->cmd = WC_GET_WORK; + wc->thr = thr; + /* send work request to workio thread */ + if (!tq_push(thr_info[work_thr_id].q, wc)) { + workio_cmd_free(wc); + return false; + } + /* wait for response, a unit of work */ + work_heap = (struct work *)tq_pop(thr->q, NULL); + if (!work_heap) + return false; + /* copy returned work into storage provided by caller */ + memcpy(work, work_heap, sizeof(*work)); + free(work_heap); + return true; +} + +static bool submit_work(struct thr_info *thr, const struct work *work_in) { + struct workio_cmd *wc; + + /* fill out work request message */ + wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); + if (!wc) + return false; + wc->u.work = (struct work *)malloc(sizeof(*work_in)); + if (!wc->u.work) + goto err_out; + wc->cmd = WC_SUBMIT_WORK; + wc->thr = thr; + work_copy(wc->u.work, work_in); + + /* send solution to workio thread */ + if (!tq_push(thr_info[work_thr_id].q, wc)) + goto err_out; + return true; +err_out: + workio_cmd_free(wc); + return false; +} + +static void update_submit_stats(struct work *work, const void *hash) { + if (!dev_mining) { + pthread_mutex_lock(&stats_lock); + + submitted_share_count++; + share_stats[s_put_ptr].share_count = submitted_share_count; + gettimeofday(&share_stats[s_put_ptr].submit_time, NULL); + share_stats[s_put_ptr].share_diff = work->sharediff; + share_stats[s_put_ptr].net_diff = net_diff; + share_stats[s_put_ptr].stratum_diff = stratum_diff; + share_stats[s_put_ptr].target_diff = work->targetdiff; + if (have_stratum) + strncpy(share_stats[s_put_ptr].job_id, work->job_id, 30); + s_put_ptr = stats_ptr_incr(s_put_ptr); + + pthread_mutex_unlock(&stats_lock); + } else if (opt_debug) { + applog(LOG_DEBUG, "Dev submitted a share"); } } -// default -void std_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / opt_diff_factor ); -} -// most scrypt based algos -void scrypt_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (65536.0 * opt_diff_factor) ); -} -// another popular choice. -void alt_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} +bool submit_solution(struct work *work, const void *hash, + struct thr_info *thr) { + work->sharediff = hash_to_diff(hash); + if (likely(submit_work(thr, work))) { + update_submit_stats(work, hash); -// Default is do_nothing (assumed LE) -void set_work_data_big_endian( struct work *work ) -{ - int nonce_index = algo_gate.nonce_index; - for ( int i = 0; i < nonce_index; i++ ) - be32enc( work->data + i, work->data[i] ); -} + if unlikely (!have_stratum && + !have_longpoll) { // solo, block solved, force getwork + pthread_rwlock_wrlock(&g_work_lock); + g_work_time = 0; + pthread_rwlock_unlock(&g_work_lock); + restart_threads(); + } -double std_calc_network_diff( struct work* work ) -{ - // sample for diff 43.281 : 1c05ea29 - // todo: endian reversed on longpoll could be zr5 specific... - int nbits_index = algo_gate.nbits_index; - uint32_t nbits = have_longpoll ? work->data[ nbits_index] - : swab32( work->data[ nbits_index ] ); - uint32_t bits = ( nbits & 0xffffff ); - int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28 - int m; - double d = (double)0x0000ffff / (double)bits; - for ( m = shift; m < 29; m++ ) - d *= 256.0; - for ( m = 29; m < shift; m++ ) - d /= 256.0; - if ( opt_debug_diff ) - applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); - return d; -} + if (!opt_quiet && (!dev_mining || opt_debug)) { + if (have_stratum) + applog(LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Job %s", + submitted_share_count, work->sharediff, work->height, + work->job_id); + else + applog(LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Ntime %08x", + submitted_share_count, work->sharediff, work->height, + work->data[algo_gate.ntime_index]); + } -uint32_t* std_get_nonceptr( uint32_t *work_data ) -{ - return work_data + algo_gate.nonce_index; + if (opt_debug) { + uint32_t *h = (uint32_t *)hash; + uint32_t *t = (uint32_t *)work->target; + uint32_t *d = (uint32_t *)work->data; + + unsigned char *xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len); + applog(LOG_INFO, "Thread %d, Nonce %08x, Xnonce2 %s", thr->id, + work->data[algo_gate.nonce_index], xnonce2str); + free(xnonce2str); + applog(LOG_INFO, + "Data[0:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", + d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9]); + applog(LOG_INFO, + " : %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", + d[10], d[11], d[12], d[13], d[14], d[15], d[16], d[17], d[18], + d[19]); + applog(LOG_INFO, "Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x", + h[7], h[6], h[5], h[4], h[3], h[2], h[1], h[0]); + applog(LOG_INFO, "Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x", + t[7], t[6], t[5], t[4], t[3], t[2], t[1], t[0]); + } + return true; + } else + applog(LOG_WARNING, "%d failed to submit share", submitted_share_count); + return false; } -uint32_t* jr2_get_nonceptr( uint32_t *work_data ) -{ - // nonce is misaligned, use byte offset - return (uint32_t*) ( ((uint8_t*) work_data) + algo_gate.nonce_index ); +static bool wanna_mine(int thr_id) { + bool state = true; + + if (opt_max_temp > 0.0) { + float temp = cpu_temp(0); + if (temp > opt_max_temp) { + if (!thr_id && !conditional_state[thr_id] && !opt_quiet) + applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp); + state = false; + } + } + if (opt_max_diff > 0.0 && net_diff > opt_max_diff) { + if (!thr_id && !conditional_state[thr_id] && !opt_quiet) + applog(LOG_INFO, "network diff too high, waiting..."); + state = false; + } + if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate) { + if (!thr_id && !conditional_state[thr_id] && !opt_quiet) { + char rate[32]; + format_hashrate(opt_max_rate, rate); + applog(LOG_INFO, "network hashrate too high, waiting %s...", rate); + } + state = false; + } + if (thr_id < MAX_CPUS) + conditional_state[thr_id] = (uint8_t)!state; + return state; } +// Common target functions, default usually listed first. -void std_get_new_work( struct work* work, struct work* g_work, int thr_id, - uint32_t *end_nonce_ptr, bool clean_job ) -{ - uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); - -// the job_id check doesn't work as intended, it's a char pointer! -// For stratum the pointers can be dereferenced and the strings compared, -// benchmark not, getwork & gbt unsure. -// || ( have_straum && strcmp( work->job_id, g_work->job_id ) ) ) ) -// or -// || ( !benchmark && strcmp( work->job_id, g_work->job_id ) ) ) ) -// For now leave it as is, it seems stable. -// strtoul seems to work. - if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) - && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) - || strtoul( work->job_id, NULL, 16 ) - != strtoul( g_work->job_id, NULL, 16 ) ) ) - { - work_free( work ); - work_copy( work, g_work ); - *nonceptr = 0xffffffffU / opt_n_threads * thr_id; - if ( opt_randomize ) - *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads; - *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; - } - else - ++(*nonceptr); +// default +void sha256d_gen_merkle_root(char *merkle_root, struct stratum_ctx *sctx) { + sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); + for (int i = 0; i < sctx->job.merkle_count; i++) { + memcpy(merkle_root + 32, sctx->job.merkle[i], 32); + sha256d(merkle_root, merkle_root, 64); + } } - -void jr2_get_new_work( struct work* work, struct work* g_work, int thr_id, - uint32_t *end_nonce_ptr ) -{ - uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); - - // byte data[ 0..38, 43..75 ], skip over misaligned nonce [39..42] - if ( memcmp( work->data, g_work->data, algo_gate.nonce_index ) - || memcmp( ((uint8_t*) work->data) + JR2_WORK_CMP_INDEX_2, - ((uint8_t*) g_work->data) + JR2_WORK_CMP_INDEX_2, - JR2_WORK_CMP_SIZE_2 ) ) - { - work_free( work ); - work_copy( work, g_work ); - *nonceptr = ( 0xffffffU / opt_n_threads ) * thr_id - + ( *nonceptr & 0xff000000U ); - *end_nonce_ptr = ( 0xffffffU / opt_n_threads ) * (thr_id+1) - + ( *nonceptr & 0xff000000U ) - 0x20; - } - else - ++(*nonceptr); +void SHA256_gen_merkle_root(char *merkle_root, struct stratum_ctx *sctx) { + SHA256(sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root); + for (int i = 0; i < sctx->job.merkle_count; i++) { + memcpy(merkle_root + 32, sctx->job.merkle[i], 32); + sha256d(merkle_root, merkle_root, 64); + } } -bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum, - int thr_id ) -{ - if ( have_stratum && !work->data[0] && !opt_benchmark ) - { - sleep(1); - return false; - } - return true; -} +// Default is do_nothing (assumed LE) +void set_work_data_big_endian(struct work *work) { + int nonce_index = algo_gate.nonce_index; + for (int i = 0; i < nonce_index; i++) + be32enc(work->data + i, work->data[i]); +} + +// calculate net diff from nbits. +double std_calc_network_diff(struct work *work) { + // sample for diff 43.281 : 1c05ea29 + // todo: endian reversed on longpoll could be zr5 specific... + int nbits_index = algo_gate.nbits_index; + uint32_t nbits = + have_longpoll ? work->data[nbits_index] : swab32(work->data[nbits_index]); + uint32_t bits = (nbits & 0xffffff); + int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 + int m; + double d = (double)0x0000ffff / (double)bits; + for (m = shift; m < 29; m++) + d *= 256.0; + for (m = 29; m < shift; m++) + d /= 256.0; + if (opt_debug_diff) + applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); + return d; +} + +void std_get_new_work(struct work *work, struct work *g_work, int thr_id, + uint32_t *end_nonce_ptr) { + uint32_t *nonceptr = work->data + algo_gate.nonce_index; + bool force_new_work = false; + + if (have_stratum) + force_new_work = work->job_id ? strtoul(work->job_id, NULL, 16) != + strtoul(g_work->job_id, NULL, 16) + : false; + + if (force_new_work || (*nonceptr >= *end_nonce_ptr) || + memcmp(work->data, g_work->data, algo_gate.work_cmp_size)) { + work_free(work); + work_copy(work, g_work); + *nonceptr = 0xffffffffU / opt_n_threads * thr_id; + *end_nonce_ptr = (0xffffffffU / opt_n_threads) * (thr_id + 1) - 0x20; + } else + ++(*nonceptr); +} + +bool std_ready_to_mine(struct work *work, struct stratum_ctx *stratum, + int thr_id) { + if (have_stratum && !work->data[0] && !opt_benchmark) { + sleep(1); + return false; + } + return true; +} + +static void stratum_gen_work(struct stratum_ctx *sctx, struct work *g_work, + bool dev) { + bool new_job; + + pthread_rwlock_wrlock(&g_work_lock); + pthread_mutex_lock(&sctx->work_lock); + + new_job = sctx->new_job; + sctx->new_job = false; + + free(g_work->job_id); + g_work->job_id = strdup(sctx->job.job_id); + g_work->xnonce2_len = sctx->xnonce2_size; + g_work->xnonce2 = (uchar *)realloc(g_work->xnonce2, sctx->xnonce2_size); + memcpy(g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); + algo_gate.build_extraheader(g_work, sctx); + net_diff = algo_gate.calc_network_diff(g_work); + algo_gate.set_work_data_endian(g_work); + g_work->height = sctx->block_height; + g_work->targetdiff = sctx->job.diff / (opt_target_factor * opt_diff_factor); + diff_to_hash(g_work->target, g_work->targetdiff); + + // Increment extranonce2 + for (int t = 0; t < sctx->xnonce2_size && !(++sctx->job.xnonce2[t]); t++) + ; + + g_work_time = time(NULL); + restart_threads(); + + pthread_mutex_unlock(&sctx->work_lock); + pthread_rwlock_unlock(&g_work_lock); + + pthread_mutex_lock(&stats_lock); + + double hr = 0.; + for (int i = 0; i < opt_n_threads; i++) + hr += thr_hashrates[i]; + global_hashrate = hr; + + pthread_mutex_unlock(&stats_lock); + + if (!dev || opt_debug) { + if (stratum_diff != sctx->job.diff) + applog(LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s", sctx->job.diff, + sctx->block_height, g_work->job_id); + else if (last_block_height != sctx->block_height) + applog(LOG_BLUE, "New Block %d, Job %s", sctx->block_height, + g_work->job_id); + else if (g_work->job_id && new_job) + applog(LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s", + sctx->block_height, net_diff, g_work->job_id); + else if (!opt_quiet) { + unsigned char *xnonce2str = + abin2hex(g_work->xnonce2, g_work->xnonce2_len); + applog(LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g", xnonce2str, + sctx->block_height, net_diff); + free(xnonce2str); + } + } -static void *miner_thread( void *userdata ) -{ - struct work work __attribute__ ((aligned (64))) ; - struct thr_info *mythr = (struct thr_info *) userdata; - int thr_id = mythr->id; - uint32_t max_nonce; - struct timeval et; - struct timeval time_now; - - // end_nonce gets read before being set so it needs to be initialized - // what is an appropriate value that is completely neutral? - // zero seems to work. No, it breaks benchmark. -// uint32_t end_nonce = 0; - uint32_t end_nonce = opt_benchmark - ? ( 0xffffffffU / opt_n_threads ) * (thr_id + 1) - 0x20 - : 0; - time_t firstwork_time = 0; - int i; - memset( &work, 0, sizeof(work) ); - - /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE - * and if that fails, then SCHED_BATCH. No need for this to be an - * error if it fails */ - if (!opt_benchmark && opt_priority == 0) - { - setpriority(PRIO_PROCESS, 0, 19); - drop_policy(); - } - else - { - int prio = 0; + if ((stratum_diff != sctx->job.diff) || + (last_block_height != sctx->block_height)) { + static bool multipool = false; + if (sctx->block_height < last_block_height) + multipool = true; + if (unlikely(!session_first_block)) + session_first_block = sctx->block_height; + last_block_height = sctx->block_height; + stratum_diff = sctx->job.diff; + last_targetdiff = g_work->targetdiff; + if (lowest_share < last_targetdiff) + lowest_share = 9e99; + + if (!dev) { + if (!opt_quiet) { + applog2(LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g", net_diff, + stratum_diff, g_work->targetdiff); + + if (likely(hr > 0.)) { + double nd = net_diff * exp32; + char hr_units[4] = {0}; + char block_ttf[32]; + char share_ttf[32]; + + sprintf_et(block_ttf, nd / hr); + sprintf_et(share_ttf, (g_work->targetdiff * exp32) / hr); + scale_hash_for_display(&hr, hr_units); + applog2(LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s", hr, + hr_units, block_ttf, share_ttf); + applog2(LOG_BLUE, "Stratum Diff %g, Block %d, Job %s", sctx->job.diff, + sctx->block_height, g_work->job_id); + + if (!multipool && last_block_height > session_first_block) { + struct timeval now, et; + gettimeofday(&now, NULL); + timeval_subtract(&et, &now, &session_start); + uint64_t net_ttf = + (last_block_height - session_first_block) == 0 + ? 0 + : et.tv_sec / (last_block_height - session_first_block); + if (net_diff && net_ttf) { + double net_hr = nd / net_ttf; + char net_hr_units[4] = {0}; + + scale_hash_for_display(&net_hr, net_hr_units); + applog2(LOG_INFO, "Net hash rate (est) %.2f %sh/s", net_hr, + net_hr_units); + } + } + } // hr > 0 + } // !quiet + } // dev + } // new diff/block +} + +static void *miner_thread(void *userdata) { + struct work work __attribute__((aligned(64))); + struct thr_info *mythr = (struct thr_info *)userdata; + int thr_id = mythr->id; + uint32_t max_nonce; + uint32_t *nonceptr = work.data + algo_gate.nonce_index; + + // end_nonce gets read before being set so it needs to be initialized + // what is an appropriate value that is completely neutral? + // zero seems to work. No, it breaks benchmark. + // uint32_t end_nonce = 0; + // uint32_t end_nonce = opt_benchmark + // ? ( 0xffffffffU / opt_n_threads ) * (thr_id + 1) - + // 0x20 : 0; + uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; + + time_t firstwork_time = 0; + int i; + memset(&work, 0, sizeof(work)); + + /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE + * and if that fails, then SCHED_BATCH. No need for this to be an + * error if it fails */ + if (!opt_benchmark && opt_priority == 0) { + setpriority(PRIO_PROCESS, 0, 19); + if (!thr_id && !opt_quiet) + applog(LOG_INFO, "Miner thread priority %d (nice 19)", opt_priority); + drop_policy(); + } else { + int prio = 0; #ifndef WIN32 - prio = 18; - // note: different behavior on linux (-19 to 19) - switch (opt_priority) - { - case 1: - prio = 5; - break; - case 2: - prio = 0; - break; - case 3: - prio = -5; - break; - case 4: - prio = -10; - break; - case 5: - prio = -15; - } - if (opt_debug) - applog(LOG_DEBUG, "Thread %d priority %d (nice %d)", thr_id, - opt_priority, prio ); + prio = 18; + // note: different behavior on linux (-19 to 19) + switch (opt_priority) { + case 1: + prio = 5; + break; + case 2: + prio = 0; + break; + case 3: + prio = -5; + break; + case 4: + prio = -10; + break; + case 5: + prio = -15; + } + if (!(thr_id || opt_quiet)) + applog(LOG_INFO, "Miner thread priority %d (nice %d)", opt_priority, + prio); #endif - setpriority(PRIO_PROCESS, 0, prio); - if (opt_priority == 0) - drop_policy(); - } - // CPU thread affinity -/* - if ( num_cpus > 64 ) - { - // opt_affinity ignored with more than 64 cpus. - if (opt_debug) - applog( LOG_DEBUG, "Binding thread %d to cpu %d", - thr_id, thr_id % num_cpus ); - affine_to_cpu_mask( thr_id, -1 ); - } - else -*/ - - if ( num_cpus > 1 ) - { + setpriority(PRIO_PROCESS, 0, prio); + if (opt_priority == 0) + drop_policy(); + } + // CPU thread affinity + if (num_cpus > 1) { #if AFFINITY_USES_UINT128 - // Default affinity - if ( (opt_affinity == (uint128_t)(-1) ) && opt_n_threads > 1 ) - { - if ( opt_debug ) - applog( LOG_DEBUG, "Binding thread %d to cpu %d.", - thr_id, thr_id % num_cpus, - u128_hi64( (uint128_t)1 << (thr_id % num_cpus) ), - u128_lo64( (uint128_t)1 << (thr_id % num_cpus) ) ); - affine_to_cpu_mask( thr_id, (uint128_t)1 << (thr_id % num_cpus) ); - } + // Default affinity + if ((opt_affinity == (uint128_t)(-1)) && opt_n_threads > 1) { + affine_to_cpu_mask(thr_id, (uint128_t)1 << (thr_id % num_cpus)); + if (opt_debug) + applog(LOG_INFO, "Binding thread %d to cpu %d.", thr_id, + thr_id % num_cpus, + u128_hi64((uint128_t)1 << (thr_id % num_cpus)), + u128_lo64((uint128_t)1 << (thr_id % num_cpus))); + } #else - if ( (opt_affinity == -1LL) && opt_n_threads > 1 ) - { - if (opt_debug) - applog( LOG_DEBUG, "Binding thread %d to cpu %d.", - thr_id, thr_id % num_cpus, 1LL << (thr_id % num_cpus)) ; - affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) ); - } + if ((opt_affinity == -1) && (opt_n_threads > 1)) { + affine_to_cpu_mask(thr_id, 1 << (thr_id % num_cpus)); + if (opt_debug) + applog(LOG_DEBUG, "Binding thread %d to cpu %d.", thr_id, + thr_id % num_cpus, 1 << (thr_id % num_cpus)); + } #endif - else // Custom affinity - { + else // Custom affinity + { + affine_to_cpu_mask(thr_id, opt_affinity); + if (opt_debug) { #if AFFINITY_USES_UINT128 - if (opt_debug) - applog( LOG_DEBUG, "Binding thread %d to mask %016llx %016llx", - thr_id, u128_hi64( opt_affinity ), - u128_lo64( opt_affinity ) ); + if (num_cpus > 64) + applog(LOG_INFO, "Binding thread %d to mask %016llx %016llx", thr_id, + u128_hi64(opt_affinity), u128_lo64(opt_affinity)); + else + applog(LOG_INFO, "Binding thread %d to mask %016llx", thr_id, + opt_affinity); #else - if (opt_debug) - applog( LOG_DEBUG, "Binding thread %d to mask %016llx", - thr_id, opt_affinity ); + applog(LOG_INFO, "Binding thread %d to mask %016llx", thr_id, + opt_affinity); #endif - affine_to_cpu_mask( thr_id, opt_affinity ); } - } - - if ( !algo_gate.miner_thread_init( thr_id ) ) - { - applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id ); - exit (1); - } - - while (1) - { - uint64_t hashes_done; - struct timeval tv_start, tv_end, diff; - int64_t max64; - int nonce_found = 0; - - if ( algo_gate.do_this_thread( thr_id ) ) - { - if ( have_stratum ) - { - algo_gate.wait_for_diff( &stratum ); - pthread_mutex_lock( &g_work_lock ); - if ( *algo_gate.get_nonceptr( work.data ) >= end_nonce ) - algo_gate.stratum_gen_work( &stratum, &g_work ); - algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce, - stratum.job.clean ); - pthread_mutex_unlock( &g_work_lock ); - } - else - { - int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime; - pthread_mutex_lock( &g_work_lock ); + } + } // num_cpus > 1 - if ( time(NULL) - g_work_time >= min_scantime - || *algo_gate.get_nonceptr( work.data ) >= end_nonce ) - { - if ( unlikely( !get_work( mythr, &g_work ) ) ) - { - applog( LOG_ERR, "work retrieval failed, exiting " - "mining thread %d", thr_id ); - pthread_mutex_unlock( &g_work_lock ); - goto out; - } - g_work_time = time(NULL); - } - algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce, true ); - - pthread_mutex_unlock( &g_work_lock ); - } - } // do_this_thread - algo_gate.resync_threads( &work ); + if (!algo_gate.miner_thread_init(thr_id)) { + applog(LOG_ERR, "FAIL: thread %u failed to initialize", thr_id); + exit(1); + } - if ( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) - continue; - // conditional mining - if (!wanna_mine(thr_id)) - { - sleep(5); - continue; - } - // adjust max_nonce to meet target scan time - if (have_stratum) - max64 = LP_SCANTIME; - else - max64 = g_work_time + ( have_longpoll ? LP_SCANTIME : opt_scantime ) - - time(NULL); - // time limit - if ( opt_time_limit && firstwork_time ) - { - int passed = (int)( time(NULL) - firstwork_time ); - int remain = (int)( opt_time_limit - passed ); - if ( remain < 0 ) - { - if ( thr_id != 0 ) - { - sleep(1); - continue; - } - if (opt_benchmark) - { - char rate[32]; - format_hashrate( global_hashrate, rate ); - applog( LOG_NOTICE, "Benchmark: %s", rate ); - fprintf(stderr, "%llu\n", (unsigned long long)global_hashrate); - } - else - applog( LOG_NOTICE, - "Mining timeout of %ds reached, exiting...", opt_time_limit); - proper_exit(0); - } - if (remain < max64) max64 = remain; - } - // max64 - uint32_t work_nonce = *( algo_gate.get_nonceptr( work.data ) ); - max64 *= thr_hashrates[thr_id]; - if ( max64 <= 0) - max64 = (int64_t)algo_gate.get_max64(); - if ( work_nonce + max64 > end_nonce ) - max_nonce = end_nonce; - else - max_nonce = work_nonce + (uint32_t)max64; - // init time - if ( firstwork_time == 0 ) - firstwork_time = time(NULL); - work_restart[thr_id].restart = 0; - hashes_done = 0; - gettimeofday( (struct timeval *) &tv_start, NULL ); - - // Scan for nonce - nonce_found = algo_gate.scanhash( &work, max_nonce, - &hashes_done, mythr ); - - // record scanhash elapsed time - gettimeofday( &tv_end, NULL ); - timeval_subtract( &diff, &tv_end, &tv_start ); - if ( diff.tv_usec || diff.tv_sec ) - { - pthread_mutex_lock( &stats_lock ); - thr_hashcount[thr_id] = hashes_done; - thr_hashrates[thr_id] = - hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 ); - pthread_mutex_unlock( &stats_lock ); - } - // if nonce(s) found submit work - if ( nonce_found && !opt_benchmark ) - { - if ( !submit_work( mythr, &work ) ) - { - applog( LOG_WARNING, "Failed to submit share." ); - break; + // wait for stratum to send first job + if (have_stratum) + while (unlikely(!g_work.job_id)) + sleep(1); + + while (1) { + uint64_t hashes_done; + struct timeval tv_start, tv_end, diff; + int64_t max64 = 1000; + int nonce_found = 0; + + if (likely(algo_gate.do_this_thread(thr_id))) { + if (have_stratum) { + if (*nonceptr >= end_nonce) { + if (dev_mining) { + stratum_gen_work(&dev_stratum, &g_work, dev_mining); + } else { + stratum_gen_work(&stratum, &g_work, dev_mining); } - if ( !opt_quiet ) - applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.", - accepted_share_count + rejected_share_count + 1, - mythr->id, work.job_id ); - - // prevent stale work in solo - // we can't submit twice a block! - if ( !have_stratum && !have_longpoll ) - { - pthread_mutex_lock( &g_work_lock ); - // will force getwork - g_work_time = 0; - pthread_mutex_unlock( &g_work_lock ); + } + } else { + pthread_rwlock_wrlock(&g_work_lock); + + if (((time(NULL) - g_work_time) >= + (have_longpoll ? LP_SCANTIME : opt_scantime)) || + (*nonceptr >= end_nonce)) { + if (unlikely(!get_work(mythr, &g_work))) { + pthread_rwlock_unlock(&g_work_lock); + applog(LOG_ERR, + "work retrieval failed, exiting " + "mining thread %d", + thr_id); + goto out; } - } - - // Check for 5 minute summary report, mutex until global counters - // are read and reset. It's bad form to unlock inside a conditional - // block but more efficient. The logic is reversed to make the mutex - // issue obvious. - pthread_mutex_lock( &stats_lock ); - - gettimeofday( &time_now, NULL ); - timeval_subtract( &et, &time_now, &five_min_start ); - if ( et.tv_sec < 300 ) - pthread_mutex_unlock( &stats_lock ); - else - { - // collect and reset global counters - double hash = shash_sum; shash_sum = 0.; - double bhash = bhash_sum; bhash_sum = 0.; - double time = time_sum; time_sum = 0.; - uint64_t submits = submit_sum; submit_sum = 0; - uint64_t rejects = reject_sum; reject_sum = 0; - uint64_t latency = latency_sum; latency_sum = 0; - memcpy( &five_min_start, &time_now, sizeof time_now ); - - pthread_mutex_unlock( &stats_lock ); - - double ghrate = global_hashrate; - double scaled_ghrate = ghrate; - double shrate = time == 0. ? 0. : hash / time; - double scaled_shrate = shrate; - double avg_share = bhash == 0. ? 0. : hash / bhash * 100.; - uint64_t avg_latency = 0; - double latency_pc = 0.; - double rejects_pc = 0.; - double submit_rate = 0.; - char shr[32]; - char shr_units[4] = {0}; - char ghr[32]; - char ghr_units[4] = {0}; - int temp = cpu_temp(0); - char tempstr[32]; - - if ( submits ) - avg_latency = latency / submits; - - if ( time != 0. ) - { - submit_rate = (double)submits*60. / time; - rejects_pc = (double)rejects / (time*10.); - latency_pc = (double)latency / ( time*10.); + g_work_time = time(NULL); + if (!opt_benchmark) { + restart_threads(); } - - scale_hash_for_display( &scaled_shrate, shr_units ); - scale_hash_for_display( &scaled_ghrate, ghr_units ); - sprintf( ghr, "%.2f %sH/s", scaled_ghrate, ghr_units ); + } - if ( use_colors ) - { - if ( shrate > (128.*ghrate) ) - sprintf( shr, "%s%.2f %sH/s%s", CL_MAG, scaled_shrate, - shr_units, CL_WHT ); - else if ( shrate > (16.*ghrate) ) - sprintf( shr, "%s%.2f %sH/s%s", CL_GRN, scaled_shrate, - shr_units, CL_WHT ); - else if ( shrate > 2.0*ghrate ) - sprintf( shr, "%s%.2f %sH/s%s", CL_CYN, scaled_shrate, - shr_units, CL_WHT ); - else if ( shrate > 0.5*ghrate ) - sprintf( shr, "%.2f %sH/s", scaled_shrate, shr_units ); - else - sprintf( shr, "%s%.2f %sH/s%s", CL_YLW, scaled_shrate, - shr_units, CL_WHT ); - - if ( temp >= 80 ) sprintf( tempstr, "%s%d C%s", - CL_RED, temp, CL_WHT ); - else if (temp >=70 ) sprintf( tempstr, "%s%d C%s", - CL_YLW, temp, CL_WHT ); - else sprintf( tempstr, "%d C", temp ); - } - else - { - sprintf( shr, "%.2f %sH/s", scaled_shrate, shr_units ); - sprintf( tempstr, "%d C", temp ); - } + pthread_rwlock_unlock(&g_work_lock); + } + pthread_rwlock_rdlock(&g_work_lock); - applog(LOG_NOTICE,"Submitted %d shares in %dm%02ds.", - (uint64_t)submits, et.tv_sec / 60, et.tv_sec % 60 ); - applog(LOG_NOTICE,"%d rejects (%.2f%%), %.5f%% block share.", - rejects, rejects_pc, avg_share ); - applog(LOG_NOTICE,"Avg hashrate: Miner %s, Share %s.", ghr, shr ); - -#if ((defined(_WIN64) || defined(__WINDOWS__))) - applog(LOG_NOTICE,"Shares/min: %.2f, latency %d ms (%.2f%%).", - submit_rate, avg_latency, latency_pc ); + algo_gate.get_new_work(&work, &g_work, thr_id, &end_nonce); + work_restart[thr_id].restart = 0; -#else - applog(LOG_NOTICE,"Shares/min: %.2f, latency %d ms (%.2f%%), temp: %s.", - submit_rate, avg_latency, latency_pc, tempstr ); -#endif + pthread_rwlock_unlock(&g_work_lock); -/* - applog(LOG_NOTICE,"Submitted %d shares in %dm%02ds, %.5f%% block share.", - (uint64_t)submits, et.tv_sec / 60, et.tv_sec % 60, avg_share ); + } // do_this_thread + algo_gate.resync_threads(thr_id, &work); -#if ((defined(_WIN64) || defined(__WINDOWS__))) - applog(LOG_NOTICE,"Share hashrate %s, latency %d ms (%.2f%%).", - shr, avg_latency, latency_pc ); -#else - applog(LOG_NOTICE,"Share hashrate %s, latency %d ms (%.2f%%), temp %s.", - shr, avg_latency, latency_pc, tempstr ); -#endif -*/ - applog(LOG_INFO,"- - - - - - - - - - - - - - - - - - - - - - - - - - -"); - } - - // display hashrate - if ( !opt_quiet ) - { - char hc[16]; - char hr[16]; - char hc_units[2] = {0,0}; - char hr_units[2] = {0,0}; - double hashcount; - double hashrate; - if ( opt_hash_meter ) - { - hashcount = thr_hashcount[thr_id]; - hashrate = thr_hashrates[thr_id]; - if ( hashcount != 0. ) - { - scale_hash_for_display( &hashcount, hc_units ); - scale_hash_for_display( &hashrate, hr_units ); - if ( hc_units[0] ) - sprintf( hc, "%.2f", hashcount ); - else // no fractions of a hash - sprintf( hc, "%.0f", hashcount ); - sprintf( hr, "%.2f", hashrate ); - applog( LOG_INFO, "CPU #%d: %s %sH, %s %sH/s", - thr_id, hc, hc_units, hr, hr_units ); - } - } - if ( thr_id == 0 ) - { - hashcount = 0.; - hashrate = 0.; - for ( i = 0; i < opt_n_threads; i++ ) - { - hashrate += thr_hashrates[i]; - hashcount += thr_hashcount[i]; - } - if ( hashcount != 0. ) - { - scale_hash_for_display( &hashcount, hc_units ); - scale_hash_for_display( &hashrate, hr_units ); - if ( hc_units[0] ) - sprintf( hc, "%.2f", hashcount ); - else // no fractions of a hash - sprintf( hc, "%.0f", hashcount ); - sprintf( hr, "%.2f", hashrate ); - applog( LOG_NOTICE, "Miner perf: %s %sH, %s %sH/s.", - hc, hc_units, hr, hr_units ); - } - } - } - - // Display benchmark total - // Update hashrate for API if no shares accepted yet. - if ( ( opt_benchmark || !accepted_share_count ) - && thr_id == opt_n_threads - 1 ) - { - double hashrate = 0.; - double hashcount = 0.; - for ( i = 0; i < opt_n_threads; i++ ) - { - hashrate += thr_hashrates[i]; - hashcount += thr_hashcount[i]; - } - if ( hashcount != 0. ) - { - global_hashcount = hashcount; - global_hashrate = hashrate; - if ( opt_benchmark ) - { - char hc[16]; - char hc_units[2] = {0,0}; - char hr[16]; - char hr_units[2] = {0,0}; - scale_hash_for_display( &hashcount, hc_units ); - scale_hash_for_display( &hashrate, hr_units ); - if ( hc_units[0] ) - sprintf( hc, "%.2f", hashcount ); - else // no fractions of a hash - sprintf( hc, "%.0f", hashcount ); - sprintf( hr, "%.2f", hashrate ); -#if ((defined(_WIN64) || defined(__WINDOWS__))) - applog( LOG_NOTICE, "Total: %s %sH, %s %sH/s", - hc, hc_units, hr, hr_units ); + if (dev_mining) { + if (unlikely(!algo_gate.ready_to_mine(&work, &dev_stratum, thr_id))) + continue; + } else { + if (unlikely(!algo_gate.ready_to_mine(&work, &stratum, thr_id))) + continue; + } + + // LP_SCANTIME overrides opt_scantime option, is this right? + + // adjust max_nonce to meet target scan time. Stratum and longpoll + // can go longer because they can rely on restart_threads to signal + // an early abort. get_work on the other hand can't rely on + // restart_threads so need a much shorter scantime + if (have_stratum) + max64 = 60 * thr_hashrates[thr_id]; + else if (have_longpoll) + max64 = LP_SCANTIME * thr_hashrates[thr_id]; + else // getwork inline + max64 = opt_scantime * thr_hashrates[thr_id]; + + // time limit + if (unlikely(opt_time_limit && firstwork_time)) { + int passed = (int)(time(NULL) - firstwork_time); + int remain = (int)(opt_time_limit - passed); + if (remain < 0) { + if (thr_id != 0) { + sleep(1); + continue; + } + if (opt_benchmark) { + char rate[32]; + format_hashrate(global_hashrate, rate); + applog(LOG_NOTICE, "Benchmark: %s", rate); + fprintf(stderr, "%llu\n", (unsigned long long)global_hashrate); + } else + applog(LOG_NOTICE, "Mining timeout of %ds reached, exiting...", + opt_time_limit); + proper_exit(0); + } + if (remain < max64) + max64 = remain; + } + + // Select nonce range based on max64, the estimated number of hashes + // to meet the desired scan time. + // Initial value arbitrarilly set to 1000 just to get + // a sample hashrate for the next time. + uint32_t work_nonce = *nonceptr; + if (max64 <= 0) + max64 = 1000; + if (work_nonce + max64 > end_nonce) + max_nonce = end_nonce; + else + max_nonce = work_nonce + (uint32_t)max64; + + // init time + if (firstwork_time == 0) + firstwork_time = time(NULL); + hashes_done = 0; + gettimeofday((struct timeval *)&tv_start, NULL); + + // Scan for nonce + nonce_found = algo_gate.scanhash(&work, max_nonce, &hashes_done, mythr); + + // record scanhash elapsed time + gettimeofday(&tv_end, NULL); + timeval_subtract(&diff, &tv_end, &tv_start); + if (diff.tv_usec || diff.tv_sec) { + pthread_mutex_lock(&stats_lock); + thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + diff.tv_usec * 1e-6); + pthread_mutex_unlock(&stats_lock); + } + + // This code is deprecated, scanhash should never return true. + // This remains as a backup in case some old implementations still exist. + // If unsubmiited nonce(s) found, submit now. + if (unlikely(nonce_found && !opt_benchmark)) { + // applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting + // bugs. Algo = %s.", + // algo_names[ opt_algo ] ); + if (!submit_work(mythr, &work)) { + applog(LOG_WARNING, "Failed to submit share."); + break; + } + if (!opt_quiet) + applog(LOG_NOTICE, "%d: submitted by thread %d.", + accepted_share_count + rejected_share_count + 1, mythr->id); + + // prevent stale work in solo + // we can't submit twice a block! + if unlikely (!have_stratum && !have_longpoll) { + pthread_rwlock_wrlock(&g_work_lock); + // will force getwork + g_work_time = 0; + pthread_rwlock_unlock(&g_work_lock); + } + } + + // display hashrate + if (unlikely(opt_hash_meter)) { + char hr[16]; + char hr_units[2] = {0, 0}; + double hashrate; + + hashrate = thr_hashrates[thr_id]; + if (hashrate != 0.) { + scale_hash_for_display(&hashrate, hr_units); + sprintf(hr, "%.2f", hashrate); + applog(LOG_INFO, "CPU #%d: %s %sh/s", thr_id, hr, hr_units); + } + } + + // Display benchmark total + // Update hashrate for API if no shares accepted yet. + if (unlikely((opt_benchmark || !accepted_share_count) && + thr_id == opt_n_threads - 1)) { + double hashrate = 0.; + + pthread_mutex_lock(&stats_lock); + for (i = 0; i < opt_n_threads; i++) + hashrate += thr_hashrates[i]; + global_hashrate = hashrate; + pthread_mutex_unlock(&stats_lock); + + if (opt_benchmark) { + char hr[16]; + char hr_units[2] = {0, 0}; + scale_hash_for_display(&hashrate, hr_units); + sprintf(hr, "%.2f", hashrate); +#if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32)) + applog(LOG_NOTICE, "Total: %s %sH/s", hr, hr_units); #else - applog( LOG_NOTICE, "Total: %s %sH, %s %sH/s, %dC", - hc, hc_units, hr, hr_units, (uint32_t)cpu_temp(0) ); + applog(LOG_NOTICE, "Total: %s %sH/s, CPU temp: %dC", hr, hr_units, + (uint32_t)cpu_temp(0)); #endif - } - } - } - } // miner_thread loop + } + } // benchmark -out: - tq_freeze(mythr->q); - return NULL; -} + // conditional mining + if (unlikely(!wanna_mine(thr_id))) { + sleep(5); + continue; + } -void restart_threads(void) -{ - for ( int i = 0; i < opt_n_threads; i++) - work_restart[i].restart = 1; -} + } // miner_thread loop -json_t *std_longpoll_rpc_call( CURL *curl, int *err, char* lp_url ) -{ - json_t *val; - char *req = NULL; - if (have_gbt) - { - req = (char*) malloc( strlen(gbt_lp_req) + strlen(lp_id) + 1 ); - sprintf( req, gbt_lp_req, lp_id ); - } - val = json_rpc_call( curl, rpc_url, rpc_userpass, getwork_req, err, - JSON_RPC_LONGPOLL ); - val = json_rpc_call( curl, lp_url, rpc_userpass, req ? req : getwork_req, - err, JSON_RPC_LONGPOLL); - free(req); - return val; +out: + tq_freeze(mythr->q); + return NULL; } -json_t *jr2_longpoll_rpc_call( CURL *curl, int *err ) -{ - json_t *val; - char req[128]; - - pthread_mutex_lock( &rpc2_login_lock ); - if ( !strlen(rpc2_id) ) - { - pthread_mutex_unlock( &rpc2_login_lock ); - sleep(1); - return NULL; - } - snprintf( req, 128, "{\"method\": \"getjob\", \"params\": {\"id\": \"%s\"}, \"id\":1}\r\n", rpc2_id ); - pthread_mutex_unlock( &rpc2_login_lock ); - val = json_rpc2_call( curl, rpc_url, rpc_userpass, req, err, - JSON_RPC_LONGPOLL ); - return val; +void restart_threads(void) { + for (int i = 0; i < opt_n_threads; i++) + work_restart[i].restart = 1; + if (opt_debug) + applog(LOG_INFO, "Threads restarted for new work."); } -static void *longpoll_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info*) userdata; - CURL *curl = NULL; - char *copy_start, *hdr_path = NULL, *lp_url = NULL; - bool need_slash = false; - - curl = curl_easy_init(); - if (unlikely(!curl)) - { - applog(LOG_ERR, "CURL init failed"); - goto out; - } +json_t *std_longpoll_rpc_call(CURL *curl, int *err, char *lp_url) { + json_t *val; + char *req = NULL; + if (have_gbt) { + req = (char *)malloc(strlen(gbt_lp_req) + strlen(lp_id) + 1); + sprintf(req, gbt_lp_req, lp_id); + } + val = json_rpc_call(curl, rpc_url, rpc_userpass, getwork_req, err, + JSON_RPC_LONGPOLL); + val = json_rpc_call(curl, lp_url, rpc_userpass, req ? req : getwork_req, err, + JSON_RPC_LONGPOLL); + free(req); + return val; +} + +static void *longpoll_thread(void *userdata) { + struct thr_info *mythr = (struct thr_info *)userdata; + CURL *curl = NULL; + char *copy_start, *hdr_path = NULL, *lp_url = NULL; + bool need_slash = false; + + curl = curl_easy_init(); + if (unlikely(!curl)) { + applog(LOG_ERR, "CURL init failed"); + goto out; + } start: - hdr_path = (char*) tq_pop(mythr->q, NULL); - if (!hdr_path) - goto out; - - /* full URL */ - if (strstr(hdr_path, "://")) - { - lp_url = hdr_path; - hdr_path = NULL; - } - else - /* absolute path, on current server */ - { - copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path; - if (rpc_url[strlen(rpc_url) - 1] != '/') - need_slash = true; - - lp_url = (char*) malloc(strlen(rpc_url) + strlen(copy_start) + 2); - if (!lp_url) - goto out; - - sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start); - } - - if (!opt_quiet) - applog(LOG_BLUE, "Long-polling on %s", lp_url); - - while (1) - { - int err; - json_t *val; - val = (json_t*)algo_gate.longpoll_rpc_call( curl, &err, lp_url ); + hdr_path = (char *)tq_pop(mythr->q, NULL); + if (!hdr_path) + goto out; + + /* full URL */ + if (strstr(hdr_path, "://")) { + lp_url = hdr_path; + hdr_path = NULL; + } else + /* absolute path, on current server */ + { + copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path; + if (rpc_url[strlen(rpc_url) - 1] != '/') + need_slash = true; - if (have_stratum) - { - if (val) - json_decref(val); - goto out; - } - if (likely( val )) - { - bool rc; - char *start_job_id; - double start_diff = 0.0; - json_t *res, *soval; - res = json_object_get(val, "result"); - if (!jsonrpc_2) - { - soval = json_object_get(res, "submitold"); - submit_old = soval ? json_is_true(soval) : false; - } - pthread_mutex_lock(&g_work_lock); - start_job_id = g_work.job_id ? strdup(g_work.job_id) : NULL; - if (have_gbt) - rc = gbt_work_decode(res, &g_work); - else - rc = work_decode(res, &g_work); - if (rc) - { - bool newblock = g_work.job_id && strcmp(start_job_id, g_work.job_id); - newblock |= (start_diff != net_diff); // the best is the height but... longpoll... - if (newblock) - { - start_diff = net_diff; - if (!opt_quiet) - { - char netinfo[64] = { 0 }; - if (net_diff > 0.) - { - sprintf(netinfo, ", diff %.3f", net_diff); - } - if (opt_showdiff) - sprintf( &netinfo[strlen(netinfo)], ", target %.3f", - g_work.targetdiff ); - applog(LOG_BLUE, "%s detected new block%s", short_url, netinfo); - } - time(&g_work_time); - restart_threads(); - } - } - free(start_job_id); - pthread_mutex_unlock(&g_work_lock); - json_decref(val); + lp_url = (char *)malloc(strlen(rpc_url) + strlen(copy_start) + 2); + if (!lp_url) + goto out; + + sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start); + } + + if (!opt_quiet) + applog(LOG_BLUE, "Long-polling on %s", lp_url); + + while (1) { + int err; + json_t *val; + val = (json_t *)algo_gate.longpoll_rpc_call(curl, &err, lp_url); + + if (have_stratum) { + if (val) + json_decref(val); + goto out; + } + if (likely(val)) { + bool rc; + char *start_job_id; + double start_diff = 0.0; + json_t *res, *soval; + res = json_object_get(val, "result"); + soval = json_object_get(res, "submitold"); + submit_old = soval ? json_is_true(soval) : false; + + pthread_rwlock_wrlock(&g_work_lock); + + // This code has been here for a long time even though job_id isn't + // used. This needs to be changed eventually to test the block height + // properly using g_work.block_height . + start_job_id = g_work.job_id ? strdup(g_work.job_id) : NULL; + if (have_gbt) + rc = gbt_work_decode(res, &g_work); + else + rc = work_decode(res, &g_work); + if (rc) { + // purge job id from solo mining + bool newblock = g_work.job_id && strcmp(start_job_id, g_work.job_id); + newblock |= (start_diff != + net_diff); // the best is the height but... longpoll... + if (newblock) { + start_diff = net_diff; + if (!opt_quiet) { + char netinfo[64] = {0}; + if (net_diff > 0.) { + sprintf(netinfo, ", diff %.3f", net_diff); + } + sprintf(&netinfo[strlen(netinfo)], ", target %.3f", + g_work.targetdiff); + applog(LOG_BLUE, "%s detected new block%s", short_url, netinfo); + } + time(&g_work_time); + restart_threads(); + } } - else // !val - { - pthread_mutex_lock(&g_work_lock); - g_work_time -= LP_SCANTIME; - pthread_mutex_unlock(&g_work_lock); - if (err == CURLE_OPERATION_TIMEDOUT) - { - restart_threads(); - } - else - { - have_longpoll = false; - restart_threads(); - free(hdr_path); - free(lp_url); - lp_url = NULL; - sleep(opt_fail_pause); - goto start; - } + free(start_job_id); + + pthread_rwlock_unlock(&g_work_lock); + + json_decref(val); + } else // !val + { + pthread_rwlock_wrlock(&g_work_lock); + g_work_time -= LP_SCANTIME; + pthread_rwlock_unlock(&g_work_lock); + if (err == CURLE_OPERATION_TIMEDOUT) { + restart_threads(); + } else { + have_longpoll = false; + restart_threads(); + free(hdr_path); + free(lp_url); + lp_url = NULL; + sleep(opt_fail_pause); + goto start; } - } + } + } out: - free(hdr_path); - free(lp_url); - tq_freeze(mythr->q); - if (curl) - curl_easy_cleanup(curl); + free(hdr_path); + free(lp_url); + tq_freeze(mythr->q); + if (curl) + curl_easy_cleanup(curl); - return NULL; + return NULL; } -bool std_stratum_handle_response( json_t *val ) -{ - bool valid = false; - json_t *err_val, *res_val, *id_val; - res_val = json_object_get( val, "result" ); - err_val = json_object_get( val, "error" ); - id_val = json_object_get( val, "id" ); - - if ( !res_val || json_integer_value(id_val) < 4 ) - return false; - valid = json_is_true( res_val ); - share_result( valid, NULL, err_val ? - json_string_value( json_array_get(err_val, 1) ) : NULL ); - return true; -} +static bool stratum_handle_response(char *buf) { + json_t *val, *id_val, *res_val, *err_val; + json_error_t err; + bool ret = false; + bool share_accepted = false; -bool jr2_stratum_handle_response( json_t *val ) -{ - bool valid = false; - json_t *err_val, *res_val; - res_val = json_object_get( val, "result" ); - err_val = json_object_get( val, "error" ); + val = JSON_LOADS(buf, &err); + if (!val) { + applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + res_val = json_object_get(val, "result"); + if (!res_val) { /* now what? */ + } - if ( !res_val && !err_val ) - return false; - json_t *status = json_object_get( res_val, "status" ); - if ( status ) - { - const char *s = json_string_value( status ); - valid = !strcmp( s, "OK" ) && json_is_null( err_val ); - } - else - valid = json_is_null( err_val ); - share_result( valid, NULL, err_val ? json_string_value(err_val) : NULL ); - return true; -} + id_val = json_object_get(val, "id"); + if (!id_val || json_is_null(id_val)) + goto out; -static bool stratum_handle_response( char *buf ) -{ - json_t *val, *id_val, *res_val; - json_error_t err; - bool ret = false; - - val = JSON_LOADS( buf, &err ); - if (!val) - { - applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - res_val = json_object_get( val, "result" ); - if ( !res_val ) { /* now what? */ } - - id_val = json_object_get( val, "id" ); - if ( !id_val || json_is_null(id_val) ) - goto out; - if ( !algo_gate.stratum_handle_response( val ) ) - goto out; - ret = true; + err_val = json_object_get(val, "error"); + + if (!res_val || json_integer_value(id_val) < 4) + goto out; + share_accepted = json_is_true(res_val); + share_result(share_accepted, NULL, + err_val ? json_string_value(json_array_get(err_val, 1)) : NULL); + + ret = true; out: - if (val) - json_decref(val); - return ret; + if (val) + json_decref(val); + return ret; } // used by stratum and gbt -void std_build_block_header( struct work* g_work, uint32_t version, - uint32_t *prevhash, uint32_t *merkle_tree, - uint32_t ntime, uint32_t nbits ) -{ - int i; +void std_build_block_header(struct work *g_work, uint32_t version, + uint32_t *prevhash, uint32_t *merkle_tree, + uint32_t ntime, uint32_t nbits, + unsigned char *final_sapling_hash) { + int i; - memset( g_work->data, 0, sizeof(g_work->data) ); - g_work->data[0] = version; + memset(g_work->data, 0, sizeof(g_work->data)); + g_work->data[0] = version; + g_work->sapling = opt_sapling; - if ( have_stratum ) - for ( i = 0; i < 8; i++ ) - g_work->data[ 1+i ] = le32dec( prevhash + i ); - else + if (have_stratum) + for (i = 0; i < 8; i++) + g_work->data[1 + i] = le32dec(prevhash + i); + else + for (i = 0; i < 8; i++) + g_work->data[8 - i] = le32dec(prevhash + i); + for (i = 0; i < 8; i++) + g_work->data[9 + i] = be32dec(merkle_tree + i); + g_work->data[algo_gate.ntime_index] = ntime; + g_work->data[algo_gate.nbits_index] = nbits; + + if (g_work->sapling) { + if (have_stratum) for (i = 0; i < 8; i++) - g_work->data[ 8-i ] = le32dec( prevhash + i ); - - for ( i = 0; i < 8; i++ ) - g_work->data[ 9+i ] = be32dec( merkle_tree + i ); - - g_work->data[ algo_gate.ntime_index ] = ntime; - g_work->data[ algo_gate.nbits_index ] = nbits; - g_work->data[20] = 0x80000000; - g_work->data[31] = 0x00000280; + g_work->data[20 + i] = le32dec((uint32_t *)final_sapling_hash + i); + else { + for (i = 0; i < 8; i++) + g_work->data[27 - i] = le32dec((uint32_t *)final_sapling_hash + i); + g_work->data[19] = 0; + } + g_work->data[28] = 0x80000000; + g_work->data[29] = 0x00000000; + g_work->data[30] = 0x00000000; + g_work->data[31] = 0x00000380; + } else { + g_work->data[20] = 0x80000000; + g_work->data[31] = 0x00000280; + } } -void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) -{ - uchar merkle_tree[64] = { 0 }; - size_t t; - - algo_gate.gen_merkle_root( merkle_tree, sctx ); - // Increment extranonce2 - for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - // Assemble block header - algo_gate.build_block_header( g_work, le32dec( sctx->job.version ), - (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree, - le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) ); -} +void std_build_extraheader(struct work *g_work, struct stratum_ctx *sctx) { + uchar merkle_tree[64] = {0}; + + algo_gate.gen_merkle_root(merkle_tree, sctx); + algo_gate.build_block_header( + g_work, le32dec(sctx->job.version), (uint32_t *)sctx->job.prevhash, + (uint32_t *)merkle_tree, le32dec(sctx->job.ntime), + le32dec(sctx->job.nbits), sctx->job.final_sapling_hash); +} + +static void *stratum_thread(void *userdata) { + struct thr_info *mythr = (struct thr_info *)userdata; + char *s = NULL; + + stratum.url = (char *)tq_pop(mythr->q, NULL); + if (!stratum.url) + goto out; + applog(LOG_BLUE, "Stratum connect %s", short_url); + + struct timeval now; + gettimeofday(&now, NULL); + while (1) { + int failures = 0; + if (unlikely(stratum_need_reset)) { + stratum_need_reset = false; + stratum_disconnect(&stratum); + if (strcmp(stratum.url, rpc_url)) { + free(stratum.url); + stratum.url = strdup(rpc_url); + applog(LOG_BLUE, "Connection changed to %s", short_url); + } else // if ( !opt_quiet ) + applog(LOG_WARNING, "Stratum connection reset"); + // reset stats queue as well + s_get_ptr = s_put_ptr = 0; + } -void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) -{ - pthread_mutex_lock( &sctx->work_lock ); - free( g_work->job_id ); - g_work->job_id = strdup( sctx->job.job_id ); - g_work->xnonce2_len = sctx->xnonce2_size; - g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size ); - memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size ); - - algo_gate.build_extraheader( g_work, sctx ); - - net_diff = algo_gate.calc_network_diff( g_work ); - algo_gate.set_work_data_endian( g_work ); - pthread_mutex_unlock( &sctx->work_lock ); - -// if ( !opt_quiet ) -// applog( LOG_BLUE,"New job %s.", g_work->job_id ); - - if ( opt_debug ) - { - unsigned char *xnonce2str = abin2hex( g_work->xnonce2, - g_work->xnonce2_len ); - applog( LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x", - g_work->job_id, xnonce2str, swab32( g_work->data[17] ) ); - free( xnonce2str ); - } - - algo_gate.set_target( g_work, sctx->job.diff ); - - if ( stratum_diff != sctx->job.diff ) - { -// char sdiff[32] = { 0 }; - // store for api stats - stratum_diff = sctx->job.diff; - if ( !opt_quiet && opt_showdiff && g_work->targetdiff != stratum_diff ) - { -// snprintf( sdiff, 32, " (%.5f)", g_work->targetdiff ); - applog( LOG_BLUE, "Stratum difficulty set to %g", stratum_diff ); -// sdiff ); - } - } -} + while (!stratum.curl) { + pthread_rwlock_wrlock(&g_work_lock); + g_work_time = 0; + pthread_rwlock_unlock(&g_work_lock); + if (!stratum_connect(&stratum, stratum.url) || + !stratum_subscribe(&stratum) || + !stratum_authorize(&stratum, rpc_user, rpc_pass)) { + stratum_disconnect(&stratum); + if (opt_retries >= 0 && ++failures > opt_retries) { + applog(LOG_ERR, "...terminating workio thread"); + tq_push(thr_info[work_thr_id].q, NULL); + goto out; + } + if (!opt_benchmark) + applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); + sleep(opt_fail_pause); + } else { + restart_threads(); + applog(LOG_BLUE, "Stratum connection established"); + } + } -void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) -{ - pthread_mutex_lock( &sctx->work_lock ); - work_free( g_work ); - work_copy( g_work, &sctx->work ); - pthread_mutex_unlock( &sctx->work_lock ); -} + // Still check if it was changed midway. + if (dev_mining) { + // 1% of 1h == 3600s => 36s + struct timeval shift = {dev_start.tv_sec + + ceil(dev_interval.tv_sec * dev_fee), + dev_start.tv_usec}; + while (timercmp(&now, &shift, <)) { + usleep(250000); // Check once every 250ms. + gettimeofday(&now, NULL); + } + struct timeval shifted = {now.tv_sec + dev_interval.tv_sec, now.tv_usec}; + dev_start = shifted; + dev_mining = false; + applog(LOG_ERR, "Dev fee collected."); + } + report_summary_log((stratum_diff != stratum.job.diff) && + (stratum_diff != 0.)); + if (stratum.new_job) + stratum_gen_work(&stratum, &g_work, false); + + if (likely(stratum_socket_full(&stratum, opt_timeout))) { + if (likely(s = stratum_recv_line(&stratum))) { + if (likely(!stratum_handle_method(&stratum, s))) + stratum_handle_response(s); + free(s); + } else { + applog(LOG_WARNING, "Stratum connection interrupted"); + stratum_disconnect(&stratum); + } + } else { + applog(LOG_ERR, "Stratum connection timeout"); + stratum_disconnect(&stratum); + } -static void *stratum_thread(void *userdata ) -{ - struct thr_info *mythr = (struct thr_info *) userdata; - char *s; + } // loop +out: + return NULL; +} - stratum.url = (char*) tq_pop(mythr->q, NULL); - if (!stratum.url) - goto out; - applog(LOG_INFO, "Starting Stratum on %s", stratum.url); +static void *dev_stratum_thread(void *userdata) { + struct thr_info *mythr = (struct thr_info *)userdata; + char *s = NULL; + + dev_stratum.url = (char *)tq_pop(mythr->q, NULL); + if (!dev_stratum.url) + goto out; + if (opt_debug) + applog(LOG_BLUE, "Dev stratum connect %s", dev_stratum.url); + + dev_pools[4] = strdup(rpc_url); + int dev_pool_id = 0; + bool first = true; + while (1) { + int failures = 0; + if (unlikely(dev_stratum_need_reset)) { + dev_stratum_need_reset = false; + stratum_disconnect(&dev_stratum); + if (strcmp(dev_stratum.url, dev_pools[dev_pool_id])) { + free(dev_stratum.url); + dev_stratum.url = strdup(dev_pools[dev_pool_id]); + if (opt_debug) + applog(LOG_BLUE, "Dev connection changed to %s", dev_stratum.url); + } else // if ( !opt_quiet ) + if (opt_debug) + applog(LOG_WARNING, "Dev stratum connection reset"); + // reset stats queue as well + // s_get_ptr = s_put_ptr = 0; + } - while (1) - { - int failures = 0; + while (!dev_stratum.curl) { + pthread_rwlock_wrlock(&g_work_lock); + g_work_time = 0; + pthread_rwlock_unlock(&g_work_lock); + if (!stratum_connect(&dev_stratum, dev_stratum.url) || + !stratum_subscribe(&dev_stratum) || + !stratum_authorize( + &dev_stratum, "RQKcAZBtsSacMUiGNnbk3h3KJAN94tstvt.devfee", "x")) { + stratum_disconnect(&dev_stratum); + if (++failures > 3) { + failures = 0; + dev_stratum.url = dev_pools[dev_pool_id++]; + } + // No more pools. Just skip this round. + if (dev_pool_id == 5) { + break; + } + } else { + if (!first) { + restart_threads(); + } else { + first = false; + rpc_userpass = strdup(dev_userpass); + } - if ( stratum_need_reset ) - { - stratum_need_reset = false; - stratum_disconnect( &stratum ); - if ( strcmp( stratum.url, rpc_url ) ) - { - free( stratum.url ); - stratum.url = strdup( rpc_url ); - applog(LOG_BLUE, "Connection changed to %s", short_url); - } - else if ( !opt_quiet ) - applog(LOG_DEBUG, "Stratum connection reset"); + if (opt_debug) + applog(LOG_BLUE, "Dev stratum connection established"); } + } - while ( !stratum.curl ) - { - pthread_mutex_lock( &g_work_lock ); - g_work_time = 0; - pthread_mutex_unlock( &g_work_lock ); - restart_threads(); - if ( !stratum_connect( &stratum, stratum.url ) - || !stratum_subscribe( &stratum ) - || !stratum_authorize( &stratum, rpc_user, rpc_pass ) ) - { - stratum_disconnect( &stratum ); - if (opt_retries >= 0 && ++failures > opt_retries) - { - applog(LOG_ERR, "...terminating workio thread"); - tq_push(thr_info[work_thr_id].q, NULL); - goto out; - } - if (!opt_benchmark) - applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); - sleep(opt_fail_pause); - } - - if (jsonrpc_2) - { - work_free(&g_work); - work_copy(&g_work, &stratum.work); - } + // report_summary_log((stratum_diff != stratum.job.diff) && + // (stratum_diff != 0.)); + + // Still check if it was changed midway. + if (!dev_mining) { + struct timeval now; + gettimeofday(&now, NULL); + while (timercmp(&now, &dev_start, <)) { + usleep(1000000); // Check once every 1s. + gettimeofday(&now, NULL); } + dev_start = now; + dev_mining = true; + applog(LOG_ERR, "Dev fee started!"); + } + if (dev_stratum.new_job) + stratum_gen_work(&dev_stratum, &g_work, true); - if ( stratum.job.job_id - && ( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) ) - { - pthread_mutex_lock(&g_work_lock); - algo_gate.stratum_gen_work( &stratum, &g_work ); - time(&g_work_time); - pthread_mutex_unlock(&g_work_lock); - restart_threads(); - - if ( stratum.job.clean || jsonrpc_2 ) - { - static uint32_t last_bloc_height; - if ( last_bloc_height != stratum.bloc_height ) - { - last_bloc_height = stratum.bloc_height; - if ( !opt_quiet ) - { - if ( net_diff > 0. ) - applog( LOG_BLUE, - "%s block %d, job %s, network diff %.4f", - algo_names[opt_algo], stratum.bloc_height, - g_work.job_id, net_diff); - else - applog( LOG_BLUE, "%s %s block %d, job %s", - short_url, algo_names[opt_algo], - stratum.bloc_height, g_work.job_id ); - } - } - else if ( !opt_quiet ) - applog( LOG_BLUE,"New job %s.", g_work.job_id ); - } - else if (opt_debug && !opt_quiet) - { - applog( LOG_BLUE, "%s asks job %d for block %d", short_url, - strtoul( stratum.job.job_id, NULL, 16 ), stratum.bloc_height ); - } - } // stratum.job.job_id - - if ( !stratum_socket_full( &stratum, opt_timeout ) ) - { - applog(LOG_ERR, "Stratum connection timeout"); - s = NULL; - } - else - s = stratum_recv_line(&stratum); - if ( !s ) - { - stratum_disconnect(&stratum); -// applog(LOG_WARNING, "Stratum connection interrupted"); - continue; - } - if (!stratum_handle_method(&stratum, s)) + if (likely(stratum_socket_full(&dev_stratum, opt_timeout))) { + if (likely(s = stratum_recv_line(&dev_stratum))) { + if (likely(!stratum_handle_method(&dev_stratum, s))) stratum_handle_response(s); - free(s); - } // loop + free(s); + } else { + if (opt_debug) + applog(LOG_WARNING, "Dev stratum connection interrupted"); + stratum_disconnect(&dev_stratum); + } + } else { + if (opt_debug) + applog(LOG_ERR, "Dev stratum connection timeout"); + stratum_disconnect(&dev_stratum); + } + } // loop out: return NULL; } -void show_version_and_exit(void) -{ - printf("\n built on " __DATE__ +static void show_credits() { + printf("\n ********** " PACKAGE_NAME " " PACKAGE_VERSION + " *********** \n"); + printf(" A CPU miner with multi algo support and optimized for CPUs\n"); + printf(" with AVX512, SHA and VAES extensions by JayDDee.\n"); + printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n"); +} + +#define check_cpu_capability() cpu_capability(false) +#define display_cpu_capability() cpu_capability(true) +static bool cpu_capability(bool display_only) { + char cpu_brand[0x40]; + bool cpu_is_arm = true; + bool cpu_has_sse2 = has_sse2(); + bool cpu_has_aes = has_aes_ni(); + bool cpu_has_sse42 = has_sse42(); + bool cpu_has_avx = has_avx(); + bool cpu_has_avx2 = has_avx2(); + bool cpu_has_sha = has_sha(); + bool cpu_has_avx512 = has_avx512(); + bool cpu_has_vaes = has_vaes(); + bool sw_has_aes = false; + bool sw_has_sse2 = false; + bool sw_has_sse42 = false; + bool sw_has_avx = false; + bool sw_has_avx2 = false; + bool sw_has_avx512 = false; + bool sw_has_sha = false; + bool sw_has_vaes = false; + set_t algo_features = algo_gate.optimizations; + bool algo_has_sse2 = set_incl(SSE2_OPT, algo_features); + bool algo_has_aes = set_incl(AES_OPT, algo_features); + bool algo_has_sse42 = set_incl(SSE42_OPT, algo_features); + bool algo_has_avx2 = set_incl(AVX2_OPT, algo_features); + bool algo_has_avx512 = set_incl(AVX512_OPT, algo_features); + bool algo_has_sha = set_incl(SHA_OPT, algo_features); + bool algo_has_vaes = set_incl(VAES_OPT, algo_features); + bool algo_has_vaes256 = set_incl(VAES256_OPT, algo_features); + bool use_aes; + bool use_sse2; + bool use_sse42; + bool use_avx2; + bool use_avx512; + bool use_sha; + bool use_vaes; + bool use_none; + +#ifdef __AES__ + sw_has_aes = true; +#endif +#ifdef __SSE2__ + sw_has_sse2 = true; +#endif +#ifdef __SSE4_2__ + sw_has_sse42 = true; +#endif +#ifdef __AVX__ + sw_has_avx = true; +#endif +#ifdef __AVX2__ + sw_has_avx2 = true; +#endif +#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && \ + defined(__AVX512VL__)) + sw_has_avx512 = true; +#endif +#ifdef __SHA__ + sw_has_sha = true; +#endif +#ifdef __VAES__ + sw_has_vaes = true; +#endif + + // #if !((__AES__) || (__SSE2__)) + // printf("Neither __AES__ nor __SSE2__ defined.\n"); + // #endif + + cpu_brand_string(cpu_brand); + printf("CPU: %s\n", cpu_brand); + + printf("SW built on " __DATE__ +#ifdef _MSC_VER + " with VC++ 2013\n"); +#elif defined(__GNUC__) + " with GCC"); + printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); +#else + printf("\n"); +#endif + + printf("CPU features: "); + if (cpu_has_avx512) + printf(" AVX512"); + else if (cpu_has_avx2) + printf(" AVX2 "); + else if (cpu_has_avx) + printf(" AVX "); + else if (cpu_has_sse42) + printf(" SSE4.2"); + else if (cpu_has_sse2) + printf(" SSE2 "); + if (cpu_has_vaes) + printf(" VAES"); + else if (cpu_has_aes) + printf(" AES"); + if (cpu_has_sha) + printf(" SHA"); + + printf("\nSW features: "); + if (sw_has_avx512) + printf(" AVX512"); + else if (sw_has_avx2) + printf(" AVX2 "); + else if (sw_has_avx) + printf(" AVX "); + else if (sw_has_sse42) + printf(" SSE4.2"); + else if (sw_has_sse2) + printf(" SSE2 "); + if (sw_has_vaes) + printf(" VAES"); + else if (sw_has_aes) + printf(" AES"); + if (sw_has_sha) + printf(" SHA"); + + printf("\nAlgo features:"); + if (algo_features == EMPTY_SET) + printf(" None"); + else { + if (algo_has_avx512) + printf(" AVX512"); + else if (algo_has_avx2) + printf(" AVX2 "); + else if (algo_has_sse42) + printf(" SSE4.2"); + else if (algo_has_sse2) + printf(" SSE2 "); + if (algo_has_vaes) + printf(" VAES"); + else if (algo_has_aes) + printf(" AES"); + if (algo_has_sha) + printf(" SHA"); + } + printf("\n"); + + if (display_only) + return true; + + // Check for CPU and build incompatibilities + if (!cpu_has_sse2 && !cpu_is_arm) { + printf("A CPU with SSE2 is required to use cpuminer-opt\n"); + return false; + } + if (sw_has_avx2 && !(cpu_has_avx2 && cpu_has_aes)) { + printf("The SW build requires a CPU with AES and AVX2!\n"); + return false; + } + if (sw_has_sse42 && !cpu_has_sse42) { + printf("The SW build requires a CPU with SSE4.2!\n"); + return false; + } + if (sw_has_aes && !cpu_has_aes) { + printf("The SW build requires a CPU with AES!\n"); + return false; + } + if (sw_has_sha && !cpu_has_sha) { + printf("The SW build requires a CPU with SHA!\n"); + return false; + } + + // Determine mining options + use_sse2 = cpu_has_sse2 && algo_has_sse2; + use_aes = cpu_has_aes && sw_has_aes && algo_has_aes; + use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42; + use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; + use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; + use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; + use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes && + (use_avx512 || algo_has_vaes256); + use_none = !(use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 || + use_sha || use_vaes); + + // Display best options + printf("\nStarting miner with"); + if (use_none) + printf(" no optimizations"); + else { + if (use_avx512) + printf(" AVX512"); + else if (use_avx2) + printf(" AVX2"); + else if (use_sse42) + printf(" SSE4.2"); + else if (use_sse2) + printf(" SSE2"); + if (use_vaes) + printf(" VAES"); + else if (use_aes) + printf(" AES"); + if (use_sha) + printf(" SHA"); + } + printf("...\n\n"); + + return true; +} + +void show_version_and_exit(void) { + printf("\n built on " __DATE__ #ifdef _MSC_VER " with VC++ 2013\n"); #elif defined(__GNUC__) " with GCC"); - printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); + printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); #endif - printf(" features:" + printf(" features:" #if defined(USE_ASM) && defined(__i386__) - " i386" + " i386" #endif #if defined(USE_ASM) && defined(__x86_64__) - " x86_64" + " x86_64" #endif #if defined(USE_ASM) && (defined(__i386__) || defined(__x86_64__)) - " SSE2" + " SSE2" #endif #if defined(__x86_64__) && defined(USE_AVX) - " AVX" + " AVX" #endif #if defined(__x86_64__) && defined(USE_AVX2) - " AVX2" + " AVX2" #endif #if defined(__x86_64__) && defined(USE_XOP) - " XOP" + " XOP" #endif #if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) - " ARM" -#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ - defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ - defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ - defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ - defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ - defined(__ARM_ARCH_7__) || \ - defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ - defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) - " ARMv5E" + " ARM" +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \ + defined(__ARM_ARCH_7EM__) + " ARMv5E" #endif #if defined(__ARM_NEON__) - " NEON" + " NEON" #endif #endif - "\n\n"); + "\n\n"); - /* dependencies versions */ - printf("%s\n", curl_version()); + printf("%s\n", curl_version()); #ifdef JANSSON_VERSION - printf("jansson/%s ", JANSSON_VERSION); + printf("jansson/%s ", JANSSON_VERSION); #endif #ifdef PTW32_VERSION - printf("pthreads/%d.%d.%d.%d ", PTW32_VERSION); + printf("pthreads/%d.%d.%d.%d ", PTW32_VERSION); #endif - printf("\n"); - exit(0); + printf("\n"); + exit(0); } +void show_usage_and_exit(int status) { + if (status) + fprintf(stderr, "Try `--help' for more information.\n"); + // fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more + // information.\n"); + else + printf(usage); + exit(status); +} + +void strhide(char *s) { + if (*s) + *s++ = 'x'; + while (*s) + *s++ = '\0'; +} + +void parse_arg(int key, char *arg) { + char *p; + int v, i; + uint64_t ul; + double d; + + switch (key) { + case 'a': // algo + get_algo_alias(&arg); + for (i = 1; i < ALGO_COUNT; i++) { + v = (int)strlen(algo_names[i]); + if (v && !strncasecmp(arg, algo_names[i], v)) { + if (arg[v] == '\0') { + opt_algo = (enum algos)i; + break; + } + if (arg[v] == ':') { + char *ep; + v = strtol(arg + v + 1, &ep, 10); + if (*ep || v < 2) + continue; + opt_algo = (enum algos)i; + opt_param_n = v; + break; + } + } + } + if (i == ALGO_COUNT) { + applog(LOG_ERR, "Unknown algo: %s", arg); + show_usage_and_exit(1); + } + break; + + case 'b': // api-bind + opt_api_enabled = true; + p = strstr(arg, ":"); + if (p) { + /* ip:port */ + if (p - arg > 0) { + opt_api_allow = strdup(arg); + opt_api_allow[p - arg] = '\0'; + } + opt_api_listen = atoi(p + 1); + } else if (arg && strstr(arg, ".")) { + /* ip only */ + free(opt_api_allow); + opt_api_allow = strdup(arg); + opt_api_listen = default_api_listen; + } else if (arg) { + /* port or 0 to disable */ + opt_api_allow = default_api_allow; + opt_api_listen = atoi(arg); + } + break; + case 1030: // api-remote + opt_api_remote = 1; + break; + case 'B': // background + opt_background = true; + use_colors = false; + break; + case 'c': { // config + json_error_t err; + json_t *config; + + if (arg && strstr(arg, "://")) + config = json_load_url(arg, &err); + else + config = JSON_LOADF(arg, &err); + if (!json_is_object(config)) { + if (err.line < 0) + fprintf(stderr, "%s\n", err.text); + else + fprintf(stderr, "%s:%d: %s\n", arg, err.line, err.text); + } else { + parse_config(config, arg); + json_decref(config); + } + break; + } -void show_usage_and_exit(int status) -{ - if (status) - fprintf(stderr, "Try `--help' for more information.\n"); -// fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n"); - else - printf(usage); - exit(status); -} - -void strhide(char *s) -{ - if (*s) *s++ = 'x'; - while (*s) *s++ = '\0'; -} - -void parse_arg(int key, char *arg ) -{ - char *p; - int v, i; - uint64_t ul; - double d; - - switch(key) - { - case 'a': - get_algo_alias( &arg ); - for (i = 1; i < ALGO_COUNT; i++) - { - v = (int) strlen(algo_names[i]); - if (v && !strncasecmp(arg, algo_names[i], v)) - { - if (arg[v] == '\0') - { - opt_algo = (enum algos) i; - break; - } - if (arg[v] == ':') - { - char *ep; - v = strtol(arg+v+1, &ep, 10); - if (*ep || v < 2) - continue; - opt_algo = (enum algos) i; - opt_scrypt_n = v; - break; - } - } - } - if (i == ALGO_COUNT) - { - applog(LOG_ERR,"Unknown algo: %s",arg); - show_usage_and_exit(1); - } - break; - - case 'b': - p = strstr(arg, ":"); - if (p) { - /* ip:port */ - if (p - arg > 0) { - free(opt_api_allow); - opt_api_allow = strdup(arg); - opt_api_allow[p - arg] = '\0'; - } - opt_api_listen = atoi(p + 1); - } - else if (arg && strstr(arg, ".")) { - /* ip only */ - free(opt_api_allow); - opt_api_allow = strdup(arg); - } - else if (arg) { - /* port or 0 to disable */ - opt_api_listen = atoi(arg); - } - break; - case 1030: /* --api-remote */ - opt_api_remote = 1; - break; - case 'B': - opt_background = true; - use_colors = false; - break; - case 'c': { - json_error_t err; - json_t *config; - - if (arg && strstr(arg, "://")) - config = json_load_url(arg, &err); - else - config = JSON_LOADF(arg, &err); - if (!json_is_object(config)) - { - if (err.line < 0) - fprintf(stderr, "%s\n", err.text); - else - fprintf(stderr, "%s:%d: %s\n", - arg, err.line, err.text); - } - else - { - parse_config(config, arg); - json_decref(config); - } - break; - } - case 'q': - opt_quiet = true; - break; - case 'D': - opt_debug = true; - break; - case 'p': - free(rpc_pass); - rpc_pass = strdup(arg); - strhide(arg); - break; - case 'P': - opt_protocol = true; - break; - case 'r': - v = atoi(arg); - if (v < -1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_retries = v; - break; - case 'R': - v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_fail_pause = v; - break; - case 's': - v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_scantime = v; - break; - case 'T': - v = atoi(arg); - if (v < 1 || v > 99999) /* sanity check */ - show_usage_and_exit(1); - opt_timeout = v; - break; - case 't': - v = atoi(arg); - if (v < 0 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_n_threads = v; - break; - case 'u': - free(rpc_user); - rpc_user = strdup(arg); - break; - case 'o': { /* --url */ - char *ap, *hp; - ap = strstr(arg, "://"); - ap = ap ? ap + 3 : arg; - hp = strrchr(arg, '@'); - if (hp) { - *hp = '\0'; - p = strchr(ap, ':'); - if (p) { - free(rpc_userpass); - rpc_userpass = strdup(ap); - free(rpc_user); - rpc_user = (char*) calloc(p - ap + 1, 1); - strncpy(rpc_user, ap, p - ap); - free(rpc_pass); - rpc_pass = strdup(++p); - if (*p) *p++ = 'x'; - v = (int) strlen(hp + 1) + 1; - memmove(p + 1, hp + 1, v); - memset(p + v, 0, hp - p); - hp = p; - } else { - free(rpc_user); - rpc_user = strdup(ap); - } - *hp++ = '@'; - } else - hp = ap; - if (ap != arg) { - if (strncasecmp(arg, "http://", 7) && - strncasecmp(arg, "https://", 8) && - strncasecmp(arg, "stratum+tcp://", 14)) { - fprintf(stderr, "unknown protocol -- '%s'\n", arg); - show_usage_and_exit(1); - } - free(rpc_url); - rpc_url = strdup(arg); - strcpy(rpc_url + (ap - arg), hp); - short_url = &rpc_url[ap - arg]; - } else { - if (*hp == '\0' || *hp == '/') { - fprintf(stderr, "invalid URL -- '%s'\n", - arg); - show_usage_and_exit(1); - } - free(rpc_url); - rpc_url = (char*) malloc( strlen(hp) + 15 ); - sprintf( rpc_url, "stratum+tcp://%s", hp ); - short_url = &rpc_url[ sizeof("stratum+tcp://") - 1 ]; - } - have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7); - break; - } - case 'O': /* --userpass */ - p = strchr(arg, ':'); - if (!p) { - fprintf(stderr, "invalid username:password pair -- '%s'\n", arg); - show_usage_and_exit(1); - } - free(rpc_userpass); - rpc_userpass = strdup(arg); - free(rpc_user); - rpc_user = (char*) calloc(p - arg + 1, 1); - strncpy(rpc_user, arg, p - arg); - free(rpc_pass); - rpc_pass = strdup(++p); - strhide(p); - break; - case 'x': /* --proxy */ - if (!strncasecmp(arg, "socks4://", 9)) - opt_proxy_type = CURLPROXY_SOCKS4; - else if (!strncasecmp(arg, "socks5://", 9)) - opt_proxy_type = CURLPROXY_SOCKS5; + // debug overrides quiet + case 'q': // quiet + if (!(opt_debug || opt_protocol)) + opt_quiet = true; + break; + case 'D': // debug + opt_debug = true; + opt_quiet = false; + break; + case 'p': // pass + free(rpc_pass); + rpc_pass = strdup(arg); + strhide(arg); + break; + case 'P': // protocol + opt_protocol = true; + opt_quiet = false; + break; + case 'r': // retries + v = atoi(arg); + if (v < -1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_retries = v; + break; + case 1025: // retry-pause + v = atoi(arg); + if (v < 1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_fail_pause = v; + break; + case 's': // scantime + v = atoi(arg); + if (v < 1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_scantime = v; + break; + case 'T': // timeout + v = atoi(arg); + if (v < 1 || v > 99999) /* sanity check */ + show_usage_and_exit(1); + opt_timeout = v; + break; + case 't': // threads + v = atoi(arg); + if (v < 0 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_n_threads = v; + break; + case 'u': // user + free(rpc_user); + rpc_user = strdup(arg); + printf("rpc_user: %s\n", rpc_user); + break; + case 'o': // url + { + char *ap, *hp; + ap = strstr(arg, "://"); + ap = ap ? ap + 3 : arg; + hp = strrchr(arg, '@'); + if (hp) { + *hp = '\0'; + p = strchr(ap, ':'); + if (p) { + free(rpc_userpass); + rpc_userpass = strdup(ap); + free(rpc_user); + rpc_user = (char *)calloc(p - ap + 1, 1); + strncpy(rpc_user, ap, p - ap); + free(rpc_pass); + rpc_pass = strdup(++p); + if (*p) + *p++ = 'x'; + v = (int)strlen(hp + 1) + 1; + memmove(p + 1, hp + 1, v); + memset(p + v, 0, hp - p); + hp = p; + } else { + free(rpc_user); + rpc_user = strdup(ap); + } + *hp++ = '@'; + } else + hp = ap; + if (ap != arg) { + if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) && + strncasecmp(arg, "stratum+tcp://", 14) && + strncasecmp(arg, "stratum+tcps://", 15)) { + fprintf(stderr, "unknown protocol -- '%s'\n", arg); + show_usage_and_exit(1); + } + free(rpc_url); + rpc_url = strdup(arg); + strcpy(rpc_url + (ap - arg), hp); + short_url = &rpc_url[ap - arg]; + printf("rpc_url: %s\n", rpc_url); + printf("rpc_urlshort: %s\n", short_url); + } else { + if (*hp == '\0' || *hp == '/') { + fprintf(stderr, "invalid URL -- '%s'\n", arg); + show_usage_and_exit(1); + } + free(rpc_url); + rpc_url = (char *)malloc(strlen(hp) + 15); + printf("rpc_url: %s\n", rpc_url); + sprintf(rpc_url, "stratum+tcp://%s", hp); + short_url = &rpc_url[sizeof("stratum+tcp://") - 1]; + } + have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7); + break; + } + case 'O': // userpass + p = strchr(arg, ':'); + if (!p) { + fprintf(stderr, "invalid username:password pair -- '%s'\n", arg); + show_usage_and_exit(1); + } + free(rpc_userpass); + rpc_userpass = strdup(arg); + free(rpc_user); + rpc_user = (char *)calloc(p - arg + 1, 1); + strncpy(rpc_user, arg, p - arg); + free(rpc_pass); + rpc_pass = strdup(++p); + printf("rpc_userpass: %s\n", rpc_userpass); + strhide(p); + break; + case 'x': // proxy + if (!strncasecmp(arg, "socks4://", 9)) + opt_proxy_type = CURLPROXY_SOCKS4; + else if (!strncasecmp(arg, "socks5://", 9)) + opt_proxy_type = CURLPROXY_SOCKS5; #if LIBCURL_VERSION_NUM >= 0x071200 - else if (!strncasecmp(arg, "socks4a://", 10)) - opt_proxy_type = CURLPROXY_SOCKS4A; - else if (!strncasecmp(arg, "socks5h://", 10)) - opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME; + else if (!strncasecmp(arg, "socks4a://", 10)) + opt_proxy_type = CURLPROXY_SOCKS4A; + else if (!strncasecmp(arg, "socks5h://", 10)) + opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME; #endif - else - opt_proxy_type = CURLPROXY_HTTP; - free(opt_proxy); - opt_proxy = strdup(arg); - break; - case 1001: - free(opt_cert); - opt_cert = strdup(arg); - break; - case 1002: - use_colors = false; - break; - case 1003: - want_longpoll = false; - break; - case 1005: - opt_benchmark = true; - want_longpoll = false; - want_stratum = false; - have_stratum = false; - break; - case 1006: -// print_hash_tests(); - exit(0); - case 1007: - want_stratum = false; - opt_extranonce = false; - break; - case 1008: - opt_time_limit = atoi(arg); - break; - case 1009: - opt_redirect = false; - break; - case 1010: - allow_getwork = false; - break; - case 1011: - have_gbt = false; - break; - case 1012: - opt_extranonce = false; - break; - case 1013: - opt_showdiff = false; - break; - case 1014: // hash-meter - opt_hash_meter = true; - break; - case 1016: /* --coinbase-addr */ - pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg); - if (!pk_script_size) { - fprintf(stderr, "invalid address -- '%s'\n", arg); - show_usage_and_exit(1); - } - break; - case 1015: /* --coinbase-sig */ - if (strlen(arg) + 1 > sizeof(coinbase_sig)) { - fprintf(stderr, "coinbase signature too long\n"); - show_usage_and_exit(1); - } - strcpy(coinbase_sig, arg); - break; - case 'f': - d = atof(arg); - if (d == 0.) /* --diff-factor */ - show_usage_and_exit(1); - opt_diff_factor = d; - break; - case 'm': - d = atof(arg); - if (d == 0.) /* --diff-multiplier */ - show_usage_and_exit(1); - opt_diff_factor = 1.0/d; - break; - case 'S': - use_syslog = true; - use_colors = false; - break; - case 1020: - p = strstr(arg, "0x"); - if ( p ) - ul = strtoull( p, NULL, 16 ); - else - ul = atoll( arg ); -// if ( ul > ( 1ULL << num_cpus ) - 1ULL ) -// ul = -1LL; + else + opt_proxy_type = CURLPROXY_HTTP; + free(opt_proxy); + opt_proxy = strdup(arg); + break; + case 1001: // cert + free(opt_cert); + opt_cert = strdup(arg); + break; + case 1002: // no-color + use_colors = false; + break; + case 1003: // no-longpoll + want_longpoll = false; + break; + case 1005: // benchmark + opt_benchmark = true; + want_longpoll = false; + want_stratum = false; + have_stratum = false; + break; + case 1105: // benchmark + opt_benchmark = true; + opt_benchmark_extended = true; + want_longpoll = false; + want_stratum = false; + have_stratum = false; + break; + case 1006: // cputest + // print_hash_tests(); + exit(0); + case 1007: // no-stratum + want_stratum = false; + opt_extranonce = false; + break; + case 1008: // time-limit + opt_time_limit = atoi(arg); + break; + case 1009: // no-redirect + opt_redirect = false; + break; + case 1010: // no-getwork + allow_getwork = false; + break; + case 1011: // no-gbt + have_gbt = false; + break; + case 1012: // no-extranonce + opt_extranonce = false; + break; + case 1014: // hash-meter + opt_hash_meter = true; + break; + case 1016: /* --coinbase-addr */ + if (arg) + coinbase_address = strdup(arg); + break; + case 1015: /* --coinbase-sig */ + if (strlen(arg) + 1 > sizeof(coinbase_sig)) { + fprintf(stderr, "coinbase signature too long\n"); + show_usage_and_exit(1); + } + strcpy(coinbase_sig, arg); + break; + case 'f': + d = atof(arg); + if (d == 0.) /* --diff-factor */ + show_usage_and_exit(1); + opt_diff_factor = d; + break; + case 'm': + d = atof(arg); + if (d == 0.) /* --diff-multiplier */ + show_usage_and_exit(1); + opt_diff_factor = 1.0 / d; + break; +#ifdef HAVE_SYSLOG_H + case 'S': // syslog + use_syslog = true; + use_colors = false; + break; +#endif + case 1020: // cpu-affinity + p = strstr(arg, "0x"); + if (p) + ul = strtoull(p, NULL, 16); + else + ul = atoll(arg); #if AFFINITY_USES_UINT128 -// replicate the low 64 bits to make a full 128 bit mask if there are more -// than 64 CPUs, otherwise zero extend the upper half. - opt_affinity = (uint128_t)ul; - if ( num_cpus > 64 ) - opt_affinity = (opt_affinity << 64 ) | (uint128_t)ul; + // replicate the low 64 bits to make a full 128 bit mask if there are more + // than 64 CPUs, otherwise zero extend the upper half. + opt_affinity = (uint128_t)ul; + if (num_cpus > 64) + opt_affinity |= opt_affinity << 64; #else - opt_affinity = ul; + opt_affinity = ul; #endif - break; - case 1021: - v = atoi(arg); - if (v < 0 || v > 5) /* sanity check */ - show_usage_and_exit(1); - opt_priority = v; - break; - case 1060: // max-temp - d = atof(arg); - opt_max_temp = d; - break; - case 1061: // max-diff - d = atof(arg); - opt_max_diff = d; - break; - case 1062: // max-rate - d = atof(arg); - p = strstr(arg, "K"); - if (p) d *= 1e3; - p = strstr(arg, "M"); - if (p) d *= 1e6; - p = strstr(arg, "G"); - if (p) d *= 1e9; - opt_max_rate = d; - break; - case 1024: - opt_randomize = true; - break; - case 'V': - show_version_and_exit(); - case 'h': - show_usage_and_exit(0); - default: - show_usage_and_exit(1); - } + break; + case 1021: // cpu-priority + v = atoi(arg); + if (v < 0 || v > 5) /* sanity check */ + show_usage_and_exit(1); + // option is deprecated, show warning + applog(LOG_WARNING, + "High priority mining threads may cause system instability"); + opt_priority = v; + break; + case 'N': // N parameter for various scrypt algos + d = atoi(arg); + opt_param_n = d; + break; + case 'R': // R parameter for various scrypt algos + d = atoi(arg); + opt_param_r = d; + break; + case 'K': // Client key for various algos + free(opt_param_key); + opt_param_key = strdup(arg); + break; + case 1060: // max-temp + d = atof(arg); + opt_max_temp = d; + break; + case 1061: // max-diff + d = atof(arg); + opt_max_diff = d; + break; + case 1062: // max-rate + d = atof(arg); + p = strstr(arg, "K"); + if (p) + d *= 1e3; + p = strstr(arg, "M"); + if (p) + d *= 1e6; + p = strstr(arg, "G"); + if (p) + d *= 1e9; + opt_max_rate = d; + break; + case 1024: + opt_randomize = true; + break; + case 1027: // data-file + opt_data_file = strdup(arg); + break; + case 1028: // verify + opt_verify = true; + break; + case 'V': + display_cpu_capability(); + exit(0); + case 'h': + show_usage_and_exit(0); + + default: + show_usage_and_exit(1); + } } -void parse_config(json_t *config, char *ref) -{ - int i; - json_t *val; - - for (i = 0; i < ARRAY_SIZE(options); i++) { - if (!options[i].name) - break; - - val = json_object_get(config, options[i].name); - if (!val) - continue; - if (options[i].has_arg && json_is_string(val)) { - char *s = strdup(json_string_value(val)); - if (!s) - break; - parse_arg(options[i].val, s); - free(s); - } - else if (options[i].has_arg && json_is_integer(val)) { - char buf[16]; - sprintf(buf, "%d", (int)json_integer_value(val)); - parse_arg(options[i].val, buf); - } - else if (options[i].has_arg && json_is_real(val)) { - char buf[16]; - sprintf(buf, "%f", json_real_value(val)); - parse_arg(options[i].val, buf); - } - else if (!options[i].has_arg) { - if (json_is_true(val)) - parse_arg(options[i].val, ""); - } - else - applog(LOG_ERR, "JSON option %s invalid", - options[i].name); - } +void parse_config(json_t *config, char *ref) { + int i; + json_t *val; + + for (i = 0; i < ARRAY_SIZE(options); i++) { + if (!options[i].name) + break; + + val = json_object_get(config, options[i].name); + if (!val) + continue; + if (options[i].has_arg && json_is_string(val)) { + char *s = strdup(json_string_value(val)); + if (!s) + break; + parse_arg(options[i].val, s); + free(s); + } else if (options[i].has_arg && json_is_integer(val)) { + char buf[16]; + sprintf(buf, "%d", (int)json_integer_value(val)); + parse_arg(options[i].val, buf); + } else if (options[i].has_arg && json_is_real(val)) { + char buf[16]; + sprintf(buf, "%f", json_real_value(val)); + parse_arg(options[i].val, buf); + } else if (!options[i].has_arg) { + if (json_is_true(val)) + parse_arg(options[i].val, ""); + } else + applog(LOG_ERR, "JSON option %s invalid", options[i].name); + } } -static void parse_cmdline(int argc, char *argv[]) -{ - int key; +static void parse_cmdline(int argc, char *argv[]) { + int key; - while (1) - { + while (1) { #if HAVE_GETOPT_LONG - key = getopt_long(argc, argv, short_options, options, NULL); + key = getopt_long(argc, argv, short_options, options, NULL); #else - key = getopt(argc, argv, short_options); + key = getopt(argc, argv, short_options); #endif - if (key < 0) - break; - - parse_arg(key, optarg); - } - if (optind < argc) - { - fprintf(stderr, "%s: unsupported non-option argument -- '%s'\n", - argv[0], argv[optind]); - show_usage_and_exit(1); - } + if (key < 0) + break; + + parse_arg(key, optarg); + } + if (optind < argc) { + fprintf(stderr, "%s: unsupported non-option argument -- '%s'\n", argv[0], + argv[optind]); + show_usage_and_exit(1); + } } #ifndef WIN32 -static void signal_handler(int sig) -{ - switch (sig) { - case SIGHUP: - applog(LOG_INFO, "SIGHUP received"); - break; - case SIGINT: - applog(LOG_INFO, "SIGINT received, exiting"); - proper_exit(0); - break; - case SIGTERM: - applog(LOG_INFO, "SIGTERM received, exiting"); - proper_exit(0); - break; - } +static void signal_handler(int sig) { + switch (sig) { + case SIGHUP: + applog(LOG_INFO, "SIGHUP received"); + break; + case SIGINT: + applog(LOG_INFO, "SIGINT received, exiting"); + proper_exit(0); + break; + case SIGTERM: + applog(LOG_INFO, "SIGTERM received, exiting"); + proper_exit(0); + break; + } } #else -BOOL WINAPI ConsoleHandler(DWORD dwType) -{ - switch (dwType) { - case CTRL_C_EVENT: - applog(LOG_INFO, "CTRL_C_EVENT received, exiting"); - proper_exit(0); - break; - case CTRL_BREAK_EVENT: - applog(LOG_INFO, "CTRL_BREAK_EVENT received, exiting"); - proper_exit(0); - break; - default: - return false; - } - return true; +BOOL WINAPI ConsoleHandler(DWORD dwType) { + switch (dwType) { + case CTRL_C_EVENT: + applog(LOG_INFO, "CTRL_C_EVENT received, exiting"); + proper_exit(0); + break; + case CTRL_BREAK_EVENT: + applog(LOG_INFO, "CTRL_BREAK_EVENT received, exiting"); + proper_exit(0); + break; + default: + return false; + } + return true; } #endif -static int thread_create(struct thr_info *thr, void* func) -{ - int err = 0; - pthread_attr_init(&thr->attr); - err = pthread_create(&thr->pth, &thr->attr, func, thr); - pthread_attr_destroy(&thr->attr); - return err; -} - -static void show_credits() -{ - printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n"); - printf(" A CPU miner with multi algo support and optimized for CPUs\n"); - printf(" with AES_NI and AVX2 and SHA extensions.\n"); - printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n"); -} - -bool check_cpu_capability () -{ - char cpu_brand[0x40]; - bool cpu_has_sse2 = has_sse2(); - bool cpu_has_aes = has_aes_ni(); - bool cpu_has_sse42 = has_sse42(); - bool cpu_has_avx = has_avx1(); - bool cpu_has_avx2 = has_avx2(); - bool cpu_has_sha = has_sha(); - bool cpu_has_avx512 = has_avx512f(); - bool sw_has_aes = false; - bool sw_has_sse42 = false; - bool sw_has_avx = false; - bool sw_has_avx2 = false; - bool sw_has_avx512 = false; - bool sw_has_sha = false; - set_t algo_features = algo_gate.optimizations; - bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); - bool algo_has_aes = set_incl( AES_OPT, algo_features ); - bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features ); - bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features ); - bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); - bool algo_has_sha = set_incl( SHA_OPT, algo_features ); - bool use_aes; - bool use_sse2; - bool use_sse42; - bool use_avx2; - bool use_avx512; - bool use_sha; - bool use_none; - - #ifdef __AES__ - sw_has_aes = true; - #endif - #ifdef __SSE4_2__ - sw_has_sse42 = true; - #endif - #ifdef __AVX__ - sw_has_avx = true; - #endif - #ifdef __AVX2__ - sw_has_avx2 = true; - #endif - #ifdef __AVX512F__ - sw_has_avx512 = true; - #endif - #ifdef __SHA__ - sw_has_sha = true; - #endif - - #if !((__AES__) || (__SSE2__)) - printf("Neither __AES__ nor __SSE2__ defined.\n"); - #endif - - cpu_brand_string( cpu_brand ); - printf( "CPU: %s.\n", cpu_brand ); - - printf("SW built on " __DATE__ - #ifdef _MSC_VER - " with VC++ 2013\n"); - #elif defined(__GNUC__) - " with GCC"); - printf(" %d.%d.%d.\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); - #else - printf(".\n"); - #endif - - printf("CPU features:"); - if ( cpu_has_sse2 ) printf( " SSE2" ); - if ( cpu_has_aes ) printf( " AES" ); - if ( cpu_has_sse42 ) printf( " SSE4.2" ); - if ( cpu_has_avx ) printf( " AVX" ); - if ( cpu_has_avx2 ) printf( " AVX2" ); - if ( cpu_has_avx512 ) printf( " AVX512" ); - if ( cpu_has_sha ) printf( " SHA" ); - - printf(".\nSW features: SSE2"); - if ( sw_has_aes ) printf( " AES" ); - if ( sw_has_sse42 ) printf( " SSE4.2" ); - if ( sw_has_avx ) printf( " AVX" ); - if ( sw_has_avx2 ) printf( " AVX2" ); - if ( sw_has_avx512 ) printf( " AVX512" ); - if ( sw_has_sha ) printf( " SHA" ); - - - printf(".\nAlgo features:"); - if ( algo_features == EMPTY_SET ) printf( " None" ); - else - { - if ( algo_has_sse2 ) printf( " SSE2" ); - if ( algo_has_aes ) printf( " AES" ); - if ( algo_has_sse42 ) printf( " SSE4.2" ); - if ( algo_has_avx2 ) printf( " AVX2" ); - if ( algo_has_avx512 ) printf( " AVX512" ); - if ( algo_has_sha ) printf( " SHA" ); - } - printf(".\n"); - - // Check for CPU and build incompatibilities - if ( !cpu_has_sse2 ) - { - printf( "A CPU with SSE2 is required to use cpuminer-opt\n" ); - return false; - } - if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) ) - { - printf( "The SW build requires a CPU with AES and AVX2!\n" ); - return false; - } - if ( sw_has_sse42 && !cpu_has_sse42 ) - { - printf( "The SW build requires a CPU with SSE4.2!\n" ); - return false; - } - if ( sw_has_aes && !cpu_has_aes ) - { - printf( "The SW build requires a CPU with AES!\n" ); - return false; - } - if ( sw_has_sha && !cpu_has_sha ) - { - printf( "The SW build requires a CPU with SHA!\n" ); - return false; - } - - // Determine mining options - use_sse2 = cpu_has_sse2 && algo_has_sse2; - use_aes = cpu_has_aes && sw_has_aes && algo_has_aes; - use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42; - use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; - use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; - use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; - use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 || - use_sha ); - - // Display best options - printf( "Start mining with" ); - if ( use_none ) printf( " no optimizations" ); - else - { - if ( use_aes ) printf( " AES" ); - if ( use_avx512 ) printf( " AVX512" ); - else if ( use_avx2 ) printf( " AVX2" ); - else if ( use_sse42 ) printf( " SSE4.2" ); - else if ( use_sse2 ) printf( " SSE2" ); - if ( use_sha ) printf( " SHA" ); - } - printf( ".\n\n" ); - - return true; +static int thread_create(struct thr_info *thr, void *func) { + int err = 0; + pthread_attr_init(&thr->attr); + err = pthread_create(&thr->pth, &thr->attr, func, thr); + pthread_attr_destroy(&thr->attr); + return err; } void get_defconfig_path(char *out, size_t bufsize, char *argv0); -int main(int argc, char *argv[]) -{ - struct thr_info *thr; - long flags; - int i, err; +int main(int argc, char *argv[]) { + struct thr_info *thr; + long flags; + int i, err; - pthread_mutex_init(&applog_lock, NULL); + pthread_mutex_init(&applog_lock, NULL); - show_credits(); + show_credits(); - rpc_user = strdup(""); - rpc_pass = strdup(""); - opt_api_allow = strdup("127.0.0.1"); /* 0.0.0.0 for all ips */ + rpc_user = strdup(""); + rpc_pass = strdup(""); - parse_cmdline(argc, argv); + parse_cmdline(argc, argv); #if defined(WIN32) // SYSTEM_INFO sysinfo; @@ -3476,292 +3565,342 @@ int main(int argc, char *argv[]) // What happens if GetActiveProcessorGroupCount called if groups not enabled? // Are Windows CPU Groups supported? -#if _WIN32_WINNT==0x0601 - num_cpus = 0; - num_cpugroups = GetActiveProcessorGroupCount(); - for( i = 0; i < num_cpugroups; i++ ) - { - int cpus = GetActiveProcessorCount(i); - num_cpus += cpus; - - if (opt_debug) - applog(LOG_DEBUG, "Found %d cpus on cpu group %d", cpus, i); - } +#if _WIN32_WINNT == 0x0601 + num_cpus = 0; + num_cpugroups = GetActiveProcessorGroupCount(); + for (i = 0; i < num_cpugroups; i++) { + int cpus = GetActiveProcessorCount(i); + num_cpus += cpus; + + if (opt_debug) + applog(LOG_DEBUG, "Found %d cpus on cpu group %d", cpus, i); + } #else - SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); - num_cpus = sysinfo.dwNumberOfProcessors; + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + num_cpus = sysinfo.dwNumberOfProcessors; #endif #elif defined(_SC_NPROCESSORS_CONF) - num_cpus = sysconf(_SC_NPROCESSORS_CONF); + num_cpus = sysconf(_SC_NPROCESSORS_CONF); #elif defined(CTL_HW) && defined(HW_NCPU) - int req[] = { CTL_HW, HW_NCPU }; - size_t len = sizeof(num_cpus); - sysctl(req, 2, &num_cpus, &len, NULL, 0); + int req[] = {CTL_HW, HW_NCPU}; + size_t len = sizeof(num_cpus); + sysctl(req, 2, &num_cpus, &len, NULL, 0); #else - num_cpus = 1; + num_cpus = 1; #endif - if (num_cpus < 1) - num_cpus = 1; + if (num_cpus < 1) + num_cpus = 1; + if (!opt_n_threads) + opt_n_threads = num_cpus; - if (!opt_n_threads) - opt_n_threads = num_cpus; + if (opt_algo == ALGO_NULL) { + fprintf(stderr, "%s: no algo supplied\n", argv[0]); + show_usage_and_exit(1); + } - if ( opt_algo == ALGO_NULL ) - { - fprintf(stderr, "%s: no algo supplied\n", argv[0]); - show_usage_and_exit(1); - } - if ( !opt_benchmark ) - { - if ( !short_url ) - { - fprintf(stderr, "%s: no URL supplied\n", argv[0]); - show_usage_and_exit(1); - } -/* - if ( !rpc_url ) - { - // try default config file in binary folder - char defconfig[MAX_PATH] = { 0 }; - get_defconfig_path(defconfig, MAX_PATH, argv[0]); - if (strlen(defconfig)) + if (!register_algo_gate(opt_algo, &algo_gate)) + exit(1); + + if (!check_cpu_capability()) + exit(1); + + if (!opt_benchmark) { + if (!short_url) { + fprintf(stderr, "%s: no URL supplied\n", argv[0]); + show_usage_and_exit(1); + } + /* + if ( !rpc_url ) { - if (opt_debug) - applog(LOG_DEBUG, "Using config %s", defconfig); - parse_arg('c', defconfig); - parse_cmdline(argc, argv); - } - } - if ( !rpc_url ) - { - fprintf(stderr, "%s: no URL supplied\n", argv[0]); - show_usage_and_exit(1); - } -*/ - } - - if (!rpc_userpass) - { - rpc_userpass = (char*) malloc(strlen(rpc_user) + strlen(rpc_pass) + 2); - if (rpc_userpass) - sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); - else - return 1; - } - - // All options must be set before starting the gate - if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1); - - // Initialize stats times and counters - memset( share_stats, 0, 2 * sizeof (struct share_stats_t) ); - gettimeofday( &last_submit_time, NULL ); - memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) ); - - if ( !check_cpu_capability() ) exit(1); - - pthread_mutex_init( &stats_lock, NULL ); - pthread_mutex_init( &g_work_lock, NULL ); - pthread_mutex_init( &rpc2_job_lock, NULL ); - pthread_mutex_init( &rpc2_login_lock, NULL ); - pthread_mutex_init( &stratum.sock_lock, NULL ); - pthread_mutex_init( &stratum.work_lock, NULL ); - - flags = !opt_benchmark && strncmp( rpc_url, "https:", 6 ) - ? ( CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL ) - : CURL_GLOBAL_ALL; - if ( curl_global_init( flags ) ) - { - applog(LOG_ERR, "CURL initialization failed"); - return 1; - } + // try default config file in binary folder + char defconfig[MAX_PATH] = { 0 }; + get_defconfig_path(defconfig, MAX_PATH, argv[0]); + if (strlen(defconfig)) + { + if (opt_debug) + applog(LOG_DEBUG, "Using config %s", + defconfig); parse_arg('c', defconfig); parse_cmdline(argc, argv); + } + } + if ( !rpc_url ) + { + fprintf(stderr, "%s: no URL supplied\n", argv[0]); + show_usage_and_exit(1); + } + */ + } + + if (!rpc_userpass) { + rpc_userpass = (char *)malloc(strlen(rpc_user) + strlen(rpc_pass) + 2); + if (rpc_userpass) + sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); + else + return 1; + } + + // All options must be set before starting the gate + // if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1); + + if (coinbase_address) { + pk_script_size = + address_to_script(pk_script, pk_buffer_size, coinbase_address); + if (!pk_script_size) { + applog(LOG_ERR, "Invalid coinbase address: '%s'", coinbase_address); + exit(0); + } + } + + // Initialize stats times and counters + memset(share_stats, 0, s_stats_size * sizeof(struct share_stats_t)); + gettimeofday(&last_submit_time, NULL); + memcpy(&five_min_start, &last_submit_time, sizeof(struct timeval)); + memcpy(&session_start, &last_submit_time, sizeof(struct timeval)); + + // if ( !check_cpu_capability() ) exit(1); + + pthread_mutex_init(&stats_lock, NULL); + pthread_rwlock_init(&g_work_lock, NULL); + pthread_mutex_init(&stratum.sock_lock, NULL); + pthread_mutex_init(&stratum.work_lock, NULL); + pthread_mutex_init(&dev_stratum.sock_lock, NULL); + pthread_mutex_init(&dev_stratum.work_lock, NULL); + pthread_cond_init(&sync_cond, NULL); + + flags = CURL_GLOBAL_ALL; + if (!opt_benchmark) + if (strncasecmp(rpc_url, "https:", 6) && + strncasecmp(rpc_url, "stratum+tcps://", 15)) + flags &= ~CURL_GLOBAL_SSL; + + if (curl_global_init(flags)) { + applog(LOG_ERR, "CURL initialization failed"); + return 1; + } #ifndef WIN32 - if (opt_background) - { - i = fork(); - if (i < 0) exit(1); - if (i > 0) exit(0); - i = setsid(); - if (i < 0) - applog(LOG_ERR, "setsid() failed (errno = %d)", errno); - i = chdir("/"); - if (i < 0) - applog(LOG_ERR, "chdir() failed (errno = %d)", errno); - signal(SIGHUP, signal_handler); - signal(SIGTERM, signal_handler); - } - /* Always catch Ctrl+C */ - signal(SIGINT, signal_handler); + if (opt_background) { + i = fork(); + if (i < 0) + exit(1); + if (i > 0) + exit(0); + i = setsid(); + if (i < 0) + applog(LOG_ERR, "setsid() failed (errno = %d)", errno); + i = chdir("/"); + if (i < 0) + applog(LOG_ERR, "chdir() failed (errno = %d)", errno); + signal(SIGHUP, signal_handler); + signal(SIGTERM, signal_handler); + } + /* Always catch Ctrl+C */ + signal(SIGINT, signal_handler); #else - SetConsoleCtrlHandler((PHANDLER_ROUTINE)ConsoleHandler, TRUE); - if (opt_background) - { - HWND hcon = GetConsoleWindow(); - if (hcon) { - // this method also hide parent command line window - ShowWindow(hcon, SW_HIDE); - } else { - HANDLE h = GetStdHandle(STD_OUTPUT_HANDLE); - CloseHandle(h); - FreeConsole(); - } - } - if (opt_priority > 0) - { - DWORD prio = NORMAL_PRIORITY_CLASS; - switch (opt_priority) { - case 1: - prio = BELOW_NORMAL_PRIORITY_CLASS; - break; - case 3: - prio = ABOVE_NORMAL_PRIORITY_CLASS; - break; - case 4: - prio = HIGH_PRIORITY_CLASS; - break; - case 5: - prio = REALTIME_PRIORITY_CLASS; - } - SetPriorityClass(GetCurrentProcess(), prio); - } + SetConsoleCtrlHandler((PHANDLER_ROUTINE)ConsoleHandler, TRUE); + if (opt_background) { + HWND hcon = GetConsoleWindow(); + if (hcon) { + // this method also hide parent command line window + ShowWindow(hcon, SW_HIDE); + } else { + HANDLE h = GetStdHandle(STD_OUTPUT_HANDLE); + CloseHandle(h); + FreeConsole(); + } + } + if (opt_priority > 0) { + DWORD prio = NORMAL_PRIORITY_CLASS; + switch (opt_priority) { + case 1: + prio = BELOW_NORMAL_PRIORITY_CLASS; + break; + case 3: + prio = ABOVE_NORMAL_PRIORITY_CLASS; + break; + case 4: + prio = HIGH_PRIORITY_CLASS; + break; + case 5: + prio = REALTIME_PRIORITY_CLASS; + } + SetPriorityClass(GetCurrentProcess(), prio); + } #endif - if ( num_cpus != opt_n_threads ) - applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.", - num_cpus, opt_n_threads ); - -// To be reviewed - if ( opt_affinity != -1 ) - { - if ( num_cpus > 64 ) - { - applog(LOG_WARNING,"--cpu-affinity argument is not supported with more"); - applog(LOG_WARNING," than 64 CPUs, using default affinity."); - opt_affinity = -1; - } - else - { - if (!opt_quiet) - applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity); - affine_to_cpu_mask( -1, (unsigned long)opt_affinity ); - } - } + // To be confirmed with more than 64 cpus + if (opt_affinity != -1) { + if (!affinity_uses_uint128 && num_cpus > 64) { + applog(LOG_WARNING, + "Setting CPU affinity with more than 64 CPUs is only"); + applog(LOG_WARNING, "available on Linux. Using default affinity."); + opt_affinity = -1; + } + /* + else + { + affine_to_cpu_mask( -1, opt_affinity ); + if ( !opt_quiet ) + { + #if AFFINITY_USES_UINT128 + if ( num_cpus > 64 ) + applog(LOG_DEBUG, "Binding process to cpu mask %x", + u128_hi64( opt_affinity ), u128_lo64( opt_affinity ) + ); else applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity + ); #else applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity + ); #endif + } + } + */ + } + + if (!opt_quiet && (opt_n_threads < num_cpus)) { + char affinity_map[64]; + format_affinity_map(affinity_map, opt_affinity); + applog(LOG_INFO, "CPU affinity [%s]", affinity_map); + } #ifdef HAVE_SYSLOG_H - if (use_syslog) - openlog("cpuminer", LOG_PID, LOG_USER); + if (use_syslog) + openlog("cpuminer", LOG_PID, LOG_USER); #endif - work_restart = (struct work_restart*) calloc(opt_n_threads, sizeof(*work_restart)); - if (!work_restart) - return 1; - thr_info = (struct thr_info*) calloc(opt_n_threads + 4, sizeof(*thr)); - if (!thr_info) - return 1; - thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double)); - if (!thr_hashrates) - return 1; - thr_hashcount = (double *) calloc(opt_n_threads, sizeof(double)); - if (!thr_hashcount) - return 1; - - /* init workio thread info */ - work_thr_id = opt_n_threads; - thr = &thr_info[work_thr_id]; - thr->id = work_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - - if ( rpc_pass && rpc_user ) - opt_stratum_stats = ( strstr( rpc_pass, "stats" ) != NULL ) - || ( strcmp( rpc_user, "benchmark" ) == 0 ); - - /* start work I/O thread */ - if (thread_create(thr, workio_thread)) - { - applog(LOG_ERR, "work thread create failed"); - return 1; - } - - /* ESET-NOD32 Detects these 2 thread_create... */ - if (want_longpoll && !have_stratum) - { - /* init longpoll thread info */ - longpoll_thr_id = opt_n_threads + 1; - thr = &thr_info[longpoll_thr_id]; - thr->id = longpoll_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - /* start longpoll thread */ - err = thread_create(thr, longpoll_thread); - if (err) { - applog(LOG_ERR, "long poll thread create failed"); - return 1; - } - } - if (want_stratum) - { - /* init stratum thread info */ - stratum_thr_id = opt_n_threads + 2; - thr = &thr_info[stratum_thr_id]; - thr->id = stratum_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - /* start stratum thread */ - err = thread_create(thr, stratum_thread); - if (err) - { - applog(LOG_ERR, "stratum thread create failed"); - return 1; - } - if (have_stratum) - tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url)); - } - - if (opt_api_listen) - { - /* api thread */ - api_thr_id = opt_n_threads + 3; - thr = &thr_info[api_thr_id]; - thr->id = api_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - err = thread_create(thr, api_thread); - if (err) { - applog(LOG_ERR, "api thread create failed"); - return 1; - } - } - - /* start mining threads */ - for (i = 0; i < opt_n_threads; i++) - { - thr = &thr_info[i]; - thr->id = i; - thr->q = tq_new(); - if (!thr->q) - return 1; - err = thread_create(thr, miner_thread); - if (err) { - applog(LOG_ERR, "thread %d create failed", i); - return 1; - } - } - - applog(LOG_INFO, "%d miner threads started, " - "using '%s' algorithm.", - opt_n_threads, - algo_names[opt_algo]); - - /* main loop - simply wait for workio thread to exit */ - pthread_join(thr_info[work_thr_id].pth, NULL); - applog(LOG_WARNING, "workio thread dead, exiting."); - return 0; + work_restart = + (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart)); + if (!work_restart) + return 1; + thr_info = (struct thr_info *)calloc(opt_n_threads + 4 + 1, sizeof(*thr)); + if (!thr_info) + return 1; + thr_hashrates = (double *)calloc(opt_n_threads, sizeof(double)); + if (!thr_hashrates) + return 1; + + /* init workio thread info */ + work_thr_id = opt_n_threads; + thr = &thr_info[work_thr_id]; + thr->id = work_thr_id; + thr->q = tq_new(); + if (!thr->q) + return 1; + + if (rpc_pass && rpc_user) + opt_stratum_stats = (strstr(rpc_pass, "stats") != NULL) || + (strcmp(rpc_user, "benchmark") == 0); + + /* start work I/O thread */ + if (thread_create(thr, workio_thread)) { + applog(LOG_ERR, "work thread create failed"); + return 1; + } + + /* ESET-NOD32 Detects these 2 thread_create... */ + if (want_longpoll && !have_stratum) { + if (opt_debug) + applog(LOG_INFO, "Creating long poll thread"); + + /* init longpoll thread info */ + longpoll_thr_id = opt_n_threads + 1; + thr = &thr_info[longpoll_thr_id]; + thr->id = longpoll_thr_id; + thr->q = tq_new(); + if (!thr->q) + return 1; + /* start longpoll thread */ + err = thread_create(thr, longpoll_thread); + if (err) { + applog(LOG_ERR, "Long poll thread create failed"); + return 1; + } + } + if (have_stratum) { + if (opt_debug) + applog(LOG_INFO, "Creating stratum thread"); + + /* init stratum thread info */ + stratum_thr_id = opt_n_threads + 2; + thr = &thr_info[stratum_thr_id]; + thr->id = stratum_thr_id; + thr->q = tq_new(); + stratum.dev = false; + if (!thr->q) + return 1; + /* start stratum thread */ + err = thread_create(thr, stratum_thread); + if (err) { + applog(LOG_ERR, "Stratum thread create failed"); + return 1; + } + if (have_stratum) + tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url)); + + /* init dev stratum thread info */ + + struct timeval now; + gettimeofday(&now, NULL); + dev_start.tv_sec = now.tv_sec + first_dev.tv_sec; + dev_start.tv_usec = now.tv_usec + first_dev.tv_usec; + dev_stratum_thr_id = opt_n_threads + 4; + thr = &thr_info[dev_stratum_thr_id]; + thr->id = dev_stratum_thr_id; + thr->q = tq_new(); + dev_stratum.dev = true; + if (!thr->q) + return 1; + /* start stratum thread */ + err = thread_create(thr, dev_stratum_thread); + if (err) { + applog(LOG_ERR, "Stratum thread create failed"); + return 1; + } + if (have_stratum) { + tq_push(thr_info[dev_stratum_thr_id].q, strdup(rpc_url)); + } + } + + if (opt_api_enabled) { + if (opt_debug) + applog(LOG_INFO, "Creating API thread"); + + /* api thread */ + api_thr_id = opt_n_threads + 3; + thr = &thr_info[api_thr_id]; + thr->id = api_thr_id; + thr->q = tq_new(); + if (!thr->q) + return 1; + err = thread_create(thr, api_thread); + if (err) { + applog(LOG_ERR, "API thread create failed"); + return 1; + } + if (!opt_quiet) + applog(LOG_INFO, "API listnening to %s:%d", opt_api_allow, + opt_api_listen); + } + + /* start mining threads */ + for (i = 0; i < opt_n_threads; i++) { + usleep(5000); + thr = &thr_info[i]; + thr->id = i; + thr->q = tq_new(); + if (!thr->q) + return 1; + err = thread_create(thr, miner_thread); + if (err) { + applog(LOG_ERR, "Miner thread %d create failed", i); + return 1; + } + } + + applog(LOG_INFO, "%d of %d miner threads started using '%s' algorithm", + opt_n_threads, num_cpus, algo_names[opt_algo]); + + /* main loop - simply wait for workio thread to exit */ + pthread_join(thr_info[work_thr_id].pth, NULL); + applog(LOG_WARNING, "workio thread dead, exiting."); + return 0; } diff --git a/cpuminer.nsi b/cpuminer.nsi deleted file mode 100644 index d522bcc..0000000 --- a/cpuminer.nsi +++ /dev/null @@ -1,363 +0,0 @@ -; NSIS script (UTF-8) NSIS-3 Unicode -; Install - -; Unicode true -; SetCompressor lzma -RequestExecutionLevel Admin - -; -------------------- - -!include x64.nsh -!include FileFunc.nsh -!include WinMessages.nsh - -AllowRootDirInstall true - -; -------------------- -; LANG TABLES: 1 - -!define MINER_VERSION "1.1" -!define VERSION "${MINER_VERSION}.0.0" - -BrandingText "CPU Miner Install System" - -!define PROGRAM_NAME "CPU Miner" -!define PROGRAM_KEY "cpuminer" - -Name "cpuminer-multi v${MINER_VERSION}" -OutFile "${PROGRAM_KEY}-setup.exe" -Icon "res\setup.ico" -; Icon "res\${PROGRAM_KEY}.ico" -Caption "${PROGRAM_NAME}" - -VIProductVersion "${VERSION}" -VIAddVersionKey ProductName "${PROGRAM_NAME} - Setup" -VIAddVersionKey Comments "" -VIAddVersionKey CompanyName "Open Source" -VIAddVersionKey LegalCopyright "2015 - Open Source" -VIAddVersionKey FileDescription "${PROGRAM_NAME} - Setup" -VIAddVersionKey FileVersion "${MINER_VERSION}" -VIAddVersionKey ProductVersion "${MINER_VERSION}" -VIAddVersionKey InternalName "${PROGRAM_NAME}" -VIAddVersionKey LegalTrademarks "" -VIAddVersionKey OriginalFilename "${PROGRAM_KEY}.exe" - -!define NSIS_MAKENSIS64 -!ifdef NSIS_MAKENSIS64 - !define BITS 64 - InstallDir $PROGRAMFILES64\cpuminer-multi - !define RK_UNINSTALL "SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall\${PROGRAM_KEY}" -!else - !define BITS 32 - InstallDir $PROGRAMFILES32\cpuminer-multi - !define RK_UNINSTALL "SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall\${PROGRAM_KEY}" - ;!define RK_UNINSTALL "SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Uninstall\${PROGRAM_KEY}" -!endif - -# Test folders x86/x64 -# InstType "Custom" -# InstallDir $WINDIR\system32 - -; LANG: ${LANG_ENGLISH} -LangString LSTR_0 ${LANG_ENGLISH} "CPU Miner Install System" -LangString LSTR_3 ${LANG_ENGLISH} "Space available: " -LangString LSTR_4 ${LANG_ENGLISH} "Space required: " -LangString LSTR_5 ${LANG_ENGLISH} "Can't write: " -LangString LSTR_17 ${LANG_ENGLISH} "Error decompressing data! Corrupted installer?" -LangString LSTR_21 ${LANG_ENGLISH} "Extract: " -LangString LSTR_22 ${LANG_ENGLISH} "Extract: error writing to file " -LangString LSTR_25 ${LANG_ENGLISH} "Output folder: " -LangString LSTR_29 ${LANG_ENGLISH} "Skipped: " -LangString LSTR_30 ${LANG_ENGLISH} "Copy Details To Clipboard" -LangString LSTR_32 ${LANG_ENGLISH} "B" -LangString LSTR_33 ${LANG_ENGLISH} "K" -LangString LSTR_34 ${LANG_ENGLISH} "M" -LangString LSTR_35 ${LANG_ENGLISH} "G" -LangString LSTR_36 ${LANG_ENGLISH} "Choose Install Location" -LangString LSTR_37 ${LANG_ENGLISH} "Choose the folder in which to install ${PROGRAM_NAME}." -LangString LSTR_38 ${LANG_ENGLISH} "Installing" -LangString LSTR_39 ${LANG_ENGLISH} "Please wait while ${PROGRAM_NAME} is being installed." -LangString LSTR_40 ${LANG_ENGLISH} "Installation Complete" -LangString LSTR_41 ${LANG_ENGLISH} "Setup was completed successfully." -LangString LSTR_42 ${LANG_ENGLISH} "Installation Aborted" -LangString LSTR_43 ${LANG_ENGLISH} "Setup was not completed successfully." -LangString LSTR_44 ${LANG_ENGLISH} "MS Shell Dlg" -LangString LSTR_45 ${LANG_ENGLISH} "8" -LangString LSTR_46 ${LANG_ENGLISH} "Error opening file for writing: $\r$\n$\r$\n$0$\r$\n$\r$\nClick Abort to stop the installation,$\r$\nRetry to try again, or$\r$\nIgnore to skip this file." -LangString LSTR_48 ${LANG_ENGLISH} "Cancel" -LangString LSTR_49 ${LANG_ENGLISH} "Setup will install ${PROGRAM_NAME} in the following folder. To install in a different folder, click Browse and select another folder. $_CLICK" -LangString LSTR_50 ${LANG_ENGLISH} "Destination Folder" -LangString LSTR_51 ${LANG_ENGLISH} "B&rowse..." -LangString LSTR_52 ${LANG_ENGLISH} "Select the folder to install ${PROGRAM_NAME} in:" -LangString LSTR_53 ${LANG_ENGLISH} "< &Back" -LangString LSTR_54 ${LANG_ENGLISH} "&Install" -LangString LSTR_55 ${LANG_ENGLISH} "Click Install to start the installation." -LangString LSTR_56 ${LANG_ENGLISH} "Show &details" -LangString LSTR_57 ${LANG_ENGLISH} "Completed" -LangString LSTR_58 ${LANG_ENGLISH} "&Next >" -LangString LSTR_59 ${LANG_ENGLISH} "Click Next to continue." -LangString LSTR_60 ${LANG_ENGLISH} " " -LangString LSTR_61 ${LANG_ENGLISH} "&Close" - - -; -------------------- -; VARIABLES: - -Var _0_ -Var _1_ -Var _2_ -Var _3_ -Var _4_ -Var _5_ -Var _6_ - -Var DATADIR -Var REALINSTDIR - -; -------------------- -; PAGES: 3 - -; Page 0 -Page directory func_title_pre0 func_show0 func_leave0 /ENABLECANCEL -; DirVar $CMDLINE - DirText $(LSTR_49) $(LSTR_50) $(LSTR_51) $(LSTR_52) ; Setup will install ${PROGRAM_NAME} in the following folder.... - -; Page 1 -Page instfiles func_title_pre1 func_show1 func_leave1 - CompletedText $(LSTR_57) ; Completed - DetailsButtonText $(LSTR_56) ; Show &details - -/* -; Page 2 -Page COMPLETED -*/ - - -; -------------------- - -Function func_title_pre0 ; Page 0, Pre - SendMessage $_0_ ${WM_SETTEXT} 0 STR:$(LSTR_36) ; Choose Install Location - SendMessage $_2_ ${WM_SETTEXT} 0 STR:$(LSTR_37) ; Choose the folder in which to install ${PROGRAM_NAME}. -FunctionEnd - - -Function func_show0 ; Page 0, Show -; FindWindow $_12_ "#32770" "" $HWNDPARENT -; GetDlgItem $_13_ $_12_ 1006 -; GetDlgItem $_14_ $_12_ 1020 -; GetDlgItem $_15_ $_12_ 1019 -; GetDlgItem $_16_ $_12_ 1001 -; GetDlgItem $_17_ $_12_ 1023 -; GetDlgItem $_18_ $_12_ 1024 -FunctionEnd - - -Function func_leave0 ; Page 0, Leave -FunctionEnd - - -Function func_title_pre1 ; Page 1, Pre - SendMessage $_0_ ${WM_SETTEXT} 0 STR:$(LSTR_38) ; Installing - SendMessage $_2_ ${WM_SETTEXT} 0 STR:$(LSTR_39) ; Please wait while ${PROGRAM_NAME} is being installed. cpuminer-multi -FunctionEnd - - -Function func_show1 ; Page 1, Show -; FindWindow $_19_ "#32770" "" $HWNDPARENT -; GetDlgItem $_20_ $_19_ 1006 -; GetDlgItem $_21_ $_19_ 1004 -; GetDlgItem $_22_ $_19_ 1027 -; GetDlgItem $_23_ $_19_ 1016 -FunctionEnd - - -Function func_leave1 ; Page 1, Leave - IfAbort label_27 - SendMessage $_0_ ${WM_SETTEXT} 0 STR:$(LSTR_40) ; "Installation Complete" - SendMessage $_2_ ${WM_SETTEXT} 0 STR:$(LSTR_41) ; "Setup was completed successfully." - Goto label_29 -label_27: - SendMessage $_0_ ${WM_SETTEXT} 0 STR:$(LSTR_42) ; "Installation Aborted" - SendMessage $_2_ ${WM_SETTEXT} 0 STR:$(LSTR_43) ; "Setup was not completed successfully." -label_29: - IfAbort label_30 -label_30: -FunctionEnd - -Function .onInit - # `/SD IDYES' tells MessageBox to automatically choose IDYES if the installer is silent - # in this case, the installer can only be silent if the user used the /S switch or if - # you've uncommented line number 5 - # MessageBox MB_YESNO|MB_ICONQUESTION "Would you like the installer to be silent from now on?" \ - # /SD IDYES IDNO no IDYES yes - # yes: - # SetSilent silent - # Goto done - # no: - # SetSilent normal - - #SetSilent silent - - ReadRegStr $R0 HKLM ${RK_UNINSTALL} \ - "UninstallString" - StrCmp $R0 "" done - - DeleteRegKey HKLM ${RK_UNINSTALL} - ClearErrors - -done: - -FunctionEnd - - -Function .onGUIInit - GetDlgItem $_0_ $HWNDPARENT 1037 - CreateFont $_1_ $(LSTR_44) $(LSTR_45) 700 ; "MS Shell Dlg" 8 - SendMessage $_0_ ${WM_SETFONT} $_1_ 0 - GetDlgItem $_2_ $HWNDPARENT 1038 - SetCtlColors $_0_ "" 0xFFFFFF - SetCtlColors $_2_ "" 0xFFFFFF - GetDlgItem $_3_ $HWNDPARENT 1034 - SetCtlColors $_3_ "" 0xFFFFFF - GetDlgItem $_4_ $HWNDPARENT 1039 - SetCtlColors $_4_ "" 0xFFFFFF - GetDlgItem $_6_ $HWNDPARENT 1028 - SetCtlColors $_6_ /BRANDING "" - GetDlgItem $_5_ $HWNDPARENT 1256 - SetCtlColors $_5_ /BRANDING "" - SendMessage $_5_ ${WM_SETTEXT} 0 "STR:$(LSTR_0) " ; "CPU Miner Install System" -; GetDlgItem $_7_ $HWNDPARENT 1035 -; GetDlgItem $_8_ $HWNDPARENT 1045 -; GetDlgItem $_9_ $HWNDPARENT 1 -; GetDlgItem $_10_ $HWNDPARENT 2 -; GetDlgItem $_11_ $HWNDPARENT 3 -FunctionEnd - - -Function .onUserAbort -FunctionEnd - - -Section - - StrCpy $DATADIR "$APPDATA\${PROGRAM_KEY}" - StrCpy $REALINSTDIR "$INSTDIR" - ${If} ${RunningX64} - ${DisableX64FSRedirection} - ; StrCmp $INSTDIR "$WINDIR\system32" 0 +2 - ; StrCpy $INSTDIR "$WINDIR\sysnative" - SetRegView 64 - ${Else} - SetRegView 32 - ${Endif} - - SetOutPath "$INSTDIR" - - # call UserInfo plugin to get user info. The plugin puts the result in the stack - UserInfo::getAccountType - # pop the result from the stack into $0 - Pop $0 - - # If match, jump 3 lines down. - StrCmp $0 "Admin" +3 - MessageBox MB_OK "Installer requires admin rights: $0" - Return - - SetOverwrite on - File cpuminer-gw64.exe - File cpuminer-x64.exe - File cpuminer-conf.json - File /oname=LICENSE.txt LICENSE - File /oname=README.txt README.md - - SetOverwrite off - AllowSkipFiles on - File x64\Release\msvcr120.dll - - # Create the uninstaller - CreateDirectory "$DATADIR" - File "/oname=$DATADIR\cpuminer-conf.json" cpuminer-conf.json - WriteUninstaller "$DATADIR\cpuminer-uninst.exe" - - # Shortcuts (program + uninstaller) - CreateDirectory "$SMPROGRAMS\${PROGRAM_NAME}" - CreateShortCut /NoWorkingDir "$SMPROGRAMS\${PROGRAM_NAME}\${PROGRAM_NAME}.lnk" "$REALINSTDIR\cpuminer-gw64.exe" - CreateShortCut /NoWorkingDir "$SMPROGRAMS\${PROGRAM_NAME}\Config.lnk" "$SYSDIR\notepad.exe" "$DATADIR\cpuminer-conf.json" - CreateShortCut "$SMPROGRAMS\${PROGRAM_NAME}\Uninstall.lnk" "$DATADIR\cpuminer-uninst.exe" - CreateShortCut /NoWorkingDir "$SMPROGRAMS\${PROGRAM_NAME}\${PROGRAM_NAME}-bg.lnk" "$REALINSTDIR\cpuminer-gw64.exe" "-q -B" "" "" SW_SHOWMINIMIZED - - WriteRegStr HKLM ${RK_UNINSTALL} \ - "DisplayName" "${PROGRAM_NAME}" - - WriteRegStr HKLM ${RK_UNINSTALL} \ - "DisplayVersion" "${MINER_VERSION}" - - WriteRegStr HKLM ${RK_UNINSTALL} \ - "Publisher" "Open Source" - - WriteRegStr HKLM ${RK_UNINSTALL} \ - "DisplayIcon" "$REALINSTDIR\cpuminer-x64.exe" - - WriteRegStr HKLM ${RK_UNINSTALL} \ - "InstallLocation" "$REALINSTDIR" - - WriteRegStr HKLM ${RK_UNINSTALL} \ - "UninstallString" "$\"$DATADIR\cpuminer-uninst.exe$\"" - - ${GetSize} "$INSTDIR" "/M=cpuminer* /S=0K /G=0" $0 $1 $2 - IntFmt $0 "0x%08X" $0 - WriteRegDWORD HKLM "${RK_UNINSTALL}" \ - "EstimatedSize" "$0" - - # Add application to Windows Firewall exception list (to check) - ;liteFirewall::AddRule "$REALINSTDIR\cpuminer-gw64.exe" "CPU Miner (MinGW64)" - ;liteFirewall::AddRule "$REALINSTDIR\cpuminer-x64.exe" "CPU Miner (x64)" - -SectionEnd - - -Section "uninstall" - - StrCpy $DATADIR "$APPDATA\${PROGRAM_KEY}" - ${If} ${RunningX64} - ;StrCmp $INSTDIR "$WINDIR\system32" 0 +2 - ;StrCpy $INSTDIR "$WINDIR\sysnative" - ${DisableX64FSRedirection} - SetRegView 64 - ${Else} - SetRegView 32 - ${Endif} - - ReadRegStr $INSTDIR HKLM ${RK_UNINSTALL} "InstallLocation" - StrCpy $REALINSTDIR "$INSTDIR" - - Delete "$INSTDIR\cpuminer-conf.json" - Delete "$INSTDIR\cpuminer-gw64.exe" - Delete "$INSTDIR\cpuminer-x64.exe" - Delete "$INSTDIR\LICENSE.txt" - Delete "$INSTDIR\README.txt" - - StrCmp $REALINSTDIR "$WINDIR/system32" +2 - Delete "$INSTDIR\msvcr120.dll" - RMDir "$INSTDIR" - - Delete "$DATADIR\cpuminer-uninst.exe" - RMDir "$DATADIR" - - ; Delete "$DATADIR\cpuminer-conf.json" - ; RMDir "$DATADIR" - - # second, remove the link from the start menu - Delete "$SMPROGRAMS\${PROGRAM_NAME}\${PROGRAM_NAME}.lnk" - Delete "$SMPROGRAMS\${PROGRAM_NAME}\${PROGRAM_NAME}-bg.lnk" - Delete "$SMPROGRAMS\${PROGRAM_NAME}\Config.lnk" - Delete "$SMPROGRAMS\${PROGRAM_NAME}\Uninstall.lnk" - RMDir "$SMPROGRAMS\${PROGRAM_NAME}" - - DeleteRegKey HKLM ${RK_UNINSTALL} - - # Remove application from Windows Firewall exception list - ;liteFirewall::RemoveRule "$REALINSTDIR\cpuminer-gw64.exe" "CPU Miner (MinGW64)" - ;liteFirewall::RemoveRule "$REALINSTDIR\cpuminer-x64.exe" "CPU Miner (x64)" - -SectionEnd diff --git a/cpuminer.sln b/cpuminer.sln deleted file mode 100644 index be2d7d2..0000000 --- a/cpuminer.sln +++ /dev/null @@ -1,27 +0,0 @@ -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 2013 -VisualStudioVersion = 12.0.30723.0 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cpuminer", "cpuminer.vcxproj", "{36DC07F9-A4A6-4877-A146-1B960083CF6F}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|x64 = Debug|x64 - Debug|Win32 = Debug|Win32 - Release|x64 = Release|x64 - Release|Win32 = Release|Win32 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|x64.ActiveCfg = Debug|x64 - {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|Win32.ActiveCfg = Debug|Win32 - {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|x64.Build.0 = Debug|x64 - {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|Win32.Build.0 = Debug|Win32 - {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|x64.ActiveCfg = Release|x64 - {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|x64.Build.0 = Release|x64 - {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|Win32.ActiveCfg = Release|Win32 - {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|Win32.Build.0 = Release|Win32 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection -EndGlobal diff --git a/cpuminer.vcxproj b/cpuminer.vcxproj deleted file mode 100644 index 5ae9600..0000000 --- a/cpuminer.vcxproj +++ /dev/null @@ -1,396 +0,0 @@ - - - - - Release - Win32 - - - Debug - Win32 - - - Release - x64 - - - Debug - x64 - - - - {36DC07F9-A4A6-4877-A146-1B960083CF6F} - cpuminer - - - - Application - false - MultiByte - v120 - false - - - Application - true - MultiByte - v120 - false - - - Application - false - true - MultiByte - v120 - false - - - Application - true - MultiByte - v120 - false - - - - - - - - - - - - - - - - - true - - - true - - - false - - - false - - - - Level3 - Disabled - StreamingSIMDExtensions2 - WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_AVX;USE_AVX2;USE_XOP;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories) - true - - - true - Console - kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;%(AdditionalDependencies) - compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;%(AdditionalLibraryDirectories) - /NODEFAULTLIB:LIBCMT %(AdditionalOptions) - true - - - - - Level3 - Disabled - AdvancedVectorExtensions - WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_AVX;USE_AVX2;USE_XOP;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories) - true - - - true - Console - kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;%(AdditionalDependencies) - compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;%(AdditionalLibraryDirectories) - /NODEFAULTLIB:LIBCMT %(AdditionalOptions) - true - - - - - Level3 - MaxSpeed - MultiThreaded - Speed - StreamingSIMDExtensions2 - false - true - true - WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_AVX;USE_AVX2;USE_XOP;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories) - true - SyncCThrow - - - false - true - true - Console - kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;%(AdditionalDependencies) - compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;%(AdditionalLibraryDirectories) - /NODEFAULTLIB:LIBCMT %(AdditionalOptions) - false - false - - - - - Level3 - MaxSpeed - MultiThreaded - Speed - false - true - true - WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories) - true - - - false - true - true - Console - kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;%(AdditionalDependencies) - compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;%(AdditionalLibraryDirectories) - /NODEFAULTLIB:LIBCMT %(AdditionalOptions) - false - - - - - - - - - - - - - - - - - - false - Full - - - - - - - - - - - - - - - - - - Full - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Full - - - - - - - - - - - - - - - - - - - true - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - true - - - - - true - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - true - - - true - - - true - - - true - - - true - - - true - - - - - - - - - - - - - - - - - - - diff --git a/cpuminer.vcxproj.filters b/cpuminer.vcxproj.filters deleted file mode 100644 index e5c6d4b..0000000 --- a/cpuminer.vcxproj.filters +++ /dev/null @@ -1,527 +0,0 @@ - - - - - - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - compat - - - jansson - - - jansson - - - jansson - - - jansson - - - jansson - - - compat - - - compat - - - jansson - - - crypto - - - crypto - - - crypto - - - crypto - - - crypto - - - crypto - - - crypto - - - crypto - - - crypto - - - sph - - - crypto - - - crypto - - - arch\x86 - - - arch\x64 - - - arch\x86 - - - arch\x64 - - - arch\x64 - - - arch\x86 - - - - - jansson - - - jansson - - - jansson - - - jansson - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - algo - - - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - sph - - - compat - - - compat - - - compat - - - compat - - - compat - - - compat - - - compat - - - compat - - - headers - - - headers - - - headers - - - jansson - - - jansson - - - crypto - - - crypto - - - crypto - - - crypto - - - crypto - - - crypto - - - sph - - - crypto - - - crypto - - - crypto - - - crypto - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - scrypt-jane - - - crypto - - - res - - - - - {822b216c-102a-48ca-b62b-7689df410249} - - - {f5e8fc70-27e7-41ab-a14d-2b0462350980} - - - {343fa430-44d1-4ff7-b15d-8915052fe036} - - - {88b3057f-0bb6-49d4-98a4-551c8bd8ce0d} - - - {d17f8918-3371-4acb-ba63-06be828562a1} - - - {04c96afe-01af-4f5c-9ee7-219481033018} - - - {774a7c76-80c0-4a15-983a-022d17e08420} - - - {e2f9ccb4-db0b-4e4f-9cfd-935e8e0ebb78} - - - {5a05483f-0f3f-43fe-bfd1-a14519e30e4d} - - - {5ed226e8-1512-481d-a88d-18644da87dcd} - - - {34f266e4-b48f-4721-ac34-474b65726498} - - - {5969f6d8-98c0-49c9-8e4c-51c54b5f0e16} - - - - - algo - - - algo - - - - - res - - - - - res - - - \ No newline at end of file diff --git a/miner.h b/miner.h index 0f906d4..1c2fedd 100644 --- a/miner.h +++ b/miner.h @@ -6,21 +6,9 @@ #define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION #define MAX_CPUS 16 -//#ifndef NO_AES_NI - #ifndef __AES__ - #define NO_AES_NI - #endif -//#endif - -//#if defined(FOUR_WAY) && defined(__AVX2__) -// keep this until all algos remove reference to HASH_4WAY -//#if defined(__AVX2__) -// #define HASH_4WAY -//#endif - #ifdef _MSC_VER -#undef USE_ASM /* to fix */ +#undef USE_ASM /* to fix */ #ifdef NOASM #undef USE_ASM @@ -36,28 +24,28 @@ #endif /* _MSC_VER */ -#include #include +#include #include -#include -#include #include +#include +#include #ifdef STDC_HEADERS -# include -# include +#include +#include #else -# ifdef HAVE_STDLIB_H -# include -# endif +#ifdef HAVE_STDLIB_H +#include +#endif #endif /* #ifndef min #define min(a,b) (a>b ? (b) :(a)) #endif -#ifndef max +#ifndef max #define max(a,b) (a //#elif !defined alloca -# ifdef __GNUC__ +#ifdef __GNUC__ //# define alloca __builtin_alloca -# elif defined _AIX -# define alloca __alloca -# elif defined _MSC_VER -# include -# define alloca _alloca -# elif !defined HAVE_ALLOCA -# ifdef __cplusplus +#elif defined _AIX +#define alloca __alloca +#elif defined _MSC_VER +#include +#define alloca _alloca +#elif !defined HAVE_ALLOCA +#ifdef __cplusplus extern "C" -# endif -void *alloca (size_t); -# endif +#endif + void *alloca(size_t); +#endif //#endif #ifdef HAVE_SYSLOG_H @@ -85,25 +73,26 @@ void *alloca (size_t); #define LOG_BLUE 0x10 /* unique value */ #else enum { - LOG_ERR, - LOG_WARNING, - LOG_NOTICE, - LOG_INFO, - LOG_DEBUG, - /* custom notices */ - LOG_BLUE = 0x10, + LOG_ERR, + LOG_WARNING, + LOG_NOTICE, + LOG_INFO, + LOG_DEBUG, + /* custom notices */ + LOG_BLUE = 0x10, }; #endif -static inline bool is_windows(void) -{ +extern bool is_power_of_2(int n); + +static inline bool is_windows(void) { #ifdef WIN32 - return true; + return true; #else - return false; + return false; #endif } - + #include "compat.h" #ifndef ARRAY_SIZE @@ -113,21 +102,24 @@ static inline bool is_windows(void) #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) #define WANT_BUILTIN_BSWAP #else -#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ - | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +#define bswap_32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) #endif -static inline uint32_t swab32(uint32_t v) -{ +static inline uint32_t swab32(uint32_t v) { #ifdef WANT_BUILTIN_BSWAP - return __builtin_bswap32(v); + return __builtin_bswap32(v); #else - return bswap_32(v); + return bswap_32(v); #endif } // Swap any two variables of the same type without using a temp -#define swap_vars(a,b) a^=b; b^=a; a^=b; +#define swap_vars(a, b) \ + a ^= b; \ + b ^= a; \ + a ^= b; #ifdef HAVE_SYS_ENDIAN_H #include @@ -136,31 +128,28 @@ static inline uint32_t swab32(uint32_t v) typedef unsigned char uchar; #if !HAVE_DECL_BE32DEC -static inline uint32_t be32dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + - ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +static inline uint32_t be32dec(const void *pp) { + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); } #endif #if !HAVE_DECL_LE32DEC -static inline uint32_t le32dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + - ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +static inline uint32_t le32dec(const void *pp) { + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); } #endif #if !HAVE_DECL_BE32ENC -static inline void be32enc(void *pp, uint32_t x) -{ - uint8_t *p = (uint8_t *)pp; - p[3] = x & 0xff; - p[2] = (x >> 8) & 0xff; - p[1] = (x >> 16) & 0xff; - p[0] = (x >> 24) & 0xff; +static inline void be32enc(void *pp, uint32_t x) { + uint8_t *p = (uint8_t *)pp; + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; } #endif @@ -169,48 +158,42 @@ static inline void be32enc(void *pp, uint32_t x) // This is a poorman's SIMD instruction, use 64 bit instruction to encode 2 // uint32_t. This function flips endian on two adjacent 32 bit quantities // aligned to 64 bits. If source is LE output is BE, and vice versa. -static inline void swab32_x2( uint64_t* dst, uint64_t src ) -{ - *dst = ( ( src & 0xff000000ff000000 ) >> 24 ) - | ( ( src & 0x00ff000000ff0000 ) >> 8 ) - | ( ( src & 0x0000ff000000ff00 ) << 8 ) - | ( ( src & 0x000000ff000000ff ) << 24 ); +static inline void swab32_x2(uint64_t *dst, uint64_t src) { + *dst = ((src & 0xff000000ff000000) >> 24) | + ((src & 0x00ff000000ff0000) >> 8) | ((src & 0x0000ff000000ff00) << 8) | + ((src & 0x000000ff000000ff) << 24); } -static inline void swab32_array( uint32_t* dst_p, uint32_t* src_p, int n ) -{ - // Assumes source is LE - for ( int i=0; i < n/2; i++ ) - swab32_x2( &((uint64_t*)dst_p)[i], ((uint64_t*)src_p)[i] ); -// if ( n % 2 ) -// be32enc( &dst_p[ n-1 ], src_p[ n-1 ] ); +static inline void swab32_array(uint32_t *dst_p, uint32_t *src_p, int n) { + // Assumes source is LE + for (int i = 0; i < n / 2; i++) + swab32_x2(&((uint64_t *)dst_p)[i], ((uint64_t *)src_p)[i]); + // if ( n % 2 ) + // be32enc( &dst_p[ n-1 ], src_p[ n-1 ] ); } #if !HAVE_DECL_LE32ENC -static inline void le32enc(void *pp, uint32_t x) -{ - uint8_t *p = (uint8_t *)pp; - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; +static inline void le32enc(void *pp, uint32_t x) { + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; } #endif #if !HAVE_DECL_LE16DEC -static inline uint16_t le16dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - return ((uint16_t)(p[0]) + ((uint16_t)(p[1]) << 8)); +static inline uint16_t le16dec(const void *pp) { + const uint8_t *p = (uint8_t const *)pp; + return ((uint16_t)(p[0]) + ((uint16_t)(p[1]) << 8)); } #endif #if !HAVE_DECL_LE16ENC -static inline void le16enc(void *pp, uint16_t x) -{ - uint8_t *p = (uint8_t *)pp; - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; +static inline void le16enc(void *pp, uint16_t x) { + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; } #endif @@ -222,7 +205,7 @@ static inline void le16enc(void *pp, uint16_t x) #define JSON_LOADF(path, err_ptr) json_load_file(path, err_ptr) #endif -json_t* json_load_url(char* cfg_url, json_error_t *err); +json_t *json_load_url(char *cfg_url, json_error_t *err); void sha256_init(uint32_t *state); void sha256_transform(uint32_t *state, const uint32_t *block, int swap); @@ -248,91 +231,125 @@ struct work; void work_free(struct work *w); void work_copy(struct work *dest, const struct work *src); - - /* api related */ void *api_thread(void *userdata); struct cpu_info { - int thr_id; - int accepted; - int rejected; - double khashes; - bool has_monitoring; - float cpu_temp; - int cpu_fan; - uint32_t cpu_clock; + int thr_id; + int accepted; + int rejected; + double khashes; + bool has_monitoring; + float cpu_temp; + int cpu_fan; + uint32_t cpu_clock; }; struct thr_api { - int id; - pthread_t pth; - struct thread_q *q; + int id; + pthread_t pth; + struct thread_q *q; }; /* end of api */ - -#define JSON_RPC_LONGPOLL (1 << 0) -#define JSON_RPC_QUIET_404 (1 << 1) -#define JSON_RPC_IGNOREERR (1 << 2) +#define JSON_RPC_LONGPOLL (1 << 0) +#define JSON_RPC_QUIET_404 (1 << 1) +#define JSON_RPC_IGNOREERR (1 << 2) #define JSON_BUF_LEN 512 -#define CL_N "\x1B[0m" -#define CL_RED "\x1B[31m" -#define CL_GRN "\x1B[32m" -#define CL_YLW "\x1B[33m" -#define CL_BLU "\x1B[34m" -#define CL_MAG "\x1B[35m" -#define CL_CYN "\x1B[36m" - -#define CL_BLK "\x1B[22;30m" /* black */ -#define CL_RD2 "\x1B[22;31m" /* red */ -#define CL_GR2 "\x1B[22;32m" /* green */ -#define CL_BRW "\x1B[22;33m" /* brown */ -#define CL_BL2 "\x1B[22;34m" /* blue */ -#define CL_MA2 "\x1B[22;35m" /* magenta */ -#define CL_CY2 "\x1B[22;36m" /* cyan */ -#define CL_SIL "\x1B[22;37m" /* gray */ +#define CL_N "\x1B[0m" +#define CL_RED "\x1B[31m" +#define CL_GRN "\x1B[32m" +#define CL_YLW "\x1B[33m" +#define CL_BLU "\x1B[34m" +#define CL_MAG "\x1B[35m" +#define CL_CYN "\x1B[36m" + +#define CL_BLK "\x1B[22;30m" /* black */ +#define CL_RD2 "\x1B[22;31m" /* red */ +#define CL_GR2 "\x1B[22;32m" /* green */ +#define CL_BRW "\x1B[22;33m" /* brown */ +#define CL_BL2 "\x1B[22;34m" /* blue */ +#define CL_MA2 "\x1B[22;35m" /* magenta */ +#define CL_CY2 "\x1B[22;36m" /* cyan */ +#define CL_SIL "\x1B[22;37m" /* gray */ #ifdef WIN32 -#define CL_GRY "\x1B[01;30m" /* dark gray */ +#define CL_GRY "\x1B[01;30m" /* dark gray */ #else -#define CL_GRY "\x1B[90m" /* dark gray selectable in putty */ +#define CL_GRY "\x1B[90m" /* dark gray selectable in putty */ #endif -#define CL_LRD "\x1B[01;31m" /* light red */ -#define CL_LGR "\x1B[01;32m" /* light green */ -#define CL_YL2 "\x1B[01;33m" /* yellow */ -#define CL_LBL "\x1B[01;34m" /* light blue */ -#define CL_LMA "\x1B[01;35m" /* light magenta */ -#define CL_LCY "\x1B[01;36m" /* light cyan */ - -#define CL_WHT "\x1B[01;37m" /* white */ - -void applog(int prio, const char *fmt, ...); -void restart_threads(void); -extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass, - const char *rpc_req, int *curl_err, int flags ); +#define CL_LRD "\x1B[01;31m" /* light red */ +#define CL_LGR "\x1B[01;32m" /* light green */ +#define CL_YL2 "\x1B[01;33m" /* yellow */ +#define CL_LBL "\x1B[01;34m" /* light blue */ +#define CL_LMA "\x1B[01;35m" /* light magenta */ +#define CL_LCY "\x1B[01;36m" /* light cyan */ + +#define CL_WHT "\x1B[01;37m" /* white */ + +void applog(int prio, const char *fmt, ...); +void applog2(int prio, const char *fmt, ...); +void restart_threads(void); +extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, + const char *rpc_req, int *curl_err, int flags); extern void cbin2hex(char *out, const char *in, size_t len); -void bin2hex( char *s, const unsigned char *p, size_t len ); -char *abin2hex( const unsigned char *p, size_t len ); -bool hex2bin( unsigned char *p, const char *hexstr, size_t len ); -bool jobj_binary( const json_t *obj, const char *key, void *buf, - size_t buflen ); -int varint_encode( unsigned char *p, uint64_t n ); -size_t address_to_script( unsigned char *out, size_t outsz, const char *addr ); -int timeval_subtract( struct timeval *result, struct timeval *x, - struct timeval *y); -bool fulltest( const uint32_t *hash, const uint32_t *target ); -void work_set_target( struct work* work, double diff ); -double target_to_diff( uint32_t* target ); -extern void diff_to_target(uint32_t *target, double diff); - -double hash_target_ratio( uint32_t* hash, uint32_t* target ); -void work_set_target_ratio( struct work* work, uint32_t* hash ); - - -void get_currentalgo( char* buf, int sz ); +void bin2hex(char *s, const unsigned char *p, size_t len); +char *abin2hex(const unsigned char *p, size_t len); +bool hex2bin(unsigned char *p, const char *hexstr, size_t len); +bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen); +int varint_encode(unsigned char *p, uint64_t n); +size_t address_to_script(unsigned char *out, size_t outsz, const char *addr); +int timeval_subtract(struct timeval *result, struct timeval *x, + struct timeval *y); + +// Segwit BEGIN +extern void memrev(unsigned char *p, size_t len); +// Segwit END + +// Bitcoin formula for converting difficulty to an equivalent +// number of hashes. +// +// https://en.bitcoin.it/wiki/Difficulty +// +// hash = diff * 2**32 +// +// diff_to_hash = 2**32 = 0x100000000 = 4294967296 = exp32; + +#define EXP16 65536. +#define EXP32 4294967296. +extern const long double exp32; // 2**32 +extern const long double exp48; // 2**48 +extern const long double exp64; // 2**64 +extern const long double exp96; // 2**96 +extern const long double exp128; // 2**128 +extern const long double exp160; // 2**160 + +bool fulltest(const uint32_t *hash, const uint32_t *target); +bool valid_hash(const void *, const void *); + +double hash_to_diff(const void *); +extern void diff_to_hash(uint32_t *, const double); + +double hash_target_ratio(uint32_t *hash, uint32_t *target); +void work_set_target_ratio(struct work *work, const void *hash); + +struct thr_info { + int id; + pthread_t pth; + pthread_attr_t attr; + struct thread_q *q; + struct cpu_info cpu; +}; + +// int test_hash_and_submit( struct work *work, const void *hash, +// struct thr_info *thr ); + +bool submit_solution(struct work *work, const void *hash, struct thr_info *thr); + +void get_currentalgo(char *buf, int sz); +/* bool has_sha(); bool has_aes_ni(); bool has_avx1(); @@ -349,79 +366,74 @@ void cpu_getmodelid(char *outbuf, size_t maxsz); void cpu_brand_string( char* s ); float cpu_temp( int core ); +*/ struct work { - uint32_t data[48] __attribute__ ((aligned (64))); - uint32_t target[8]; - - double targetdiff; - double shareratio; - double sharediff; - - int height; - char *txs; - char *workid; - - char *job_id; - size_t xnonce2_len; - unsigned char *xnonce2; - // x16rt - uint32_t merkleroothash[8]; - uint32_t witmerkleroothash[8]; - uint32_t denom10[8]; - uint32_t denom100[8]; - uint32_t denom1000[8]; - uint32_t denom10000[8]; - -} __attribute__ ((aligned (64))); + uint32_t target[8] __attribute__((aligned(64))); + uint32_t data[48] __attribute__((aligned(64))); + double targetdiff; + double sharediff; + double stratum_diff; + int height; + char *txs; + char *workid; + char *job_id; + size_t xnonce2_len; + unsigned char *xnonce2; + bool sapling; + bool stale; +} __attribute__((aligned(64))); struct stratum_job { - char *job_id; - unsigned char prevhash[32]; - size_t coinbase_size; - unsigned char *coinbase; - unsigned char *xnonce2; - int merkle_count; - unsigned char **merkle; - unsigned char version[4]; - unsigned char nbits[4]; - unsigned char ntime[4]; - double diff; - bool clean; - // for x16rt - unsigned char extra[64]; - unsigned char denom10[32]; - unsigned char denom100[32]; - unsigned char denom1000[32]; - unsigned char denom10000[32]; - unsigned char proofoffullnode[32]; - -} __attribute__ ((aligned (64))); + unsigned char prevhash[32]; + unsigned char final_sapling_hash[32]; + char *job_id; + size_t coinbase_size; + unsigned char *coinbase; + unsigned char *xnonce2; + int merkle_count; + unsigned char **merkle; + unsigned char version[4]; + unsigned char nbits[4]; + unsigned char ntime[4]; + double diff; + bool clean; + // for x16rt-veil + unsigned char extra[64]; + unsigned char denom10[32]; + unsigned char denom100[32]; + unsigned char denom1000[32]; + unsigned char denom10000[32]; + unsigned char proofoffullnode[32]; + +} __attribute__((aligned(64))); struct stratum_ctx { - char *url; - - CURL *curl; - char *curl_url; - char curl_err_str[CURL_ERROR_SIZE]; - curl_socket_t sock; - size_t sockbuf_size; - char *sockbuf; - pthread_mutex_t sock_lock; - - double next_diff; - double sharediff; - - char *session_id; - size_t xnonce1_size; - unsigned char *xnonce1; - size_t xnonce2_size; - struct stratum_job job; - struct work work __attribute__ ((aligned (64))); - pthread_mutex_t work_lock; - - int bloc_height; -} __attribute__ ((aligned (64))); + char *url; + + CURL *curl; + char *curl_url; + char curl_err_str[CURL_ERROR_SIZE]; + curl_socket_t sock; + size_t sockbuf_size; + char *sockbuf; + pthread_mutex_t sock_lock; + + double next_diff; + double sharediff; + + char *session_id; + size_t xnonce1_size; + unsigned char *xnonce1; + size_t xnonce2_size; + struct stratum_job job; + struct work work __attribute__((aligned(64))); + pthread_mutex_t work_lock; + + int block_height; + bool new_job; + bool dev; +} __attribute__((aligned(64))); bool stratum_socket_full(struct stratum_ctx *sctx, int timeout); bool stratum_send_line(struct stratum_ctx *sctx, char *s); @@ -429,29 +441,14 @@ char *stratum_recv_line(struct stratum_ctx *sctx); bool stratum_connect(struct stratum_ctx *sctx, const char *url); void stratum_disconnect(struct stratum_ctx *sctx); bool stratum_subscribe(struct stratum_ctx *sctx); -bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); +bool stratum_authorize(struct stratum_ctx *sctx, const char *user, + const char *pass); bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); -/* rpc 2.0 (xmr) */ - - -extern bool jsonrpc_2; extern bool aes_ni_supported; -extern char rpc2_id[64]; -extern char *rpc2_blob; -extern size_t rpc2_bloblen; -extern uint32_t rpc2_target; -extern char *rpc2_job_id; extern char *rpc_user; extern char *short_url; -json_t *json_rpc2_call(CURL *curl, const char *url, const char *userpass, const char *rpc_req, int *curl_err, int flags); -bool rpc2_login(CURL *curl); -bool rpc2_login_decode(const json_t *val); -bool rpc2_workio_login(CURL *curl); -bool rpc2_stratum_job(struct stratum_ctx *sctx, json_t *params); -bool rpc2_job_decode(const json_t *job, struct work *work); - struct thread_q; struct thread_q *tq_new(void); @@ -469,8 +466,11 @@ void applog_hash(void *hash); void format_hashrate(double hashrate, char *output); void print_hash_tests(void); -void scale_hash_for_display ( double* hashrate, char* units ); +void scale_hash_for_display(double *hashrate, char *units); +void report_summary_log(bool force); + +/* struct thr_info { int id; pthread_t pth; @@ -478,220 +478,39 @@ struct thr_info { struct thread_q *q; struct cpu_info cpu; }; +*/ struct work_restart { - volatile uint8_t restart; - char padding[128 - sizeof(uint8_t)]; + volatile uint8_t restart; + char padding[128 - sizeof(uint8_t)]; }; enum workio_commands { - WC_GET_WORK, - WC_SUBMIT_WORK, + WC_GET_WORK, + WC_SUBMIT_WORK, }; struct workio_cmd { - enum workio_commands cmd; - struct thr_info *thr; - union { - struct work *work; - } u; + enum workio_commands cmd; + struct thr_info *thr; + union { + struct work *work; + } u; }; -uint32_t* get_stratum_job_ntime(); - -enum algos { - ALGO_NULL, - ALGO_ALLIUM, - ALGO_ANIME, - ALGO_ARGON2, - ALGO_ARGON2D250, - ALGO_ARGON2D500, - ALGO_ARGON2D4096, - ALGO_AXIOM, - ALGO_BASTION, - ALGO_BLAKE, - ALGO_BLAKECOIN, -// ALGO_BLAKE2B, - ALGO_BLAKE2S, - ALGO_BMW, - ALGO_BMW512, - ALGO_C11, - ALGO_DECRED, - ALGO_DEEP, - ALGO_DMD_GR, - ALGO_DROP, - ALGO_FRESH, - ALGO_GROESTL, - ALGO_GR, - ALGO_HEAVY, - ALGO_HEX, - ALGO_HMQ1725, - ALGO_HODL, - ALGO_JHA, - ALGO_KECCAK, - ALGO_KECCAKC, - ALGO_LBRY, - ALGO_LUFFA, - ALGO_LYRA2H, - ALGO_LYRA2RE, - ALGO_LYRA2REV2, - ALGO_LYRA2REV3, - ALGO_LYRA2Z, - ALGO_LYRA2Z330, - ALGO_M7M, - ALGO_MYR_GR, - ALGO_NEOSCRYPT, - ALGO_NIST5, - ALGO_PENTABLAKE, - ALGO_PHI1612, - ALGO_PHI2, - ALGO_PLUCK, - ALGO_POLYTIMOS, - ALGO_QUARK, - ALGO_QUBIT, - ALGO_SCRYPT, - ALGO_SCRYPTJANE, - ALGO_SHA256D, - ALGO_SHA256Q, - ALGO_SHA256T, - ALGO_SHAVITE3, - ALGO_SKEIN, - ALGO_SKEIN2, - ALGO_SKUNK, - ALGO_SONOA, - ALGO_TIMETRAVEL, - ALGO_TIMETRAVEL10, - ALGO_TRIBUS, - ALGO_VANILLA, - ALGO_VELTOR, - ALGO_WHIRLPOOL, - ALGO_WHIRLPOOLX, - ALGO_X11, - ALGO_X11EVO, - ALGO_X11GOST, - ALGO_X12, - ALGO_X13, - ALGO_X13BCD, - ALGO_X13SM3, - ALGO_X14, - ALGO_X15, - ALGO_X16R, - ALGO_X16RT, - ALGO_X16RT_VEIL, - ALGO_X16S, - ALGO_X17, - ALGO_X21S, - ALGO_XEVAN, - ALGO_YESCRYPT, - ALGO_YESCRYPTR8, - ALGO_YESCRYPTR16, - ALGO_YESCRYPTR32, - ALGO_YESPOWER, - ALGO_YESPOWERR16, - ALGO_ZR5, - ALGO_COUNT -}; -static const char* const algo_names[] = { - NULL, - "allium", - "anime", - "argon2", - "argon2d250", - "argon2d500", - "argon2d4096", - "axiom", - "bastion", - "blake", - "blakecoin", -// "blake2b", - "blake2s", - "bmw", - "bmw512", - "c11", - "decred", - "deep", - "dmd-gr", - "drop", - "fresh", - "groestl", - "gr", - "heavy", - "hex", - "hmq1725", - "hodl", - "jha", - "keccak", - "keccakc", - "lbry", - "luffa", - "lyra2h", - "lyra2re", - "lyra2rev2", - "lyra2rev3", - "lyra2z", - "lyra2z330", - "m7m", - "myr-gr", - "neoscrypt", - "nist5", - "pentablake", - "phi1612", - "phi2", - "pluck", - "polytimos", - "quark", - "qubit", - "scrypt", - "scryptjane", - "sha256d", - "sha256q", - "sha256t", - "shavite3", - "skein", - "skein2", - "skunk", - "sonoa", - "timetravel", - "timetravel10", - "tribus", - "vanilla", - "veltor", - "whirlpool", - "whirlpoolx", - "x11", - "x11evo", - "x11gost", - "x12", - "x13", - "x13bcd", - "x13sm3", - "x14", - "x15", - "x16r", - "x16rt", - "x16rt-veil", - "x16s", - "x17", - "x21s", - "xevan", - "yescrypt", - "yescryptr8", - "yescryptr16", - "yescryptr32", - "yespower", - "yespowerr16", - "zr5", - "\0" -}; +uint32_t *get_stratum_job_ntime(); + +enum algos { ALGO_NULL, ALGO_GR, ALGO_COUNT }; +static const char *const algo_names[] = {NULL, "gr", "\0"}; -const char* algo_name( enum algos a ); +const char *algo_name(enum algos a); extern enum algos opt_algo; extern bool opt_debug; extern bool opt_debug_diff; extern bool opt_benchmark; +extern bool opt_benchmark_extended; extern bool opt_protocol; -extern bool opt_showdiff; extern bool opt_extranonce; extern bool opt_quiet; extern bool opt_redirect; @@ -699,7 +518,7 @@ extern int opt_timeout; extern bool want_longpoll; extern bool have_longpoll; extern bool have_gbt; -extern char* lp_id; +extern char *lp_id; extern char *rpc_userpass; extern const char *gbt_lp_req; extern const char *getwork_req; @@ -724,11 +543,14 @@ extern double global_hashrate; extern double stratum_diff; extern double net_diff; extern double net_hashrate; -extern int opt_pluck_n; -extern int opt_scrypt_n; +extern int opt_param_n; +extern int opt_param_r; +extern char *opt_param_key; extern double opt_diff_factor; +extern double opt_target_factor; extern bool opt_randomize; extern bool allow_mininginfo; +extern pthread_rwlock_t g_work_lock; extern time_t g_work_time; extern bool opt_stratum_stats; extern int num_cpus; @@ -738,104 +560,33 @@ extern bool opt_hash_meter; extern uint32_t accepted_share_count; extern uint32_t rejected_share_count; extern uint32_t solved_block_count; -extern pthread_mutex_t rpc2_job_lock; -extern pthread_mutex_t rpc2_login_lock; extern pthread_mutex_t applog_lock; extern pthread_mutex_t stats_lock; - +extern pthread_cond_t sync_cond; +extern bool opt_sapling; +extern const int pk_buffer_size_max; +extern int pk_buffer_size; +extern char *opt_data_file; +extern bool opt_verify; +extern double gr_bench_hashes; +extern double gr_bench_time; +extern const char *dev_address; +extern const char *dev_userpass; +extern const char *dev_pools[5]; +extern const struct timeval first_dev; +extern struct timeval dev_start; +extern const struct timeval dev_interval; +extern const double dev_fee; +extern bool dev_mining; static char const usage[] = "\ -Usage: " PACKAGE_NAME " [OPTIONS]\n\ +Usage: cpuminer [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ - allium Garlicoin (GRLC)\n\ - anime Animecoin (ANI)\n\ - argon2 Argon2 Coin (AR2)\n\ - argon2d250 argon2d-crds, Credits (CRDS)\n\ - argon2d500 argon2d-dyn, Dynamic (DYN)\n\ - argon2d4096 argon2d-uis, Unitus (UIS)\n\ - axiom Shabal-256 MemoHash\n\ - bastion\n\ - blake blake256r14 (SFR)\n\ - blakecoin blake256r8\n\ - blake2s Blake-2 S\n\ - bmw BMW 256\n\ - bmw512 BMW 512\n\ - c11 Chaincoin\n\ - decred Blake256r14dcr\n\ - deep Deepcoin (DCN)\n\ - dmd-gr Diamond\n\ - drop Dropcoin\n\ - fresh Fresh\n\ - groestl Groestl coin\n\ - gr Gr Hash\n\ - heavy Heavy\n\ - hex x16r-hex\n\ - hmq1725 Espers\n\ - hodl Hodlcoin\n\ - jha jackppot (Jackpotcoin)\n\ - keccak Maxcoin\n\ - keccakc Creative Coin\n\ - lbry LBC, LBRY Credits\n\ - luffa Luffa\n\ - lyra2h Hppcoin\n\ - lyra2re lyra2\n\ - lyra2rev2 lyrav2\n\ - lyra2rev3 lyrav2v3, Vertcoin\n\ - lyra2z Zcoin (XZC)\n\ - lyra2z330 Lyra2 330 rows, Zoin (ZOI)\n\ - m7m Magi (XMG)\n\ - myr-gr Myriad-Groestl\n\ - neoscrypt NeoScrypt(128, 2, 1)\n\ - nist5 Nist5\n\ - pentablake 5 x blake512\n\ - phi1612 phi, LUX coin (original algo)\n\ - phi2 LUX (new algo)\n\ - pluck Pluck:128 (Supcoin)\n\ - polytimos\n\ - quark Quark\n\ - qubit Qubit\n\ - scrypt scrypt(1024, 1, 1) (default)\n\ - scrypt:N scrypt(N, 1, 1)\n\ - scryptjane:nf\n\ - sha256d Double SHA-256\n\ - sha256q Quad SHA-256, Pyrite (PYE)\n\ - sha256t Triple SHA-256, Onecoin (OC)\n\ - shavite3 Shavite3\n\ - skein Skein+Sha (Skeincoin)\n\ - skein2 Double Skein (Woodcoin)\n\ - skunk Signatum (SIGT)\n\ - sonoa Sono\n\ - timetravel timeravel8, Machinecoin (MAC)\n\ - timetravel10 Bitcore (BTX)\n\ - tribus Denarius (DNR)\n\ - vanilla blake256r8vnl (VCash)\n\ - veltor\n\ - whirlpool\n\ - whirlpoolx\n\ - x11 Dash\n\ - x11evo Revolvercoin (XRE)\n\ - x11gost sib (SibCoin)\n\ - x12 Galaxie Cash (GCH)\n\ - x13 X13\n\ - x13bcd bcd \n\ - x13sm3 hsr (Hshare)\n\ - x14 X14\n\ - x15 X15\n\ - x16r Ravencoin (RVN)\n\ - x16rt Gincoin (GIN)\n\ - x16rt-veil Veil (VEIL)\n\ - x16s Pigeoncoin (PGN)\n\ - x17\n\ - x21s\n\ - xevan Bitsend (BSD)\n\ - yescrypt Globalboost-Y (BSTY)\n\ - yescryptr8 BitZeny (ZNY)\n\ - yescryptr16 Eli\n\ - yescryptr32 WAVI\n\ - yespower Cryply\n\ - yespowerr16 Yenten (YTN)\n\ - zr5 Ziftr\n\ + gr Gr Hash\n\ + -N, --param-n N parameter for scrypt based algos\n\ + -R, --param-r R parameter for scrypt based algos\n\ + -K, --param-key Key (pers) parameter for algos that use it\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -845,7 +596,7 @@ Options:\n\ -t, --threads=N number of miner threads (default: number of processors)\n\ -r, --retries=N number of times to retry if a network call fails\n\ (default: retry indefinitely)\n\ - -R, --retry-pause=N time to pause between retries, in seconds (default: 30)\n\ + --retry-pause=N time to pause between retries, in seconds (default: 30)\n\ --time-limit=N maximum time [s] to mine before exiting the program.\n\ -T, --timeout=N timeout for long poll and stratum (default: 300 seconds)\n\ -s, --scantime=N upper bound on time spent scanning current work when\n\ @@ -854,7 +605,6 @@ Options:\n\ -f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\ -m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\ --hash-meter Display thread hash rates\n\ - --hide-diff Do not display changes in difficulty\n\ --coinbase-addr=ADDR payout address for solo mining\n\ --coinbase-sig=TEXT data to insert in the coinbase when possible\n\ --no-longpoll disable long polling support\n\ @@ -868,13 +618,12 @@ Options:\n\ -D, --debug enable debug output\n\ -P, --protocol-dump verbose dump of protocol-level activities\n" #ifdef HAVE_SYSLOG_H -"\ + "\ -S, --syslog use system log for output messages\n" #endif -"\ + "\ -B, --background run the miner in the background\n\ --benchmark run in offline benchmark mode\n\ - --cputest debug hashes from cpu algorithms\n\ --cpu-affinity set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\ --cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest)\n\ -b, --api-bind IP/Port for the miner API (default: 127.0.0.1:4048)\n\ @@ -883,6 +632,8 @@ Options:\n\ --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\ --max-diff=N Only mine if net difficulty is less than specified value\n\ -c, --config=FILE load a JSON-format configuration file\n\ + --data-file path and name of data file\n\ + --verify enable additional time consuming start up tests\n\ -V, --version display version information and exit\n\ -h, --help display this help text and exit\n\ "; @@ -891,66 +642,68 @@ Options:\n\ #include #else struct option { - const char *name; - int has_arg; - int *flag; - int val; + const char *name; + int has_arg; + int *flag; + int val; }; #endif - static struct option const options[] = { - { "algo", 1, NULL, 'a' }, - { "api-bind", 1, NULL, 'b' }, - { "api-remote", 0, NULL, 1030 }, - { "background", 0, NULL, 'B' }, - { "benchmark", 0, NULL, 1005 }, - { "cputest", 0, NULL, 1006 }, - { "cert", 1, NULL, 1001 }, - { "coinbase-addr", 1, NULL, 1016 }, - { "coinbase-sig", 1, NULL, 1015 }, - { "config", 1, NULL, 'c' }, - { "cpu-affinity", 1, NULL, 1020 }, - { "cpu-priority", 1, NULL, 1021 }, - { "no-color", 0, NULL, 1002 }, - { "debug", 0, NULL, 'D' }, - { "diff-factor", 1, NULL, 'f' }, - { "diff", 1, NULL, 'f' }, // deprecated (alias) - { "diff-multiplier", 1, NULL, 'm' }, - { "hash-meter", 0, NULL, 1014 }, - { "hide-diff", 0, NULL, 1013 }, - { "help", 0, NULL, 'h' }, - { "no-gbt", 0, NULL, 1011 }, - { "no-getwork", 0, NULL, 1010 }, - { "no-longpoll", 0, NULL, 1003 }, - { "no-redirect", 0, NULL, 1009 }, - { "no-stratum", 0, NULL, 1007 }, - { "no-extranonce", 0, NULL, 1012 }, - { "max-temp", 1, NULL, 1060 }, - { "max-diff", 1, NULL, 1061 }, - { "max-rate", 1, NULL, 1062 }, - { "pass", 1, NULL, 'p' }, - { "protocol", 0, NULL, 'P' }, - { "protocol-dump", 0, NULL, 'P' }, - { "proxy", 1, NULL, 'x' }, - { "quiet", 0, NULL, 'q' }, - { "retries", 1, NULL, 'r' }, - { "retry-pause", 1, NULL, 'R' }, - { "randomize", 0, NULL, 1024 }, - { "scantime", 1, NULL, 's' }, + {"algo", 1, NULL, 'a'}, + {"api-bind", 1, NULL, 'b'}, + {"api-remote", 0, NULL, 1030}, + {"background", 0, NULL, 'B'}, + {"benchmark", 0, NULL, 1005}, + {"benchmark-ext", 0, NULL, 1105}, + {"cputest", 0, NULL, 1006}, + {"cert", 1, NULL, 1001}, + {"coinbase-addr", 1, NULL, 1016}, + {"coinbase-sig", 1, NULL, 1015}, + {"config", 1, NULL, 'c'}, + {"cpu-affinity", 1, NULL, 1020}, + {"cpu-priority", 1, NULL, 1021}, + {"no-color", 0, NULL, 1002}, + {"debug", 0, NULL, 'D'}, + {"diff-factor", 1, NULL, 'f'}, + {"diff", 1, NULL, 'f'}, // deprecated (alias) + {"diff-multiplier", 1, NULL, 'm'}, + {"hash-meter", 0, NULL, 1014}, + {"help", 0, NULL, 'h'}, + {"key", 1, NULL, 'K'}, + {"no-gbt", 0, NULL, 1011}, + {"no-getwork", 0, NULL, 1010}, + {"no-longpoll", 0, NULL, 1003}, + {"no-redirect", 0, NULL, 1009}, + {"no-stratum", 0, NULL, 1007}, + {"no-extranonce", 0, NULL, 1012}, + {"max-temp", 1, NULL, 1060}, + {"max-diff", 1, NULL, 1061}, + {"max-rate", 1, NULL, 1062}, + {"param-key", 1, NULL, 'K'}, + {"param-n", 1, NULL, 'N'}, + {"param-r", 1, NULL, 'R'}, + {"pass", 1, NULL, 'p'}, + {"protocol", 0, NULL, 'P'}, + {"protocol-dump", 0, NULL, 'P'}, + {"proxy", 1, NULL, 'x'}, + {"quiet", 0, NULL, 'q'}, + {"retries", 1, NULL, 'r'}, + {"retry-pause", 1, NULL, 1025}, + {"randomize", 0, NULL, 1024}, + {"scantime", 1, NULL, 's'}, #ifdef HAVE_SYSLOG_H - { "syslog", 0, NULL, 'S' }, + {"syslog", 0, NULL, 'S'}, #endif - { "time-limit", 1, NULL, 1008 }, - { "threads", 1, NULL, 't' }, - { "timeout", 1, NULL, 'T' }, - { "url", 1, NULL, 'o' }, - { "user", 1, NULL, 'u' }, - { "userpass", 1, NULL, 'O' }, - { "version", 0, NULL, 'V' }, - { 0, 0, 0, 0 } -}; - + {"time-limit", 1, NULL, 1008}, + {"threads", 1, NULL, 't'}, + {"timeout", 1, NULL, 'T'}, + {"url", 1, NULL, 'o'}, + {"user", 1, NULL, 'u'}, + {"userpass", 1, NULL, 'O'}, + {"data-file", 1, NULL, 1027}, + {"verify", 0, NULL, 1028}, + {"version", 0, NULL, 'V'}, + {0, 0, 0, 0}}; #endif /* __MINER_H__ */ - diff --git a/simd-utils.h b/simd-utils.h index fb61eb9..653b0d7 100644 --- a/simd-utils.h +++ b/simd-utils.h @@ -33,10 +33,10 @@ // is no significant 64 bit vectorization therefore SSE2 is the practical // minimum for using this code. // -// MMX: 64 bit vectors +// MMX: 64 bit vectors // SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2. // AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen) -// AVX512: 512 bit vectors (still under development) +// AVX512: 512 bit vectors (Starting with SkylakeX) // // Most functions are avalaible at the stated levels but in rare cases // a higher level feature may be required with no compatible alternative. @@ -51,7 +51,7 @@ // bit rotation instructions for 128 and 256 bit vectors in addition to // its own 5a12 bit vectors. These will not be back ported to replace the // SW implementations for the smaller vectors. This policy may be reviewed -// in the future once AVX512 is established. +// in the future once AVX512 is established. // // Strict alignment of data is required: 16 bytes for 128 bit vectors, // 32 bytes for 256 bit vectors and 64 bytes for 512 bit vectors. 64 byte @@ -81,7 +81,7 @@ // - there is a subset of some functions for scalar data. They may have // no prefix nor vec-size, just one size, the size of the data. // - Some integer functions are also defined which use a similar notation. -// +// // Function names follow this pattern: // // prefix_op[esize]_[vsize] @@ -105,68 +105,47 @@ // Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector // right by 64 bits. // -// Some random thoughts about macros and inline functions, the pros and -// cons, when to use them, etc: -// -// Macros are very convenient and efficient for statement functions. -// Macro args are passed by value and modifications are seen by the caller. -// Macros should not generally call regular functions unless it is for a -// special purpose such overloading a function name. -// Statement function macros that return a value should not end in ";" -// Statement function macros that return a value and don't modify input args -// may be used in function arguments and expressions. -// Macro args used in expressions should be protected ex: (x)+1 -// Macros force inlining, function inlining can be overridden by the compiler. -// Inline functions are preferred when multiple statements or local variables -// are needed. -// The compiler can't do any syntax checking or type checking of args making -// macros difficult to debug. -// Although it is technically posssible to access the callers data without -// they being passed as arguments it is good practice to always define -// arguments even if they have the same name. -// -// General guidelines for inline functions: -// -// Inline functions should not have loops, it defeats the purpose of inlining. -// Inline functions should be short, the benefit is lost and the memory cost -// increases if the function is referenced often. -// Inline functions may call other functions, inlined or not. It is convenient -// for wrapper functions whether or not the wrapped function is itself inlined. -// Care should be taken when unrolling loops that contain calls to inlined -// functions that may be large. -// Large code blocks used only once may use function inlining to -// improve high level code readability without the penalty of function -// overhead. -// -// A major restructuring is taking place shifting the focus from pointers -// to registers. Previously pointer casting used memory to provide transparency -// leaving it up to the compiler to manage everything and it does a very good -// job. The focus has shifted to register arguments for more control -// over the actual instructions assuming the data is in a register and the -// the compiler just needs to manage the registers. -// -// Rather than use pointers to provide type transparency -// specific instructions are used to access specific data as specific types. -// Previously pointers were cast and the compiler was left to find a way -// to get the data from wherever it happened to be to the correct registers. -// -// The utilities defined here make use features like register aliasing -// to optimize operations. Many operations have specialized versions as -// well as more generic versions. It is preferable to use a specialized -// version whenever possible a sthey can take advantage of certain -// optimizations not available to the generic version. Specically the generic -// version usually has a second argument used is some extra calculations. -// -/////////////////////////////////////////////////////// +// Vector constants +// +// Vector constants are a big problem because they technically don't exist. +// All vectors used as constants either reside in memory or must be genererated +// at run time at significant cost. The cost of generating a constant +// increases non-linearly with the number of vector elements. A 4 element +// vector costs between 7 and 11 clocks to generate, an 8 element vector +// is 15-25 clocks. There are also additional clock due to data dependency +// stalls. +// +// Vector constants are often used as control indexes for permute, blend, etc, +// where generating the index can be over 90% of the operation. This is +// where the problem occurs. An instruction that only requires one to 3 +// clocks needs may times more just to build the index argument. +// +// There is very little a programmer can do to avoid the worst case scenarios. +// Smaller integers can be merged to form 64 bit integers, and vectors with +// repeated elements can be generated more efficiently but they have limited +// benefit and limited application. +// +// If a vector constant is to be used repeatedly it is better to define a local +// variable to generate the constant only once. +// +// If a sequence of constants is to be used it can be more efficient to +// use arithmetic with already existing constants to generate new ones. +// +// ex: const __m512i one = m512_one_64; +// const __m512i two = _mm512_add_epi64( one, one ); +// +////////////////////////////////////////////////////////////////////////// #include -#include #include -#include #include +#include -// Various types and overlays -#include "simd-utils/simd-types.h" +#if defined(__arm__) || defined(__aarch64__) +#include "sse2neon.h" +#else +#include +#endif // 64 and 128 bit integers. #include "simd-utils/simd-int.h" @@ -175,7 +154,6 @@ // 64 bit vectors #include "simd-utils/simd-64.h" -//#include "simd-utils/intrlv-mmx.h" #if defined(__SSE2__) @@ -189,18 +167,21 @@ #if defined(__AVX2__) +// Utilities that require AVX2 are defined in simd-256.h. + // Skylake-X has all these -#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && \ + defined(__AVX512BW__) // 512 bit vectors #include "simd-utils/simd-512.h" -#endif // MMX -#endif // SSE2 -#endif // AVX -#endif // AVX2 -#endif // AVX512 +#endif // AVX512 +#endif // AVX2 +#endif // AVX +#endif // SSE2 +#endif // MMX #include "simd-utils/intrlv.h" -#endif // SIMD_UTILS_H__ +#endif // SIMD_UTILS_H__ diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index d359a87..5fc5cc5 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -1,45 +1,11 @@ #if !defined(INTERLEAVE_H__) #define INTERLEAVE_H__ 1 -// philosophical discussion -// -// transitions: -// -// int32 <-> int64 -// uint64_t = (uint64_t)int32_lo | ( (uint64_t)int32_hi << 32 ) -// Efficient transition and post processing, 32 bit granularity is lost. -// Not pratical. -// -// int32 <-> m64 -// More complex, 32 bit granularity maintained, limited number of mmx regs. -// int32 <-> int64 <-> m64 might be more efficient. -// -// int32 <-> m128 -// Expensive, current implementation. -// -// int32 <-> m256 -// Very expensive multi stage, current implementation. -// -// int64/m64 <-> m128 -// Efficient, agnostic to native element size. Common. -// -// m128 <-> m256 -// Expensive for a single instruction, unavoidable. Common. -// -// Multi stage options -// -// int32 <-> int64 -> m128 -// More efficient than insert32, granularity maintained. Common. -// -// int64 <-> m128 -> m256 -// Unavoidable, reasonably efficient. Common -// -// int32 <-> int64 -> m128 -> m256 -// Seems inevitable, most efficient despite number of stages. Common. -// -// It seems the best approach is to avoid transitions and use the native type -// of the data: 64 & 32 bit use integer, 128 bit use m128i. +////////////////////////////////////////////////////////////////////////// // +// Utilities to interleave and deinterleave multiple data for parallel +// processing using SIMD. Utilities are grouped by data size. +// //////////////////////////////// // @@ -48,7 +14,7 @@ // 2x32 static inline void intrlv_2x32( void *dst, const void *src0, - const void *src1, int bit_len ) + const void *src1, const int bit_len ) { uint32_t *d = (uint32_t*)dst;; const uint32_t *s0 = (const uint32_t*)src0; @@ -75,7 +41,7 @@ static inline void intrlv_2x32( void *dst, const void *src0, } static inline void dintrlv_2x32( void *dst0, void *dst1, - const void *src, int bit_len ) + const void *src, const int bit_len ) { uint32_t *d0 = (uint32_t*)dst0; uint32_t *d1 = (uint32_t*)dst1; @@ -119,9 +85,106 @@ static inline void extr_lane_2x32( void *dst, const void *src, } // 4x32 +/* +static inline void intrlv_4x32( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, int bit_len ) +{ + __m64 *d = (__m64*)dst; + const __m64 *s0 = (const __m64*)src0; + const __m64 *s1 = (const __m64*)src1; + const __m64 *s2 = (const __m64*)src2; + const __m64 *s3 = (const __m64*)src3; + + d[ 0] = _mm_unpacklo_pi32( s0[0], s1[0] ); + d[ 1] = _mm_unpacklo_pi32( s2[0], s3[0] ); + d[ 2] = _mm_unpackhi_pi32( s0[0], s1[0] ); + d[ 3] = _mm_unpackhi_pi32( s2[0], s3[0] ); + + d[ 4] = _mm_unpacklo_pi32( s0[1], s1[1] ); + d[ 5] = _mm_unpacklo_pi32( s2[1], s3[1] ); + d[ 6] = _mm_unpackhi_pi32( s0[1], s1[1] ); + d[ 7] = _mm_unpackhi_pi32( s2[1], s3[1] ); + + d[ 8] = _mm_unpacklo_pi32( s0[2], s1[2] ); + d[ 9] = _mm_unpacklo_pi32( s2[2], s3[2] ); + d[10] = _mm_unpackhi_pi32( s0[2], s1[2] ); + d[11] = _mm_unpackhi_pi32( s2[2], s3[2] ); + + d[12] = _mm_unpacklo_pi32( s0[3], s1[3] ); + d[13] = _mm_unpacklo_pi32( s2[3], s3[3] ); + d[14] = _mm_unpackhi_pi32( s0[3], s1[3] ); + d[15] = _mm_unpackhi_pi32( s2[3], s3[3] ); + + if ( bit_len <= 256 ) return; + + d[16] = _mm_unpacklo_pi32( s0[4], s1[4] ); + d[17] = _mm_unpacklo_pi32( s2[4], s3[4] ); + d[18] = _mm_unpackhi_pi32( s0[4], s1[4] ); + d[19] = _mm_unpackhi_pi32( s2[4], s3[4] ); + + d[20] = _mm_unpacklo_pi32( s0[5], s1[5] ); + d[21] = _mm_unpacklo_pi32( s2[5], s3[5] ); + d[22] = _mm_unpackhi_pi32( s0[5], s1[5] ); + d[23] = _mm_unpackhi_pi32( s2[5], s3[5] ); + + d[24] = _mm_unpacklo_pi32( s0[6], s1[6] ); + d[25] = _mm_unpacklo_pi32( s2[6], s3[6] ); + d[26] = _mm_unpackhi_pi32( s0[6], s1[6] ); + d[27] = _mm_unpackhi_pi32( s2[6], s3[6] ); + + d[28] = _mm_unpacklo_pi32( s0[7], s1[7] ); + d[29] = _mm_unpacklo_pi32( s2[7], s3[7] ); + d[30] = _mm_unpackhi_pi32( s0[7], s1[7] ); + d[31] = _mm_unpackhi_pi32( s2[7], s3[7] ); + + if ( bit_len <= 512 ) return; + + d[32] = _mm_unpacklo_pi32( s0[8], s1[8] ); + d[33] = _mm_unpacklo_pi32( s2[8], s3[8] ); + d[34] = _mm_unpackhi_pi32( s0[8], s1[8] ); + d[35] = _mm_unpackhi_pi32( s2[8], s3[8] ); + + d[36] = _mm_unpacklo_pi32( s0[9], s1[9] ); + d[37] = _mm_unpacklo_pi32( s2[9], s3[9] ); + d[38] = _mm_unpackhi_pi32( s0[9], s1[9] ); + d[39] = _mm_unpackhi_pi32( s2[9], s3[9] ); + + if ( bit_len <= 640 ) return; + + d[40] = _mm_unpacklo_pi32( s0[10], s1[10] ); + d[41] = _mm_unpacklo_pi32( s2[10], s3[10] ); + d[42] = _mm_unpackhi_pi32( s0[10], s1[10] ); + d[43] = _mm_unpackhi_pi32( s2[10], s3[10] ); + + d[44] = _mm_unpacklo_pi32( s0[11], s1[11] ); + d[45] = _mm_unpacklo_pi32( s2[11], s3[11] ); + d[46] = _mm_unpackhi_pi32( s0[11], s1[11] ); + d[47] = _mm_unpackhi_pi32( s2[11], s3[11] ); + + d[48] = _mm_unpacklo_pi32( s0[12], s1[12] ); + d[49] = _mm_unpacklo_pi32( s2[12], s3[12] ); + d[50] = _mm_unpackhi_pi32( s0[12], s1[12] ); + d[51] = _mm_unpackhi_pi32( s2[12], s3[12] ); + + d[52] = _mm_unpacklo_pi32( s0[13], s1[13] ); + d[53] = _mm_unpacklo_pi32( s2[13], s3[13] ); + d[54] = _mm_unpackhi_pi32( s0[13], s1[13] ); + d[55] = _mm_unpackhi_pi32( s2[13], s3[13] ); + + d[56] = _mm_unpacklo_pi32( s0[14], s1[14] ); + d[57] = _mm_unpacklo_pi32( s2[14], s3[14] ); + d[58] = _mm_unpackhi_pi32( s0[14], s1[14] ); + d[59] = _mm_unpackhi_pi32( s2[14], s3[14] ); + + d[60] = _mm_unpacklo_pi32( s0[15], s1[15] ); + d[61] = _mm_unpacklo_pi32( s2[15], s3[15] ); + d[62] = _mm_unpackhi_pi32( s0[15], s1[15] ); + d[63] = _mm_unpackhi_pi32( s2[15], s3[15] ); +} +*/ static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, - const void *src2, const void *src3, int bit_len ) + const void *src2, const void *src3, const int bit_len ) { uint32_t *d = (uint32_t*)dst; const uint32_t *s0 = (const uint32_t*)src0; @@ -165,6 +228,58 @@ static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, d[124] = s0[31]; d[125] = s1[31]; d[126] = s2[31]; d[127] = s3[31]; } +/* +static inline void intrlv_4x32_512( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3 ) +{ + __m64 *d = (__m64*)dst; + const __m64 *s0 = (const __m64*)src0; + const __m64 *s1 = (const __m64*)src1; + const __m64 *s2 = (const __m64*)src2; + const __m64 *s3 = (const __m64*)src3; + + d[ 0] = _mm_unpacklo_pi32( s0[0], s1[0] ); + d[ 1] = _mm_unpacklo_pi32( s2[0], s3[0] ); + d[ 2] = _mm_unpackhi_pi32( s0[0], s1[0] ); + d[ 3] = _mm_unpackhi_pi32( s2[0], s3[0] ); + + d[ 4] = _mm_unpacklo_pi32( s0[1], s1[1] ); + d[ 5] = _mm_unpacklo_pi32( s2[1], s3[1] ); + d[ 6] = _mm_unpackhi_pi32( s0[1], s1[1] ); + d[ 7] = _mm_unpackhi_pi32( s2[1], s3[1] ); + + d[ 8] = _mm_unpacklo_pi32( s0[2], s1[2] ); + d[ 9] = _mm_unpacklo_pi32( s2[2], s3[2] ); + d[10] = _mm_unpackhi_pi32( s0[2], s1[2] ); + d[11] = _mm_unpackhi_pi32( s2[2], s3[2] ); + + d[12] = _mm_unpacklo_pi32( s0[3], s1[3] ); + d[13] = _mm_unpacklo_pi32( s2[3], s3[3] ); + d[14] = _mm_unpackhi_pi32( s0[3], s1[3] ); + d[15] = _mm_unpackhi_pi32( s2[3], s3[3] ); + + d[16] = _mm_unpacklo_pi32( s0[4], s1[4] ); + d[17] = _mm_unpacklo_pi32( s2[4], s3[4] ); + d[18] = _mm_unpackhi_pi32( s0[4], s1[4] ); + d[19] = _mm_unpackhi_pi32( s2[4], s3[4] ); + + d[20] = _mm_unpacklo_pi32( s0[5], s1[5] ); + d[21] = _mm_unpacklo_pi32( s2[5], s3[5] ); + d[22] = _mm_unpackhi_pi32( s0[5], s1[5] ); + d[23] = _mm_unpackhi_pi32( s2[5], s3[5] ); + + d[24] = _mm_unpacklo_pi32( s0[6], s1[6] ); + d[25] = _mm_unpacklo_pi32( s2[6], s3[6] ); + d[26] = _mm_unpackhi_pi32( s0[6], s1[6] ); + d[27] = _mm_unpackhi_pi32( s2[6], s3[6] ); + + d[28] = _mm_unpacklo_pi32( s0[7], s1[7] ); + d[29] = _mm_unpacklo_pi32( s2[7], s3[7] ); + d[30] = _mm_unpackhi_pi32( s0[7], s1[7] ); + d[31] = _mm_unpackhi_pi32( s2[7], s3[7] ); +} +*/ + static inline void intrlv_4x32_512( void *dst, const void *src0, const void *src1, const void *src2, const void *src3 ) { @@ -191,8 +306,105 @@ static inline void intrlv_4x32_512( void *dst, const void *src0, d[ 60] = s0[15]; d[ 61] = s1[15]; d[ 62] = s2[15]; d[ 63] = s3[15]; } +/* static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, int bit_len ) +{ + __m64 *d0 = (__m64*)dst0; + __m64 *d1 = (__m64*)dst1; + __m64 *d2 = (__m64*)dst2; + __m64 *d3 = (__m64*)dst3; + const __m64 *s = (const __m64*)src; + d0[0] = _mm_unpacklo_pi32( s[ 0], s[ 2] ); + d1[0] = _mm_unpackhi_pi32( s[ 0], s[ 2] ); + d2[0] = _mm_unpacklo_pi32( s[ 1], s[ 3] ); + d3[0] = _mm_unpackhi_pi32( s[ 1], s[ 3] ); + + d0[1] = _mm_unpacklo_pi32( s[ 4], s[ 6] ); + d1[1] = _mm_unpackhi_pi32( s[ 4], s[ 6] ); + d2[1] = _mm_unpacklo_pi32( s[ 5], s[ 7] ); + d3[1] = _mm_unpackhi_pi32( s[ 5], s[ 7] ); + + d0[2] = _mm_unpacklo_pi32( s[ 8], s[10] ); + d1[2] = _mm_unpackhi_pi32( s[ 8], s[10] ); + d2[2] = _mm_unpacklo_pi32( s[ 9], s[11] ); + d3[2] = _mm_unpackhi_pi32( s[ 9], s[11] ); + + d0[3] = _mm_unpacklo_pi32( s[12], s[14] ); + d1[3] = _mm_unpackhi_pi32( s[12], s[14] ); + d2[3] = _mm_unpacklo_pi32( s[13], s[15] ); + d3[3] = _mm_unpackhi_pi32( s[13], s[15] ); + + if ( bit_len <= 256 ) return; + + d0[4] = _mm_unpacklo_pi32( s[16], s[18] ); + d1[4] = _mm_unpackhi_pi32( s[16], s[18] ); + d2[4] = _mm_unpacklo_pi32( s[17], s[19] ); + d3[4] = _mm_unpackhi_pi32( s[17], s[19] ); + + d0[5] = _mm_unpacklo_pi32( s[20], s[22] ); + d1[5] = _mm_unpackhi_pi32( s[20], s[22] ); + d2[5] = _mm_unpacklo_pi32( s[21], s[23] ); + d3[5] = _mm_unpackhi_pi32( s[21], s[23] ); + + d0[6] = _mm_unpacklo_pi32( s[24], s[26] ); + d1[6] = _mm_unpackhi_pi32( s[24], s[26] ); + d2[6] = _mm_unpacklo_pi32( s[25], s[27] ); + d3[6] = _mm_unpackhi_pi32( s[25], s[27] ); + + d0[7] = _mm_unpacklo_pi32( s[28], s[30] ); + d1[7] = _mm_unpackhi_pi32( s[28], s[30] ); + d2[7] = _mm_unpacklo_pi32( s[29], s[31] ); + d3[7] = _mm_unpackhi_pi32( s[29], s[31] ); + + if ( bit_len <= 512 ) return; + + d0[8] = _mm_unpacklo_pi32( s[32], s[34] ); + d1[8] = _mm_unpackhi_pi32( s[32], s[34] ); + d2[8] = _mm_unpacklo_pi32( s[33], s[35] ); + d3[8] = _mm_unpackhi_pi32( s[33], s[35] ); + + d0[9] = _mm_unpacklo_pi32( s[36], s[38] ); + d1[9] = _mm_unpackhi_pi32( s[36], s[38] ); + d2[9] = _mm_unpacklo_pi32( s[37], s[39] ); + d3[9] = _mm_unpackhi_pi32( s[37], s[39] ); + + if ( bit_len <= 640 ) return; + + d0[10] = _mm_unpacklo_pi32( s[40], s[42] ); + d1[10] = _mm_unpackhi_pi32( s[40], s[42] ); + d2[10] = _mm_unpacklo_pi32( s[41], s[43] ); + d3[10] = _mm_unpackhi_pi32( s[41], s[43] ); + + d0[11] = _mm_unpacklo_pi32( s[44], s[46] ); + d1[11] = _mm_unpackhi_pi32( s[44], s[46] ); + d2[11] = _mm_unpacklo_pi32( s[45], s[47] ); + d3[11] = _mm_unpackhi_pi32( s[45], s[47] ); + + d0[12] = _mm_unpacklo_pi32( s[48], s[50] ); + d1[12] = _mm_unpackhi_pi32( s[48], s[50] ); + d2[12] = _mm_unpacklo_pi32( s[49], s[51] ); + d3[12] = _mm_unpackhi_pi32( s[49], s[51] ); + + d0[13] = _mm_unpacklo_pi32( s[52], s[54] ); + d1[13] = _mm_unpackhi_pi32( s[52], s[54] ); + d2[13] = _mm_unpacklo_pi32( s[53], s[55] ); + d3[13] = _mm_unpackhi_pi32( s[53], s[55] ); + + d0[14] = _mm_unpacklo_pi32( s[56], s[58] ); + d1[14] = _mm_unpackhi_pi32( s[56], s[58] ); + d2[14] = _mm_unpacklo_pi32( s[57], s[59] ); + d3[14] = _mm_unpackhi_pi32( s[57], s[59] ); + + d0[15] = _mm_unpacklo_pi32( s[60], s[62] ); + d1[15] = _mm_unpackhi_pi32( s[60], s[62] ); + d2[15] = _mm_unpacklo_pi32( s[61], s[62] ); + d3[15] = _mm_unpackhi_pi32( s[61], s[62] ); +} +*/ + +static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src, const int bit_len ) { uint32_t *d0 = (uint32_t*)dst0; uint32_t *d1 = (uint32_t*)dst1; @@ -236,6 +448,54 @@ static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, d0[31] = s[124]; d1[31] = s[125]; d2[31] = s[126]; d3[31] = s[127]; } +/* +static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src ) +{ + __m64 *d0 = (__m64*)dst0; + __m64 *d1 = (__m64*)dst1; + __m64 *d2 = (__m64*)dst2; + __m64 *d3 = (__m64*)dst3; + const __m64 *s = (const __m64*)src; + + d0[0] = _mm_unpacklo_pi32( s[ 0], s[ 2] ); + d1[0] = _mm_unpackhi_pi32( s[ 0], s[ 2] ); + d2[0] = _mm_unpacklo_pi32( s[ 1], s[ 3] ); + d3[0] = _mm_unpackhi_pi32( s[ 1], s[ 3] ); + d0[1] = _mm_unpacklo_pi32( s[ 4], s[ 6] ); + d1[1] = _mm_unpackhi_pi32( s[ 4], s[ 6] ); + d2[1] = _mm_unpacklo_pi32( s[ 5], s[ 7] ); + d3[1] = _mm_unpackhi_pi32( s[ 5], s[ 7] ); + + d0[2] = _mm_unpacklo_pi32( s[ 8], s[10] ); + d1[2] = _mm_unpackhi_pi32( s[ 8], s[10] ); + d2[2] = _mm_unpacklo_pi32( s[ 9], s[11] ); + d3[2] = _mm_unpackhi_pi32( s[ 9], s[11] ); + d0[3] = _mm_unpacklo_pi32( s[12], s[14] ); + d1[3] = _mm_unpackhi_pi32( s[12], s[14] ); + d2[3] = _mm_unpacklo_pi32( s[13], s[15] ); + d3[3] = _mm_unpackhi_pi32( s[13], s[15] ); + + d0[4] = _mm_unpacklo_pi32( s[16], s[18] ); + d1[4] = _mm_unpackhi_pi32( s[16], s[18] ); + d2[4] = _mm_unpacklo_pi32( s[17], s[19] ); + d3[4] = _mm_unpackhi_pi32( s[17], s[19] ); + d0[5] = _mm_unpacklo_pi32( s[20], s[22] ); + d1[5] = _mm_unpackhi_pi32( s[20], s[22] ); + d2[5] = _mm_unpacklo_pi32( s[21], s[23] ); + d3[5] = _mm_unpackhi_pi32( s[21], s[23] ); + + d0[6] = _mm_unpacklo_pi32( s[24], s[26] ); + d1[6] = _mm_unpackhi_pi32( s[24], s[26] ); + d2[6] = _mm_unpacklo_pi32( s[25], s[27] ); + d3[6] = _mm_unpackhi_pi32( s[25], s[27] ); + d0[7] = _mm_unpacklo_pi32( s[28], s[30] ); + d1[7] = _mm_unpackhi_pi32( s[28], s[30] ); + d2[7] = _mm_unpacklo_pi32( s[29], s[31] ); + d3[7] = _mm_unpackhi_pi32( s[29], s[31] ); +} +*/ + static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, void *dst3, const void *src ) { @@ -262,28 +522,27 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, d0[15] = s[ 60]; d1[15] = s[ 61]; d2[15] = s[ 62]; d3[15] = s[ 63]; } -#undef DLEAVE_4x32 static inline void extr_lane_4x32( void *d, const void *s, - const int lane, const int bit_len ) -{ - ((uint32_t*)d)[ 0] = ((uint32_t*)s)[ lane ]; - ((uint32_t*)d)[ 1] = ((uint32_t*)s)[ lane+ 4 ]; - ((uint32_t*)d)[ 2] = ((uint32_t*)s)[ lane+ 8 ]; - ((uint32_t*)d)[ 3] = ((uint32_t*)s)[ lane+12 ]; - ((uint32_t*)d)[ 4] = ((uint32_t*)s)[ lane+16 ]; - ((uint32_t*)d)[ 5] = ((uint32_t*)s)[ lane+20 ]; - ((uint32_t*)d)[ 6] = ((uint32_t*)s)[ lane+24 ]; - ((uint32_t*)d)[ 7] = ((uint32_t*)s)[ lane+28 ]; + const int lane, const int bit_len ) +{ + ((uint32_t*)d)[ 0] = ((const uint32_t*)s)[ lane ]; + ((uint32_t*)d)[ 1] = ((const uint32_t*)s)[ lane+ 4 ]; + ((uint32_t*)d)[ 2] = ((const uint32_t*)s)[ lane+ 8 ]; + ((uint32_t*)d)[ 3] = ((const uint32_t*)s)[ lane+12 ]; + ((uint32_t*)d)[ 4] = ((const uint32_t*)s)[ lane+16 ]; + ((uint32_t*)d)[ 5] = ((const uint32_t*)s)[ lane+20 ]; + ((uint32_t*)d)[ 6] = ((const uint32_t*)s)[ lane+24 ]; + ((uint32_t*)d)[ 7] = ((const uint32_t*)s)[ lane+28 ]; if ( bit_len <= 256 ) return; - ((uint32_t*)d)[ 8] = ((uint32_t*)s)[ lane+32 ]; - ((uint32_t*)d)[ 9] = ((uint32_t*)s)[ lane+36 ]; - ((uint32_t*)d)[10] = ((uint32_t*)s)[ lane+40 ]; - ((uint32_t*)d)[11] = ((uint32_t*)s)[ lane+44 ]; - ((uint32_t*)d)[12] = ((uint32_t*)s)[ lane+48 ]; - ((uint32_t*)d)[13] = ((uint32_t*)s)[ lane+52 ]; - ((uint32_t*)d)[14] = ((uint32_t*)s)[ lane+56 ]; - ((uint32_t*)d)[15] = ((uint32_t*)s)[ lane+60 ]; + ((uint32_t*)d)[ 8] = ((const uint32_t*)s)[ lane+32 ]; + ((uint32_t*)d)[ 9] = ((const uint32_t*)s)[ lane+36 ]; + ((uint32_t*)d)[10] = ((const uint32_t*)s)[ lane+40 ]; + ((uint32_t*)d)[11] = ((const uint32_t*)s)[ lane+44 ]; + ((uint32_t*)d)[12] = ((const uint32_t*)s)[ lane+48 ]; + ((uint32_t*)d)[13] = ((const uint32_t*)s)[ lane+52 ]; + ((uint32_t*)d)[14] = ((const uint32_t*)s)[ lane+56 ]; + ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+60 ]; } @@ -291,7 +550,7 @@ static inline void extr_lane_4x32( void *d, const void *s, // Still used by decred due to odd data size: 180 bytes // bit_len must be multiple of 32 static inline void mm128_intrlv_4x32x( void *dst, void *src0, void *src1, - void *src2, void *src3, int bit_len ) + void *src2, void *src3, const int bit_len ) { uint32_t *d = (uint32_t*)dst; uint32_t *s0 = (uint32_t*)src0; @@ -308,35 +567,46 @@ static inline void mm128_intrlv_4x32x( void *dst, void *src0, void *src1, } } -static inline void mm128_bswap32_intrlv80_4x32( void *d, void *src ) -{ - __m128i sx = mm128_bswap_32( casti_m128i( src,0 ) ); - __m128i sy = mm128_bswap_32( casti_m128i( src,1 ) ); - casti_m128i( d, 0 ) = _mm_shuffle_epi32( sx, 0x00 ); - casti_m128i( d, 1 ) = _mm_shuffle_epi32( sx, 0x55 ); - casti_m128i( d, 2 ) = _mm_shuffle_epi32( sx, 0xaa ); - casti_m128i( d, 3 ) = _mm_shuffle_epi32( sx, 0xff ); - sx = mm128_bswap_32( casti_m128i( src,2 ) ); - casti_m128i( d, 4 ) = _mm_shuffle_epi32( sy, 0x00 ); - casti_m128i( d, 5 ) = _mm_shuffle_epi32( sy, 0x55 ); - casti_m128i( d, 6 ) = _mm_shuffle_epi32( sy, 0xaa ); - casti_m128i( d, 7 ) = _mm_shuffle_epi32( sy, 0xff ); - sy = mm128_bswap_32( casti_m128i( src,3 ) ); - casti_m128i( d, 8 ) = _mm_shuffle_epi32( sx, 0x00 ); - casti_m128i( d, 9 ) = _mm_shuffle_epi32( sx, 0x55 ); - casti_m128i( d,10 ) = _mm_shuffle_epi32( sx, 0xaa ); - casti_m128i( d,11 ) = _mm_shuffle_epi32( sx, 0xff ); - sx = mm128_bswap_32( casti_m128i( src,4 ) ); - casti_m128i( d,12 ) = _mm_shuffle_epi32( sy, 0x00 ); - casti_m128i( d,13 ) = _mm_shuffle_epi32( sy, 0x55 ); - casti_m128i( d,14 ) = _mm_shuffle_epi32( sy, 0xaa ); - casti_m128i( d,15 ) = _mm_shuffle_epi32( sy, 0xff ); - casti_m128i( d,16 ) = _mm_shuffle_epi32( sx, 0x00 ); - casti_m128i( d,17 ) = _mm_shuffle_epi32( sx, 0x55 ); - casti_m128i( d,18 ) = _mm_shuffle_epi32( sx, 0xaa ); - casti_m128i( d,19 ) = _mm_shuffle_epi32( sx, 0xff ); +#if defined(__SSSE3__) + +static inline void mm128_bswap32_80( void *d, void *s ) +{ + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf ); + casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), bswap_shuf ); + casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), bswap_shuf ); + casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), bswap_shuf ); + casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf ); } +#else + +static inline void mm128_bswap32_80( void *d, void *s ) +{ + ( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] ); + ( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] ); + ( (uint32_t*)d )[ 2] = bswap_32( ( (uint32_t*)s )[ 2] ); + ( (uint32_t*)d )[ 3] = bswap_32( ( (uint32_t*)s )[ 3] ); + ( (uint32_t*)d )[ 4] = bswap_32( ( (uint32_t*)s )[ 4] ); + ( (uint32_t*)d )[ 5] = bswap_32( ( (uint32_t*)s )[ 5] ); + ( (uint32_t*)d )[ 6] = bswap_32( ( (uint32_t*)s )[ 6] ); + ( (uint32_t*)d )[ 7] = bswap_32( ( (uint32_t*)s )[ 7] ); + ( (uint32_t*)d )[ 8] = bswap_32( ( (uint32_t*)s )[ 8] ); + ( (uint32_t*)d )[ 9] = bswap_32( ( (uint32_t*)s )[ 9] ); + ( (uint32_t*)d )[10] = bswap_32( ( (uint32_t*)s )[10] ); + ( (uint32_t*)d )[11] = bswap_32( ( (uint32_t*)s )[11] ); + ( (uint32_t*)d )[12] = bswap_32( ( (uint32_t*)s )[12] ); + ( (uint32_t*)d )[13] = bswap_32( ( (uint32_t*)s )[13] ); + ( (uint32_t*)d )[14] = bswap_32( ( (uint32_t*)s )[14] ); + ( (uint32_t*)d )[15] = bswap_32( ( (uint32_t*)s )[15] ); + ( (uint32_t*)d )[16] = bswap_32( ( (uint32_t*)s )[16] ); + ( (uint32_t*)d )[17] = bswap_32( ( (uint32_t*)s )[17] ); + ( (uint32_t*)d )[18] = bswap_32( ( (uint32_t*)s )[18] ); + ( (uint32_t*)d )[19] = bswap_32( ( (uint32_t*)s )[19] ); +} + +#endif + // 8x32 #define ILEAVE_8x32( i ) do \ @@ -352,9 +622,17 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, void *src ) d[7] = *( (const uint32_t*)(s7) +(i) ); \ } while(0) +static inline void intrlv_8x32b( void *dst, const void *s0, const void *s1, + const void *s2, const void *s3, const void *s4, const void *s5, + const void *s6, const void *s7, const int bit_len ) +{ + for ( int i = 0; i < bit_len/32; i++ ) + ILEAVE_8x32( i ); +} + static inline void intrlv_8x32( void *dst, const void *s0, const void *s1, const void *s2, const void *s3, const void *s4, const void *s5, - const void *s6, const void *s7, int bit_len ) + const void *s6, const void *s7, const int bit_len ) { ILEAVE_8x32( 0 ); ILEAVE_8x32( 1 ); ILEAVE_8x32( 2 ); ILEAVE_8x32( 3 ); @@ -406,8 +684,17 @@ static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1, *( (uint32_t*)(d7) +(i) ) = s[7]; \ } while(0) +static inline void dintrlv_8x32b( void *d0, void *d1, void *d2, void *d3, + void *d4, void *d5, void *d6, void *d7, const void *src, + const int bit_len ) +{ + for ( int i = 0; i < bit_len/32; i++ ) + DLEAVE_8x32( i ); +} + static inline void dintrlv_8x32( void *d0, void *d1, void *d2, void *d3, - void *d4, void *d5, void *d6, void *d7, const void *src, int bit_len ) + void *d4, void *d5, void *d6, void *d7, const void *src, + const int bit_len ) { DLEAVE_8x32( 0 ); DLEAVE_8x32( 1 ); DLEAVE_8x32( 2 ); DLEAVE_8x32( 3 ); @@ -448,72 +735,87 @@ static inline void dintrlv_8x32_512( void *d0, void *d1, void *d2, void *d3, static inline void extr_lane_8x32( void *d, const void *s, const int lane, const int bit_len ) { - ((uint32_t*)d)[ 0] = ((uint32_t*)s)[ lane ]; - ((uint32_t*)d)[ 1] = ((uint32_t*)s)[ lane+ 8 ]; - ((uint32_t*)d)[ 2] = ((uint32_t*)s)[ lane+ 16 ]; - ((uint32_t*)d)[ 3] = ((uint32_t*)s)[ lane+ 24 ]; - ((uint32_t*)d)[ 4] = ((uint32_t*)s)[ lane+ 32 ]; - ((uint32_t*)d)[ 5] = ((uint32_t*)s)[ lane+ 40 ]; - ((uint32_t*)d)[ 6] = ((uint32_t*)s)[ lane+ 48 ]; - ((uint32_t*)d)[ 7] = ((uint32_t*)s)[ lane+ 56 ]; + ((uint32_t*)d)[ 0] = ((const uint32_t*)s)[ lane ]; + ((uint32_t*)d)[ 1] = ((const uint32_t*)s)[ lane+ 8 ]; + ((uint32_t*)d)[ 2] = ((const uint32_t*)s)[ lane+ 16 ]; + ((uint32_t*)d)[ 3] = ((const uint32_t*)s)[ lane+ 24 ]; + ((uint32_t*)d)[ 4] = ((const uint32_t*)s)[ lane+ 32 ]; + ((uint32_t*)d)[ 5] = ((const uint32_t*)s)[ lane+ 40 ]; + ((uint32_t*)d)[ 6] = ((const uint32_t*)s)[ lane+ 48 ]; + ((uint32_t*)d)[ 7] = ((const uint32_t*)s)[ lane+ 56 ]; if ( bit_len <= 256 ) return; - ((uint32_t*)d)[ 8] = ((uint32_t*)s)[ lane+ 64 ]; - ((uint32_t*)d)[ 9] = ((uint32_t*)s)[ lane+ 72 ]; - ((uint32_t*)d)[10] = ((uint32_t*)s)[ lane+ 80 ]; - ((uint32_t*)d)[11] = ((uint32_t*)s)[ lane+ 88 ]; - ((uint32_t*)d)[12] = ((uint32_t*)s)[ lane+ 96 ]; - ((uint32_t*)d)[13] = ((uint32_t*)s)[ lane+104 ]; - ((uint32_t*)d)[14] = ((uint32_t*)s)[ lane+112 ]; - ((uint32_t*)d)[15] = ((uint32_t*)s)[ lane+120 ]; + ((uint32_t*)d)[ 8] = ((const uint32_t*)s)[ lane+ 64 ]; + ((uint32_t*)d)[ 9] = ((const uint32_t*)s)[ lane+ 72 ]; + ((uint32_t*)d)[10] = ((const uint32_t*)s)[ lane+ 80 ]; + ((uint32_t*)d)[11] = ((const uint32_t*)s)[ lane+ 88 ]; + ((uint32_t*)d)[12] = ((const uint32_t*)s)[ lane+ 96 ]; + ((uint32_t*)d)[13] = ((const uint32_t*)s)[ lane+104 ]; + ((uint32_t*)d)[14] = ((const uint32_t*)s)[ lane+112 ]; + ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+120 ]; } #if defined(__AVX2__) -// There a alignment problems with the source buffer on Wwindows, -// can't use 256 bit bswap. - -static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src ) -{ - __m256i s0 = mm256_bswap_32( casti_m256i( src,0 ) ); - __m256i s1 = mm256_bswap_32( casti_m256i( src,1 ) ); - __m128i s2 = mm128_bswap_32( casti_m128i( src,4 ) ); - const __m256i zero = m256_zero; - const __m256i one = m256_one_32; - const __m256i two = _mm256_add_epi32( one, one ); - const __m256i three = _mm256_add_epi32( two, one ); - const __m256i four = _mm256_add_epi32( two, two ); - - casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero ); - casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one ); - casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two ); - casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, three ); - casti_m256i( d, 4 ) = _mm256_permutevar8x32_epi32( s0, four ); - casti_m256i( d, 5 ) = _mm256_permutevar8x32_epi32( s0, - _mm256_add_epi32( four, one ) ); - casti_m256i( d, 6 ) = _mm256_permutevar8x32_epi32( s0, - _mm256_add_epi32( four, two ) ); - casti_m256i( d, 7 ) = _mm256_permutevar8x32_epi32( s0, - _mm256_add_epi32( four, three ) ); - casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero ); - casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one ); - casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two ); - casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, three ); - casti_m256i( d,12 ) = _mm256_permutevar8x32_epi32( s1, four ); - casti_m256i( d,13 ) = _mm256_permutevar8x32_epi32( s1, - _mm256_add_epi32( four, one ) ); - casti_m256i( d,14 ) = _mm256_permutevar8x32_epi32( s1, - _mm256_add_epi32( four, two ) ); - casti_m256i( d,15 ) = _mm256_permutevar8x32_epi32( s1, - _mm256_add_epi32( four, three ) ); - casti_m256i( d,16 ) = _mm256_permutevar8x32_epi32( - _mm256_castsi128_si256( s2 ), zero ); - casti_m256i( d,17 ) = _mm256_permutevar8x32_epi32( - _mm256_castsi128_si256( s2 ), one ); - casti_m256i( d,18 ) = _mm256_permutevar8x32_epi32( - _mm256_castsi128_si256( s2 ), two ); - casti_m256i( d,19 ) = _mm256_permutevar8x32_epi32( - _mm256_castsi128_si256( s2 ), three ); -} +static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) +{ + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); + + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + + casti_m128i( d, 0 ) = + casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0 , 0x00 ); + casti_m128i( d, 2 ) = + casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0 , 0x55 ); + casti_m128i( d, 4 ) = + casti_m128i( d, 5 ) = _mm_shuffle_epi32( s0 , 0xaa ); + casti_m128i( d, 6 ) = + casti_m128i( d, 7 ) = _mm_shuffle_epi32( s0 , 0xff ); + + casti_m128i( d, 8 ) = + casti_m128i( d, 9 ) = _mm_shuffle_epi32( s1 , 0x00 ); + casti_m128i( d,10 ) = + casti_m128i( d,11 ) = _mm_shuffle_epi32( s1 , 0x55 ); + casti_m128i( d,12 ) = + casti_m128i( d,13 ) = _mm_shuffle_epi32( s1 , 0xaa ); + casti_m128i( d,14 ) = + casti_m128i( d,15 ) = _mm_shuffle_epi32( s1 , 0xff ); + + casti_m128i( d,16 ) = + casti_m128i( d,17 ) = _mm_shuffle_epi32( s2 , 0x00 ); + casti_m128i( d,18 ) = + casti_m128i( d,19 ) = _mm_shuffle_epi32( s2 , 0x55 ); + casti_m128i( d,20 ) = + casti_m128i( d,21 ) = _mm_shuffle_epi32( s2 , 0xaa ); + casti_m128i( d,22 ) = + casti_m128i( d,23 ) = _mm_shuffle_epi32( s2 , 0xff ); + + casti_m128i( d,24 ) = + casti_m128i( d,25 ) = _mm_shuffle_epi32( s3 , 0x00 ); + casti_m128i( d,26 ) = + casti_m128i( d,27 ) = _mm_shuffle_epi32( s3 , 0x55 ); + casti_m128i( d,28 ) = + casti_m128i( d,29 ) = _mm_shuffle_epi32( s3 , 0xaa ); + casti_m128i( d,30 ) = + casti_m128i( d,31 ) = _mm_shuffle_epi32( s3 , 0xff ); + + casti_m128i( d,32 ) = + casti_m128i( d,33 ) = _mm_shuffle_epi32( s4 , 0x00 ); + casti_m128i( d,34 ) = + casti_m128i( d,35 ) = _mm_shuffle_epi32( s4 , 0x55 ); + casti_m128i( d,36 ) = + casti_m128i( d,37 ) = _mm_shuffle_epi32( s4 , 0xaa ); + casti_m128i( d,38 ) = + casti_m128i( d,39 ) = _mm_shuffle_epi32( s4 , 0xff ); +} #endif // AVX2 @@ -544,7 +846,7 @@ static inline void intrlv_16x32( void *dst, const void *s00, const void *s01, const void *s02, const void *s03, const void *s04, const void *s05, const void *s06, const void *s07, const void *s08, const void *s09, const void *s10, const void *s11, const void *s12, - const void *s13, const void *s14, const void *s15, int bit_len ) + const void *s13, const void *s14, const void *s15, const int bit_len ) { ILEAVE_16x32( 0 ); ILEAVE_16x32( 1 ); ILEAVE_16x32( 2 ); ILEAVE_16x32( 3 ); @@ -597,7 +899,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00, *( (uint32_t*)(d06) +(i) ) = s[ 6]; \ *( (uint32_t*)(d07) +(i) ) = s[ 7]; \ *( (uint32_t*)(d08) +(i) ) = s[ 8]; \ - *( (uint32_t*)(d09) +(i) ) = s[ 0]; \ + *( (uint32_t*)(d09) +(i) ) = s[ 9]; \ *( (uint32_t*)(d10) +(i) ) = s[10]; \ *( (uint32_t*)(d11) +(i) ) = s[11]; \ *( (uint32_t*)(d12) +(i) ) = s[12]; \ @@ -609,7 +911,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00, static inline void dintrlv_16x32( void *d00, void *d01, void *d02, void *d03, void *d04, void *d05, void *d06, void *d07, void *d08, void *d09, void *d10, void *d11, void *d12, void *d13, void *d14, void *d15, - const void *src, int bit_len ) + const void *src, const int bit_len ) { DLEAVE_16x32( 0 ); DLEAVE_16x32( 1 ); DLEAVE_16x32( 2 ); DLEAVE_16x32( 3 ); @@ -650,74 +952,128 @@ static inline void dintrlv_16x32_512( void *d00, void *d01, void *d02, #undef DLEAVE_16x32 static inline void extr_lane_16x32( void *d, const void *s, - const int lane, const int bit_len ) + const int lane, const int bit_len ) { - ((uint32_t*)d)[ 0] = ((uint32_t*)s)[ lane ]; - ((uint32_t*)d)[ 1] = ((uint32_t*)s)[ lane+16 ]; - ((uint32_t*)d)[ 2] = ((uint32_t*)s)[ lane+32 ]; - ((uint32_t*)d)[ 3] = ((uint32_t*)s)[ lane+48 ]; - ((uint32_t*)d)[ 4] = ((uint32_t*)s)[ lane+64 ]; - ((uint32_t*)d)[ 5] = ((uint32_t*)s)[ lane+80 ]; - ((uint32_t*)d)[ 6] = ((uint32_t*)s)[ lane+96 ]; - ((uint32_t*)d)[ 7] = ((uint32_t*)s)[ lane+112 ]; + ((uint32_t*)d)[ 0] = ((const uint32_t*)s)[ lane ]; + ((uint32_t*)d)[ 1] = ((const uint32_t*)s)[ lane+16 ]; + ((uint32_t*)d)[ 2] = ((const uint32_t*)s)[ lane+32 ]; + ((uint32_t*)d)[ 3] = ((const uint32_t*)s)[ lane+48 ]; + ((uint32_t*)d)[ 4] = ((const uint32_t*)s)[ lane+64 ]; + ((uint32_t*)d)[ 5] = ((const uint32_t*)s)[ lane+80 ]; + ((uint32_t*)d)[ 6] = ((const uint32_t*)s)[ lane+96 ]; + ((uint32_t*)d)[ 7] = ((const uint32_t*)s)[ lane+112 ]; if ( bit_len <= 256 ) return; - ((uint32_t*)d)[ 8] = ((uint32_t*)s)[ lane+128 ]; - ((uint32_t*)d)[ 9] = ((uint32_t*)s)[ lane+144 ]; - ((uint32_t*)d)[10] = ((uint32_t*)s)[ lane+160 ]; - ((uint32_t*)d)[11] = ((uint32_t*)s)[ lane+176 ]; - ((uint32_t*)d)[12] = ((uint32_t*)s)[ lane+192 ]; - ((uint32_t*)d)[13] = ((uint32_t*)s)[ lane+208 ]; - ((uint32_t*)d)[14] = ((uint32_t*)s)[ lane+224 ]; - ((uint32_t*)d)[15] = ((uint32_t*)s)[ lane+240 ]; + ((uint32_t*)d)[ 8] = ((const uint32_t*)s)[ lane+128 ]; + ((uint32_t*)d)[ 9] = ((const uint32_t*)s)[ lane+144 ]; + ((uint32_t*)d)[10] = ((const uint32_t*)s)[ lane+160 ]; + ((uint32_t*)d)[11] = ((const uint32_t*)s)[ lane+176 ]; + ((uint32_t*)d)[12] = ((const uint32_t*)s)[ lane+192 ]; + ((uint32_t*)d)[13] = ((const uint32_t*)s)[ lane+208 ]; + ((uint32_t*)d)[14] = ((const uint32_t*)s)[ lane+224 ]; + ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ]; } #if defined(__AVX512F__) && defined(__AVX512VL__) -static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src ) -{ - __m512i s0 = mm512_bswap_32( casti_m512i( src, 0 ) ); - __m128i s1 = mm128_bswap_32( casti_m128i( src, 4 ) ); - const __m512i zero = m512_zero; - const __m512i one = m512_one_32; - const __m512i two = _mm512_add_epi32( one, one ); - const __m512i three = _mm512_add_epi32( two, one ); - const __m512i four = _mm512_add_epi32( two, two ); - const __m512i eight = _mm512_add_epi32( four, four ); - const __m512i eleven = _mm512_add_epi32( eight, three ); - - casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, zero ); - casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one ); - casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two ); - casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, three ); - casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, four ); - casti_m512i( d, 5 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( four, one ) ); - casti_m512i( d, 6 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( four, two ) ); - casti_m512i( d, 7 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( four, three ) ); - casti_m512i( d, 8 ) = _mm512_permutexvar_epi32( s0, eight ); - casti_m512i( d, 9 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eight, one ) ); - casti_m512i( d,10 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eight, two ) ); - casti_m512i( d,11 ) = _mm512_permutexvar_epi32( s0, eleven ); - casti_m512i( d,12 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eleven, one ) ); - casti_m512i( d,13 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eleven, two ) ); - casti_m512i( d,14 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eleven, three ) ); - casti_m512i( d,15 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eleven, four ) ); - casti_m512i( d,16 ) = _mm512_permutexvar_epi32( - _mm512_castsi128_si512( s1 ), zero ); - casti_m512i( d,17 ) = _mm512_permutexvar_epi32( - _mm512_castsi128_si512( s1 ), one ); - casti_m512i( d,18 ) = _mm512_permutexvar_epi32( - _mm512_castsi128_si512( s1 ), two ); - casti_m512i( d,19 ) = _mm512_permutexvar_epi32( - _mm512_castsi128_si512( s1 ), three ); +static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) +{ + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); + + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + + casti_m128i( d, 0 ) = + casti_m128i( d, 1 ) = + casti_m128i( d, 2 ) = + casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0 , 0x00 ); + casti_m128i( d, 4 ) = + casti_m128i( d, 5 ) = + casti_m128i( d, 6 ) = + casti_m128i( d, 7 ) = _mm_shuffle_epi32( s0 , 0x55 ); + casti_m128i( d, 8 ) = + casti_m128i( d, 9 ) = + casti_m128i( d,10 ) = + casti_m128i( d,11 ) = _mm_shuffle_epi32( s0 , 0xaa ); + casti_m128i( d,12 ) = + casti_m128i( d,13 ) = + casti_m128i( d,14 ) = + casti_m128i( d,15 ) = _mm_shuffle_epi32( s0 , 0xff ); + + casti_m128i( d,16 ) = + casti_m128i( d,17 ) = + casti_m128i( d,18 ) = + casti_m128i( d,19 ) = _mm_shuffle_epi32( s1 , 0x00 ); + casti_m128i( d,20 ) = + casti_m128i( d,21 ) = + casti_m128i( d,22 ) = + casti_m128i( d,23 ) = _mm_shuffle_epi32( s1 , 0x55 ); + casti_m128i( d,24 ) = + casti_m128i( d,25 ) = + casti_m128i( d,26 ) = + casti_m128i( d,27 ) = _mm_shuffle_epi32( s1 , 0xaa ); + casti_m128i( d,28 ) = + casti_m128i( d,29 ) = + casti_m128i( d,30 ) = + casti_m128i( d,31 ) = _mm_shuffle_epi32( s1 , 0xff ); + + casti_m128i( d,32 ) = + casti_m128i( d,33 ) = + casti_m128i( d,34 ) = + casti_m128i( d,35 ) = _mm_shuffle_epi32( s2 , 0x00 ); + casti_m128i( d,36 ) = + casti_m128i( d,37 ) = + casti_m128i( d,38 ) = + casti_m128i( d,39 ) = _mm_shuffle_epi32( s2 , 0x55 ); + casti_m128i( d,40 ) = + casti_m128i( d,41 ) = + casti_m128i( d,42 ) = + casti_m128i( d,43 ) = _mm_shuffle_epi32( s2 , 0xaa ); + casti_m128i( d,44 ) = + casti_m128i( d,45 ) = + casti_m128i( d,46 ) = + casti_m128i( d,47 ) = _mm_shuffle_epi32( s2 , 0xff ); + + casti_m128i( d,48 ) = + casti_m128i( d,49 ) = + casti_m128i( d,50 ) = + casti_m128i( d,51 ) = _mm_shuffle_epi32( s3 , 0x00 ); + casti_m128i( d,52 ) = + casti_m128i( d,53 ) = + casti_m128i( d,54 ) = + casti_m128i( d,55 ) = _mm_shuffle_epi32( s3 , 0x55 ); + casti_m128i( d,56 ) = + casti_m128i( d,57 ) = + casti_m128i( d,58 ) = + casti_m128i( d,59 ) = _mm_shuffle_epi32( s3 , 0xaa ); + casti_m128i( d,60 ) = + casti_m128i( d,61 ) = + casti_m128i( d,62 ) = + casti_m128i( d,63 ) = _mm_shuffle_epi32( s3 , 0xff ); + + casti_m128i( d,64 ) = + casti_m128i( d,65 ) = + casti_m128i( d,66 ) = + casti_m128i( d,67 ) = _mm_shuffle_epi32( s4 , 0x00 ); + casti_m128i( d,68 ) = + casti_m128i( d,69 ) = + casti_m128i( d,70 ) = + casti_m128i( d,71 ) = _mm_shuffle_epi32( s4 , 0x55 ); + casti_m128i( d,72 ) = + casti_m128i( d,73 ) = + casti_m128i( d,74 ) = + casti_m128i( d,75 ) = _mm_shuffle_epi32( s4 , 0xaa ); + casti_m128i( d,76 ) = + casti_m128i( d,77 ) = + casti_m128i( d,78 ) = + casti_m128i( d,79 ) = _mm_shuffle_epi32( s4 , 0xff ); } #endif // AVX512 @@ -729,7 +1085,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src ) // 2x64 (SSE2) static inline void intrlv_2x64( void *dst, const void *src0, - const void *src1, int bit_len ) + const void *src1, const int bit_len ) { uint64_t *d = (uint64_t*)dst;; const uint64_t *s0 = (const uint64_t*)src0; @@ -748,7 +1104,7 @@ static inline void intrlv_2x64( void *dst, const void *src0, } static inline void dintrlv_2x64( void *dst0, void *dst1, - const void *src, int bit_len ) + const void *src, const int bit_len ) { uint64_t *d0 = (uint64_t*)dst0; uint64_t *d1 = (uint64_t*)dst1; @@ -770,13 +1126,60 @@ static inline void dintrlv_2x64( void *dst0, void *dst1, // 4x64 (AVX2) static inline void intrlv_4x64( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3, int bit_len ) + const void *src1, const void *src2, const void *src3, + const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + const __m128i *s2 = (const __m128i*)src2; + const __m128i *s3 = (const __m128i*)src3; + d[ 0] = _mm_unpacklo_epi64( s0[0], s1[0] ); + d[ 1] = _mm_unpacklo_epi64( s2[0], s3[0] ); + d[ 2] = _mm_unpackhi_epi64( s0[0], s1[0] ); + d[ 3] = _mm_unpackhi_epi64( s2[0], s3[0] ); + d[ 4] = _mm_unpacklo_epi64( s0[1], s1[1] ); + d[ 5] = _mm_unpacklo_epi64( s2[1], s3[1] ); + d[ 6] = _mm_unpackhi_epi64( s0[1], s1[1] ); + d[ 7] = _mm_unpackhi_epi64( s2[1], s3[1] ); + if ( bit_len <= 256 ) return; + d[ 8] = _mm_unpacklo_epi64( s0[2], s1[2] ); + d[ 9] = _mm_unpacklo_epi64( s2[2], s3[2] ); + d[10] = _mm_unpackhi_epi64( s0[2], s1[2] ); + d[11] = _mm_unpackhi_epi64( s2[2], s3[2] ); + d[12] = _mm_unpacklo_epi64( s0[3], s1[3] ); + d[13] = _mm_unpacklo_epi64( s2[3], s3[3] ); + d[14] = _mm_unpackhi_epi64( s0[3], s1[3] ); + d[15] = _mm_unpackhi_epi64( s2[3], s3[3] ); + if ( bit_len <= 512 ) return; + d[16] = _mm_unpacklo_epi64( s0[4], s1[4] ); + d[17] = _mm_unpacklo_epi64( s2[4], s3[4] ); + d[18] = _mm_unpackhi_epi64( s0[4], s1[4] ); + d[19] = _mm_unpackhi_epi64( s2[4], s3[4] ); + if ( bit_len <= 640 ) return; + d[20] = _mm_unpacklo_epi64( s0[5], s1[5] ); + d[21] = _mm_unpacklo_epi64( s2[5], s3[5] ); + d[22] = _mm_unpackhi_epi64( s0[5], s1[5] ); + d[23] = _mm_unpackhi_epi64( s2[5], s3[5] ); + d[24] = _mm_unpacklo_epi64( s0[6], s1[6] ); + d[25] = _mm_unpacklo_epi64( s2[6], s3[6] ); + d[26] = _mm_unpackhi_epi64( s0[6], s1[6] ); + d[27] = _mm_unpackhi_epi64( s2[6], s3[6] ); + d[28] = _mm_unpacklo_epi64( s0[7], s1[7] ); + d[29] = _mm_unpacklo_epi64( s2[7], s3[7] ); + d[30] = _mm_unpackhi_epi64( s0[7], s1[7] ); + d[31] = _mm_unpackhi_epi64( s2[7], s3[7] ); +} + +/* +static inline void intrlv_4x64( void *dst, void *src0, + void *src1, void *src2, void *src3, int bit_len ) { uint64_t *d = (uint64_t*)dst; - const uint64_t *s0 = (const uint64_t*)src0; - const uint64_t *s1 = (const uint64_t*)src1; - const uint64_t *s2 = (const uint64_t*)src2; - const uint64_t *s3 = (const uint64_t*)src3; + uint64_t *s0 = (uint64_t*)src0; + uint64_t *s1 = (uint64_t*)src1; + uint64_t *s2 = (uint64_t*)src2; + uint64_t *s3 = (uint64_t*)src3; d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0]; d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1]; d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2]; @@ -797,7 +1200,35 @@ static inline void intrlv_4x64( void *dst, const void *src0, d[ 56] = s0[14]; d[ 57] = s1[14]; d[ 58] = s2[14]; d[ 59] = s3[14]; d[ 60] = s0[15]; d[ 61] = s1[15]; d[ 62] = s2[15]; d[ 63] = s3[15]; } +*/ +static inline void intrlv_4x64_512( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3 ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + const __m128i *s2 = (const __m128i*)src2; + const __m128i *s3 = (const __m128i*)src3; + d[ 0] = _mm_unpacklo_epi64( s0[0], s1[0] ); + d[ 1] = _mm_unpacklo_epi64( s2[0], s3[0] ); + d[ 2] = _mm_unpackhi_epi64( s0[0], s1[0] ); + d[ 3] = _mm_unpackhi_epi64( s2[0], s3[0] ); + d[ 4] = _mm_unpacklo_epi64( s0[1], s1[1] ); + d[ 5] = _mm_unpacklo_epi64( s2[1], s3[1] ); + d[ 6] = _mm_unpackhi_epi64( s0[1], s1[1] ); + d[ 7] = _mm_unpackhi_epi64( s2[1], s3[1] ); + d[ 8] = _mm_unpacklo_epi64( s0[2], s1[2] ); + d[ 9] = _mm_unpacklo_epi64( s2[2], s3[2] ); + d[10] = _mm_unpackhi_epi64( s0[2], s1[2] ); + d[11] = _mm_unpackhi_epi64( s2[2], s3[2] ); + d[12] = _mm_unpacklo_epi64( s0[3], s1[3] ); + d[13] = _mm_unpacklo_epi64( s2[3], s3[3] ); + d[14] = _mm_unpackhi_epi64( s0[3], s1[3] ); + d[15] = _mm_unpackhi_epi64( s2[3], s3[3] ); +} + +/* static inline void intrlv_4x64_512( void *dst, const void *src0, const void *src1, const void *src2, const void *src3 ) { @@ -815,7 +1246,55 @@ static inline void intrlv_4x64_512( void *dst, const void *src0, d[ 24] = s0[ 6]; d[ 25] = s1[ 6]; d[ 26] = s2[ 6]; d[ 27] = s3[ 6]; d[ 28] = s0[ 7]; d[ 29] = s1[ 7]; d[ 30] = s2[ 7]; d[ 31] = s3[ 7]; } +*/ + +static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src, const int bit_len ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + const __m128i *s = (const __m128i*)src; + d0[0] = _mm_unpacklo_epi64( s[ 0], s[ 2] ); + d1[0] = _mm_unpackhi_epi64( s[ 0], s[ 2] ); + d2[0] = _mm_unpacklo_epi64( s[ 1], s[ 3] ); + d3[0] = _mm_unpackhi_epi64( s[ 1], s[ 3] ); + d0[1] = _mm_unpacklo_epi64( s[ 4], s[ 6] ); + d1[1] = _mm_unpackhi_epi64( s[ 4], s[ 6] ); + d2[1] = _mm_unpacklo_epi64( s[ 5], s[ 7] ); + d3[1] = _mm_unpackhi_epi64( s[ 5], s[ 7] ); + if ( bit_len <= 256 ) return; + d0[2] = _mm_unpacklo_epi64( s[ 8], s[10] ); + d1[2] = _mm_unpackhi_epi64( s[ 8], s[10] ); + d2[2] = _mm_unpacklo_epi64( s[ 9], s[11] ); + d3[2] = _mm_unpackhi_epi64( s[ 9], s[11] ); + d0[3] = _mm_unpacklo_epi64( s[12], s[14] ); + d1[3] = _mm_unpackhi_epi64( s[12], s[14] ); + d2[3] = _mm_unpacklo_epi64( s[13], s[15] ); + d3[3] = _mm_unpackhi_epi64( s[13], s[15] ); + if ( bit_len <= 512 ) return; + d0[4] = _mm_unpacklo_epi64( s[16], s[18] ); + d1[4] = _mm_unpackhi_epi64( s[16], s[18] ); + d2[4] = _mm_unpacklo_epi64( s[17], s[19] ); + d3[4] = _mm_unpackhi_epi64( s[17], s[19] ); + if ( bit_len <= 640 ) return; + d0[5] = _mm_unpacklo_epi64( s[20], s[22] ); + d1[5] = _mm_unpackhi_epi64( s[20], s[22] ); + d2[5] = _mm_unpacklo_epi64( s[21], s[23] ); + d3[5] = _mm_unpackhi_epi64( s[21], s[23] ); + d0[6] = _mm_unpacklo_epi64( s[24], s[26] ); + d1[6] = _mm_unpackhi_epi64( s[24], s[26] ); + d2[6] = _mm_unpacklo_epi64( s[25], s[27] ); + d3[6] = _mm_unpackhi_epi64( s[25], s[27] ); + d0[7] = _mm_unpacklo_epi64( s[28], s[30] ); + d1[7] = _mm_unpackhi_epi64( s[28], s[30] ); + d2[7] = _mm_unpacklo_epi64( s[29], s[31] ); + d3[7] = _mm_unpackhi_epi64( s[29], s[31] ); +} + +/* static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, int bit_len ) { @@ -844,7 +1323,35 @@ static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, d0[14] = s[56]; d1[14] = s[57]; d2[14] = s[58]; d3[14] = s[59]; d0[15] = s[60]; d1[15] = s[61]; d2[15] = s[62]; d3[15] = s[63]; } +*/ +static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + const __m128i *s = (const __m128i*)src; + d0[0] = _mm_unpacklo_epi64( s[ 0], s[ 2] ); + d1[0] = _mm_unpackhi_epi64( s[ 0], s[ 2] ); + d2[0] = _mm_unpacklo_epi64( s[ 1], s[ 3] ); + d3[0] = _mm_unpackhi_epi64( s[ 1], s[ 3] ); + d0[1] = _mm_unpacklo_epi64( s[ 4], s[ 6] ); + d1[1] = _mm_unpackhi_epi64( s[ 4], s[ 6] ); + d2[1] = _mm_unpacklo_epi64( s[ 5], s[ 7] ); + d3[1] = _mm_unpackhi_epi64( s[ 5], s[ 7] ); + d0[2] = _mm_unpacklo_epi64( s[ 8], s[10] ); + d1[2] = _mm_unpackhi_epi64( s[ 8], s[10] ); + d2[2] = _mm_unpacklo_epi64( s[ 9], s[11] ); + d3[2] = _mm_unpackhi_epi64( s[ 9], s[11] ); + d0[3] = _mm_unpacklo_epi64( s[12], s[14] ); + d1[3] = _mm_unpackhi_epi64( s[12], s[14] ); + d2[3] = _mm_unpacklo_epi64( s[13], s[15] ); + d3[3] = _mm_unpackhi_epi64( s[13], s[15] ); +} + +/* static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, void *dst3, const void *src ) { @@ -862,27 +1369,20 @@ static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, d0[ 6] = s[24]; d1[ 6] = s[25]; d2[ 6] = s[26]; d3[ 6] = s[27]; d0[ 7] = s[28]; d1[ 7] = s[29]; d2[ 7] = s[30]; d3[ 7] = s[31]; } +*/ static inline void extr_lane_4x64( void *d, const void *s, const int lane, const int bit_len ) { - ((uint64_t*)d)[ 0] = ((uint64_t*)s)[ lane ]; - ((uint64_t*)d)[ 1] = ((uint64_t*)s)[ lane+ 4 ]; - ((uint64_t*)d)[ 2] = ((uint64_t*)s)[ lane+ 8 ]; - ((uint64_t*)d)[ 3] = ((uint64_t*)s)[ lane+12 ]; - ((uint64_t*)d)[ 4] = ((uint64_t*)s)[ lane+16 ]; - ((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+20 ]; - ((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+24 ]; - ((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+28 ]; + ((uint64_t*)d)[ 0] = ((const uint64_t*)s)[ lane ]; + ((uint64_t*)d)[ 1] = ((const uint64_t*)s)[ lane+ 4 ]; + ((uint64_t*)d)[ 2] = ((const uint64_t*)s)[ lane+ 8 ]; + ((uint64_t*)d)[ 3] = ((const uint64_t*)s)[ lane+12 ]; if ( bit_len <= 256 ) return; - ((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+32 ]; - ((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+36 ]; - ((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+40 ]; - ((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+44 ]; - ((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+48 ]; - ((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+52 ]; - ((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+56 ]; - ((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+60 ]; + ((uint64_t*)d)[ 4] = ((const uint64_t*)s)[ lane+16 ]; + ((uint64_t*)d)[ 5] = ((const uint64_t*)s)[ lane+20 ]; + ((uint64_t*)d)[ 6] = ((const uint64_t*)s)[ lane+24 ]; + ((uint64_t*)d)[ 7] = ((const uint64_t*)s)[ lane+28 ]; } #if defined(__AVX2__) @@ -890,30 +1390,199 @@ static inline void extr_lane_4x64( void *d, const void *s, // There a alignment problems with the source buffer on Wwindows, // can't use 256 bit bswap. -static inline void mm256_bswap32_intrlv80_4x64( void *d, void *src ) +static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) { - __m256i s0 = mm256_bswap_32( casti_m256i( src, 0 ) ); - __m256i s1 = mm256_bswap_32( casti_m256i( src, 1 ) ); - __m128i s2 = mm128_bswap_32( casti_m128i( src, 4 ) ); + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); + + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + + casti_m128i( d, 0 ) = + casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x44 ); + casti_m128i( d, 2 ) = + casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xee ); + + casti_m128i( d, 4 ) = + casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x44 ); + casti_m128i( d, 6 ) = + casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xee ); + + casti_m128i( d, 8 ) = + casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x44 ); + casti_m128i( d, 10 ) = + casti_m128i( d, 11 ) = _mm_shuffle_epi32( s2, 0xee ); + + casti_m128i( d, 12 ) = + casti_m128i( d, 13 ) = _mm_shuffle_epi32( s3, 0x44 ); + casti_m128i( d, 14 ) = + casti_m128i( d, 15 ) = _mm_shuffle_epi32( s3, 0xee ); + + casti_m128i( d, 16 ) = + casti_m128i( d, 17 ) = _mm_shuffle_epi32( s4, 0x44 ); + casti_m128i( d, 18 ) = + casti_m128i( d, 19 ) = _mm_shuffle_epi32( s4, 0xee ); - casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 ); - casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 ); - casti_m256i( d, 2 ) = _mm256_permute4x64_epi64( s0, 0xaa ); - casti_m256i( d, 3 ) = _mm256_permute4x64_epi64( s0, 0xff ); - casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s1, 0x00 ); - casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s1, 0x55 ); - casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s1, 0xaa ); - casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s1, 0xff ); - casti_m256i( d, 8 ) = _mm256_permute4x64_epi64( - _mm256_castsi128_si256( s2 ), 0x00 ); - casti_m256i( d, 9 ) = _mm256_permute4x64_epi64( - _mm256_castsi128_si256( s2 ), 0x55 ); } #endif // AVX2 // 8x64 (AVX512) +static inline void intrlv_8x64( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, + const void *src4, const void *src5, const void *src6, + const void *src7, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + const __m128i *s2 = (const __m128i*)src2; + const __m128i *s3 = (const __m128i*)src3; + const __m128i *s4 = (const __m128i*)src4; + const __m128i *s5 = (const __m128i*)src5; + const __m128i *s6 = (const __m128i*)src6; + const __m128i *s7 = (const __m128i*)src7; + + d[ 0] = _mm_unpacklo_epi64( s0[0], s1[0] ); + d[ 1] = _mm_unpacklo_epi64( s2[0], s3[0] ); + d[ 2] = _mm_unpacklo_epi64( s4[0], s5[0] ); + d[ 3] = _mm_unpacklo_epi64( s6[0], s7[0] ); + d[ 4] = _mm_unpackhi_epi64( s0[0], s1[0] ); + d[ 5] = _mm_unpackhi_epi64( s2[0], s3[0] ); + d[ 6] = _mm_unpackhi_epi64( s4[0], s5[0] ); + d[ 7] = _mm_unpackhi_epi64( s6[0], s7[0] ); + + d[ 8] = _mm_unpacklo_epi64( s0[1], s1[1] ); + d[ 9] = _mm_unpacklo_epi64( s2[1], s3[1] ); + d[10] = _mm_unpacklo_epi64( s4[1], s5[1] ); + d[11] = _mm_unpacklo_epi64( s6[1], s7[1] ); + d[12] = _mm_unpackhi_epi64( s0[1], s1[1] ); + d[13] = _mm_unpackhi_epi64( s2[1], s3[1] ); + d[14] = _mm_unpackhi_epi64( s4[1], s5[1] ); + d[15] = _mm_unpackhi_epi64( s6[1], s7[1] ); + + if ( bit_len <= 256 ) return; + + d[16] = _mm_unpacklo_epi64( s0[2], s1[2] ); + d[17] = _mm_unpacklo_epi64( s2[2], s3[2] ); + d[18] = _mm_unpacklo_epi64( s4[2], s5[2] ); + d[19] = _mm_unpacklo_epi64( s6[2], s7[2] ); + d[20] = _mm_unpackhi_epi64( s0[2], s1[2] ); + d[21] = _mm_unpackhi_epi64( s2[2], s3[2] ); + d[22] = _mm_unpackhi_epi64( s4[2], s5[2] ); + d[23] = _mm_unpackhi_epi64( s6[2], s7[2] ); + + d[24] = _mm_unpacklo_epi64( s0[3], s1[3] ); + d[25] = _mm_unpacklo_epi64( s2[3], s3[3] ); + d[26] = _mm_unpacklo_epi64( s4[3], s5[3] ); + d[27] = _mm_unpacklo_epi64( s6[3], s7[3] ); + d[28] = _mm_unpackhi_epi64( s0[3], s1[3] ); + d[29] = _mm_unpackhi_epi64( s2[3], s3[3] ); + d[30] = _mm_unpackhi_epi64( s4[3], s5[3] ); + d[31] = _mm_unpackhi_epi64( s6[3], s7[3] ); + + if ( bit_len <= 512 ) return; + + d[32] = _mm_unpacklo_epi64( s0[4], s1[4] ); + d[33] = _mm_unpacklo_epi64( s2[4], s3[4] ); + d[34] = _mm_unpacklo_epi64( s4[4], s5[4] ); + d[35] = _mm_unpacklo_epi64( s6[4], s7[4] ); + d[36] = _mm_unpackhi_epi64( s0[4], s1[4] ); + d[37] = _mm_unpackhi_epi64( s2[4], s3[4] ); + d[38] = _mm_unpackhi_epi64( s4[4], s5[4] ); + d[39] = _mm_unpackhi_epi64( s6[4], s7[4] ); + + if ( bit_len <= 640 ) return; + + d[40] = _mm_unpacklo_epi64( s0[5], s1[5] ); + d[41] = _mm_unpacklo_epi64( s2[5], s3[5] ); + d[42] = _mm_unpacklo_epi64( s4[5], s5[5] ); + d[43] = _mm_unpacklo_epi64( s6[5], s7[5] ); + d[44] = _mm_unpackhi_epi64( s0[5], s1[5] ); + d[45] = _mm_unpackhi_epi64( s2[5], s3[5] ); + d[46] = _mm_unpackhi_epi64( s4[5], s5[5] ); + d[47] = _mm_unpackhi_epi64( s6[5], s7[5] ); + + d[48] = _mm_unpacklo_epi64( s0[6], s1[6] ); + d[49] = _mm_unpacklo_epi64( s2[6], s3[6] ); + d[50] = _mm_unpacklo_epi64( s4[6], s5[6] ); + d[51] = _mm_unpacklo_epi64( s6[6], s7[6] ); + d[52] = _mm_unpackhi_epi64( s0[6], s1[6] ); + d[53] = _mm_unpackhi_epi64( s2[6], s3[6] ); + d[54] = _mm_unpackhi_epi64( s4[6], s5[6] ); + d[55] = _mm_unpackhi_epi64( s6[6], s7[6] ); + + d[56] = _mm_unpacklo_epi64( s0[7], s1[7] ); + d[57] = _mm_unpacklo_epi64( s2[7], s3[7] ); + d[58] = _mm_unpacklo_epi64( s4[7], s5[7] ); + d[59] = _mm_unpacklo_epi64( s6[7], s7[7] ); + d[60] = _mm_unpackhi_epi64( s0[7], s1[7] ); + d[61] = _mm_unpackhi_epi64( s2[7], s3[7] ); + d[62] = _mm_unpackhi_epi64( s4[7], s5[7] ); + d[63] = _mm_unpackhi_epi64( s6[7], s7[7] ); +} + +static inline void intrlv_8x64_512( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, + const void *src4, const void *src5, const void *src6, + const void *src7 ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + const __m128i *s2 = (const __m128i*)src2; + const __m128i *s3 = (const __m128i*)src3; + const __m128i *s4 = (const __m128i*)src4; + const __m128i *s5 = (const __m128i*)src5; + const __m128i *s6 = (const __m128i*)src6; + const __m128i *s7 = (const __m128i*)src7; + + d[ 0] = _mm_unpacklo_epi64( s0[0], s1[0] ); + d[ 1] = _mm_unpacklo_epi64( s2[0], s3[0] ); + d[ 2] = _mm_unpacklo_epi64( s4[0], s5[0] ); + d[ 3] = _mm_unpacklo_epi64( s6[0], s7[0] ); + d[ 4] = _mm_unpackhi_epi64( s0[0], s1[0] ); + d[ 5] = _mm_unpackhi_epi64( s2[0], s3[0] ); + d[ 6] = _mm_unpackhi_epi64( s4[0], s5[0] ); + d[ 7] = _mm_unpackhi_epi64( s6[0], s7[0] ); + + d[ 8] = _mm_unpacklo_epi64( s0[1], s1[1] ); + d[ 9] = _mm_unpacklo_epi64( s2[1], s3[1] ); + d[10] = _mm_unpacklo_epi64( s4[1], s5[1] ); + d[11] = _mm_unpacklo_epi64( s6[1], s7[1] ); + d[12] = _mm_unpackhi_epi64( s0[1], s1[1] ); + d[13] = _mm_unpackhi_epi64( s2[1], s3[1] ); + d[14] = _mm_unpackhi_epi64( s4[1], s5[1] ); + d[15] = _mm_unpackhi_epi64( s6[1], s7[1] ); + + d[16] = _mm_unpacklo_epi64( s0[2], s1[2] ); + d[17] = _mm_unpacklo_epi64( s2[2], s3[2] ); + d[18] = _mm_unpacklo_epi64( s4[2], s5[2] ); + d[19] = _mm_unpacklo_epi64( s6[2], s7[2] ); + d[20] = _mm_unpackhi_epi64( s0[2], s1[2] ); + d[21] = _mm_unpackhi_epi64( s2[2], s3[2] ); + d[22] = _mm_unpackhi_epi64( s4[2], s5[2] ); + d[23] = _mm_unpackhi_epi64( s6[2], s7[2] ); + + d[24] = _mm_unpacklo_epi64( s0[3], s1[3] ); + d[25] = _mm_unpacklo_epi64( s2[3], s3[3] ); + d[26] = _mm_unpacklo_epi64( s4[3], s5[3] ); + d[27] = _mm_unpacklo_epi64( s6[3], s7[3] ); + d[28] = _mm_unpackhi_epi64( s0[3], s1[3] ); + d[29] = _mm_unpackhi_epi64( s2[3], s3[3] ); + d[30] = _mm_unpackhi_epi64( s4[3], s5[3] ); + d[31] = _mm_unpackhi_epi64( s6[3], s7[3] ); +} + +/* #define ILEAVE_8x64( i ) do \ { \ uint64_t *d = (uint64_t*)(dst) + ( (i) << 3 ); \ @@ -945,7 +1614,154 @@ static inline void intrlv_8x64( void *dst, const void *s0, } #undef ILEAVE_8x64 +*/ + + +static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2, + void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, + const void *src, const int bit_len ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + __m128i *d4 = (__m128i*)dst4; + __m128i *d5 = (__m128i*)dst5; + __m128i *d6 = (__m128i*)dst6; + __m128i *d7 = (__m128i*)dst7; + const __m128i* s = (const __m128i*)src; + + d0[0] = _mm_unpacklo_epi64( s[ 0], s[ 4] ); + d1[0] = _mm_unpackhi_epi64( s[ 0], s[ 4] ); + d2[0] = _mm_unpacklo_epi64( s[ 1], s[ 5] ); + d3[0] = _mm_unpackhi_epi64( s[ 1], s[ 5] ); + d4[0] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); + d5[0] = _mm_unpackhi_epi64( s[ 2], s[ 6] ); + d6[0] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); + d7[0] = _mm_unpackhi_epi64( s[ 3], s[ 7] ); + + d0[1] = _mm_unpacklo_epi64( s[ 8], s[12] ); + d1[1] = _mm_unpackhi_epi64( s[ 8], s[12] ); + d2[1] = _mm_unpacklo_epi64( s[ 9], s[13] ); + d3[1] = _mm_unpackhi_epi64( s[ 9], s[13] ); + d4[1] = _mm_unpacklo_epi64( s[10], s[14] ); + d5[1] = _mm_unpackhi_epi64( s[10], s[14] ); + d6[1] = _mm_unpacklo_epi64( s[11], s[15] ); + d7[1] = _mm_unpackhi_epi64( s[11], s[15] ); + + if ( bit_len <= 256 ) return; + + d0[2] = _mm_unpacklo_epi64( s[16], s[20] ); + d1[2] = _mm_unpackhi_epi64( s[16], s[20] ); + d2[2] = _mm_unpacklo_epi64( s[17], s[21] ); + d3[2] = _mm_unpackhi_epi64( s[17], s[21] ); + d4[2] = _mm_unpacklo_epi64( s[18], s[22] ); + d5[2] = _mm_unpackhi_epi64( s[18], s[22] ); + d6[2] = _mm_unpacklo_epi64( s[19], s[23] ); + d7[2] = _mm_unpackhi_epi64( s[19], s[23] ); + + d0[3] = _mm_unpacklo_epi64( s[24], s[28] ); + d1[3] = _mm_unpackhi_epi64( s[24], s[28] ); + d2[3] = _mm_unpacklo_epi64( s[25], s[29] ); + d3[3] = _mm_unpackhi_epi64( s[25], s[29] ); + d4[3] = _mm_unpacklo_epi64( s[26], s[30] ); + d5[3] = _mm_unpackhi_epi64( s[26], s[30] ); + d6[3] = _mm_unpacklo_epi64( s[27], s[31] ); + d7[3] = _mm_unpackhi_epi64( s[27], s[31] ); + + if ( bit_len <= 512 ) return; + + d0[4] = _mm_unpacklo_epi64( s[32], s[36] ); + d1[4] = _mm_unpackhi_epi64( s[32], s[36] ); + d2[4] = _mm_unpacklo_epi64( s[33], s[37] ); + d3[4] = _mm_unpackhi_epi64( s[33], s[37] ); + d4[4] = _mm_unpacklo_epi64( s[34], s[38] ); + d5[4] = _mm_unpackhi_epi64( s[34], s[38] ); + d6[4] = _mm_unpacklo_epi64( s[35], s[39] ); + d7[4] = _mm_unpackhi_epi64( s[35], s[39] ); + + if ( bit_len <= 640 ) return; + + d0[5] = _mm_unpacklo_epi64( s[40], s[44] ); + d1[5] = _mm_unpackhi_epi64( s[40], s[44] ); + d2[5] = _mm_unpacklo_epi64( s[41], s[45] ); + d3[5] = _mm_unpackhi_epi64( s[41], s[45] ); + d4[5] = _mm_unpacklo_epi64( s[42], s[46] ); + d5[5] = _mm_unpackhi_epi64( s[42], s[46] ); + d6[5] = _mm_unpacklo_epi64( s[43], s[47] ); + d7[5] = _mm_unpackhi_epi64( s[43], s[47] ); + + d0[6] = _mm_unpacklo_epi64( s[48], s[52] ); + d1[6] = _mm_unpackhi_epi64( s[48], s[52] ); + d2[6] = _mm_unpacklo_epi64( s[49], s[53] ); + d3[6] = _mm_unpackhi_epi64( s[49], s[53] ); + d4[6] = _mm_unpacklo_epi64( s[50], s[54] ); + d5[6] = _mm_unpackhi_epi64( s[50], s[54] ); + d6[6] = _mm_unpacklo_epi64( s[51], s[55] ); + d7[6] = _mm_unpackhi_epi64( s[51], s[55] ); + + d0[7] = _mm_unpacklo_epi64( s[56], s[60] ); + d1[7] = _mm_unpackhi_epi64( s[56], s[60] ); + d2[7] = _mm_unpacklo_epi64( s[57], s[61] ); + d3[7] = _mm_unpackhi_epi64( s[57], s[61] ); + d4[7] = _mm_unpacklo_epi64( s[58], s[62] ); + d5[7] = _mm_unpackhi_epi64( s[58], s[62] ); + d6[7] = _mm_unpacklo_epi64( s[59], s[63] ); + d7[7] = _mm_unpackhi_epi64( s[59], s[63] ); +} + +static inline void dintrlv_8x64_512( void *dst0, void *dst1, void *dst2, + void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, + const void *src ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + __m128i *d4 = (__m128i*)dst4; + __m128i *d5 = (__m128i*)dst5; + __m128i *d6 = (__m128i*)dst6; + __m128i *d7 = (__m128i*)dst7; + const __m128i* s = (const __m128i*)src; + + d0[0] = _mm_unpacklo_epi64( s[ 0], s[ 4] ); + d1[0] = _mm_unpackhi_epi64( s[ 0], s[ 4] ); + d2[0] = _mm_unpacklo_epi64( s[ 1], s[ 5] ); + d3[0] = _mm_unpackhi_epi64( s[ 1], s[ 5] ); + d4[0] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); + d5[0] = _mm_unpackhi_epi64( s[ 2], s[ 6] ); + d6[0] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); + d7[0] = _mm_unpackhi_epi64( s[ 3], s[ 7] ); + + d0[1] = _mm_unpacklo_epi64( s[ 8], s[12] ); + d1[1] = _mm_unpackhi_epi64( s[ 8], s[12] ); + d2[1] = _mm_unpacklo_epi64( s[ 9], s[13] ); + d3[1] = _mm_unpackhi_epi64( s[ 9], s[13] ); + d4[1] = _mm_unpacklo_epi64( s[10], s[14] ); + d5[1] = _mm_unpackhi_epi64( s[10], s[14] ); + d6[1] = _mm_unpacklo_epi64( s[11], s[15] ); + d7[1] = _mm_unpackhi_epi64( s[11], s[15] ); + + d0[2] = _mm_unpacklo_epi64( s[16], s[20] ); + d1[2] = _mm_unpackhi_epi64( s[16], s[20] ); + d2[2] = _mm_unpacklo_epi64( s[17], s[21] ); + d3[2] = _mm_unpackhi_epi64( s[17], s[21] ); + d4[2] = _mm_unpacklo_epi64( s[18], s[22] ); + d5[2] = _mm_unpackhi_epi64( s[18], s[22] ); + d6[2] = _mm_unpacklo_epi64( s[19], s[23] ); + d7[2] = _mm_unpackhi_epi64( s[19], s[23] ); + + d0[3] = _mm_unpacklo_epi64( s[24], s[28] ); + d1[3] = _mm_unpackhi_epi64( s[24], s[28] ); + d2[3] = _mm_unpacklo_epi64( s[25], s[29] ); + d3[3] = _mm_unpackhi_epi64( s[25], s[29] ); + d4[3] = _mm_unpacklo_epi64( s[26], s[30] ); + d5[3] = _mm_unpackhi_epi64( s[26], s[30] ); + d6[3] = _mm_unpacklo_epi64( s[27], s[31] ); + d7[3] = _mm_unpackhi_epi64( s[27], s[31] ); +} +/* #define DLEAVE_8x64( i ) do \ { \ const uint64_t *s = (const uint64_t*)(src) + ( (i) << 3 ); \ @@ -976,54 +1792,83 @@ static inline void dintrlv_8x64( void *d0, void *d1, void *d2, void *d3, } #undef DLEAVE_8x64 +*/ static inline void extr_lane_8x64( void *d, const void *s, const int lane, const int bit_len ) { - ((uint64_t*)d)[ 0] = ((uint64_t*)s)[ lane ]; - ((uint64_t*)d)[ 1] = ((uint64_t*)s)[ lane+ 8 ]; - ((uint64_t*)d)[ 2] = ((uint64_t*)s)[ lane+ 16 ]; - ((uint64_t*)d)[ 3] = ((uint64_t*)s)[ lane+ 24 ]; - ((uint64_t*)d)[ 4] = ((uint64_t*)s)[ lane+ 32 ]; - ((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+ 40 ]; - ((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+ 48 ]; - ((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+ 56 ]; + ((uint64_t*)d)[ 0] = ((const uint64_t*)s)[ lane ]; + ((uint64_t*)d)[ 1] = ((const uint64_t*)s)[ lane+ 8 ]; + ((uint64_t*)d)[ 2] = ((const uint64_t*)s)[ lane+ 16 ]; + ((uint64_t*)d)[ 3] = ((const uint64_t*)s)[ lane+ 24 ]; if ( bit_len <= 256 ) return; - ((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+ 64 ]; - ((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+ 72 ]; - ((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+ 80 ]; - ((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+ 88 ]; - ((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+ 96 ]; - ((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+104 ]; - ((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+112 ]; - ((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+120 ]; + ((uint64_t*)d)[ 4] = ((const uint64_t*)s)[ lane+ 32 ]; + ((uint64_t*)d)[ 5] = ((const uint64_t*)s)[ lane+ 40 ]; + ((uint64_t*)d)[ 6] = ((const uint64_t*)s)[ lane+ 48 ]; + ((uint64_t*)d)[ 7] = ((const uint64_t*)s)[ lane+ 56 ]; } #if defined(__AVX512F__) && defined(__AVX512VL__) -static inline void mm512_bswap32_intrlv80_8x64( void *dst, void *src ) -{ - __m512i *d = (__m512i*)dst; - __m512i s0 = mm512_bswap_32( casti_m512i(src, 0 ) ); - __m128i s1 = mm128_bswap_32( casti_m128i(src, 4 ) ); - const __m512i zero = m512_zero; - const __m512i one = m512_one_64; - const __m512i two = _mm512_add_epi64( one, one ); - const __m512i three = _mm512_add_epi64( two, one ); - const __m512i four = _mm512_add_epi64( two, two ); - - d[0] = _mm512_permutexvar_epi64( s0, zero ); - d[1] = _mm512_permutexvar_epi64( s0, one ); - d[2] = _mm512_permutexvar_epi64( s0, two ); - d[3] = _mm512_permutexvar_epi64( s0, three ); - d[4] = _mm512_permutexvar_epi64( s0, four ); - d[5] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, one ) ); - d[6] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, two ) ); - d[7] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, three ) ); - d[8] = _mm512_permutexvar_epi64( - _mm512_castsi128_si512( s1 ), zero ); - d[9] = _mm512_permutexvar_epi64( - _mm512_castsi128_si512( s1 ), one ); +static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) +{ + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); + + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + + casti_m128i( d, 0 ) = + casti_m128i( d, 1 ) = + casti_m128i( d, 2 ) = + casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0x44 ); + casti_m128i( d, 4 ) = + casti_m128i( d, 5 ) = + casti_m128i( d, 6 ) = + casti_m128i( d, 7 ) = _mm_shuffle_epi32( s0, 0xee ); + + casti_m128i( d, 8 ) = + casti_m128i( d, 9 ) = + casti_m128i( d, 10 ) = + casti_m128i( d, 11 ) = _mm_shuffle_epi32( s1, 0x44 ); + casti_m128i( d, 12 ) = + casti_m128i( d, 13 ) = + casti_m128i( d, 14 ) = + casti_m128i( d, 15 ) = _mm_shuffle_epi32( s1, 0xee ); + + casti_m128i( d, 16 ) = + casti_m128i( d, 17 ) = + casti_m128i( d, 18 ) = + casti_m128i( d, 19 ) = _mm_shuffle_epi32( s2, 0x44 ); + casti_m128i( d, 20 ) = + casti_m128i( d, 21 ) = + casti_m128i( d, 22 ) = + casti_m128i( d, 23 ) = _mm_shuffle_epi32( s2, 0xee ); + + casti_m128i( d, 24 ) = + casti_m128i( d, 25 ) = + casti_m128i( d, 26 ) = + casti_m128i( d, 27 ) = _mm_shuffle_epi32( s3, 0x44 ); + casti_m128i( d, 28 ) = + casti_m128i( d, 29 ) = + casti_m128i( d, 30 ) = + casti_m128i( d, 31 ) = _mm_shuffle_epi32( s3, 0xee ); + + casti_m128i( d, 32 ) = + casti_m128i( d, 33 ) = + casti_m128i( d, 34 ) = + casti_m128i( d, 35 ) = _mm_shuffle_epi32( s4, 0x44 ); + casti_m128i( d, 36 ) = + casti_m128i( d, 37 ) = + casti_m128i( d, 38 ) = + casti_m128i( d, 39 ) = _mm_shuffle_epi32( s4, 0xee ); } #endif // AVX512 @@ -1035,7 +1880,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *dst, void *src ) // 2x128 (AVX2) static inline void intrlv_2x128( void *dst, const void *src0, - const void *src1, int bit_len ) + const void *src1, const int bit_len ) { __m128i *d = (__m128i*)dst; const __m128i *s0 = (const __m128i*)src0; @@ -1100,7 +1945,7 @@ static inline void dintrlv_2x128_512( void *dst0, void *dst1, const void *src ) // 4x128 (AVX512) static inline void intrlv_4x128( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3, int bit_len ) + const void *src1, const void *src2, const void *src3, const int bit_len ) { __m128i *d = (__m128i*)dst; const __m128i *s0 = (const __m128i*)src0; @@ -1135,7 +1980,7 @@ static inline void intrlv_4x128_512( void *dst, const void *src0, } static inline void dintrlv_4x128( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src, int bit_len ) + void *dst3, const void *src, const int bit_len ) { __m128i *d0 = (__m128i*)dst0; __m128i *d1 = (__m128i*)dst1; @@ -1170,6 +2015,70 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2, } +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src ) +{ + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); + + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + + casti_m512i( d, 0 ) = _mm512_broadcast_i64x2( s0 ); + casti_m512i( d, 1 ) = _mm512_broadcast_i64x2( s1 ); + casti_m512i( d, 2 ) = _mm512_broadcast_i64x2( s2 ); + casti_m512i( d, 3 ) = _mm512_broadcast_i64x2( s3 ); + casti_m512i( d, 4 ) = _mm512_broadcast_i64x2( s4 ); +} + +#endif + +// 2x256 (AVX512) + +#if defined (__AVX__) + +static inline void intrlv_2x256( void *dst, const void *src0, + const void *src1, const int bit_len ) +{ + __m256i *d = (__m256i*)dst; + const __m256i *s0 = (const __m256i*)src0; + const __m256i *s1 = (const __m256i*)src1; + d[0] = s0[0]; d[1] = s1[0]; + if ( bit_len <= 256 ) return; + d[2] = s0[1]; d[3] = s1[1]; + if ( bit_len <= 512 ) return; + d[4] = s0[2]; + if ( bit_len <= 640 ) return; + d[5] = s1[2]; + d[6] = s0[3]; d[7] = s1[3]; +} + +// No 80 byte dintrlv +static inline void dintrlv_2x256( void *dst0, void *dst1, + const void *src, int bit_len ) +{ + __m256i *d0 = (__m256i*)dst0; + __m256i *d1 = (__m256i*)dst1; + const __m256i *s = (const __m256i*)src; + + d0[0] = s[0]; d1[0] = s[1]; + if ( bit_len <= 256 ) return; + d0[1] = s[2]; d1[1] = s[3]; + if ( bit_len <= 512 ) return; + d0[2] = s[4]; d1[2] = s[5]; + d0[3] = s[6]; d1[3] = s[7]; +} + +#endif // AVX + /////////////////////////// // // Re-intereleaving @@ -1189,8 +2098,8 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2, // Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX // bit_len must be multiple of 64 -static inline void rintrlv_4x64_4x32( void *dst, void *src, - int bit_len ) +static inline void rintrlv_4x64_4x32( void *dst, const void *src, + const int bit_len ) { RLEAVE_4x64_4x32( 0 ); RLEAVE_4x64_4x32( 8 ); RLEAVE_4x64_4x32( 16 ); RLEAVE_4x64_4x32( 24 ); @@ -1199,6 +2108,7 @@ static inline void rintrlv_4x64_4x32( void *dst, void *src, RLEAVE_4x64_4x32( 48 ); RLEAVE_4x64_4x32( 56 ); if ( bit_len <= 512 ) return; RLEAVE_4x64_4x32( 64 ); RLEAVE_4x64_4x32( 72 ); + if ( bit_len <= 640 ) return; RLEAVE_4x64_4x32( 80 ); RLEAVE_4x64_4x32( 88 ); RLEAVE_4x64_4x32( 96 ); RLEAVE_4x64_4x32( 104 ); RLEAVE_4x64_4x32( 112 ); RLEAVE_4x64_4x32( 120 ); @@ -1206,9 +2116,225 @@ static inline void rintrlv_4x64_4x32( void *dst, void *src, #undef RLEAVE_4x64_4x32 +#define RLEAVE_8x64_8x32( i ) do \ +{ \ + uint32_t *d = (uint32_t*)dst + (i); \ + const uint32_t *s = (const uint32_t*)src + (i); \ + d[ 0] = s[ 0]; d[ 1] = s[ 2]; d[ 2] = s[ 4]; d[ 3] = s[ 6]; \ + d[ 4] = s[ 8]; d[ 5] = s[10]; d[ 6] = s[12]; d[ 7] = s[14]; \ + d[ 8] = s[ 1]; d[ 9] = s[ 3]; d[10] = s[ 5]; d[11] = s[ 7]; \ + d[12] = s[ 9]; d[13] = s[11]; d[14] = s[13]; d[15] = s[15]; \ +} while(0) + + +// 8x64 -> 8x32 + +static inline void rintrlv_8x64_8x32( void *dst, const void *src, + const int bit_len ) +{ + RLEAVE_8x64_8x32( 0 ); RLEAVE_8x64_8x32( 16 ); + RLEAVE_8x64_8x32( 32 ); RLEAVE_8x64_8x32( 48 ); + + if ( bit_len <= 256 ) return; + + RLEAVE_8x64_8x32( 64 ); RLEAVE_8x64_8x32( 80 ); + RLEAVE_8x64_8x32( 96 ); RLEAVE_8x64_8x32( 112 ); + + if ( bit_len <= 512 ) return; + + RLEAVE_8x64_8x32( 128 ); RLEAVE_8x64_8x32( 144 ); + + if ( bit_len <= 640 ) return; + + RLEAVE_8x64_8x32( 160 ); RLEAVE_8x64_8x32( 176 ); + RLEAVE_8x64_8x32( 192 ); RLEAVE_8x64_8x32( 208 ); + RLEAVE_8x64_8x32( 224 ); RLEAVE_8x64_8x32( 240 ); +} + +#undef RLEAVE_8x64_8x32 // 4x32 -> 4x64 +static inline void rintrlv_4x32_4x64( void *dst, + const void *src, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s = (const __m128i*)src; + d[ 0] = _mm_unpacklo_epi32( s[ 0], s[ 1] ); + d[ 1] = _mm_unpackhi_epi32( s[ 0], s[ 1] ); + d[ 2] = _mm_unpacklo_epi32( s[ 2], s[ 3] ); + d[ 3] = _mm_unpackhi_epi32( s[ 2], s[ 3] ); + d[ 4] = _mm_unpacklo_epi32( s[ 4], s[ 5] ); + d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 5] ); + d[ 6] = _mm_unpacklo_epi32( s[ 6], s[ 7] ); + d[ 7] = _mm_unpackhi_epi32( s[ 6], s[ 7] ); + + if ( bit_len <= 256 ) return; + + d[ 8] = _mm_unpacklo_epi32( s[ 8], s[ 9] ); + d[ 9] = _mm_unpackhi_epi32( s[ 8], s[ 9] ); + d[10] = _mm_unpacklo_epi32( s[10], s[11] ); + d[11] = _mm_unpackhi_epi32( s[10], s[11] ); + d[12] = _mm_unpacklo_epi32( s[12], s[13] ); + d[13] = _mm_unpackhi_epi32( s[12], s[13] ); + d[14] = _mm_unpacklo_epi32( s[14], s[15] ); + d[15] = _mm_unpackhi_epi32( s[14], s[15] ); + + if ( bit_len <= 512 ) return; + + d[16] = _mm_unpacklo_epi32( s[16], s[17] ); + d[17] = _mm_unpackhi_epi32( s[16], s[17] ); + d[18] = _mm_unpacklo_epi32( s[18], s[19] ); + d[19] = _mm_unpackhi_epi32( s[18], s[19] ); + + if ( bit_len <= 640 ) return; + + d[20] = _mm_unpacklo_epi32( s[20], s[21] ); + d[21] = _mm_unpackhi_epi32( s[20], s[21] ); + d[22] = _mm_unpacklo_epi32( s[22], s[23] ); + d[23] = _mm_unpackhi_epi32( s[22], s[23] ); + + d[24] = _mm_unpacklo_epi32( s[24], s[25] ); + d[25] = _mm_unpackhi_epi32( s[24], s[25] ); + d[26] = _mm_unpacklo_epi32( s[26], s[27] ); + d[27] = _mm_unpackhi_epi32( s[26], s[27] ); + d[28] = _mm_unpacklo_epi32( s[28], s[29] ); + d[29] = _mm_unpackhi_epi32( s[28], s[29] ); + d[30] = _mm_unpacklo_epi32( s[30], s[31] ); + d[31] = _mm_unpackhi_epi32( s[30], s[31] ); +} + +// 8x32 -> 8x64 + +static inline void rintrlv_8x32_8x64( void *dst, + const void *src, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s = (const __m128i*)src; + + d[ 0] = _mm_unpacklo_epi32( s[ 0], s[ 2] ); + d[ 1] = _mm_unpackhi_epi32( s[ 0], s[ 2] ); + d[ 2] = _mm_unpacklo_epi32( s[ 1], s[ 3] ); + d[ 3] = _mm_unpackhi_epi32( s[ 1], s[ 3] ); + d[ 4] = _mm_unpacklo_epi32( s[ 4], s[ 6] ); + d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 6] ); + d[ 6] = _mm_unpacklo_epi32( s[ 5], s[ 7] ); + d[ 7] = _mm_unpackhi_epi32( s[ 5], s[ 7] ); + + d[ 8] = _mm_unpacklo_epi32( s[ 8], s[10] ); + d[ 9] = _mm_unpackhi_epi32( s[ 8], s[10] ); + d[10] = _mm_unpacklo_epi32( s[ 9], s[11] ); + d[11] = _mm_unpackhi_epi32( s[ 9], s[11] ); + d[12] = _mm_unpacklo_epi32( s[12], s[14] ); + d[13] = _mm_unpackhi_epi32( s[12], s[14] ); + d[14] = _mm_unpacklo_epi32( s[13], s[15] ); + d[15] = _mm_unpackhi_epi32( s[13], s[15] ); + + if ( bit_len <= 256 ) return; + + d[16] = _mm_unpacklo_epi32( s[16], s[18] ); + d[17] = _mm_unpackhi_epi32( s[16], s[18] ); + d[18] = _mm_unpacklo_epi32( s[17], s[19] ); + d[19] = _mm_unpackhi_epi32( s[17], s[19] ); + d[20] = _mm_unpacklo_epi32( s[20], s[22] ); + d[21] = _mm_unpackhi_epi32( s[20], s[22] ); + d[22] = _mm_unpacklo_epi32( s[21], s[23] ); + d[23] = _mm_unpackhi_epi32( s[21], s[23] ); + + d[24] = _mm_unpacklo_epi32( s[24], s[26] ); + d[25] = _mm_unpackhi_epi32( s[24], s[26] ); + d[26] = _mm_unpacklo_epi32( s[25], s[27] ); + d[27] = _mm_unpackhi_epi32( s[25], s[27] ); + d[28] = _mm_unpacklo_epi32( s[28], s[30] ); + d[29] = _mm_unpackhi_epi32( s[28], s[30] ); + d[30] = _mm_unpacklo_epi32( s[29], s[31] ); + d[31] = _mm_unpackhi_epi32( s[29], s[31] ); + + if ( bit_len <= 512 ) return; + + d[32] = _mm_unpacklo_epi32( s[32], s[34] ); + d[33] = _mm_unpackhi_epi32( s[32], s[34] ); + d[34] = _mm_unpacklo_epi32( s[33], s[35] ); + d[35] = _mm_unpackhi_epi32( s[33], s[35] ); + d[36] = _mm_unpacklo_epi32( s[36], s[38] ); + d[37] = _mm_unpackhi_epi32( s[36], s[38] ); + d[38] = _mm_unpacklo_epi32( s[37], s[39] ); + d[39] = _mm_unpackhi_epi32( s[37], s[39] ); + + if ( bit_len <= 640 ) return; + + d[40] = _mm_unpacklo_epi32( s[40], s[42] ); + d[41] = _mm_unpackhi_epi32( s[40], s[42] ); + d[42] = _mm_unpacklo_epi32( s[41], s[43] ); + d[43] = _mm_unpackhi_epi32( s[41], s[43] ); + d[44] = _mm_unpacklo_epi32( s[44], s[46] ); + d[45] = _mm_unpackhi_epi32( s[44], s[46] ); + d[46] = _mm_unpacklo_epi32( s[45], s[47] ); + d[47] = _mm_unpackhi_epi32( s[45], s[47] ); + + d[48] = _mm_unpacklo_epi32( s[48], s[50] ); + d[49] = _mm_unpackhi_epi32( s[48], s[50] ); + d[50] = _mm_unpacklo_epi32( s[49], s[51] ); + d[51] = _mm_unpackhi_epi32( s[49], s[51] ); + d[52] = _mm_unpacklo_epi32( s[52], s[54] ); + d[53] = _mm_unpackhi_epi32( s[52], s[54] ); + d[54] = _mm_unpacklo_epi32( s[53], s[55] ); + d[55] = _mm_unpackhi_epi32( s[53], s[55] ); + + d[56] = _mm_unpacklo_epi32( s[56], s[58] ); + d[57] = _mm_unpackhi_epi32( s[56], s[58] ); + d[58] = _mm_unpacklo_epi32( s[57], s[59] ); + d[59] = _mm_unpackhi_epi32( s[57], s[59] ); + d[60] = _mm_unpacklo_epi32( s[60], s[62] ); + d[61] = _mm_unpackhi_epi32( s[60], s[62] ); + d[62] = _mm_unpacklo_epi32( s[61], s[63] ); + d[63] = _mm_unpackhi_epi32( s[61], s[63] ); +} + +// 8x32 -> 4x128 + +// 16 bytes per lane +#define RLEAVE_8X32_4X128( i ) \ +do { \ + uint32_t *d0 = (uint32_t*)dst0 + (i); \ + uint32_t *d1 = (uint32_t*)dst1 + (i); \ + const uint32_t *s = (const uint32_t*)src + ((i)<<1); \ + d0[ 0] = s[ 0]; d1[ 0] = s[ 4]; \ + d0[ 1] = s[ 8]; d1[ 1] = s[12]; \ + d0[ 2] = s[16]; d1[ 2] = s[20]; \ + d0[ 3] = s[24]; d1[ 3] = s[28]; \ +\ + d0[ 4] = s[ 1]; d1[ 4] = s[ 5]; \ + d0[ 5] = s[ 9]; d1[ 5] = s[13]; \ + d0[ 6] = s[17]; d1[ 6] = s[21]; \ + d0[ 7] = s[25]; d1[ 7] = s[29]; \ +\ + d0[ 8] = s[ 2]; d1[ 8] = s[ 6]; \ + d0[ 9] = s[10]; d1[ 9] = s[14]; \ + d0[10] = s[18]; d1[10] = s[22]; \ + d0[11] = s[26]; d1[11] = s[30]; \ +\ + d0[12] = s[ 3]; d1[12] = s[ 7]; \ + d0[13] = s[11]; d1[13] = s[15]; \ + d0[14] = s[19]; d1[14] = s[23]; \ + d0[15] = s[27]; d1[15] = s[31]; \ +} while(0) + +static inline void rintrlv_8x32_4x128( void *dst0, void *dst1, + const void *src, const int bit_len ) +{ + RLEAVE_8X32_4X128( 0 ); RLEAVE_8X32_4X128( 16 ); + if ( bit_len <= 256 ) return; + RLEAVE_8X32_4X128( 32 ); RLEAVE_8X32_4X128( 48 ); + if ( bit_len <= 512 ) return; + RLEAVE_8X32_4X128( 64 ); + if ( bit_len <= 640 ) return; + RLEAVE_8X32_4X128( 80 ); + RLEAVE_8X32_4X128( 96 ); RLEAVE_8X32_4X128( 112 ); +} +#undef RLEAVE_8X32_4X128 + +/* #define RLEAVE_4x32_4x64(i) do \ { \ uint32_t *d = (uint32_t*)dst + (i); \ @@ -1235,10 +2361,54 @@ static inline void rintrlv_4x32_4x64( void *dst, } #undef RLEAVE_4x32_4x64 - +*/ // 2x128 -> 4x64 +static inline void rintrlv_2x128_4x64( void *dst, const void *src0, + const void *src1, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] ); + d[ 1] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] ); + d[ 2] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] ); + d[ 3] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] ); + d[ 4] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] ); + d[ 5] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] ); + d[ 6] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] ); + d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] ); + if ( bit_len <= 256 ) return; + d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] ); + d[ 9] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] ); + d[10] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] ); + d[11] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] ); + d[12] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] ); + d[13] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] ); + d[14] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] ); + d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] ); + if ( bit_len <= 512 ) return; + d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] ); + d[17] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] ); + d[18] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] ); + d[19] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] ); + if ( bit_len <= 640 ) return; + d[20] = _mm_unpacklo_epi64( s0[10], s0[11] ); + d[21] = _mm_unpacklo_epi64( s1[10], s1[11] ); + d[22] = _mm_unpackhi_epi64( s0[10], s0[11] ); + d[23] = _mm_unpackhi_epi64( s1[10], s1[11] ); + d[24] = _mm_unpacklo_epi64( s0[12], s0[13] ); + d[25] = _mm_unpacklo_epi64( s1[12], s1[13] ); + d[26] = _mm_unpackhi_epi64( s0[12], s0[13] ); + d[27] = _mm_unpackhi_epi64( s1[12], s1[13] ); + d[28] = _mm_unpacklo_epi64( s0[14], s0[15] ); + d[29] = _mm_unpacklo_epi64( s1[14], s1[15] ); + d[30] = _mm_unpackhi_epi64( s0[14], s0[15] ); + d[31] = _mm_unpackhi_epi64( s1[14], s1[15] ); +} + +/* #define RLEAVE_2x128_4x64( i ) do \ { \ uint64_t *d = (uint64_t*)dst + ((i)<<1); \ @@ -1262,10 +2432,54 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0, } #undef RLEAVE_2x128_4x64 - +*/ // 4x64 -> 2x128 +static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, + const void *src, const int bit_len ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + const __m128i* s = (const __m128i*)src; + d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 2] ); + d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 2] ); + d1[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 3] ); + d1[ 1] = _mm_unpackhi_epi64( s[ 1], s[ 3] ); + d0[ 2] = _mm_unpacklo_epi64( s[ 4], s[ 6] ); + d0[ 3] = _mm_unpackhi_epi64( s[ 4], s[ 6] ); + d1[ 2] = _mm_unpacklo_epi64( s[ 5], s[ 7] ); + d1[ 3] = _mm_unpackhi_epi64( s[ 5], s[ 7] ); + if ( bit_len <= 256 ) return; + d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[10] ); + d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[10] ); + d1[ 4] = _mm_unpacklo_epi64( s[ 9], s[11] ); + d1[ 5] = _mm_unpackhi_epi64( s[ 9], s[11] ); + d0[ 6] = _mm_unpacklo_epi64( s[12], s[14] ); + d0[ 7] = _mm_unpackhi_epi64( s[12], s[14] ); + d1[ 6] = _mm_unpacklo_epi64( s[13], s[15] ); + d1[ 7] = _mm_unpackhi_epi64( s[13], s[15] ); + if ( bit_len <= 512 ) return; + d0[ 8] = _mm_unpacklo_epi64( s[16], s[18] ); + d0[ 9] = _mm_unpackhi_epi64( s[16], s[18] ); + d1[ 8] = _mm_unpacklo_epi64( s[17], s[19] ); + d1[ 9] = _mm_unpackhi_epi64( s[17], s[19] ); + if ( bit_len <= 640 ) return; + d0[10] = _mm_unpacklo_epi64( s[20], s[22] ); + d0[11] = _mm_unpackhi_epi64( s[20], s[22] ); + d1[10] = _mm_unpacklo_epi64( s[21], s[23] ); + d1[11] = _mm_unpackhi_epi64( s[21], s[23] ); + d0[12] = _mm_unpacklo_epi64( s[24], s[26] ); + d0[13] = _mm_unpackhi_epi64( s[24], s[26] ); + d1[12] = _mm_unpacklo_epi64( s[25], s[27] ); + d1[13] = _mm_unpackhi_epi64( s[25], s[27] ); + d0[14] = _mm_unpacklo_epi64( s[28], s[30] ); + d0[15] = _mm_unpackhi_epi64( s[28], s[30] ); + d1[14] = _mm_unpacklo_epi64( s[29], s[31] ); + d1[15] = _mm_unpackhi_epi64( s[29], s[31] ); +} + +/* #define RLEAVE_4x64_2x128( i ) do \ { \ uint64_t *d0 = (uint64_t*)dst0 + (i); \ @@ -1287,7 +2501,363 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, RLEAVE_4x64_2x128( 16 ); RLEAVE_4x64_2x128( 20 ); RLEAVE_4x64_2x128( 24 ); RLEAVE_4x64_2x128( 28 ); } +*/ + +// 2x128 -> 8x64 + +static inline void rintrlv_4x128_8x64( void *dst, const void *src0, + const void *src1, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + + d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] ); + d[ 1] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] ); + d[ 2] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] ); + d[ 3] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] ); + d[ 4] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] ); + d[ 5] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] ); + d[ 6] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] ); + d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] ); + + d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] ); + d[ 9] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] ); + d[10] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] ); + d[11] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] ); + d[12] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] ); + d[13] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] ); + d[14] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] ); + d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] ); + + if ( bit_len <= 256 ) return; + + d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] ); + d[17] = _mm_unpacklo_epi64( s0[10], s0[11] ); + d[18] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] ); + d[19] = _mm_unpacklo_epi64( s1[10], s1[11] ); + d[20] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] ); + d[21] = _mm_unpackhi_epi64( s0[10], s0[11] ); + d[22] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] ); + d[23] = _mm_unpackhi_epi64( s1[10], s1[11] ); + + d[24] = _mm_unpacklo_epi64( s0[12], s0[13] ); + d[25] = _mm_unpacklo_epi64( s0[14], s0[15] ); + d[26] = _mm_unpacklo_epi64( s1[12], s1[13] ); + d[27] = _mm_unpacklo_epi64( s1[14], s1[15] ); + d[28] = _mm_unpackhi_epi64( s0[12], s0[13] ); + d[29] = _mm_unpackhi_epi64( s0[14], s0[15] ); + d[30] = _mm_unpackhi_epi64( s1[12], s1[13] ); + d[31] = _mm_unpackhi_epi64( s1[14], s1[15] ); + + if ( bit_len <= 512 ) return; + + d[32] = _mm_unpacklo_epi64( s0[16], s0[17] ); + d[33] = _mm_unpacklo_epi64( s0[18], s0[19] ); + d[34] = _mm_unpacklo_epi64( s1[16], s1[17] ); + d[35] = _mm_unpacklo_epi64( s1[18], s1[19] ); + d[36] = _mm_unpackhi_epi64( s0[16], s0[17] ); + d[37] = _mm_unpackhi_epi64( s0[18], s0[19] ); + d[38] = _mm_unpackhi_epi64( s1[16], s1[17] ); + d[39] = _mm_unpackhi_epi64( s1[18], s1[19] ); + + if ( bit_len <= 640 ) return; + + d[40] = _mm_unpacklo_epi64( s0[20], s0[21] ); + d[41] = _mm_unpacklo_epi64( s0[22], s0[23] ); + d[42] = _mm_unpacklo_epi64( s1[20], s1[21] ); + d[43] = _mm_unpacklo_epi64( s1[22], s1[23] ); + d[44] = _mm_unpackhi_epi64( s0[20], s0[21] ); + d[45] = _mm_unpackhi_epi64( s0[22], s0[23] ); + d[46] = _mm_unpackhi_epi64( s1[20], s1[21] ); + d[47] = _mm_unpackhi_epi64( s1[22], s1[23] ); + + d[48] = _mm_unpacklo_epi64( s0[24], s0[25] ); + d[49] = _mm_unpacklo_epi64( s0[26], s0[27] ); + d[50] = _mm_unpacklo_epi64( s1[24], s1[25] ); + d[51] = _mm_unpacklo_epi64( s1[26], s1[27] ); + d[52] = _mm_unpackhi_epi64( s0[24], s0[25] ); + d[53] = _mm_unpackhi_epi64( s0[26], s0[27] ); + d[54] = _mm_unpackhi_epi64( s1[24], s1[25] ); + d[55] = _mm_unpackhi_epi64( s1[26], s1[27] ); + + d[56] = _mm_unpacklo_epi64( s0[28], s0[29] ); + d[57] = _mm_unpacklo_epi64( s0[30], s0[31] ); + d[58] = _mm_unpacklo_epi64( s1[28], s1[29] ); + d[59] = _mm_unpacklo_epi64( s1[30], s1[31] ); + d[60] = _mm_unpackhi_epi64( s0[28], s0[29] ); + d[61] = _mm_unpackhi_epi64( s0[30], s0[31] ); + d[62] = _mm_unpackhi_epi64( s1[28], s1[29] ); + d[63] = _mm_unpackhi_epi64( s1[30], s1[31] ); +} +// 8x64 -> 4x128 + +static inline void rintrlv_8x64_4x128( void *dst0, void *dst1, + const void *src, const int bit_len ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + const __m128i* s = (const __m128i*)src; + + d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] ); + d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 4] ); + d1[ 0] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); + d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] ); + d0[ 2] = _mm_unpacklo_epi64( s[ 1], s[ 5] ); + d0[ 3] = _mm_unpackhi_epi64( s[ 1], s[ 5] ); + d1[ 2] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); + d1[ 3] = _mm_unpackhi_epi64( s[ 3], s[ 7] ); + + d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[12] ); + d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[12] ); + d1[ 4] = _mm_unpacklo_epi64( s[10], s[14] ); + d1[ 5] = _mm_unpackhi_epi64( s[10], s[14] ); + d0[ 6] = _mm_unpacklo_epi64( s[ 9], s[13] ); + d0[ 7] = _mm_unpackhi_epi64( s[ 9], s[13] ); + d1[ 6] = _mm_unpacklo_epi64( s[11], s[15] ); + d1[ 7] = _mm_unpackhi_epi64( s[11], s[15] ); + + if ( bit_len <= 256 ) return; + + d0[ 8] = _mm_unpacklo_epi64( s[16], s[20] ); + d0[ 9] = _mm_unpackhi_epi64( s[16], s[20] ); + d1[ 8] = _mm_unpacklo_epi64( s[18], s[22] ); + d1[ 9] = _mm_unpackhi_epi64( s[18], s[22] ); + d0[10] = _mm_unpacklo_epi64( s[17], s[21] ); + d0[11] = _mm_unpackhi_epi64( s[17], s[21] ); + d1[10] = _mm_unpacklo_epi64( s[19], s[23] ); + d1[11] = _mm_unpackhi_epi64( s[19], s[23] ); + + d0[12] = _mm_unpacklo_epi64( s[24], s[28] ); + d0[13] = _mm_unpackhi_epi64( s[24], s[28] ); + d1[12] = _mm_unpacklo_epi64( s[26], s[30] ); + d1[13] = _mm_unpackhi_epi64( s[26], s[30] ); + d0[14] = _mm_unpacklo_epi64( s[25], s[29] ); + d0[15] = _mm_unpackhi_epi64( s[25], s[29] ); + d1[14] = _mm_unpacklo_epi64( s[27], s[31] ); + d1[15] = _mm_unpackhi_epi64( s[27], s[31] ); + + if ( bit_len <= 512 ) return; + + d0[16] = _mm_unpacklo_epi64( s[32], s[36] ); + d0[17] = _mm_unpackhi_epi64( s[32], s[36] ); + d1[16] = _mm_unpacklo_epi64( s[34], s[38] ); + d1[17] = _mm_unpackhi_epi64( s[34], s[38] ); + d0[18] = _mm_unpacklo_epi64( s[33], s[37] ); + d0[19] = _mm_unpackhi_epi64( s[33], s[37] ); + d1[18] = _mm_unpacklo_epi64( s[35], s[39] ); + d1[19] = _mm_unpackhi_epi64( s[35], s[39] ); + + if ( bit_len <= 640 ) return; + + d0[20] = _mm_unpacklo_epi64( s[40], s[44] ); + d0[21] = _mm_unpackhi_epi64( s[40], s[44] ); + d1[20] = _mm_unpacklo_epi64( s[42], s[46] ); + d1[21] = _mm_unpackhi_epi64( s[42], s[46] ); + d0[22] = _mm_unpacklo_epi64( s[41], s[45] ); + d0[23] = _mm_unpackhi_epi64( s[41], s[45] ); + d1[22] = _mm_unpacklo_epi64( s[43], s[47] ); + d1[23] = _mm_unpackhi_epi64( s[43], s[47] ); + + d0[24] = _mm_unpacklo_epi64( s[48], s[52] ); + d0[25] = _mm_unpackhi_epi64( s[48], s[52] ); + d1[24] = _mm_unpacklo_epi64( s[50], s[54] ); + d1[25] = _mm_unpackhi_epi64( s[50], s[54] ); + d0[26] = _mm_unpacklo_epi64( s[49], s[53] ); + d0[27] = _mm_unpackhi_epi64( s[49], s[53] ); + d1[26] = _mm_unpacklo_epi64( s[51], s[55] ); + d1[27] = _mm_unpackhi_epi64( s[51], s[55] ); + + d0[28] = _mm_unpacklo_epi64( s[56], s[60] ); + d0[29] = _mm_unpackhi_epi64( s[56], s[60] ); + d1[28] = _mm_unpacklo_epi64( s[58], s[62] ); + d1[29] = _mm_unpackhi_epi64( s[58], s[62] ); + d0[30] = _mm_unpacklo_epi64( s[57], s[61] ); + d0[31] = _mm_unpackhi_epi64( s[57], s[61] ); + d1[30] = _mm_unpacklo_epi64( s[59], s[63] ); + d1[31] = _mm_unpackhi_epi64( s[59], s[63] ); +} + +// 8x64 -> 2x256 + +static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src, const int bit_len ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + const __m128i* s = (const __m128i*)src; + + d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] ); + d1[ 0] = _mm_unpackhi_epi64( s[ 0], s[ 4] ); + d2[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 5] ); + d3[ 0] = _mm_unpackhi_epi64( s[ 1], s[ 5] ); + d0[ 1] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); + d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] ); + d2[ 1] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); + d3[ 1] = _mm_unpackhi_epi64( s[ 3], s[ 7] ); + + d0[ 2] = _mm_unpacklo_epi64( s[ 8], s[12] ); + d1[ 2] = _mm_unpackhi_epi64( s[ 8], s[12] ); + d2[ 2] = _mm_unpacklo_epi64( s[ 9], s[13] ); + d3[ 2] = _mm_unpackhi_epi64( s[ 9], s[13] ); + d0[ 3] = _mm_unpacklo_epi64( s[10], s[14] ); + d1[ 3] = _mm_unpackhi_epi64( s[10], s[14] ); + d2[ 3] = _mm_unpacklo_epi64( s[11], s[15] ); + d3[ 3] = _mm_unpackhi_epi64( s[11], s[15] ); + + if ( bit_len <= 256 ) return; + + d0[ 4] = _mm_unpacklo_epi64( s[16], s[20] ); + d1[ 4] = _mm_unpackhi_epi64( s[16], s[20] ); + d2[ 4] = _mm_unpacklo_epi64( s[17], s[21] ); + d3[ 4] = _mm_unpackhi_epi64( s[17], s[21] ); + d0[ 5] = _mm_unpacklo_epi64( s[18], s[22] ); + d1[ 5] = _mm_unpackhi_epi64( s[18], s[22] ); + d2[ 5] = _mm_unpacklo_epi64( s[19], s[23] ); + d3[ 5] = _mm_unpackhi_epi64( s[19], s[23] ); + + d0[ 6] = _mm_unpacklo_epi64( s[24], s[28] ); + d1[ 6] = _mm_unpackhi_epi64( s[24], s[28] ); + d2[ 6] = _mm_unpacklo_epi64( s[25], s[29] ); + d3[ 6] = _mm_unpackhi_epi64( s[25], s[29] ); + d0[ 7] = _mm_unpacklo_epi64( s[26], s[30] ); + d1[ 7] = _mm_unpackhi_epi64( s[26], s[30] ); + d2[ 7] = _mm_unpacklo_epi64( s[27], s[31] ); + d3[ 7] = _mm_unpackhi_epi64( s[27], s[31] ); + + if ( bit_len <= 512 ) return; + + d0[ 8] = _mm_unpacklo_epi64( s[32], s[36] ); + d1[ 8] = _mm_unpackhi_epi64( s[32], s[36] ); + d2[ 8] = _mm_unpacklo_epi64( s[33], s[37] ); + d3[ 8] = _mm_unpackhi_epi64( s[33], s[37] ); + d0[ 9] = _mm_unpacklo_epi64( s[34], s[38] ); + d1[ 9] = _mm_unpackhi_epi64( s[34], s[38] ); + d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] ); + d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] ); + + if ( bit_len <= 640 ) return; + + d0[10] = _mm_unpacklo_epi64( s[40], s[44] ); + d1[10] = _mm_unpackhi_epi64( s[40], s[44] ); + d2[10] = _mm_unpacklo_epi64( s[41], s[45] ); + d3[10] = _mm_unpackhi_epi64( s[41], s[45] ); + d0[11] = _mm_unpacklo_epi64( s[42], s[46] ); + d1[11] = _mm_unpackhi_epi64( s[42], s[46] ); + d2[11] = _mm_unpacklo_epi64( s[43], s[47] ); + d3[11] = _mm_unpackhi_epi64( s[43], s[47] ); + + d0[12] = _mm_unpacklo_epi64( s[48], s[52] ); + d1[12] = _mm_unpackhi_epi64( s[48], s[52] ); + d2[12] = _mm_unpacklo_epi64( s[49], s[53] ); + d3[12] = _mm_unpackhi_epi64( s[49], s[53] ); + d0[13] = _mm_unpacklo_epi64( s[50], s[54] ); + d1[13] = _mm_unpackhi_epi64( s[50], s[54] ); + d2[13] = _mm_unpacklo_epi64( s[51], s[55] ); + d3[13] = _mm_unpackhi_epi64( s[51], s[55] ); + + d0[14] = _mm_unpacklo_epi64( s[56], s[60] ); + d1[14] = _mm_unpackhi_epi64( s[56], s[60] ); + d2[14] = _mm_unpacklo_epi64( s[57], s[61] ); + d3[14] = _mm_unpackhi_epi64( s[57], s[61] ); + d0[15] = _mm_unpacklo_epi64( s[58], s[62] ); + d1[15] = _mm_unpackhi_epi64( s[58], s[62] ); + d2[15] = _mm_unpacklo_epi64( s[59], s[63] ); + d3[15] = _mm_unpackhi_epi64( s[59], s[63] ); +} + +// 4x128 -> 8x64 + +static inline void rintrlv_2x256_8x64( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + __m128i *s0 = (__m128i*)src0; + __m128i *s1 = (__m128i*)src1; + __m128i *s2 = (__m128i*)src2; + __m128i *s3 = (__m128i*)src3; + + d[ 0] = _mm_unpacklo_epi64( s0[0], s0[2] ); + d[ 1] = _mm_unpacklo_epi64( s1[0], s1[2] ); + d[ 2] = _mm_unpacklo_epi64( s2[0], s2[2] ); + d[ 3] = _mm_unpacklo_epi64( s3[0], s3[2] ); + d[ 4] = _mm_unpackhi_epi64( s0[0], s0[2] ); + d[ 5] = _mm_unpackhi_epi64( s1[0], s1[2] ); + d[ 6] = _mm_unpackhi_epi64( s2[0], s2[2] ); + d[ 7] = _mm_unpackhi_epi64( s3[0], s3[2] ); + + d[ 8] = _mm_unpacklo_epi64( s0[1], s0[3] ); + d[ 9] = _mm_unpacklo_epi64( s1[1], s1[3] ); + d[10] = _mm_unpacklo_epi64( s2[1], s2[3] ); + d[11] = _mm_unpacklo_epi64( s3[1], s3[3] ); + d[12] = _mm_unpackhi_epi64( s0[1], s0[3] ); + d[13] = _mm_unpackhi_epi64( s1[1], s1[3] ); + d[14] = _mm_unpackhi_epi64( s2[1], s2[3] ); + d[15] = _mm_unpackhi_epi64( s3[1], s3[3] ); + + if ( bit_len <= 256 ) return; + + d[16] = _mm_unpacklo_epi64( s0[4], s0[6] ); + d[17] = _mm_unpacklo_epi64( s1[4], s1[6] ); + d[18] = _mm_unpacklo_epi64( s2[4], s2[6] ); + d[19] = _mm_unpacklo_epi64( s3[4], s3[6] ); + d[20] = _mm_unpackhi_epi64( s0[4], s0[6] ); + d[21] = _mm_unpackhi_epi64( s1[4], s1[6] ); + d[22] = _mm_unpackhi_epi64( s2[4], s2[6] ); + d[23] = _mm_unpackhi_epi64( s3[4], s3[6] ); + + d[24] = _mm_unpacklo_epi64( s0[5], s0[7] ); + d[25] = _mm_unpacklo_epi64( s1[5], s1[7] ); + d[26] = _mm_unpacklo_epi64( s2[5], s2[7] ); + d[27] = _mm_unpacklo_epi64( s3[5], s3[7] ); + d[28] = _mm_unpackhi_epi64( s0[5], s0[7] ); + d[29] = _mm_unpackhi_epi64( s1[5], s1[7] ); + d[30] = _mm_unpackhi_epi64( s2[5], s2[7] ); + d[31] = _mm_unpackhi_epi64( s3[5], s3[7] ); + + if ( bit_len <= 512 ) return; + + d[32] = _mm_unpacklo_epi64( s0[8], s0[10] ); + d[33] = _mm_unpacklo_epi64( s1[8], s1[10] ); + d[34] = _mm_unpacklo_epi64( s2[8], s2[10] ); + d[35] = _mm_unpacklo_epi64( s3[8], s3[10] ); + d[36] = _mm_unpackhi_epi64( s0[8], s0[10] ); + d[37] = _mm_unpackhi_epi64( s1[8], s1[10] ); + d[38] = _mm_unpackhi_epi64( s2[8], s2[10] ); + d[39] = _mm_unpackhi_epi64( s3[8], s3[10] ); + + if ( bit_len <= 640 ) return; + + d[40] = _mm_unpacklo_epi64( s0[9], s0[11] ); + d[41] = _mm_unpacklo_epi64( s1[9], s1[11] ); + d[42] = _mm_unpacklo_epi64( s2[9], s2[11] ); + d[43] = _mm_unpacklo_epi64( s3[9], s3[11] ); + d[44] = _mm_unpackhi_epi64( s0[9], s0[11] ); + d[45] = _mm_unpackhi_epi64( s1[9], s1[11] ); + d[46] = _mm_unpackhi_epi64( s2[9], s2[11] ); + d[47] = _mm_unpackhi_epi64( s3[9], s3[11] ); + + d[48] = _mm_unpacklo_epi64( s0[12], s0[14] ); + d[49] = _mm_unpacklo_epi64( s1[12], s1[14] ); + d[50] = _mm_unpacklo_epi64( s2[12], s2[14] ); + d[51] = _mm_unpacklo_epi64( s3[12], s3[14] ); + d[52] = _mm_unpackhi_epi64( s0[12], s0[14] ); + d[53] = _mm_unpackhi_epi64( s1[12], s1[14] ); + d[54] = _mm_unpackhi_epi64( s2[12], s2[14] ); + d[55] = _mm_unpackhi_epi64( s3[12], s3[14] ); + + d[56] = _mm_unpacklo_epi64( s0[13], s0[15] ); + d[57] = _mm_unpacklo_epi64( s1[13], s1[15] ); + d[58] = _mm_unpacklo_epi64( s2[13], s2[15] ); + d[59] = _mm_unpacklo_epi64( s3[13], s3[15] ); + d[60] = _mm_unpackhi_epi64( s0[13], s0[15] ); + d[61] = _mm_unpackhi_epi64( s1[13], s1[15] ); + d[62] = _mm_unpackhi_epi64( s2[13], s2[15] ); + d[63] = _mm_unpackhi_epi64( s3[13], s3[15] ); +} // // Some functions customized for mining. @@ -1296,17 +2866,19 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, #if defined(__SSE4_1__) // No SSE2 implementation. -#define mm128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f ) -#define mm128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 ) +//#define mm128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f ) +//#define mm128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 ) #endif // SSE4_1 #if defined(__AVX2__) -#define mm256_intrlv_blend_128( hi, lo ) _mm256_blend_epi32( hi, lo, 0x0f ) -#define mm256_intrlv_blend_64( hi, lo ) _mm256_blend_epi32( hi, lo, 0x33 ) +//#define mm256_intrlv_blend_128( hi, lo ) _mm256_blend_epi32( hi, lo, 0x0f ) +//#define mm256_intrlv_blend_64( hi, lo ) _mm256_blend_epi32( hi, lo, 0x33 ) #define mm256_intrlv_blend_32( hi, lo ) _mm256_blend_epi32( hi, lo, 0x55 ) +// change to _mm256_blend_epi32 +// // Select lanes of 32 byte hash from 2 sources according to control mask. // macro due to 256 bit value arg. #define mm256_blend_hash_4x64( dst, a, b, mask ) \ @@ -1323,4 +2895,30 @@ do { \ #endif // AVX2 +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +/* +#define mm512_intrlv_blend_128( hi, lo ) \ + _mm512_mask_blend_epi32( 0x0f0f, hi, lo ) + +#define mm512_intrlv_blend_64( hi, lo ) \ + _mm512_mask_blend_epi32( 0x3333, hi, lo ) +*/ + +#define mm512_intrlv_blend_32( hi, lo ) \ + _mm512_mask_blend_epi32( 0x5555, hi, lo ) + +#define mm512_blend_hash_8x64( dst, a, b, mask ) \ +do { \ + dst[0] = _mm512_mask_blend_epi64( mask, a[0], b[0] ); \ + dst[1] = _mm512_mask_blend_epi64( mask, a[1], b[1] ); \ + dst[2] = _mm512_mask_blend_epi64( mask, a[2], b[2] ); \ + dst[3] = _mm512_mask_blend_epi64( mask, a[3], b[3] ); \ + dst[4] = _mm512_mask_blend_epi64( mask, a[4], b[4] ); \ + dst[5] = _mm512_mask_blend_epi64( mask, a[5], b[5] ); \ + dst[6] = _mm512_mask_blend_epi64( mask, a[6], b[6] ); \ + dst[7] = _mm512_mask_blend_epi64( mask, a[7], b[7] ); \ +} while(0) + +#endif // AVX512 #endif // INTERLEAVE_H__ diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index bad84b0..35be610 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -3,170 +3,176 @@ #if defined(__SSE2__) -////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////// // // 128 bit SSE vectors // -// SSE2 is generally required for full 128 bit support. Some functions -// are also optimized with SSSE3 or SSE4.1. -// -// Do not call _mm_extract directly, it isn't supported in SSE2. -// Use mm128_extr instead, it will select the appropriate implementation. -// -// 128 bit operations are enhanced with uint128 which adds 128 bit integer -// support for arithmetic and other operations. Casting to uint128_t is not -// efficient but is sometimes the only way for certain operations. +// SSE2 is required for 128 bit integer support. Some functions are also +// optimized with SSSE3, SSE4.1 or AVX. Some of these more optimized +// functions don't have SSE2 equivalents and their use would break SSE2 +// compatibility. // // Constants are an issue with simd. Simply put, immediate constants don't -// exist. All simd constants either reside in memory or a register. -// The distibction is made below with c128 being memory resident defined -// at compile time and m128 being register defined at run time. -// -// All run time constants must be generated using their components elements -// incurring significant overhead. The more elements the more overhead -// both in instructions and in GP register usage. Whenever possible use -// 64 bit constant elements regardless of the actual element size. +// exist. All simd constants either reside in memory or a register and +// must be loaded from memory or generated at run time. // -// Due to the cost of generating constants they should not be regenerated -// in the same function. Instead, define a local const. +// Due to the cost of generating constants it is more efficient to +// define a local const for repeated references to the same constant. // -// Some constant values can be generated using shortcuts. Zero for example -// is as simple as XORing any register with itself, and is implemented -// in the setzero instrinsic. These shortcuts must be implemented is asm -// due to doing things the compiler would complain about. Another single -// instruction constant is -1, defined below. Others may be added as the need -// arises. Even single instruction constants are less efficient than local -// register variables so the advice above stands. +// One common use for simd constants is as a control index for vector +// instructions like blend and shuffle. Alhough the ultimate instruction +// may execute in a single clock cycle, generating the control index adds +// several more cycles to the entire operation. // -// One common use for simd constants is as a control index for some simd -// instructions like blend and shuffle. The utilities below do not take this -// into account. Those that generate a simd constant should not be used -// repeatedly. It may be better for the application to reimplement the -// utility to better suit its usage. +// All of the utilities here assume all data is in registers except +// in rare cases where arguments are pointers. // -// More tips: +// Some constants are generated using a memory overlay on the stack. // -// Conversions from integer to vector should be avoided whenever possible. -// Extract, insert and set and set1 instructions should be avoided. -// In addition to the issues with constants set is also very inefficient with -// variables. -// Converting integer data to perform a couple of vector operations -// then converting back to integer should be avoided. Converting data in -// registers should also be avoided. Conversion should be limited to buffers -// in memory where the data is loaded directly to vector registers, bypassing -// the integer to vector conversion. +// Intrinsics automatically promote from REX to VEX when AVX is available +// but ASM needs to be done manually. // -// Pseudo constants. -// -// These can't be used for compile time initialization. -// These should be used for all simple vectors. -// Repeated usage of any simd pseudo-constant should use a locally defined -// const rather than recomputing it for every reference. - -#define m128_zero _mm_setzero_si128() +/////////////////////////////////////////////////////////////////////////// -// As suggested by Intel... -// Arg passing for simd registers is assumed to be first output arg, -// then input args, then locals. This is probably wrong, gcc likely picks -// whichever register is currently holding the variable, or whichever -// register is available to hold it. Nevertheless, all args are specified -// by their arg number and local variables use registers starting at -// last arg + 1, by type. -// Output args don't need to be listed as clobbered. +// Efficient and convenient moving between GP & low bits of XMM. +// Use VEX when available to give access to xmm8-15 and zero extend for +// larger vectors. -static inline __m128i m128_one_64_fn() +static inline __m128i mm128_mov64_128( const uint64_t n ) { __m128i a; - asm( "pxor %0, %0\n\t" - "pcmpeqd %%xmm1, %%xmm1\n\t" - "psubq %%xmm1, %0\n\t" - : "=x"(a) - : - : "xmm1" ); - return a; +#if defined(__AVX__) + asm( "vmovq %1, %0\n\t" : "=x"(a) : "r"(n) ); +#else + asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) ); +#endif + return a; } -#define m128_one_64 m128_one_64_fn() -static inline __m128i m128_one_32_fn() +static inline __m128i mm128_mov32_128( const uint32_t n ) { __m128i a; - asm( "pxor %0, %0\n\t" - "pcmpeqd %%xmm1, %%xmm1\n\t" - "psubd %%xmm1, %0\n\t" - : "=x"(a) - : - : "xmm1" ); - return a; +#if defined(__AVX__) + asm( "vmovd %1, %0\n\t" : "=x"(a) : "r"(n) ); +#else + asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) ); +#endif + return a; } -#define m128_one_32 m128_one_32_fn() -static inline __m128i m128_one_16_fn() +static inline uint64_t mm128_mov128_64( const __m128i a ) { - __m128i a; - asm( "pxor %0, %0\n\t" - "pcmpeqd %%xmm1, %%xmm1\n\t" - "psubw %%xmm1, %0\n\t" - : "=x"(a) - : - : "xmm1" ); - return a; + uint64_t n; +#if defined(__AVX__) + asm( "vmovq %1, %0\n\t" : "=r"(n) : "x"(a) ); +#else + asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) ); +#endif + return n; } -#define m128_one_16 m128_one_16_fn() -static inline __m128i m128_one_8_fn() +static inline uint32_t mm128_mov128_32( const __m128i a ) { - __m128i a; - asm( "pxor %0, %0\n\t" - "pcmpeqd %%xmm1, %%xmm1\n\t" - "psubb %%xmm1, %0\n\t" - : "=x"(a) - : - : "xmm1" ); - return a; + uint32_t n; +#if defined(__AVX__) + asm( "vmovd %1, %0\n\t" : "=r"(n) : "x"(a) ); +#else + asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) ); +#endif + return n; } -#define m128_one_8 m128_one_8_fn() -static inline __m128i m128_neg1_fn() -{ - __m128i a; - asm( "pcmpeqd %0, %0\n\t" - : "=x"(a) ); - return a; -} -#define m128_neg1 m128_neg1_fn() +// Equivalent of set1, broadcast integer to all elements. +#define m128_const_i128( i ) mm128_mov64_128( i ) +#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 ) +#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 ) + +#if defined(__SSE4_1__) + +// Assign 64 bit integers to respective elements: {hi, lo} +#define m128_const_64( hi, lo ) \ + _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 ) -#if defined(__SSE41__) +#else // No insert in SSE2 -static inline __m128i m128_one_128_fn() +#define m128_const_64 _mm_set_epi64x + +#endif + +// Pseudo constants + +#define m128_zero _mm_setzero_si128() +#define m128_one_128 mm128_mov64_128( 1 ) +#define m128_one_64 _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 ) +#define m128_one_32 _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 ) +#define m128_one_16 _mm_shuffle_epi32( \ + mm128_mov32_128( 0x00010001 ), 0x00 ) +#define m128_one_8 _mm_shuffle_epi32( \ + mm128_mov32_128( 0x01010101 ), 0x00 ) + +// ASM avoids the need to initialize return variable to avoid compiler warning. +// Macro abstracts function parentheses to look like an identifier. + +static inline __m128i mm128_neg1_fn() { __m128i a; - asm( "pinsrq $0, $1, %0\n\t" - "pinsrq $1, $0, %0\n\t" - : "=x"(a) ); +#if defined(__AVX__) + asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) ); +#else + asm( "pcmpeqq %0, %0\n\t" : "=x"(a) ); +#endif return a; } -#define m128_one_128 m128_one_128_fn() +#define m128_neg1 mm128_neg1_fn() -// alternative to _mm_set_epi64x, doesn't use mem, -// cost = 2 pinsrt, estimate 4 clocks. -static inline __m128i m128_const_64( uint64_t hi, uint64_t lo ) -{ - __m128i a; - asm( "pinsrq $0, %2, %0\n\t" - "pinsrq $1, %1, %0\n\t" - : "=x"(a) - : "r"(hi), "r"(lo) ); - return a; -} +#if defined(__SSE4_1__) -#else +///////////////////////////// +// +// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c ) +// +// Fast and powerful but very limited in its application. +// It requires SSE4.1 but only works with 128 bit vectors with 32 bit +// elements. There is no equivalent instruction for 256 bit or 512 bit vectors. +// There's no integer version. There's no 64 bit, 16 bit or byte element +// sizing. It's unique. +// +// It can: +// - zero 32 bit elements of a 128 bit vector. +// - extract any 32 bit element from one 128 bit vector and insert the +// data to any 32 bit element of another 128 bit vector, or the same vector. +// - do both simultaneoulsly. +// +// It can be used as a more efficient replacement for _mm_insert_epi32 +// or _mm_extract_epi32. +// +// Control byte definition: +// c[3:0] zero mask +// c[5:4] destination element selector +// c[7:6] source element selector -#define m128_one_128 _mm_set_epi64x( 0ULL, 1ULL ) +// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask +#define mm128_xim_32( v1, v2, c ) \ + _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \ + _mm_castsi128_ps( v2 ), c ) ) -#define m128_const_64 _mm_set_epi64x +// Some examples of simple operations: -#endif +// Insert 32 bit integer into v at element c and return modified v. +static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i, + const int c ) +{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); } + +// Extract 32 bit element c from v and return as integer. +static inline uint32_t mm128_extract_32( const __m128i v, const int c ) +{ return mm128_mov128_32( mm128_xim_32( v, v, c<<6 ) ); } + +// Clear (zero) 32 bit elements based on bits set in 4 bit mask. +static inline __m128i mm128_mask_32( const __m128i v, const int m ) +{ return mm128_xim_32( v, v, m ); } + +#endif // SSE4_1 // // Basic operations without equivalent SIMD intrinsic @@ -174,11 +180,12 @@ static inline __m128i m128_const_64( uint64_t hi, uint64_t lo ) // Bitwise not (~v) #define mm128_not( v ) _mm_xor_si128( (v), m128_neg1 ) -// Unary negation of elements +// Unary negation of elements (-v) #define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v ) #define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v ) #define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v ) + // Add 4 values, fewer dependencies than sequential addition. #define mm128_add4_64( a, b, c, d ) \ _mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) ) @@ -195,35 +202,6 @@ static inline __m128i m128_const_64( uint64_t hi, uint64_t lo ) #define mm128_xor4( a, b, c, d ) \ _mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) ) -// This isn't cheap, not suitable for bulk usage. -#define mm128_extr_4x32( a0, a1, a2, a3, src ) \ -do { \ - a0 = _mm_extract_epi32( src, 0 ); \ - a1 = _mm_extract_epi32( src, 1 ); \ - a1 = _mm_extract_epi32( src, 2 ); \ - a3 = _mm_extract_epi32( src, 3 ); \ -} while(0) - -// Horizontal vector testing - -#if defined(__SSE41__) - -#define mm128_allbits0( a ) _mm_testz_si128( a, a ) -#define mm128_allbits1( a ) _mm_testc_si128( a, m128_neg1 ) -#define mm128_allbitsne( a ) _mm_testnzc_si128( a, m128_neg1 ) -#define mm128_anybits0 mm128_allbitsne -#define mm128_anybits1 mm128_allbitsne - -#else // SSE2 - -// Bit-wise test of entire vector, useful to test results of cmp. -#define mm128_anybits0( a ) (uint128_t)(a) -#define mm128_anybits1( a ) (((uint128_t)(a))+1) - -#define mm128_allbits0( a ) ( !mm128_anybits1(a) ) -#define mm128_allbits1( a ) ( !mm128_anybits0(a) ) - -#endif // SSE41 else SSE2 // // Vector pointer cast @@ -244,102 +222,63 @@ do { \ // returns pointer p+o #define casto_m128i(p,o) (((__m128i*)(p))+(o)) -// SSE2 doesn't implement extract -#if defined(__SSE4_1) - -#define mm128_extr_64(a,n) _mm_extract_epi64( a, n ) -#define mm128_extr_32(a,n) _mm_extract_epi32( a, n ) - -#else - -// Doesn't work with register variables. -#define mm128_extr_64(a,n) (((uint64_t*)&a)[n]) -#define mm128_extr_32(a,n) (((uint32_t*)&a)[n]) - -#endif - - -// Gather and scatter data. -// Surprise, they don't use vector instructions. Several reasons why. -// Since scalar data elements are being manipulated scalar instructions -// are most appropriate and can bypass vector registers. They are faster -// and more efficient on a per instruction basis due to the higher clock -// speed and greater avaiability of execution resources. It's good for -// interleaving data buffers for parallel processing. -// May suffer overhead if data is already in a vector register. This can -// usually be easilly avoided by the coder. Sometimes _mm_set is simply better. -// These macros are likely to be used when transposing matrices rather than -// conversions of a single vector. - -// Gather data elements into contiguous memory for vector use. -// Source args are appropriately sized value integers, destination arg is a -// type agnostic pointer. -// Vector alignment is not required, though likely. Appropriate integer -// alignment satisfies these macros. - -// rewrite using insert -#define mm128_gather_64( d, s0, s1 ) \ - ((uint64_t*)d)[0] = (uint64_t)s0; \ - ((uint64_t*)d)[1] = (uint64_t)s1; - -#define mm128_gather_32( d, s0, s1, s2, s3 ) \ - ((uint32_t*)d)[0] = (uint32_t)s0; \ - ((uint32_t*)d)[1] = (uint32_t)s1; \ - ((uint32_t*)d)[2] = (uint32_t)s2; \ - ((uint32_t*)d)[3] = (uint32_t)s3; - -// Scatter data from contiguous memory. -#define mm128_scatter_64( d0, d1, s ) \ - *( (uint64_t*)d0) = ((uint64_t*)s)[0]; \ - *( (uint64_t*)d1) = ((uint64_t*)s)[1]; - -#define mm128_scatter_32( d0, d1, d2, d3, s ) \ - *( (uint32_t*)d0) = ((uint32_t*)s)[0]; \ - *( (uint32_t*)d1) = ((uint32_t*)s)[1]; \ - *( (uint32_t*)d2) = ((uint32_t*)s)[2]; \ - *( (uint32_t*)d3) = ((uint32_t*)s)[3]; - -// Memory functions -// Mostly for convenience, avoids calculating bytes. -// Assumes data is alinged and integral. -// n = number of __m128i, bytes/16 // Memory functions // Mostly for convenience, avoids calculating bytes. // Assumes data is alinged and integral. // n = number of __m128i, bytes/16 -static inline void memset_zero_128( __m128i *dst, int n ) +static inline void memset_zero_128( __m128i *dst, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; } -static inline void memset_128( __m128i *dst, const __m128i a, int n ) +static inline void memset_128( __m128i *dst, const __m128i a, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = a; } -static inline void memcpy_128( __m128i *dst, const __m128i *src, int n ) +static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } // // Bit rotations -// AVX512 has implemented bit rotation for 128 bit vectors with +// AVX512VL has implemented bit rotation for 128 bit vectors with // 64 and 32 bit elements. -// -// Rotate each element of v by c bits +// compiler doesn't like when a variable is used for the last arg of +// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same +// specification but works with a variable. Therefore use rol_var where +// necessary. +// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required. -#define mm128_ror_64( v, c ) \ +#define mm128_ror_var_64( v, c ) \ _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ) -#define mm128_rol_64( v, c ) \ +#define mm128_rol_var_64( v, c ) \ _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ) -#define mm128_ror_32( v, c ) \ +#define mm128_ror_var_32( v, c ) \ _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ) -#define mm128_rol_32( v, c ) \ +#define mm128_rol_var_32( v, c ) \ _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +#define mm128_ror_64 _mm_ror_epi64 +#define mm128_rol_64 _mm_rol_epi64 +#define mm128_ror_32 _mm_ror_epi32 +#define mm128_rol_32 _mm_rol_epi32 + +#else // SSE2 + +#define mm128_ror_64 mm128_ror_var_64 +#define mm128_rol_64 mm128_rol_var_64 +#define mm128_ror_32 mm128_ror_var_32 +#define mm128_rol_32 mm128_rol_var_32 + +#endif // AVX512 else SSE2 + #define mm128_ror_16( v, c ) \ _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ) @@ -350,54 +289,24 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n ) // Rotate vector elements accross all lanes #define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) - #define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 ) #define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 ) +//#define mm128_swap_64( v ) _mm_alignr_epi8( v, v, 8 ) +//#define mm128_ror_1x32( v ) _mm_alignr_epi8( v, v, 4 ) +//#define mm128_rol_1x32( v ) _mm_alignr_epi8( v, v, 12 ) -#if defined (__SSE3__) -// no SSE2 implementation, no current users - -#define mm128_ror_1x16( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x01000f0e0d0c0b0a, \ - 0x0908070605040302 ) ) -#define mm128_rol_1x16( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080706, \ - 0x0504030201000f0e ) ) -#define mm128_ror_1x8( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x000f0e0d0c0b0a09, \ - 0x0807060504030201 ) ) -#define mm128_rol_1x8( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \ - 0x060504030201000f ) ) -#endif // SSE3 - -// Rotate 16 byte (128 bit) vector by c bytes. -// Less efficient using shift but more versatile. Use only for odd number -// byte rotations. Use shuffle above whenever possible. -#define mm128_bror( v, c ) \ - _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ) - -#define mm128_brol( v, c ) \ - _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ) - -// -// Rotate elements within lanes. - -#define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 ) +// Swap 32 bit elements in 64 bit lanes +#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 ) -#define mm128_ror16_64( v ) _mm_shuffle_epi8( v, \ - m128_const_64( 0x09080f0e0d0c0b0a, 0x0100070605040302 ) -#define mm128_rol16_64( v ) _mm_shuffle_epi8( v, \ - m128_const_64( 0x0dc0b0a09080f0e, 0x0504030201000706 ) +#if defined(__SSSE3__) -#define mm128_swap16_32( v ) _mm_shuffle_epi8( v, \ - m128_const_64( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) +// Rotate right by c bytes +static inline __m128i mm128_ror_x8( const __m128i v, const int c ) +{ return _mm_alignr_epi8( v, v, c ); } // // Endian byte swap. -#if defined(__SSSE3__) - #define mm128_bswap_64( v ) \ _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \ 0x0001020304050607 ) ) @@ -406,8 +315,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n ) _mm_shuffle_epi8( v, m128_const_64( 0x0c0d0e0f08090a0b, \ 0x0405060700010203 ) ) -#define mm128_bswap_16( v ) _mm_shuffle_epi8( \ - m128_const_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) +#define mm128_bswap_16( v ) \ + _mm_shuffle_epi8( v, m128_const_64( 0x0e0f0c0d0a0b0809, \ + 0x0607040502030001 ) // 8 byte qword * 8 qwords * 2 lanes = 128 bytes #define mm128_block_bswap_64( d, s ) do \ @@ -439,7 +349,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n ) #else // SSE2 -// Use inline function instead of macro due to multiple statements. static inline __m128i mm128_bswap_64( __m128i v ) { v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); @@ -459,7 +368,7 @@ static inline __m128i mm128_bswap_16( __m128i v ) return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); } -static inline void mm128_block_bswap_64( __m128i *d, __m128i *s ) +static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s ) { d[0] = mm128_bswap_64( s[0] ); d[1] = mm128_bswap_64( s[1] ); @@ -471,7 +380,7 @@ static inline void mm128_block_bswap_64( __m128i *d, __m128i *s ) d[7] = mm128_bswap_64( s[7] ); } -static inline void mm128_block_bswap_32( __m128i *d, __m128i *s ) +static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) { d[0] = mm128_bswap_32( s[0] ); d[1] = mm128_bswap_32( s[1] ); @@ -490,64 +399,65 @@ static inline void mm128_block_bswap_32( __m128i *d, __m128i *s ) // Swap 128 bit vectorse. -#define mm128_swap128_256( v1, v2 ) \ +#define mm128_swap256_128( v1, v2 ) \ v1 = _mm_xor_si128( v1, v2 ); \ v2 = _mm_xor_si128( v1, v2 ); \ v1 = _mm_xor_si128( v1, v2 ); + // Concatenate v1 & v2 and rotate as one 256 bit vector. #if defined(__SSE4_1__) -#define mm128_ror1x64_256( v1, v2 ) \ +#define mm128_ror256_64( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v1 = _mm_alignr_epi8( v2, v1, 8 ); \ v2 = t; \ } while(0) -#define mm128_rol1x64_256( v1, v2 ) \ +#define mm128_rol256_64( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v2 = _mm_alignr_epi8( v2, v1, 8 ); \ v1 = t; \ } while(0) -#define mm128_ror1x32_256( v1, v2 ) \ +#define mm128_ror256_32( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 4 ); \ v1 = _mm_alignr_epi8( v2, v1, 4 ); \ v2 = t; \ } while(0) -#define mm128_rol1x32_256( v1, v2 ) \ +#define mm128_rol256_32( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 12 ); \ v2 = _mm_alignr_epi8( v2, v1, 12 ); \ v1 = t; \ } while(0) -#define mm128_ror1x16_256( v1, v2 ) \ +#define mm128_ror256_16( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 2 ); \ v1 = _mm_alignr_epi8( v2, v1, 2 ); \ v2 = t; \ } while(0) -#define mm128_rol1x16_256( v1, v2 ) \ +#define mm128_rol256_16( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 14 ); \ v2 = _mm_alignr_epi8( v2, v1, 14 ); \ v1 = t; \ } while(0) -#define mm128_ror1x8_256( v1, v2 ) \ +#define mm128_ror256_8( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 1 ); \ v1 = _mm_alignr_epi8( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm128_rol1x8_256( v1, v2 ) \ +#define mm128_rol256_8( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 15 ); \ v2 = _mm_alignr_epi8( v2, v1, 15 ); \ @@ -556,59 +466,75 @@ do { \ #else // SSE2 -#define mm128_ror1x64_256( v1, v2 ) \ +#define mm128_ror256_64( v1, v2 ) \ do { \ - __m128i t = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 8 ); \ - v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 8 ); \ + __m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \ + _mm_slli_si128( v2, 8 ) ); \ + v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \ + _mm_slli_si128( v1, 8 ) ); \ v1 = t; \ } while(0) -#define mm128_rol1x64_256( v1, v2 ) \ +#define mm128_rol256_64( v1, v2 ) \ do { \ - __m128i t = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 8 ); \ - v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 8 ); \ + __m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \ + _mm_srli_si128( v2, 8 ) ); \ + v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \ + _mm_srli_si128( v1, 8 ) ); \ v1 = t; \ } while(0) -#define mm128_ror1x32_256( v1, v2 ) \ +#define mm128_ror256_32( v1, v2 ) \ do { \ - __m128i t = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 12 ); \ - v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 12 ); \ + __m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \ + _mm_slli_si128( v2, 12 ) ); \ + v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \ + _mm_slli_si128( v1, 12 ) ); \ v1 = t; \ } while(0) -#define mm128_rol1x32_256( v1, v2 ) \ +#define mm128_rol256_32( v1, v2 ) \ do { \ - __m128i t = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 12 ); \ - v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 12 ); \ + __m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \ + _mm_srli_si128( v2, 12 ) ); \ + v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \ + _mm_srli_si128( v1, 12 ) ); \ v1 = t; \ } while(0) -#define mm128_ror1x16_256( v1, v2 ) \ +#define mm128_ror256_16( v1, v2 ) \ do { \ - __m128i t = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 14 ); \ - v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 14 ); \ + __m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \ + _mm_slli_si128( v2, 14 ) ); \ + v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \ + _mm_slli_si128( v1, 14 ) ); \ v1 = t; \ } while(0) -#define mm128_rol1x16_256( v1, v2 ) \ +#define mm128_rol256_16( v1, v2 ) \ do { \ - __m128i t = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 14 ); \ - v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 14 ); \ + __m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \ + _mm_srli_si128( v2, 14 ) ); \ + v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \ + _mm_srli_si128( v1, 14 ) ); \ v1 = t; \ } while(0) -#define mm128_ror1x8_256( v1, v2 ) \ +#define mm128_ror256_8( v1, v2 ) \ do { \ - __m128i t = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 15 ); \ - v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 15 ); \ + __m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \ + _mm_slli_si128( v2, 15 ) ); \ + v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \ + _mm_slli_si128( v1, 15 ) ); \ v1 = t; \ } while(0) -#define mm128_rol1x8_256( v1, v2 ) \ +#define mm128_rol256_8( v1, v2 ) \ do { \ - __m128i t = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 15 ); \ - v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 15 ); \ + __m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \ + _mm_srli_si128( v2, 15 ) ); \ + v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \ + _mm_srli_si128( v1, 15 ) ); \ v1 = t; \ } while(0) diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 9f7a233..635eb4f 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -1,7 +1,7 @@ #if !defined(SIMD_256_H__) #define SIMD_256_H__ 1 -#if defined(__AVX__) +#if defined(__AVX2__) ///////////////////////////////////////////////////////////////////// // @@ -14,205 +14,67 @@ // is limited because 256 bit vectors are less likely to be used when 512 // is available. -// -// Pseudo constants. -// These can't be used for compile time initialization but are preferable -// for simple constant vectors at run time. For repeated use define a local -// constant to avoid multiple calls to the same macro. - -#define m256_zero _mm256_setzero_si256() - -#define m256_one_256 \ - _mm256_insertf128_si256( _mm256_castsi128_si256( m128_one_128 ), \ - m128_zero, 1 ) - -#define m256_one_128 \ - _mm256_insertf128_si256( _mm256_castsi128_si256( m128_one_128 ), \ - m128_one_128, 1 ) - -// set instructions load memory resident constants, this avoids mem. -// cost 4 pinsert + 1 vinsert, estimate 7 clocks. -// Avoid using, mm128_const_64 twice is still faster. -#define m256_const_64( i3, i2, i1, i0 ) \ - _mm256_insertf128_si256( _mm256_castsi128_si256( m128_const_64( i1, i0 ) ), \ - m128_const_64( i3, i2 ), 1 ) -#define m256_const1_64( i ) m256_const_64( i, i, i, i ) - -#if defined(__AVX2__) - -// These look like a lot of overhead but the compiler optimizes nicely -// and puts the asm inline in the calling function. Usage is like any -// variable expression. -// __m256i foo = m256_one_64; +// Move integer to low element of vector, other elements are set to zero. +#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) ) +#define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) ) -static inline __m256i m256_one_64_fn() -{ - __m256i a; - asm( "vpxor %0, %0, %0\n\t" - "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t" - "vpsubq %%ymm1, %0, %0\n\t" - : "=x"(a) - : - : "ymm1" ); - return a; -} -#define m256_one_64 m256_one_64_fn() - -static inline __m256i m256_one_32_fn() -{ - __m256i a; - asm( "vpxor %0, %0, %0\n\t" - "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t" - "vpsubd %%ymm1, %0, %0\n\t" - : "=x"(a) - : - : "ymm1" ); - return a; -} -#define m256_one_32 m256_one_32_fn() +// Mo0ve low element of vector to integer. +#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) ) +#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) ) -static inline __m256i m256_one_16_fn() -{ - __m256i a; - asm( "vpxor %0, %0, %0\n\t" - "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t" - "vpsubw %%ymm1, %0, %0\n\t" - : "=x"(a) - : - : "ymm1" ); - return a; -} -#define m256_one_16 m256_one_16_fn() +// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo } +#define mm256_concat_128( hi, lo ) \ + _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) -static inline __m256i m256_one_8_fn() -{ - __m256i a; - asm( "vpxor %0, %0, %0\n\t" - "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t" - "vpsubb %%ymm1, %0, %0\n\t" - : "=x"(a) - : - : "ymm1" ); - return a; -} -#define m256_one_8 m256_one_8_fn() -static inline __m256i m256_neg1_fn() +// Equivalent of set, move 64 bit integer constants to respective 64 bit +// elements. +static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2, + const uint64_t i1, const uint64_t i0 ) { - __m256i a; - asm( "vpcmpeqq %0, %0, %0\n\t" - : "=x"(a) ); - return a; + union { __m256i m256i; + uint64_t u64[4]; } v; + v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3; + return v.m256i; } -#define m256_neg1 m256_neg1_fn() -#else // AVX +// Equivalent of set1. +// 128 bit vector argument +#define m256_const1_128( v ) \ + _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 ) +// 64 bit integer argument +#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) ) +#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) ) +#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) ) +#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) ) +#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) ) -#define m256_one_64 _mm256_set1_epi64x( 1ULL ) -#define m256_one_32 _mm256_set1_epi64x( 0x0000000100000001ULL ) -#define m256_one_16 _mm256_set1_epi64x( 0x0001000100010001ULL ) -#define m256_one_8 _mm256_set1_epi64x( 0x0101010101010101ULL ) +#define m256_const2_64( i1, i0 ) \ + m256_const1_128( m128_const_64( i1, i0 ) ) -// AVX doesn't have inserti128 but insertf128 will do. -// Ideally this can be done with 2 instructions and no temporary variables. -static inline __m256i m256_neg1_fn() +// +// All SIMD constant macros are actually functions containing executable +// code and therefore can't be used as compile time initializers. + +#define m256_zero _mm256_setzero_si256() +#define m256_one_256 mm256_mov64_256( 1 ) +#define m256_one_128 m256_const1_i128( 1 ) +#define m256_one_64 _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) ) +#define m256_one_32 _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) ) +#define m256_one_16 _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) ) +#define m256_one_8 _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) ) + +static inline __m256i mm256_neg1_fn() { - __m128i a = m128_neg1; - return _mm256_insertf128_si256( _mm256_castsi128_si256( a ), a, 1 ); + __m256i v; + asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(v) ); + return v; } -#define m256_neg1 m256_neg1_fn() -//#define m256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) - -#endif // AVX2 else AVX -// -// Vector size conversion. -// -// Allows operations on either or both halves of a 256 bit vector serially. -// Handy for parallel AES. -// Caveats when writing: -// _mm256_castsi256_si128 is free and without side effects. -// _mm256_castsi128_si256 is also free but leaves the high half -// undefined. That's ok if the hi half will be subseqnently assigned. -// If assigning both, do lo first, If assigning only 1, use -// _mm256_inserti128_si256. -// -#define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a ) -#define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 ) - -// Extract 4 u64 from 256 bit vector. -#define mm256_extr_4x64( a0, a1, a2, a3, src ) \ -do { \ - __m128i hi = _mm256_extracti128_si256( src, 1 ); \ - a0 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 0 ); \ - a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \ - a2 = _mm_extract_epi64( hi, 0 ); \ - a3 = _mm_extract_epi64( hi, 1 ); \ -} while(0) - -#define mm256_extr_8x32( a0, a1, a2, a3, a4, a5, a6, a7, src ) \ -do { \ - __m128i hi = _mm256_extracti128_si256( src, 1 ); \ - a0 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 0 ); \ - a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \ - a2 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 2 ); \ - a3 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 3 ); \ - a4 = _mm_extract_epi32( hi, 0 ); \ - a5 = _mm_extract_epi32( hi, 1 ); \ - a6 = _mm_extract_epi32( hi, 2 ); \ - a7 = _mm_extract_epi32( hi, 3 ); \ -} while(0) - -// input __m128i, returns __m256i -// To build a 256 bit vector from 2 128 bit vectors lo must be done first. -// lo alone leaves hi undefined, hi alone leaves lo unchanged. -// Both cost one clock while preserving the other half.. -// Insert b into specified half of a leaving other half of a unchanged. -#define mm256_ins_lo128_256( a, b ) _mm256_inserti128_si256( a, b, 0 ) -#define mm256_ins_hi128_256( a, b ) _mm256_inserti128_si256( a, b, 1 ) - - -// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo } -#define mm256_concat_128( hi, lo ) \ - mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi ) - -// Horizontal vector testing -#if defined(__AVX2__) - -#define mm256_allbits0( a ) _mm256_testz_si256( a, a ) -#define mm256_allbits1( a ) _mm256_testc_si256( a, m256_neg1 ) -#define mm256_allbitsne( a ) _mm256_testnzc_si256( a, m256_neg1 ) -#define mm256_anybits0 mm256_allbitsne -#define mm256_anybits1 mm256_allbitsne - -#else // AVX - -// Bit-wise test of entire vector, useful to test results of cmp. -#define mm256_anybits0( a ) \ - ( (uint128_t)mm128_extr_hi128_256( a ) \ - | (uint128_t)mm128_extr_lo128_256( a ) ) +#define m256_neg1 mm256_neg1_fn() -#define mm256_anybits1( a ) \ - ( ( (uint128_t)mm128_extr_hi128_256( a ) + 1 ) \ - | ( (uint128_t)mm128_extr_lo128_256( a ) + 1 ) ) - -#define mm256_allbits0_256( a ) ( !mm256_anybits1(a) ) -#define mm256_allbits1_256( a ) ( !mm256_anybits0(a) ) - -#endif // AVX2 else AVX - -// Parallel AES, for when x is expected to be in a 256 bit register. -// Use same 128 bit key. -#define mm256_aesenc_2x128( x, k ) \ - mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \ - _mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) ) - -#define mm256_paesenc_2x128( y, x, k ) do \ -{ \ - __m128i *X = (__m128i*)x; \ - __m128i *Y = (__m128i*)y; \ - Y[0] = _mm_aesenc_si128( X[0], k ); \ - Y[1] = _mm_aesenc_si128( X[1], k ); \ -} while(0); +// Consistent naming for similar operations. +#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v ) +#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 ) // // Pointer casting @@ -234,74 +96,30 @@ do { \ #define casto_m256i(p,o) (((__m256i*)(p))+(o)) -// Gather scatter - -#define mm256_gather_64( d, s0, s1, s2, s3 ) \ - ((uint64_t*)(d))[0] = (uint64_t)(s0); \ - ((uint64_t*)(d))[1] = (uint64_t)(s1); \ - ((uint64_t*)(d))[2] = (uint64_t)(s2); \ - ((uint64_t*)(d))[3] = (uint64_t)(s3); - -#define mm256_gather_32( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \ - ((uint32_t*)(d))[0] = (uint32_t)(s0); \ - ((uint32_t*)(d))[1] = (uint32_t)(s1); \ - ((uint32_t*)(d))[2] = (uint32_t)(s2); \ - ((uint32_t*)(d))[3] = (uint32_t)(s3); \ - ((uint32_t*)(d))[4] = (uint32_t)(s4); \ - ((uint32_t*)(d))[5] = (uint32_t)(s5); \ - ((uint32_t*)(d))[6] = (uint32_t)(s6); \ - ((uint32_t*)(d))[7] = (uint32_t)(s7); - - -// Scatter data from contiguous memory. -// All arguments are pointers -#define mm256_scatter_64( d0, d1, d2, d3, s ) \ - *((uint64_t*)(d0)) = ((uint64_t*)(s))[0]; \ - *((uint64_t*)(d1)) = ((uint64_t*)(s))[1]; \ - *((uint64_t*)(d2)) = ((uint64_t*)(s))[2]; \ - *((uint64_t*)(d3)) = ((uint64_t*)(s))[3]; - -#define mm256_scatter_32( d0, d1, d2, d3, d4, d5, d6, d7, s ) \ - *((uint32_t*)(d0)) = ((uint32_t*)(s))[0]; \ - *((uint32_t*)(d1)) = ((uint32_t*)(s))[1]; \ - *((uint32_t*)(d2)) = ((uint32_t*)(s))[2]; \ - *((uint32_t*)(d3)) = ((uint32_t*)(s))[3]; \ - *((uint32_t*)(d4)) = ((uint32_t*)(s))[4]; \ - *((uint32_t*)(d5)) = ((uint32_t*)(s))[5]; \ - *((uint32_t*)(d6)) = ((uint32_t*)(s))[6]; \ - *((uint32_t*)(d7)) = ((uint32_t*)(s))[7]; - - // // Memory functions // n = number of 256 bit (32 byte) vectors -static inline void memset_zero_256( __m256i *dst, int n ) +static inline void memset_zero_256( __m256i *dst, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = m256_zero; } -static inline void memset_256( __m256i *dst, const __m256i a, int n ) +static inline void memset_256( __m256i *dst, const __m256i a, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = a; } -static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) +static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } -/////////////////////////////// -// -// AVX2 needed from now on. -// - -#if defined(__AVX2__) // // Basic operations without SIMD equivalent -// Bitwise not ( ~x ) -#define mm256_not( x ) _mm256_xor_si256( (x), m256_neg1 ) \ +// Bitwise not ( ~v ) +#define mm256_not( v ) _mm256_xor_si256( v, m256_neg1 ) \ -// Unary negation of each element ( -a ) -#define mm256_negate_64( a ) _mm256_sub_epi64( m256_zero, a ) -#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a ) -#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a ) +// Unary negation of each element ( -v ) +#define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v ) +#define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v ) +#define mm256_negate_16( v ) _mm256_sub_epi16( m256_zero, v ) // Add 4 values, fewer dependencies than sequential addition. @@ -327,26 +145,47 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) // The only bit shift for more than 64 bits is with __int128. // // AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements -// but is of little value -// -// Rotate each element of v by c bits -#define mm256_ror_64( v, c ) \ + +// compiler doesn't like when a variable is used for the last arg of +// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where +// necessary. + +#define mm256_ror_var_64( v, c ) \ _mm256_or_si256( _mm256_srli_epi64( v, c ), \ _mm256_slli_epi64( v, 64-(c) ) ) -#define mm256_rol_64( v, c ) \ +#define mm256_rol_var_64( v, c ) \ _mm256_or_si256( _mm256_slli_epi64( v, c ), \ _mm256_srli_epi64( v, 64-(c) ) ) -#define mm256_ror_32( v, c ) \ +#define mm256_ror_var_32( v, c ) \ _mm256_or_si256( _mm256_srli_epi32( v, c ), \ _mm256_slli_epi32( v, 32-(c) ) ) -#define mm256_rol_32( v, c ) \ +#define mm256_rol_var_32( v, c ) \ _mm256_or_si256( _mm256_slli_epi32( v, c ), \ _mm256_srli_epi32( v, 32-(c) ) ) + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// AVX512, control must be 8 bit immediate. + +#define mm256_ror_64 _mm256_ror_epi64 +#define mm256_rol_64 _mm256_rol_epi64 +#define mm256_ror_32 _mm256_ror_epi32 +#define mm256_rol_32 _mm256_rol_epi32 + +#else // AVX2 + +#define mm256_ror_64 mm256_ror_var_64 +#define mm256_rol_64 mm256_rol_var_64 +#define mm256_ror_32 mm256_ror_var_32 +#define mm256_rol_32 mm256_rol_var_32 + +#endif // AVX512 else AVX2 + #define mm256_ror_16( v, c ) \ _mm256_or_si256( _mm256_srli_epi16( v, c ), \ _mm256_slli_epi16( v, 16-(c) ) ) @@ -355,39 +194,39 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) _mm256_or_si256( _mm256_slli_epi16( v, c ), \ _mm256_srli_epi16( v, 16-(c) ) ) -// Rotate bits in each element of v by the amount in corresponding element of -// index vector c -#define mm256_rorv_64( v, c ) \ - _mm256_or_si256( \ - _mm256_srlv_epi64( v, c ), \ - _mm256_sllv_epi64( v, _mm256_sub_epi64( \ - _mm256_set1_epi64x( 64 ), c ) ) ) - -#define mm256_rolv_64( v, c ) \ - _mm256_or_si256( \ - _mm256_sllv_epi64( v, c ), \ - _mm256_srlv_epi64( v, _mm256_sub_epi64( \ - _mm256_set1_epi64x( 64 ), c ) ) ) - -#define mm256_rorv_32( v, c ) \ - _mm256_or_si256( \ - _mm256_srlv_epi32( v, c ), \ - _mm256_sllv_epi32( v, _mm256_sub_epi32( \ - _mm256_set1_epi32( 32 ), c ) ) ) - -#define mm256_rolv_32( v, c ) \ - _mm256_or_si256( \ - _mm256_sllv_epi32( v, c ), \ - _mm256_srlv_epi32( v, _mm256_sub_epi32( \ - _mm256_set1_epi32( 32 ), c ) ) ) - -// AVX512 can do 16 bit elements. // // Rotate elements accross all lanes. // // AVX2 has no full vector permute for elements less than 32 bits. // AVX512 has finer granularity full vector permutes. +// AVX512 has full vector alignr which might be faster, especially for 32 bit + + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +static inline __m256i mm256_swap_128( const __m256i v ) +{ return _mm256_alignr_epi64( v, v, 2 ); } + +static inline __m256i mm256_ror_1x64( const __m256i v ) +{ return _mm256_alignr_epi64( v, v, 1 ); } + +static inline __m256i mm256_rol_1x64( const __m256i v ) +{ return _mm256_alignr_epi64( v, v, 3 ); } + +static inline __m256i mm256_ror_1x32( const __m256i v ) +{ return _mm256_alignr_epi32( v, v, 1 ); } + +static inline __m256i mm256_rol_1x32( const __m256i v ) +{ return _mm256_alignr_epi32( v, v, 7 ); } + +static inline __m256i mm256_ror_3x32( const __m256i v ) +{ return _mm256_alignr_epi32( v, v, 3 ); } + +static inline __m256i mm256_rol_3x32( const __m256i v ) +{ return _mm256_alignr_epi32( v, v, 5 ); } + +#else // AVX2 // Swap 128 bit elements in 256 bit vector. #define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) @@ -396,8 +235,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) #define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) #define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) -// A little faster with avx512 -// Rotate 256 bit vector by one 32 bit element. Use 64 bit set, it's faster. +// Rotate 256 bit vector by one 32 bit element. #define mm256_ror_1x32( v ) \ _mm256_permutevar8x32_epi32( v, \ m256_const_64( 0x0000000000000007, 0x0000000600000005, \ @@ -419,106 +257,44 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) m256_const_64( 0x0000000400000003, 0x0000000200000001, \ 0x0000000000000007, 0x0000000600000005 ) -// AVX512 can do 16 & 8 bit elements. -#if defined(__AVX512VL__) +#endif // AVX512 else AVX2 -// Rotate 256 bit vector by one 16 bit element. -#define mm256_ror_1x16( v ) \ - _mm256_permutexvar_epi16( m256_const_64( \ - 0x0000000f000e000d, 0x000c000b000a0009, \ - 0x0008000700060005, 0x0004000300020001 ), v ) - -#define mm256_rol_1x16( v ) \ - _mm256_permutexvar_epi16( m256_const_64( \ - 0x000e000d000c000b, 0x000a000900080007, \ - 0x0006000500040003, 0x000200010000000f ), v ) +// +// Rotate elements within each 128 bit lane of 256 bit vector. -// Rotate 256 bit vector by one byte. -#define mm256_ror_1x8( v ) m256_const_64( \ - 0x001f1e1d1c1b1a19, 0x1817161514131211, \ - 0x100f0e0d0c0b0a09, 0x0807060504030201 ) +#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e ) +#define mm256_ror128_32( v ) _mm256_shuffle_epi32( v, 0x39 ) +#define mm256_rol128_32( v ) _mm256_shuffle_epi32( v, 0x93 ) -#define mm256_rol_1x8( v ) m256_const_64( \ - 0x1e1d1c1b1a191817, 0x161514131211100f, \ - 0x0e0d0c0b0a090807, 0x060504030201001f ) +static inline __m256i mm256_ror128_x8( const __m256i v, const int c ) +{ return _mm256_alignr_epi8( v, v, c ); } -#endif // AVX512 - -// -// Rotate elements within lanes of 256 bit vector. - -// Swap 64 bit elements in each 128 bit lane. -#define mm256_swap64_128( v ) _mm256_shuffle_epi32( v, 0x4e ) - -// Rotate each 128 bit lane by one 32 bit element. -#define mm256_ror1x32_128( v ) _mm256_shuffle_epi32( v, 0x39 ) -#define mm256_rol1x32_128( v ) _mm256_shuffle_epi32( v, 0x93 ) - -// Rotate each 128 bit lane by one 16 bit element. -#define mm256_rol1x16_128( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 6,5,4,3,2,1,0,7, \ - 6,5,4,3,2,1,0,7 ) ) -#define mm256_ror1x16_128( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 0,7,6,5,4,3,2,1, \ - 0,7,6,5,4,3,2,1 ) ) - -// Rotate each 128 bit lane by one byte -#define mm256_rol1x8_128( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi8(14,13,12,11,10, 9, 8, 7, \ - 6, 5, 4, 3, 2, 1, 0,15, \ - 14,13,12,11,10, 9, 8, 7, \ - 6, 5, 4, 3, 2, 1, 0,15 ) ) -#define mm256_ror1x8_128( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi8( 0,15,14,13,12,11,10, 9, \ - 8, 7, 6, 5, 4, 3, 2, 1, \ - 0,15,14,13,12,11,10, 9, \ - 8, 7, 6, 5, 4, 3, 2, 1 ) ) - -// Rotate each 128 bit lane by c bytes. -#define mm256_bror_128( v, c ) \ - _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \ - _mm256_bslli_epi128( v, 16-(c) ) ) -#define mm256_brol_128( v, c ) \ - _mm256_or_si256( _mm256_bslli_epi128( v, c ), \ - _mm256_bsrli_epi128( v, 16-(c) ) ) - -// Swap 32 bit elements in each 64 bit lane -#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 ) - -#define mm256_ror16_64( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 4,7,6,5,0,3,2,1, \ - 4,7,6,5,0,3,2,1 ) ) -#define mm256_rol16_64( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 6,5,4,7,2,1,0,3, \ - 6,5,4,7,2,1,0,3 ) ) - - -// Swap 16 bit elements in each 32 bit lane -#define mm256_swap16_32( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 6,7,4,5,2,3,0,1, \ - 6,7,4,5,2,3,0,1 ) ) +// Swap 32 bit elements in each 64 bit lane. +#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 ) // // Swap bytes in vector elements, endian bswap. #define mm256_bswap_64( v ) \ - _mm256_shuffle_epi8( v, m256_const_64( 0x08090a0b0c0d0e0f, \ - 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) #define mm256_bswap_32( v ) \ - _mm256_shuffle_epi8( v, m256_const_64( 0x0c0d0e0f08090a0b, \ - 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \ + 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) #define mm256_bswap_16( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi8( 14,15, 12,13, 10,11, 8, 9, \ - 6, 7, 4, 5, 2, 3, 0, 1, \ - 14,15, 12,13, 10,11, 8, 9, \ - 6, 7, 4, 5, 2, 3, 0, 1 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \ + 0x0e0f0c0d0a0b0809, 0x0607040502030001, ) ) +// Source and destination are pointers, may point to same memory. // 8 byte qword * 8 qwords * 4 lanes = 256 bytes #define mm256_block_bswap_64( d, s ) do \ { \ - __m256i ctl = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ + __m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \ casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \ @@ -532,7 +308,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) // 4 byte dword * 8 dwords * 8 lanes = 256 bytes #define mm256_block_bswap_32( d, s ) do \ { \ - __m256i ctl = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \ + __m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \ 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ @@ -551,82 +327,28 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) // Some of these can use permute but appears to be slower. Maybe a Ryzen // issue -#define mm256_swap256_512 (v1, v2) \ - v1 = _mm256_xor_si256(v1, v2); \ - v2 = _mm256_xor_si256(v1, v2); \ - v1 = _mm256_xor_si256(v1, v2); - -#define mm256_ror1x128_512( v1, v2 ) \ -do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 16 ); \ - v1 = _mm256_alignr_epi8( v2, v1, 16 ); \ - v2 = t; \ -} while(0) +// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also +// makes these macros unnecessary. -#define mm256_rol1x128_512( v1, v2 ) \ -do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 16 ); \ - v2 = _mm256_alignr_epi8( v2, v1, 16 ); \ - v1 = t; \ -} while(0) - -#define mm256_ror1x64_512( v1, v2 ) \ -do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 8 ); \ - v1 = _mm256_alignr_epi8( v2, v1, 8 ); \ - v2 = t; \ -} while(0) - -#define mm256_rol1x64_512( v1, v2 ) \ -do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 24 ); \ - v2 = _mm256_alignr_epi8( v2, v1, 24 ); \ - v1 = t; \ -} while(0) - -#define mm256_ror1x32_512( v1, v2 ) \ -do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 4 ); \ - v1 = _mm256_alignr_epi8( v2, v1, 4 ); \ - v2 = t; \ -} while(0) - -#define mm256_rol1x32_512( v1, v2 ) \ -do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 28 ); \ - v2 = _mm256_alignr_epi8( v2, v1, 28 ); \ - v1 = t; \ -} while(0) - -#define mm256_ror1x16_512( v1, v2 ) \ -do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 2 ); \ - v1 = _mm256_alignr_epi8( v2, v1, 2 ); \ - v2 = t; \ -} while(0) - -#define mm256_rol1x16_512( v1, v2 ) \ -do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 30 ); \ - v2 = _mm256_alignr_epi8( v2, v1, 30 ); \ - v1 = t; \ -} while(0) +#define mm256_swap512_256( v1, v2 ) \ + v1 = _mm256_xor_si256( v1, v2 ); \ + v2 = _mm256_xor_si256( v1, v2 ); \ + v1 = _mm256_xor_si256( v1, v2 ); -#define mm256_ror1x8_512( v1, v2 ) \ +#define mm256_ror512_128( v1, v2 ) \ do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 1 ); \ - v1 = _mm256_alignr_epi8( v2, v1, 1 ); \ + __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \ + v1 = _mm256_permute2x128( v2, v1, 0x21 ); \ v2 = t; \ } while(0) -#define mm256_rol1x8_512( v1, v2 ) \ +#define mm256_rol512_128( v1, v2 ) \ do { \ - __m256i t = _mm256_alignr_epi8( v1, v2, 31 ); \ - v2 = _mm256_alignr_epi8( v2, v1, 31 ); \ + __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \ + v2 = _mm256_permute2x128( v2, v1, 0x21 ); \ v1 = t; \ } while(0) #endif // __AVX2__ -#endif // __AVX__ #endif // SIMD_256_H__ diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index fd98b80..22c5331 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -1,35 +1,70 @@ #if !defined(SIMD_512_H__) #define SIMD_512_H__ 1 -#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +//////////////////////////////////////////////////////////////////////// +// +// AVX-512 +// +// The baseline for these utilities is AVX512F, AVX512DQ, AVX512BW +// and AVX512VL, first available in quantity in Skylake-X. +// Some utilities may require additional features available in subsequent +// architectures and are noted. -//////////////////////////////////////////////////////// +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// AVX512 intrinsics have a few changes from previous conventions. +// +// cmp instruction now returns a bitmask isnstead of a vector mask. +// This eliminates the need for the blendv instruction. +// +// The new rotate instructions require the count to be an 8 bit +// immediate value only. Compilation fails if a variable is used. +// The documentation is the same as for shift and it works with +// variables. +// +// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute +// usually shuffles accross all lanes. +// +// permutexvar has args reversed, index is first arg. Previously all +// permutes and shuffles have the index last. +// +// _mm512_permutexvar_epi8 requires AVX512-VBMI, larger elements don't. +// It also performs the same op as _mm512_shuffle_epi8. +// +// shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually +// doesn't cross 128 bit lane boundaries but is consistent with AVX2 +// where shuffle_epi8 spans the entire vector. +// +// There are 2 areas where overhead is aconcern: constants and +// permutations. +// +// Constants need to be composed at run time by assembling individual +// elements, very expensive. The cost is proportional to the number of +// different elements therefore use the largest element size possible, +// merge smaller integer elements to 64 bits, and group repeated elements. +// +// Constants with repeating patterns can be optimized with the smaller +// patterns repeated more frequently being more efficient. +// +// Some specific constants can be very efficient. Zero is very efficient, +// 1 and -1 slightly less so. +// +// If an expensive constant is to be reused in the same function it should +// be declared as a local variable defined once and reused. +// +// Permutations can be very expensive if they use a vector control index, +// even if the permutation itself is quite efficient. +// The index is essentially a constant with all the baggage that brings. +// The same rules apply, if an index is to be reused it should be defined +// as a local. This applies specifically to bswap operations. // -// Some extentsions in AVX512 supporting operations on -// smaller elements in 256 bit vectors. - -// Variable rotate, each element rotates by corresponding index. -#define mm256_rorv_16( v, c ) \ - _mm256_or_si256( \ - _mm256_srlv_epi16( v, _mm256_set1_epi16( c ) ), \ - _mm256_sllv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) ) - -#define mm256_rolv_16( v, c ) \ - _mm256_or_si256( \ - _mm256_sllv_epi16( v, _mm256_set1_epi16( c ) ), \ - _mm256_srlv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) ) - -// Invert vector: {7,6,5,4,3,2,1,0} -> {0,1,2,3,4,5,6,7} -#define mm256_invert_16 ( v ) \ - _mm256_permutex_epi16( v, _mm256_set_epi16( 0, 1, 2, 3, 4, 5, 6, 7, \ - 8, 9,10,11,12,13,14,15 ) ) - -#define mm256_invert_8( v ) \ - _mm256_permutex_epi8( v, _mm256_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, \ - 8, 9,10,11,12,13,14,15, \ - 16,17,18,19,20,21,22,23, \ - 24,25,26,27,28,29,30,31 ) ) +// Additionally, permutations using smaller vectors can be more efficient +// if the permutation doesn't cross lane boundaries, typically 128 bits, +// and the smnaller vector can use an imm comtrol. +// +// If the permutation doesn't cross lane boundaries a shuffle instructions +// can be used with imm control instead of permute. ////////////////////////////////////////////////////////////// // @@ -38,112 +73,107 @@ // Other AVX512 extensions that may be required for some functions. // __AVX512VBMI__ __AVX512VAES__ // -// Experimental, not fully tested. +// Move integer to/from element 0 of vector. -// -// Pseudo constants. +#define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) ) +#define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) ) -// _mm512_setzero_si512 uses xor instruction. If needed frequently -// in a function is it better to define a register variable (const?) -// initialized to zero. -// It isn't clear to me yet how set or set1 actually work. +#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) ) +#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) ) -#define m512_zero _mm512_setzero_si512() -#define m512_one_512 _mm512_set_epi64( 0ULL, 0ULL, 0ULL, 0ULL, \ - 0ULL, 0ULL, 0ULL, 1ULL ) -#define m512_one_256 _mm512_set4_epi64( 0ULL, 0ULL, 0ULL, 1ULL ) -#define m512_one_128 _mm512_set4_epi64( 0ULL, 1ULL, 0ULL, 1ULL ) -//#define m512_one_64 _mm512_set1_epi64( 1ULL ) -//#define m512_one_32 _mm512_set1_epi32( 1UL ) -//#define m512_one_16 _mm512_set1_epi16( 1U ) -//#define m512_one_8 _mm512_set1_epi8( 1U ) -//#define m512_neg1 _mm512_set1_epi64( 0xFFFFFFFFFFFFFFFFULL ) - -#define mi512_const_64( i7, i6, i5, i4, i3, i2, i1, i0 ) \ - _mm512_inserti64x4( _mm512_castsi512_si256( m256_const_64( i3.i2,i1,i0 ) ), \ - m256_const_64( i7,i6,i5,i4 ), 1 ) -#define m512_const1_64( i ) m256_const_64( i, i, i, i, i, i, i, i ) - -static inline __m512i m512_one_64_fn() -{ - __m512i a; - asm( "vpxorq %0, %0, %0\n\t" - "vpcmpeqd %%zmm1, %%zmm1, %%zmm1\n\t" - "vpsubq %%zmm1, %0, %0\n\t" - :"=x"(a) - : - : "zmm1" ); - return a; -} -#define m512_one_64 m512_one_64_fn() +// A simple 128 bit permute, using function instead of macro avoids +// problems if the v arg passed as an expression. +static inline __m512i mm512_perm_128( const __m512i v, const int c ) +{ return _mm512_shuffle_i64x2( v, v, c ); } -static inline __m512i m512_one_32_fn() -{ - __m512i a; - asm( "vpxord %0, %0, %0\n\t" - "vpcmpeqd %%zmm1, %%zmm1, %%zmm1\n\t" - "vpsubd %%zmm1, %0, %0\n\t" - :"=x"(a) - : - : "zmm1" ); - return a; -} -#define m512_one_32 m512_one_32_fn() +// Concatenate two 256 bit vectors into one 512 bit vector {hi, lo} +#define mm512_concat_256( hi, lo ) \ + _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 ) -static inline __m512i m512_one_16_fn() +// Equivalent of set, assign 64 bit integers to respective 64 bit elements. +// Use stack memory overlay +static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6, + const uint64_t i5, const uint64_t i4, + const uint64_t i3, const uint64_t i2, + const uint64_t i1, const uint64_t i0 ) { - __m512i a; - asm( "vpxord %0, %0, %0\n\t" - "vpcmpeqd %%zmm1, %%zmm1, %%zmm1\n\t" - "vpsubw %%zmm1, %0, %0\n\t" - :"=x"(a) - : - : "zmm1" ); - return a; + union { __m512i m512i; + uint64_t u64[8]; } v; + v.u64[0] = i0; v.u64[1] = i1; + v.u64[2] = i2; v.u64[3] = i3; + v.u64[4] = i4; v.u64[5] = i5; + v.u64[6] = i6; v.u64[7] = i7; + return v.m512i; } -#define m512_one_16 m512_one_16_fn() -static inline __m512i m512_one_8_fn() -{ - __m512i a; - asm( "vpxord %0, %0, %0\n\t" - "vpcmpeqd %%zmm1, %%zmm1, %%zmm1\n\t" - "vpsubb %%zmm1, %0, %0\n\t" - :"=x"(a) - : - : "zmm1" ); - return a; -} -#define m512_one_8 m512_one_8_fn() +// Equivalent of set1, broadcast lo element all elements. +static inline __m512i m512_const1_256( const __m256i v ) +{ return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); } + +#define m512_const1_128( v ) \ + mm512_perm_128( _mm512_castsi128_si512( v ), 0 ) +// Integer input argument up to 64 bits +#define m512_const1_i128( i ) \ + mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 ) + +//#define m512_const1_256( v ) _mm512_broadcast_i64x4( v ) +//#define m512_const1_128( v ) _mm512_broadcast_i64x2( v ) +#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) ) +#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) ) +#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) ) +#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) ) -static inline __m512i m512_neg1_fn() +#define m512_const2_128( v1, v0 ) \ + m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 ) ) + +#define m512_const2_64( i1, i0 ) \ + m512_const1_128( m128_const_64( i1, i0 ) ) + + +static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, + const uint64_t i1, const uint64_t i0 ) { - __m512i a; - asm( "vpcmpeqq %0, %0, %0\n\t" - :"=x"(a) ); - return a; + union { __m512i m512i; + uint64_t u64[8]; } v; + v.u64[0] = v.u64[4] = i0; + v.u64[1] = v.u64[5] = i1; + v.u64[2] = v.u64[6] = i2; + v.u64[3] = v.u64[7] = i3; + return v.m512i; } -#define m512_neg1 m512_neg1_fn() +// +// Pseudo constants. + +// _mm512_setzero_si512 uses xor instruction. If needed frequently +// in a function is it better to define a register variable (const?) +// initialized to zero. + +#define m512_zero _mm512_setzero_si512() +#define m512_one_512 mm512_mov64_512( 1 ) +#define m512_one_256 _mm512_inserti64x4( m512_one_512, m256_one_256, 1 ) +#define m512_one_128 m512_const1_i128( 1 ) +#define m512_one_64 m512_const1_64( 1 ) +#define m512_one_32 m512_const1_32( 1 ) +#define m512_one_16 m512_const1_16( 1 ) +#define m512_one_8 m512_const1_8( 1 ) + +//#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) +#define m512_neg1 _mm512_movm_epi64( 0xff ) // // Basic operations without SIMD equivalent +// ~x #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) + +// -x #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x ) #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x ) #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x ) - -#define mm256_extr_lo256_512( a ) _mm512_castsi512_si256( a ) -#define mm256_extr_hi256_512( a ) _mm512_extracti64x4_epi64( a, 1 ) - -#define mm128_extr_lo128_512( a ) _mm512_castsi512_si256( a ) - - - // // Pointer casting @@ -163,72 +193,21 @@ static inline __m512i m512_neg1_fn() // returns p+o as pointer to vector #define casto_m512i(p,o) (((__m512i*)(p))+(o)) -// Gather scatter - -#define mm512_gather_64( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \ - ((uint64_t*)(d))[0] = (uint64_t)(s0); \ - ((uint64_t*)(d))[1] = (uint64_t)(s1); \ - ((uint64_t*)(d))[2] = (uint64_t)(s2); \ - ((uint64_t*)(d))[3] = (uint64_t)(s3); \ - ((uint64_t*)(d))[4] = (uint64_t)(s4); \ - ((uint64_t*)(d))[5] = (uint64_t)(s5); \ - ((uint64_t*)(d))[6] = (uint64_t)(s6); \ - ((uint64_t*)(d))[7] = (uint64_t)(s7); - - -#define mm512_gather_32( d, s00, s01, s02, s03, s04, s05, s06, s07, \ - s08, s09, s10, s11, s12, s13, s14, s15 ) \ - ((uint32_t*)(d))[ 0] = (uint32_t)(s00); \ - ((uint32_t*)(d))[ 1] = (uint32_t)(s01); \ - ((uint32_t*)(d))[ 2] = (uint32_t)(s02); \ - ((uint32_t*)(d))[ 3] = (uint32_t)(s03); \ - ((uint32_t*)(d))[ 4] = (uint32_t)(s04); \ - ((uint32_t*)(d))[ 5] = (uint32_t)(s05); \ - ((uint32_t*)(d))[ 6] = (uint32_t)(s06); \ - ((uint32_t*)(d))[ 7] = (uint32_t)(s07); \ - ((uint32_t*)(d))[ 8] = (uint32_t)(s08); \ - ((uint32_t*)(d))[ 9] = (uint32_t)(s09); \ - ((uint32_t*)(d))[10] = (uint32_t)(s10); \ - ((uint32_t*)(d))[11] = (uint32_t)(s11); \ - ((uint32_t*)(d))[12] = (uint32_t)(s12); \ - ((uint32_t*)(d))[13] = (uint32_t)(s13); \ - ((uint32_t*)(d))[13] = (uint32_t)(s14); \ - ((uint32_t*)(d))[15] = (uint32_t)(s15); - -// Scatter data from contiguous memory. -// All arguments are pointers -#define mm512_scatter_64( d0, d1, d2, d3, d4, d5, d6, d7, s ) \ - *((uint64_t*)(d0)) = ((uint64_t*)(s))[0]; \ - *((uint64_t*)(d1)) = ((uint64_t*)(s))[1]; \ - *((uint64_t*)(d2)) = ((uint64_t*)(s))[2]; \ - *((uint64_t*)(d3)) = ((uint64_t*)(s))[3]; \ - *((uint64_t*)(d4)) = ((uint64_t*)(s))[4]; \ - *((uint64_t*)(d5)) = ((uint64_t*)(s))[5]; \ - *((uint64_t*)(d6)) = ((uint64_t*)(s))[6]; \ - *((uint64_t*)(d7)) = ((uint64_t*)(s))[7]; - - -#define mm512_scatter_32( d00, d01, d02, d03, d04, d05, d06, d07, \ - d08, d09, d10, d11, d12, d13, d14, d15, s ) \ - *((uint32_t*)(d00)) = ((uint32_t*)(s))[ 0]; \ - *((uint32_t*)(d01)) = ((uint32_t*)(s))[ 1]; \ - *((uint32_t*)(d02)) = ((uint32_t*)(s))[ 2]; \ - *((uint32_t*)(d03)) = ((uint32_t*)(s))[ 3]; \ - *((uint32_t*)(d04)) = ((uint32_t*)(s))[ 4]; \ - *((uint32_t*)(d05)) = ((uint32_t*)(s))[ 5]; \ - *((uint32_t*)(d06)) = ((uint32_t*)(s))[ 6]; \ - *((uint32_t*)(d07)) = ((uint32_t*)(s))[ 7]; \ - *((uint32_t*)(d00)) = ((uint32_t*)(s))[ 8]; \ - *((uint32_t*)(d01)) = ((uint32_t*)(s))[ 9]; \ - *((uint32_t*)(d02)) = ((uint32_t*)(s))[10]; \ - *((uint32_t*)(d03)) = ((uint32_t*)(s))[11]; \ - *((uint32_t*)(d04)) = ((uint32_t*)(s))[12]; \ - *((uint32_t*)(d05)) = ((uint32_t*)(s))[13]; \ - *((uint32_t*)(d06)) = ((uint32_t*)(s))[14]; \ - *((uint32_t*)(d07)) = ((uint32_t*)(s))[15]; - -// Add 4 values, fewer dependencies than sequential addition. +// +// Memory functions +// n = number of 512 bit (64 byte) vectors + +static inline void memset_zero_512( __m512i *dst, const int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = m512_zero; } + +static inline void memset_512( __m512i *dst, const __m512i a, const int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = a; } + +static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) +{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } + +// Sum 4 values, fewer dependencies than sequential addition. #define mm512_add4_64( a, b, c, d ) \ _mm512_add_epi64( _mm512_add_epi64( a, b ), _mm512_add_epi64( c, d ) ) @@ -243,373 +222,321 @@ static inline __m512i m512_neg1_fn() _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) ) #define mm512_xor4( a, b, c, d ) \ - _mm512_xor_si512( _mm512_xor_si256( a, b ), _mm512_xor_si256( c, d ) ) - + _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) ) // // Bit rotations. -// AVX512F has built-in bit fixed and variable rotation for 64 & 32 bit -// elements. There is no bit rotation or shift for larger elements. +// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit +// elements and can be called directly. But they only accept immediate 8 +// for control arg. // // _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32 // -// Here is a bit rotate for 16 bit elements: -#define mm512_ror_16( v, c ) \ - _mm512_or_si512( _mm512_srli_epi16( v, c ), \ - _mm512_slli_epi16( v, 16-(c) ) -#define mm512_rol_16( v, c ) \ - _mm512_or_si512( _mm512_slli_epi16( v, c ), \ - _mm512_srli_epi16( v, 16-(c) ) + +// For convenience and consistency with AVX2 +#define mm512_ror_64 _mm512_ror_epi64 +#define mm512_rol_64 _mm512_rol_epi64 +#define mm512_ror_32 _mm512_ror_epi32 +#define mm512_rol_32 _mm512_rol_epi32 + +static inline __m512i mm512_ror_var_64( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_srli_epi64( v, c ), + _mm512_slli_epi64( v, 64-c ) ); +} + +static inline __m512i mm512_rol_var_64( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_slli_epi64( v, c ), + _mm512_srli_epi64( v, 64-c ) ); +} + +static inline __m512i mm512_ror_var_32( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_srli_epi32( v, c ), + _mm512_slli_epi32( v, 32-c ) ); +} + +static inline __m512i mm512_rol_var_32( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_slli_epi32( v, c ), + _mm512_srli_epi32( v, 32-c ) ); +} + +static inline __m512i mm512_ror_16( __m512i const v, const int c ) +{ + return _mm512_or_si512( _mm512_srli_epi16( v, c ), + _mm512_slli_epi16( v, 16-c ) ); +} + +static inline __m512i mm512_rol_16( const __m512i v, const int c ) +{ + return _mm512_or_si512( _mm512_slli_epi16( v, c ), + _mm512_srli_epi16( v, 16-c ) ); +} + +// Rotations using a vector control index are very slow due to overhead +// to generate the index vector. Repeated rotations using the same index +// are better handled by the calling function where the index only needs +// to be generated once then reused very efficiently. +// Permutes and shuffles using an immediate index are significantly faster. + +// +// Swap bytes in vector elements, vectorized endian conversion. + +#define mm512_bswap_64( v ) \ + _mm512_shuffle_epi8( v, \ + m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \ + 0x28292a2b2c2d2e2f, 0x2021222324252627, \ + 0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) + +#define mm512_bswap_32( v ) \ + _mm512_shuffle_epi8( v, \ + m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \ + 0x2c2d2e2f28292a2b, 0x2425262720212223, \ + 0x1c1d1e1f18191a1b, 0x1415161710111213, \ + 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) + +#define mm512_bswap_16( v ) \ + _mm512_shuffle_epi8( v, \ + m512_const_64( 0x3e3f3c3d3a3b3839, 0x3637343532333031, \ + 0x2e2f2c2d2a2b2829, 0x2627242522232021, \ + 0x1e1f1c1d1a1b1819, 0x1617141512131011, \ + 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) + +// Source and destination are pointers, may point to same memory. +// 8 lanes of 64 bytes each +#define mm512_block_bswap_64( d, s ) do \ +{ \ + __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \ + 0x28292a2b2c2d2e2f, 0x2021222324252627, \ + 0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ + casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \ + casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \ + casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \ + casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \ + casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \ + casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \ + casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \ + casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \ +} while(0) + +// 16 lanes of 32 bytes each +#define mm512_block_bswap_32( d, s ) do \ +{ \ + __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \ + 0x2c2d2e2f28292a2b, 0x2425262720212223, \ + 0x1c1d1e1f18191a1b, 0x1415161710111213, \ + 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ + casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \ + casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \ + casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \ + casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \ + casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \ + casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \ + casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \ + casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \ +} while(0) + // // Rotate elements in 512 bit vector. -#define mm512_swap_256( v ) _mm512_alignr_epi64( v, v, 4 ) +static inline __m512i mm512_swap_256( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 4 ); } + +static inline __m512i mm512_ror_1x128( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 2 ); } -#define mm512_ror_1x128( v ) _mm512_alignr_epi64( v, v, 2 ) -#define mm512_rol_1x128( v ) _mm512_alignr_epi64( v, v, 6 ) +static inline __m512i mm512_rol_1x128( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 6 ); } -#define mm512_ror_1x64( v ) _mm512_alignr_epi64( v, v, 1 ) -#define mm512_rol_1x64( v ) _mm512_alignr_epi64( v, v, 7 ) +static inline __m512i mm512_ror_1x64( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 1 ); } -#define mm512_ror_1x32( v ) _mm512_alignr_epi32( v, v, 1 ) -#define mm512_rol_1x32( v ) _mm512_alignr_epi32( v, v, 15 ) +static inline __m512i mm512_rol_1x64( const __m512i v ) +{ return _mm512_alignr_epi64( v, v, 7 ); } -// Generic for odd rotations -#define mm512_ror_x64( v, n ) _mm512_alignr_epi64( v, v, n ) +static inline __m512i mm512_ror_1x32( const __m512i v ) +{ return _mm512_alignr_epi32( v, v, 1 ); } -#define mm512_ror_x32( v, n ) _mm512_alignr_epi32( v, v, n ) +static inline __m512i mm512_rol_1x32( const __m512i v ) +{ return _mm512_alignr_epi32( v, v, 15 ); } +static inline __m512i mm512_ror_x64( const __m512i v, const int n ) +{ return _mm512_alignr_epi64( v, v, n ); } +static inline __m512i mm512_ror_x32( const __m512i v, const int n ) +{ return _mm512_alignr_epi32( v, v, n ); } #define mm512_ror_1x16( v ) \ - _mm512_permutexvar_epi16( v, m512_const_64( \ + _mm512_permutexvar_epi16( m512_const_64( \ 0x0000001F001E001D, 0x001C001B001A0019, \ 0X0018001700160015, 0X0014001300120011, \ 0X0010000F000E000D, 0X000C000B000A0009, \ - 0X0008000700060005, 0X0004000300020001 ) ) + 0X0008000700060005, 0X0004000300020001 ), v ) #define mm512_rol_1x16( v ) \ - _mm512_permutexvar_epi16( v, m512_const_64( \ + _mm512_permutexvar_epi16( m512_const_64( \ 0x001E001D001C001B, 0x001A001900180017, \ 0X0016001500140013, 0X001200110010000F, \ 0X000E000D000C000B, 0X000A000900080007, \ - 0X0006000500040003, 0X000200010000001F ) ) - + 0X0006000500040003, 0X000200010000001F ), v ) #define mm512_ror_1x8( v ) \ - _mm512_permutexvar_epi8( v, m512_const_64( \ + _mm512_shuffle_epi8( v, m512_const_64( \ 0x003F3E3D3C3B3A39, 0x3837363534333231, \ 0x302F2E2D2C2B2A29, 0x2827262524232221, \ 0x201F1E1D1C1B1A19. 0x1817161514131211, \ 0x100F0E0D0C0B0A09, 0x0807060504030201 ) ) #define mm512_rol_1x8( v ) \ - _mm512_permutexvar_epi8( v, m512_const_64( \ + _mm512_shuffle_epi8( v, m512_const_64( \ 0x3E3D3C3B3A393837, 0x363534333231302F. \ 0x2E2D2C2B2A292827, 0x262524232221201F, \ 0x1E1D1C1B1A191817, 0x161514131211100F, \ 0x0E0D0C0B0A090807, 0x060504030201003F ) ) -// Invert vector: {3,2,1,0} -> {0,1,2,3} -#define mm512_invert_128( v ) _mm512_permute4f128_epi32( a, 0x1b ) - -#define mm512_invert_64( v ) \ - _mm512_permutex_epi64( v, m512_const_64( 0,1,2,3,4,5,6,7 ) ) - -#define mm512_invert_32( v ) \ - _mm512_permutexvar_epi32( v, _mm512_set_epi32( \ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 ) ) - - -#define mm512_invert_16( v ) \ - _mm512_permutexvar_epi16( v, _mm512_set_epi32( \ - 0x00000001, 0x00020003, 0x00040005, 0x00060007, \ - 0x00080009, 0x000A000B, 0x000C000D, 0x000E000F, \ - 0x00100011, 0x00120013, 0x00140015, 0x00160017, \ - 0x00180019, 0x001A001B, 0x001C001D, 0x001E001F ) ) - -#define mm512_invert_8( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F, \ - 0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F, \ - 0x20212223, 0x24252627, 0x28292A2B, 0x2C2D2E2F, \ - 0x30313233, 0x34353637, 0x38393A3B, 0x3C3D3E3F ) ) - // // Rotate elements within 256 bit lanes of 512 bit vector. // Swap hi & lo 128 bits in each 256 bit lane -#define mm512_swap128_256( v ) _mm512_permutex_epi64( v, 0x4e ) +#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e ) // Rotate 256 bit lanes by one 64 bit element -#define mm512_ror1x64_256( v ) _mm512_permutex_epi64( v, 0x39 ) -#define mm512_rol1x64_256( v ) _mm512_permutex_epi64( v, 0x93 ) +#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 ) +#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 ) // Rotate 256 bit lanes by one 32 bit element -#define mm512_ror1x32_256( v ) \ - _mm512_permutexvar_epi32( v, _mm512_set_epi32( \ - 8,15,14,13,12,11,10, 9, 0, 7, 6, 5, 4, 3, 2, 1 ) ) -#define mm512_rol1x32_256( v ) \ - _mm512_permutexvar_epi32( v, _mm512_set_epi32( \ - 14,13,12,11,10, 9, 8,15, 6, 5, 4, 3, 2, 1, 0, 7 ) ) -#define mm512_ror1x16_256( v ) \ - _mm512_permutexvar_epi16( v, _mm512_set_epi32( \ - 0x0010001F, 0x001E001D, 0x001C001B, 0x001A0019, \ - 0x00180017, 0x00160015, 0x00140013, 0x00120011, \ - 0x0000000F, 0x000E000D, 0x000C000B, 0x000A0009, \ - 0x00080007, 0x00060005, 0x00040003, 0x00020001 ) ) - -#define mm512_rol1x16_256( v ) \ - _mm512_permutexvar_epi16( v, _mm512_set_epi32( \ - 0x001E001D, 0x001C001B, 0x001A0019, 0x00180017, \ - 0x00160015, 0x00140013, 0x00120011, 0x0000000F, \ - 0x000E000D, 0x000C000B, 0x000A0009, 0x00080007, \ - 0x00060005, 0x00040003, 0x00020001, 0x0000001F ) ) - -#define mm512_ror1x8_256( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x203F3E3D, 0x3C3B3A39, 0x38373635, 0x34333231, \ - 0x302F2E2D, 0x2C2B2A29, 0x28272625, 0x24232221, \ - 0x001F1E1D, 0x1C1B1A19, 0x18171615, 0x14131211, \ - 0x100F0E0D, 0x0C0B0A09, 0x08070605, 0x04030201 ) ) - -#define mm512_rol1x8_256( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x3E3D3C3B, 0x3A393837, 0x36353433, 0x3231302F, \ - 0x2E2D2C2B, 0x2A292827, 0x26252423, 0x2221203F, \ - 0x1E1D1C1B, 0x1A191817, 0x16151413, 0x1211100F, \ - 0x0E0D0C0B, 0x0A090807, 0x06050403, 0x0201001F ) ) +#define mm512_ror256_32( v ) \ + _mm512_permutexvar_epi32( m512_const_64( \ + 0x000000080000000f, 0x0000000e0000000d, \ + 0x0000000c0000000b, 0x0000000a00000009, \ + 0x0000000000000007, 0x0000000600000005, \ + 0x0000000400000003, 0x0000000200000001 ), v ) + +#define mm512_rol256_32( v ) \ + _mm512_permutexvar_epi32( m512_const_64( \ + 0x0000000e0000000d, 0x0000000c0000000b, \ + 0x0000000a00000009, 0x000000080000000f, \ + 0x0000000600000005, 0x0000000400000003, \ + 0x0000000200000001, 0x0000000000000007 ), v ) + +#define mm512_ror256_16( v ) \ + _mm512_permutexvar_epi16( m512_const_64( \ + 0x00100001001e001d, 0x001c001b001a0019, \ + 0x0018001700160015, 0x0014001300120011, \ + 0x0000000f000e000d, 0x000c000b000a0009, \ + 0x0008000700060005, 0x0004000300020001 ), v ) + +#define mm512_rol256_16( v ) \ + _mm512_permutexvar_epi16( m512_const_64( \ + 0x001e001d001c001b, 0x001a001900180017, \ + 0x0016001500140013, 0x001200110010001f, \ + 0x000e000d000c000b, 0x000a000900080007, \ + 0x0006000500040003, 0x000200010000000f ), v ) + +#define mm512_ror256_8( v ) \ + _mm512_shuffle_epi8( v, m512_const_64( \ + 0x203f3e3d3c3b3a39, 0x3837363534333231, \ + 0x302f2e2d2c2b2a29, 0x2827262524232221, \ + 0x001f1e1d1c1b1a19, 0x1817161514131211, \ + 0x100f0e0d0c0b0a09, 0x0807060504030201 ) ) + +#define mm512_rol256_8( v ) \ + _mm512_shuffle_epi8( v, m512_const_64( \ + 0x3e3d3c3b3a393837, 0x363534333231302f, \ + 0x2e2d2c2b2a292827, 0x262524232221203f, \ + 0x1e1d1c1b1a191817, 0x161514131211100f, \ + 0x0e0d0c0b0a090807, 0x060504030201001f ) ) // // Rotate elements within 128 bit lanes of 512 bit vector. -// Swap hi & lo 64 bits in each 128 bit lane -#define mm512_swap64_128( v ) _mm512_permutex_epi64( v, 0xb1 ) +// Swap 64 bits in each 128 bit lane +#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e ) // Rotate 128 bit lanes by one 32 bit element -#define mm512_ror1x32_128( v ) _mm512_shuffle_epi32( v, 0x39 ) -#define mm512_rol1x32_128( v ) _mm512_shuffle_epi32( v, 0x93 ) - -#define mm512_ror1x16_128( v ) \ - _mm512_permutexvar_epi16( v, m512_const_64( \ - 0x0018001F001E001D, 0x001C001B001A0019, \ - 0x0010001700160015, 0x0014001300120011, \ - 0x0008000F000E000D, 0x000C000B000A0009, \ - 0x0000000700060005, 0x0004000300020001 ) ) - -#define mm512_rol1x16_128( v ) \ - _mm512_permutexvar_epi16( v, m512_const_64( \ - 0x001E001D001C001B, 0x001A00190018001F, \ - 0x0016001500140013, 0x0012001100100017, \ - 0x000E000D000C000B, 0x000A00090008000F, \ - 0x0006000500040003, 0x0002000100000007 ) ) - -#define mm512_ror1x8_128( v ) \ - _mm512_permutexvar_epi8( v, m512_const_64( \ - 0x303F3E3D3C3B3A39, 0x3837363534333231, \ - 0x202F2E2D2C2B2A29, 0x2827262524232221, \ - 0x101F1E1D1C1B1A19, 0x1817161514131211, \ - 0x000F0E0D0C0B0A09, 0x0807060504030201 ) ) - -#define mm512_rol1x8_128( v ) \ - _mm512_permutexvar_epi8( v, m512_const_64( \ - 0x3E3D3C3B3A393837, 0x363534333231303F, \ - 0x2E2D2C2B2A292827, 0x262524232221202F, \ - 0x1E1D1C1B1A191817, 0x161514131211101F, \ - 0x0E0D0C0B0A090807, 0x060504030201000F ) ) - -// Rotate 128 bit lanes by c bytes. -#define mm512_bror_128( v, c ) \ - _mm512_or_si512( _mm512_bsrli_epi128( v, c ), \ - _mm512_bslli_epi128( v, 16-(c) ) ) -#define mm512_brol_128( v, c ) \ - _mm512_or_si512( _mm512_bslli_epi128( v, c ), \ - _mm512_bsrli_epi128( v, 16-(c) ) ) +#define mm512_ror128_32( v ) _mm512_shuffle_epi32( v, 0x39 ) +#define mm512_rol128_32( v ) _mm512_shuffle_epi32( v, 0x93 ) +// Rotate right 128 bit lanes by c bytes +static inline __m512i mm512_ror128_x8( const __m512i v, const int c ) +{ return _mm512_alignr_epi8( v, v, c ); } -// -// Rotate elements within 64 bit lanes. - -// Swap 32 bit elements in each 64 bit lane -#define mm512_swap32_64( v ) _mm512_shuffle_epi32( v, 0xb1 ) - -// _mm512_set_epi8 doesn't seem to work - -// Rotate each 64 bit lane by one 16 bit element. -#define mm512_ror1x16_64( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x39383F3E, 0x3D3C3B3A, 0x31303736, 0x35343332, \ - 0x29282F2E, 0x2D2C2B2A, 0x21202726, 0x25242322, \ - 0x19181F1E, 0x1D1C1B1A, 0x11101716, 0x15141312, \ - 0x09080F0E, 0x0D0C0B0A, 0x01000706, 0x05040302 ) ) - -#define mm512_rol1x16_64( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x3D3C3B3A, 0x39383F3E, 0x35343332, 0x31303736 \ - 0x2D2C2B2A, 0x29282F2E, 0x25242322, 0x21202726 \ - 0x1D1C1B1A, 0x19181F1E, 0x15141312, 0x11101716 \ - 0x0D0C0B0A, 0x09080F0E, 0x05040302, 0x01000706 ) ) - -// Rotate each 64 bit lane by one byte. -#define mm512_ror1x8_64( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x383F3E3D, 0x3C3B3A39, 0x30373635, 0x34333231, \ - 0x282F2E2D, 0x2C2B2A29, 0x20272625, 0x24232221, \ - 0x181F1E1D, 0x1C1B1A19, 0x10171615, 0x14131211, \ - 0x080F0E0D, 0x0C0B0A09, 0x00070605, 0x0403020 ) -#define mm512_rol1x8_64( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x3E3D3C3B, 0x3A39383F, 0x36353433, 0x32313037, \ - 0x2E2D2C2B, 0x2A29282F, 0x26252423, 0x22212027, \ - 0x1E1D1C1B, 0x1A19181F, 0x16151413, 0x12111017, \ - 0x0E0D0C0B, 0x0A09080F, 0x06050403, 0x02010007 ) +// Swap 32 bits in each 64 bit lane. +#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 ) -// -// Rotate elements within 32 bit lanes. - -#define mm512_swap16_32( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x001D001C, 0x001F001E, 0x00190018, 0x001B001A, \ - 0x00150014, 0x00170016, 0x00110010, 0x00130012, \ - 0x000D000C, 0x000F000E, 0x00190008, 0x000B000A, \ - 0x00050004, 0x00070006, 0x00110000, 0x00030002 ) - -#define mm512_ror1x8_32( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x3C3F3E3D, 0x383B3A39, 0x34373635, 0x30333231, \ - 0x2C2F2E2D, 0x282B2A29, 0x24272625, 0x20232221, \ - 0x1C1F1E1D, 0x181B1A19, 0x14171615, 0x10131211, \ - 0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201 ) ) - -#define mm512_rol1x8_32( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x3E3D3C3F, 0x3A39383B, 0x36353437, 0x32313033, \ - 0x2E2D2C2F, 0x2A29282B, 0x26252427, 0x22212023, \ - 0x1E1D1C1F, 0x1A19181B, 0x16151417, 0x12111013, \ - 0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003 ) ) - -// -// Swap bytes in vector elements, vectorized bswap. - -#define mm512_bswap_64( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x38393A3B, 0x3C3D3E3F, 0x20313233, 0x34353637, \ - 0x28292A2B, 0x2C2D2E2F, 0x20212223, 0x34353637, \ - 0x18191A1B, 0x1C1D1E1F, 0x10111213, 0x14151617, \ - 0x08090A0B, 0x0C0D0E0F, 0x00010203, 0x04050607 ) ) - -#define mm512_bswap_32( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \ - 0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \ - 0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \ - 0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233 ) ) - -#define mm512_bswap_16( v ) \ - _mm512_permutexvar_epi8( v, _mm512_set_epi32( \ - 0x3E3F3C3D, 0x3A3B3839, 0x36373435, 0x32333031, \ - 0x2E2F2C2D, 0x2A2B2829, 0x26272425, 0x22232021, \ - 0x1E1F1C1D, 0x1A1B1819, 0x16171415, 0x12131011, \ - 0x0E0F0C0D, 0x0A0B0809, 0x06070405, 0x02030001 ) ) // // Rotate elements from 2 512 bit vectors in place, source arguments // are overwritten. -// These can all be done with 2 permutex2var instructions but they are -// slower than either xor or alignr. -#define mm512_swap512_1024(v1, v2) \ - v1 = _mm512_xor_si512(v1, v2); \ - v2 = _mm512_xor_si512(v1, v2); \ - v1 = _mm512_xor_si512(v1, v2); +#define mm512_swap1024_512( v1, v2 ) \ + v1 = _mm512_xor_si512( v1, v2 ); \ + v2 = _mm512_xor_si512( v1, v2 ); \ + v1 = _mm512_xor_si512( v1, v2 ); -#define mm512_ror1x256_1024( v1, v2 ) \ +#define mm512_ror1024_256( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \ v1 = _mm512_alignr_epi64( v2, v1, 4 ); \ v2 = t; \ } while(0) -#define mm512_rol1x256_1024( v1, v2 ) \ +#define mm512_rol1024_256( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \ v2 = _mm512_alignr_epi64( v2, v1, 4 ); \ v1 = t; \ } while(0) -#define mm512_ror1x128_1024( v1, v2 ) \ +#define mm512_ror1024_128( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \ v1 = _mm512_alignr_epi64( v2, v1, 2 ); \ v2 = t; \ } while(0) -#define mm512_rol1x128_1024( v1, v2 ) \ +#define mm512_rol1024_128( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \ v2 = _mm512_alignr_epi64( v2, v1, 6 ); \ v1 = t; \ } while(0) -#define mm512_ror1x64_1024( v1, v2 ) \ +#define mm512_ror1024_64( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \ v1 = _mm512_alignr_epi64( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm512_rol1x64_1024( v1, v2 ) \ +#define mm512_rol1024_64( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \ v2 = _mm512_alignr_epi64( v2, v1, 7 ); \ v1 = t; \ } while(0) -#define mm512_ror1x32_1024( v1, v2 ) \ +#define mm512_ror1024_32( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \ v1 = _mm512_alignr_epi32( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm512_rol1x32_1024( v1, v2 ) \ +#define mm512_rol1024_32( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \ v2 = _mm512_alignr_epi32( v2, v1, 15 ); \ v1 = t; \ } while(0) -#define mm512_ror1x16_1024( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi8( v1, v2, 2 ); \ - v1 = _mm512_alignr_epi8( v2, v1, 2 ); \ - v2 = t; \ -} while(0) - -#define mm512_rol1x16_1024( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi8( v1, v2, 62 ); \ - v2 = _mm512_alignr_epi8( v2, v1, 62 ); \ - v1 = t; \ -} while(0) - -#define mm512_ror1x8_1024( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi8( v1, v2, 1 ); \ - v1 = _mm512_alignr_epi8( v2, v1, 1 ); \ - v2 = t; \ -} while(0) - -#define mm512_rol1x8_1024( v1, v2 ) \ -do { \ - __m512i t = _mm512_alignr_epi8( v1, v2, 63 ); \ - v2 = _mm512_alignr_epi8( v2, v1, 63 ); \ - v1 = t; \ -} while(0) - #endif // AVX512 #endif // SIMD_512_H__ diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h index 3add748..e74066b 100644 --- a/simd-utils/simd-64.h +++ b/simd-utils/simd-64.h @@ -1,18 +1,18 @@ #if !defined(SIMD_64_H__) #define SIMD_64_H__ 1 -#if defined(__MMX__) +#if defined(__MMX__) && defined(__SSE__) //////////////////////////////////////////////////////////////// // // 64 bit MMX vectors. // -// There are rumours MMX wil be removed. Although casting with int64 -// works there is likely some overhead to move the data to An MMX register -// and back. - +// This code is not used anywhere annd likely never will. It's intent was +// to support 2 way parallel hashing using SSE2 for 64 bit, and MMX for 32 +// bit hash functions, but was never implemented. // Pseudo constants + /* #define m64_zero _mm_setzero_si64() #define m64_one_64 _mm_set_pi32( 0UL, 1UL ) @@ -30,78 +30,67 @@ #define casti_m64(p,i) (((__m64*)(p))[(i)]) -// cast all arguments as the're likely to be uint64_t - // Bitwise not: ~(a) -#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 ) +//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 ) +#define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) ) // Unary negate elements -#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, (__m64)v ) -#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, (__m64)v ) -#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, (__m64)v ) +#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v ) +#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v ) +#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v ) // Rotate bits in packed elements of 64 bit vector #define mm64_rol_64( a, n ) \ - _mm_or_si64( _mm_slli_si64( (__m64)(a), n ), \ - _mm_srli_si64( (__m64)(a), 64-(n) ) ) + _mm_or_si64( _mm_slli_si64( a, n ), \ + _mm_srli_si64( a, 64-(n) ) ) #define mm64_ror_64( a, n ) \ - _mm_or_si64( _mm_srli_si64( (__m64)(a), n ), \ - _mm_slli_si64( (__m64)(a), 64-(n) ) ) + _mm_or_si64( _mm_srli_si64( a, n ), \ + _mm_slli_si64( a, 64-(n) ) ) #define mm64_rol_32( a, n ) \ - _mm_or_si64( _mm_slli_pi32( (__m64)(a), n ), \ - _mm_srli_pi32( (__m64)(a), 32-(n) ) ) + _mm_or_si64( _mm_slli_pi32( a, n ), \ + _mm_srli_pi32( a, 32-(n) ) ) #define mm64_ror_32( a, n ) \ - _mm_or_si64( _mm_srli_pi32( (__m64)(a), n ), \ - _mm_slli_pi32( (__m64)(a), 32-(n) ) ) + _mm_or_si64( _mm_srli_pi32( a, n ), \ + _mm_slli_pi32( a, 32-(n) ) ) #define mm64_rol_16( a, n ) \ - _mm_or_si64( _mm_slli_pi16( (__m64)(a), n ), \ - _mm_srli_pi16( (__m64)(a), 16-(n) ) ) + _mm_or_si64( _mm_slli_pi16( a, n ), \ + _mm_srli_pi16( a, 16-(n) ) ) #define mm64_ror_16( a, n ) \ - _mm_or_si64( _mm_srli_pi16( (__m64)(a), n ), \ - _mm_slli_pi16( (__m64)(a), 16-(n) ) ) + _mm_or_si64( _mm_srli_pi16( a, n ), \ + _mm_slli_pi16( a, 16-(n) ) ) // Rotate packed elements accross lanes. Useful for byte swap and byte // rotation. -// _mm_shuffle_pi8 requires SSSE3 while _mm_shuffle_pi16 requires SSE -// even though these are MMX instructions. - // Swap hi & lo 32 bits. -#define mm64_swap32( a ) _mm_shuffle_pi16( (__m64)(a), 0x4e ) +#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e ) -#define mm64_ror1x16_64( a ) _mm_shuffle_pi16( (__m64)(a), 0x39 ) -#define mm64_rol1x16_64( a ) _mm_shuffle_pi16( (__m64)(a), 0x93 ) +#define mm64_ror64_1x16( a ) _mm_shuffle_pi16( a, 0x39 ) +#define mm64_rol64_1x16( a ) _mm_shuffle_pi16( a, 0x93 ) // Swap hi & lo 16 bits of each 32 bit element -#define mm64_swap16_32( a ) _mm_shuffle_pi16( (__m64)(a), 0xb1 ) +#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 ) #if defined(__SSSE3__) // Endian byte swap packed elements -// A vectorized version of the u64 bswap, use when data already in MMX reg. -#define mm64_bswap_64( v ) \ - _mm_shuffle_pi8( (__m64)v, (__m64)0x0001020304050607 ) - #define mm64_bswap_32( v ) \ - _mm_shuffle_pi8( (__m64)v, (__m64)0x0405060700010203 ) + _mm_shuffle_pi8( v, (__m64)0x0405060700010203 ) #define mm64_bswap_16( v ) \ - _mm_shuffle_pi8( (__m64)v, (__m64)0x0607040502030001 ); + _mm_shuffle_pi8( v, (__m64)0x0607040502030001 ); -#else +// Rotate right by c bytes +static inline __m64 mm64_ror_x8( __m64 v, const int c ) +{ return _mm_alignr_pi8( v, v, c ); } -#define mm64_bswap_64( v ) \ - (__m64)__builtin_bswap64( (uint64_t)v ) +#else -// These exist only for compatibility with CPUs without SSSE3. MMX doesn't -// have extract 32 instruction so pointers are needed to access elements. -// It' more efficient for the caller to use scalar variables and call -// bswap_32 directly. #define mm64_bswap_32( v ) \ _mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \ __builtin_bswap32( ((uint32_t*)&v)[0] ) ) @@ -114,17 +103,6 @@ #endif -// 64 bit mem functions use integral sizes instead of bytes, data must -// be aligned to 64 bits. -static inline void memcpy_m64( __m64 *dst, const __m64 *src, int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = src[i]; } - -static inline void memset_zero_m64( __m64 *src, int n ) -{ for ( int i = 0; i < n; i++ ) src[i] = (__m64)0ULL; } - -static inline void memset_m64( __m64 *dst, const __m64 a, int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = a; } - #endif // MMX #endif // SIMD_64_H__ diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h index 1268214..0108505 100644 --- a/simd-utils/simd-int.h +++ b/simd-utils/simd-int.h @@ -1,76 +1,51 @@ -#if !defined(SIMD_SCALAR_H__) -#define SIMD_SCALAR_H__ 1 - -/////////////////////////////////// -// -// Integers up to 128 bits. -// -// These utilities enhance support for integers up to 128 bits. -// All standard operations are supported on 128 bit integers except -// numeric constant representation and IO. 128 bit integers must be built -// and displayed as 2 64 bit halves, just like the old times. -// -// Some utilities are also provided for smaller integers, most notably -// bit rotation. - -// MMX has no extract instruction for 32 bit elements so this: -// Lo is trivial, high is a simple shift. -// Input may be uint64_t or __m64, returns uint32_t. -#define u64_extr_lo32(a) ( (uint32_t)( (uint64_t)(a) ) ) -#define u64_extr_hi32(a) ( (uint32_t)( ((uint64_t)(a)) >> 32) ) - -#define u64_extr_32( a, n ) ( (uint32_t)( (a) >> ( ( 2-(n)) <<5 ) ) ) -#define u64_extr_16( a, n ) ( (uint16_t)( (a) >> ( ( 4-(n)) <<4 ) ) ) -#define u64_extr_8( a, n ) ( (uint8_t) ( (a) >> ( ( 8-(n)) <<3 ) ) ) - -// Rotate bits in various sized integers. -#define u64_ror_64( x, c ) \ - (uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) ) -#define u64_rol_64( x, c ) \ - (uint64_t)( ( (uint64_t)(x) << (c) ) | ( (uint64_t)(x) >> (64-(c)) ) ) -#define u32_ror_32( x, c ) \ - (uint32_t)( ( (uint32_t)(x) >> (c) ) | ( (uint32_t)(x) << (32-(c)) ) ) -#define u32_rol_32( x, c ) \ - (uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) ) -#define u16_ror_16( x, c ) \ - (uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) ) -#define u16rol_16( x, c ) \ - (uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) ) -#define u8_ror_8( x, c ) \ - (uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) ) -#define u8_rol_8( x, c ) \ - (uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) ) +#if !defined(SIMD_INT_H__) +#define SIMD_INT_H__ 1 // Endian byte swap #define bswap_64( a ) __builtin_bswap64( a ) #define bswap_32( a ) __builtin_bswap32( a ) -// 64 bit mem functions use integral sizes instead of bytes, data must -// be aligned to 64 bits. Mostly for scaled indexing convenience. -static inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = src[i]; } - -static inline void memset_zero_64( uint64_t *src, int n ) -{ for ( int i = 0; i < n; i++ ) src[i] = 0ull; } +// safe division, integer or floating point +#define safe_div( dividend, divisor, safe_result ) \ + ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) ) -static inline void memset_64( uint64_t *dst, const uint64_t a, int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = a; } - -#if defined (GCC_INT128) /////////////////////////////////////// // // 128 bit integers // -// 128 bit integers are inneficient and not a shortcut for __m128i. +// 128 bit integers are inneficient and not a shortcut for __m128i. +// Native type __int128 supported starting with GCC-4.8. +// +// __int128 uses two 64 bit GPRs to hold the data. The main benefits are +// for 128 bit arithmetic. Vectors are preferred when 128 bit arith +// is not required. int128 also works better with other integer sizes. +// Vectors benefit from wider registers. +// +// For safety use typecasting on all numeric arguments. +// +// Use typecasting for conversion to/from 128 bit vector: +// __m128i v128 = (__m128i)my_int128l +// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 ); +// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 ); -// No real need or use. -//#define u128_neg1 ((uint128_t)(-1)) +// Compiler check for __int128 support +// Configure also has a test for int128. -// usefull for making constants. -#define mk_uint128( hi, lo ) \ - ( ( (uint128_t)(hi) << 64 ) | ( (uint128_t)(lo) ) ) +#if !(defined(__arm__) || defined(__aarch64__)) +#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) ) + #define GCC_INT128 1 +#endif +#if !defined(GCC_INT128) + #warning "__int128 not supported, requires GCC-4.8 or newer." +#endif +#endif +#if defined(GCC_INT128) + +// Familiar looking type names +typedef __int128 int128_t; +typedef unsigned __int128 uint128_t; // Extracting the low bits is a trivial cast. // These specialized functions are optimized while providing a @@ -78,20 +53,8 @@ static inline void memset_64( uint64_t *dst, const uint64_t a, int n ) #define u128_hi64( x ) ( (uint64_t)( (uint128_t)(x) >> 64 ) ) #define u128_lo64( x ) ( (uint64_t)(x) ) -// Generic extract, don't use for extracting low bits, cast instead. -#define u128_extr_64( a, n ) ( (uint64_t)( (a) >> ( ( 2-(n)) <<6 ) ) ) -#define u128_extr_32( a, n ) ( (uint32_t)( (a) >> ( ( 4-(n)) <<5 ) ) ) -#define u128_extr_16( a, n ) ( (uint16_t)( (a) >> ( ( 8-(n)) <<4 ) ) ) -#define u128_extr_8( a, n ) ( (uint8_t) ( (a) >> ( (16-(n)) <<3 ) ) ) - -// Not much need for this but it fills a gap. -#define u128_ror_128( x, c ) \ - ( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) ) -#define u128_rol_128( x, c ) \ - ( ( (uint128_t)(x) << (c) ) | ( (uint128_t)(x) >> (128-(c)) ) ) - #endif // GCC_INT128 -#endif // SIMD_SCALAR_H__ +#endif // SIMD_INT_H__ diff --git a/sse2neon.h b/sse2neon.h new file mode 100644 index 0000000..5cec266 --- /dev/null +++ b/sse2neon.h @@ -0,0 +1,7113 @@ +#ifndef SSE2NEON_H +#define SSE2NEON_H + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions +// +// This header file does not yet translate all of the SSE intrinsics. +// +// Contributors to this work are: +// John W. Ratcliff +// Brandon Rowlett +// Ken Fast +// Eric van Beurden +// Alexander Potylitsin +// Hasindu Gamaarachchi +// Jim Huang +// Mark Cheng +// Malcolm James MacLeod +// Devin Hussey (easyaspi314) +// Sebastian Pop +// Developer Ecosystem Engineering +// Danila Kutenin +// François Turban (JishinMaster) +// Pei-Hsuan Hung +// Yang-Hao Yuan +// Syoyo Fujita +// Brecht Van Lommel + +/* + * sse2neon is freely redistributable under the MIT License. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Tunable configurations */ + +/* Enable precise implementation of math operations + * This would slow down the computation a bit, but gives consistent result with + * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result) + */ +/* _mm_min_ps and _mm_max_ps */ +#ifndef SSE2NEON_PRECISE_MINMAX +#define SSE2NEON_PRECISE_MINMAX (0) +#endif +/* _mm_rcp_ps and _mm_div_ps */ +#ifndef SSE2NEON_PRECISE_DIV +#define SSE2NEON_PRECISE_DIV (0) +#endif +/* _mm_sqrt_ps and _mm_rsqrt_ps */ +#ifndef SSE2NEON_PRECISE_SQRT +#define SSE2NEON_PRECISE_SQRT (0) +#endif + +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#else +#error "Macro name collisions may happen with unsupported compiler." +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif +#define FORCE_INLINE static inline +#ifndef ALIGN_STRUCT +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif +#endif +#ifndef likely +#define likely(x) (x) +#endif +#ifndef unlikely +#define unlikely(x) (x) +#endif + +#include +#include + +/* Architecture-specific build options */ +/* FIXME: #pragma GCC push_options is only available on GCC */ +#if defined(__GNUC__) +#if defined(__arm__) && __ARM_ARCH == 7 +/* According to ARM C Language Extensions Architecture specification, + * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) + * architecture supported. + */ +#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) +#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." +#endif +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("fpu=neon") +#endif +#elif defined(__aarch64__) +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("+simd") +#endif +#else +#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." +#endif +#endif + +#include + +/* Rounding functions require either Aarch64 instructions or libm failback */ +#if !defined(__aarch64__) +#include +#endif + +/* "__has_builtin" can be used to query support for built-in functions + * provided by gcc/clang and other compilers that support it. + */ +#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ +/* Compatibility with gcc <= 9 */ +#if __GNUC__ <= 9 +#define __has_builtin(x) HAS##x +#define HAS__builtin_popcount 1 +#define HAS__builtin_popcountll 1 +#else +#define __has_builtin(x) 0 +#endif +#endif + +/** + * MACRO for shuffle parameter for _mm_shuffle_ps(). + * Argument fp3 is a digit[0123] that represents the fp from argument "b" + * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same + * for fp2 in result. fp1 is a digit[0123] that represents the fp from + * argument "a" of mm_shuffle_ps that will be places in fp1 of result. + * fp0 is the same for fp0 of result. + */ +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x08 +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 + +/* indicate immediate constant argument in a given range */ +#define __constrange(a, b) const + +/* A few intrinsics accept traditional data types like ints or floats, but + * most operate on data types that are specific to SSE. + * If a vector type ends in d, it contains doubles, and if it does not have + * a suffix, it contains floats. An integer vector type can contain any type + * of integer, from chars to shorts to unsigned long longs. + */ +typedef int64x1_t __m64; +typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ +// On ARM 32-bit architecture, the float64x2_t is not supported. +// The data type __m128d should be represented in a different way for related +// intrinsic conversion. +#if defined(__aarch64__) +typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ +#else +typedef float32x4_t __m128d; +#endif +typedef int64x2_t __m128i; /* 128-bit vector containing integers */ + +/* type-safe casting between types */ + +#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) +#define vreinterpretq_m128_f32(x) (x) +#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) + +#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) +#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) +#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) +#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) +#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) +#define vreinterpretq_f32_m128(x) (x) +#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) + +#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) +#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) +#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) +#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) +#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) +#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) +#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) +#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) +#define vreinterpretq_m128i_s64(x) (x) + +#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) +#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) +#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) +#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) + +#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) +#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) +#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) +#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) +#define vreinterpretq_s64_m128i(x) (x) + +#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) +#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) +#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) +#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) + +#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) +#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) +#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) +#define vreinterpret_m64_s64(x) (x) + +#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) +#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) +#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) +#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) + +#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) +#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) +#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) + +#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) +#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) +#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) +#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) + +#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) +#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) +#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) +#define vreinterpret_s64_m64(x) (x) + +#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) + +#if defined(__aarch64__) +#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) + +#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) +#define vreinterpretq_m128d_f64(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) + +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) + +#define vreinterpretq_f64_m128d(x) (x) +#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) +#else +#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128d_f32(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_f32_m128d(x) (x) +#endif + +// A struct is defined in this header file called 'SIMDVec' which can be used +// by applications which attempt to access the contents of an _m128 struct +// directly. It is important to note that accessing the __m128 struct directly +// is bad coding practice by Microsoft: @see: +// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx +// +// However, some legacy source code may try to access the contents of an __m128 +// struct directly so the developer can use the SIMDVec as an alias for it. Any +// casting must be done manually by the developer, as you cannot cast or +// otherwise alias the base NEON data type for intrinsic operations. +// +// union intended to allow direct access to an __m128 variable using the names +// that the MSVC compiler provides. This union should really only be used when +// trying to access the members of the vector as integer values. GCC/clang +// allow native access to the float members through a simple array access +// operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause +// a performance hit. If it really is needed however, the original __m128 +// variable can be aliased with a pointer to this union and used to access +// individual components. The use of this union should be hidden behind a macro +// that is used throughout the codebase to access the members instead of always +// declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +// casting using SIMDVec +#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *)&x)->m128_u64[n]) +#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *)&x)->m128_u32[n]) +#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *)&x)->m128_u8[n]) + +/* Backwards compatibility for compilers with lack of specific type support */ + +// Older gcc does not define vld1q_u8_x4 type +#if defined(__GNUC__) && !defined(__clang__) && \ + ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 2)) || \ + (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) || \ + (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7) +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(p + 0); + ret.val[1] = vld1q_u8(p + 16); + ret.val[2] = vld1q_u8(p + 32); + ret.val[3] = vld1q_u8(p + 48); + return ret; +} +#else +// Wraps vld1q_u8_x4 +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { + return vld1q_u8_x4(p); +} +#endif + +/* Function Naming Conventions + * The naming convention of SSE intrinsics is straightforward. A generic SSE + * intrinsic function is given as follows: + * _mm__ + * + * The parts of this format are given as follows: + * 1. describes the operation performed by the intrinsic + * 2. identifies the data type of the function's primary arguments + * + * This last part, , is a little complicated. It identifies the + * content of the input values, and can be set to any of the following values: + * + ps - vectors contain floats (ps stands for packed single-precision) + * + pd - vectors cantain doubles (pd stands for packed double-precision) + * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * signed integers + * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * unsigned integers + * + si128 - unspecified 128-bit vector or 256-bit vector + * + m128/m128i/m128d - identifies input vector types when they are different + * than the type of the returned vector + * + * For example, _mm_setzero_ps. The _mm implies that the function returns + * a 128-bit vector. The _ps at the end implies that the argument vectors + * contain floats. + * + * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) + * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits + * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + * // Set packed 8-bit integers + * // 128 bits, 16 chars, per 8 bits + * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, + * 4, 5, 12, 13, 6, 7, 14, 15); + * // Shuffle packed 8-bit integers + * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb + * + * Data (Number, Binary, Byte Index): + +------+------+-------------+------+------+-------------+ + | 1 | 2 | 3 | 4 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary + +------+------+------+------+------+------+------+------+ + | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 5 | 6 | 7 | 8 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + * Index (Byte Index): + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | + +------+------+------+------+------+------+------+------+ + * Result: + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index + +------+------+------+------+------+------+------+------+ + | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary + +------+------+------+------+------+------+------+------+ + | 256 | 2 | 5 | 6 | Number + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 3 | 7 | 4 | 8 | Number + +------+------+------+------+------+------+-------------+ + */ + +/* Set/get methods */ + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint { + _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ + _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ + _MM_HINT_T1 = 2, /* load data to L2 cache only */ + _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ + _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */ + _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */ + _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */ + _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ +}; + +// Loads one cache line of data from address p to a location closer to the +// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx +FORCE_INLINE void _mm_prefetch(const void *p, int i) { + (void)i; + __builtin_prefetch(p); +} + +// Pause the processor. This is typically used in spin-wait loops and depending +// on the x86 processor typical values are in the 40-100 cycle range. The +// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most +// Arm cores. Experience with several databases has shown has shown an 'isb' is +// a reasonable approximation. +FORCE_INLINE void _mm_pause() { __asm__ __volatile__("isb\n"); } + +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 +FORCE_INLINE float _mm_cvtss_f32(__m128 a) { + return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); +} + +// Convert the lower single-precision (32-bit) floating-point element in b to a +// double-precision (64-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd +FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { + double d = (double)vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *)&d, vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 +#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 +FORCE_INLINE int _mm_cvtss_si64(__m128 a) { +#if defined(__aarch64__) + return vgetq_lane_s64( + vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0); +#else + float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t diff = data - floor(data); + if (diff > 0.5) + return (int64_t)ceil(data); + if (unlikely(diff == 0.5)) { + int64_t f = (int64_t)floor(data); + int64_t c = (int64_t)ceil(data); + return c & 1 ? f : c; + } + return (int64_t)floor(data); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi +FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) { + return vreinterpret_m64_s32( + vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si +FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) { + return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 +#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 +#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 +FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) { + return vgetq_lane_s64( + vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0); +} + +// Sets the 128-bit value to zero +// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx +FORCE_INLINE __m128i _mm_setzero_si128(void) { + return vreinterpretq_m128i_s32(vdupq_n_s32(0)); +} + +// Clears the four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setzero_ps(void) { + return vreinterpretq_m128_f32(vdupq_n_f32(0)); +} + +// Return vector of type __m128d with all elements set to zero. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd +FORCE_INLINE __m128d _mm_setzero_pd(void) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(0)); +#else + return vreinterpretq_m128d_f32(vdupq_n_f32(0)); +#endif +} + +// Sets the four single-precision, floating-point values to w. +// +// r0 := r1 := r2 := r3 := w +// +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set1_ps(float _w) { + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to w. +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps1(float _w) { + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to the four inputs. +// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) { + float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Copy single-precision (32-bit) floating-point element a to the lower element +// of dst, and zero the upper 3 elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss +FORCE_INLINE __m128 _mm_set_ss(float a) { + float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the four single-precision, floating-point values to the four inputs in +// reverse order. +// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) { + float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the 8 signed 16-bit integer values in reverse order. +// +// Return Value +// r0 := w0 +// r1 := w1 +// ... +// r7 := w7 +FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, + short w4, short w5, short w6, short w7) { + int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; + return vreinterpretq_m128i_s16(vld1q_s16((int16_t *)data)); +} + +// Sets the 4 signed 32-bit integer values in reverse order +// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) { + int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Set packed 64-bit integers in dst with the supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 +FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) { + return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); +} + +// Sets the 16 signed 8-bit integer values to b. +// +// r0 := b +// r1 := b +// ... +// r15 := b +// +// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi8(signed char w) { + return vreinterpretq_m128i_s8(vdupq_n_s8(w)); +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd +FORCE_INLINE __m128d _mm_set1_pd(double d) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(d)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *)&d)); +#endif +} + +// Sets the 8 signed 16-bit integer values to w. +// +// r0 := w +// r1 := w +// ... +// r7 := w +// +// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set1_epi16(short w) { + return vreinterpretq_m128i_s16(vdupq_n_s16(w)); +} + +// Sets the 16 signed 8-bit integer values. +// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx +FORCE_INLINE __m128i +_mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, + signed char b11, signed char b10, signed char b9, signed char b8, + signed char b7, signed char b6, signed char b5, signed char b4, + signed char b3, signed char b2, signed char b1, signed char b0) { + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t)b0, (int8_t)b1, (int8_t)b2, (int8_t)b3, + (int8_t)b4, (int8_t)b5, (int8_t)b6, (int8_t)b7, + (int8_t)b8, (int8_t)b9, (int8_t)b10, (int8_t)b11, + (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15}; + return (__m128i)vld1q_s8(data); +} + +// Sets the 8 signed 16-bit integer values. +// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, + short i3, short i2, short i1, short i0) { + int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vreinterpretq_m128i_s16(vld1q_s16(data)); +} + +// Sets the 16 signed 8-bit integer values in reverse order. +// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi8( + signed char b0, signed char b1, signed char b2, signed char b3, + signed char b4, signed char b5, signed char b6, signed char b7, + signed char b8, signed char b9, signed char b10, signed char b11, + signed char b12, signed char b13, signed char b14, signed char b15) { + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t)b0, (int8_t)b1, (int8_t)b2, (int8_t)b3, + (int8_t)b4, (int8_t)b5, (int8_t)b6, (int8_t)b7, + (int8_t)b8, (int8_t)b9, (int8_t)b10, (int8_t)b11, + (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15}; + return (__m128i)vld1q_s8(data); +} + +// Sets the 4 signed 32-bit integer values to i. +// +// r0 := i +// r1 := i +// r2 := i +// r3 := I +// +// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi32(int _i) { + return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) +FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) { + return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t)_i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) { + return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); +} + +// Sets the 4 signed 32-bit integer values. +// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) { + int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) { + return vreinterpretq_m128i_s64( + vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) { + return _mm_set_epi64x((int64_t)i1, (int64_t)i2); +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd +FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) { + double ALIGN_STRUCT(16) data[2] = {e0, e1}; +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64((float64_t *)data)); +#else + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *)data)); +#endif +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd +FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) { + return _mm_set_pd(e0, e1); +} + +// Copy double-precision (64-bit) floating-point element a to the lower element +// of dst, and zero the upper element. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd +FORCE_INLINE __m128d _mm_set_sd(double a) { return _mm_set_pd(0, a); } + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1 +#define _mm_set_pd1 _mm_set1_pd + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) { + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1 +FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) { + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + vst1q_f32(p, vdupq_n_f32(a0)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps +#define _mm_store1_ps _mm_store_ps1 + +// Store 4 single-precision (32-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[127:96] +// MEM[mem_addr+63:mem_addr+32] := a[95:64] +// MEM[mem_addr+95:mem_addr+64] := a[63:32] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps +FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) { + float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); + float32x4_t rev = vextq_f32(tmp, tmp, 2); + vst1q_f32(p, rev); +} + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) { + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. +// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) { + vst1q_s32((int32_t *)p, vreinterpretq_s32_m128i(a)); +} + +// Stores 128-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128 +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) { + vst1q_s32((int32_t *)p, vreinterpretq_s32_m128i(a)); +} + +// Stores 64-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64 +FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) { + vst1q_lane_s64((int64_t *)p, vreinterpretq_s64_m128i(a), 0); +} + +// Stores 32-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32 +FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) { + vst1q_lane_s32((int32_t *)p, vreinterpretq_s32_m128i(a), 0); +} + +// Stores 16-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16 +FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) { + vst1q_lane_s16((int16_t *)p, vreinterpretq_s16_m128i(a), 0); +} + +// Stores the lower single - precision, floating - point value. +// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) { + vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd +FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) { +#if defined(__aarch64__) + vst1q_f64((float64_t *)mem_addr, vreinterpretq_f64_m128d(a)); +#else + vst1q_f32((float32_t *)mem_addr, vreinterpretq_f32_m128d(a)); +#endif +} + +// Store the upper double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd +FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) { +#if defined(__aarch64__) + vst1_f64((float64_t *)mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *)mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd +FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) { +#if defined(__aarch64__) + vst1_f64((float64_t *)mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *)mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store 2 double-precision (64-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// MEM[mem_addr+127:mem_addr+64] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd +FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) { + float32x4_t f = vreinterpretq_f32_m128d(a); + _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 +FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) { +#if defined(__aarch64__) + float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); + vst1q_f64((float64_t *)mem_addr, + vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); +#else + float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); + vst1q_f32((float32_t *)mem_addr, + vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. mem_addr does not need to be aligned on any particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd +FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) { +#if defined(__aarch64__) + vst1_f64((float64_t *)mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_u64((uint64_t *)mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd +#define _mm_store1_pd _mm_store_pd1 + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd +FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) { + _mm_store_pd(mem_addr, a); +} + +// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. +// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx +FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) { + uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); + uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); + *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); +} + +// Stores the lower two single-precision floating point values of a to the +// address p. +// +// *p0 := a0 +// *p1 := a1 +// +// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx +FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) { + *p = vreinterpret_m64_f32(vget_low_f32(a)); +} + +// Stores the upper two single-precision, floating-point values of a to the +// address p. +// +// *p0 := a2 +// *p1 := a3 +// +// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx +FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) { + *p = vreinterpret_m64_f32(vget_high_f32(a)); +} + +// Loads a single single-precision, floating-point value, copying it into all +// four words +// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load1_ps(const float *p) { + return vreinterpretq_m128_f32(vld1q_dup_f32(p)); +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[63:32] := MEM[mem_addr+31:mem_addr] +// dst[95:64] := MEM[mem_addr+31:mem_addr] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 +#define _mm_load_ps1 _mm_load1_ps + +// Sets the lower two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the upper two values are passed +// through from a. +// +// Return Value +// r0 := *p0 +// r1 := *p1 +// r2 := a2 +// r3 := a3 +// +// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) { + return vreinterpretq_m128_f32( + vcombine_f32(vld1_f32((const float32_t *)p), vget_high_f32(a))); +} + +// Load 4 single-precision (32-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[31:0] := MEM[mem_addr+127:mem_addr+96] +// dst[63:32] := MEM[mem_addr+95:mem_addr+64] +// dst[95:64] := MEM[mem_addr+63:mem_addr+32] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps +FORCE_INLINE __m128 _mm_loadr_ps(const float *p) { + float32x4_t v = vrev64q_f32(vld1q_f32(p)); + return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); +} + +// Sets the upper two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the lower two values are passed +// through from a. +// +// r0 := a0 +// r1 := a1 +// r2 := *p0 +// r3 := *p1 +// +// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) { + return vreinterpretq_m128_f32( + vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *)p))); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load_ps(const float *p) { + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_loadu_ps(const float *p) { + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are + // equivalent for neon + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Load unaligned 16-bit integer from memory into the first element of dst. +// +// dst[15:0] := MEM[mem_addr+15:mem_addr] +// dst[MAX:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 +FORCE_INLINE __m128i _mm_loadu_si16(const void *p) { + return vreinterpretq_m128i_s16( + vsetq_lane_s16(*(const int16_t *)p, vdupq_n_s16(0), 0)); +} + +// Load unaligned 64-bit integer from memory into the first element of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[MAX:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 +FORCE_INLINE __m128i _mm_loadu_si64(const void *p) { + return vreinterpretq_m128i_s64( + vcombine_s64(vld1_s64((const int64_t *)p), vdup_n_s64(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower of dst, and zero the upper element. mem_addr does not need to be +// aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd +FORCE_INLINE __m128d _mm_load_sd(const double *p) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); +#else + const float *fp = (const float *)p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from 16-byte aligned memory, floating-point +// values. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd +FORCE_INLINE __m128d _mm_load_pd(const double *p) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64(p)); +#else + const float *fp = (const float *)p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from unaligned memory, floating-point values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd +FORCE_INLINE __m128d _mm_loadu_pd(const double *p) { return _mm_load_pd(p); } + +// Loads an single - precision, floating - point value into the low word and +// clears the upper three words. +// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_load_ss(const float *p) { + return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); +} + +// Load 64-bit integer from memory into the first element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64 +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) { + /* Load the lower 64 bits of the value pointed to by p into the + * lower 64 bits of the result, zeroing the upper 64 bits of the result. + */ + return vreinterpretq_m128i_s32( + vcombine_s32(vld1_s32((int32_t const *)p), vcreate_s32(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower element of dst, and copy the upper element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); +#else + return vreinterpretq_m128d_f32(vcombine_f32( + vld1_f32((const float *)p), vget_high_f32(vreinterpretq_f32_m128d(a)))); +#endif +} + +// Load 2 double-precision (64-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[63:0] := MEM[mem_addr+127:mem_addr+64] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd +FORCE_INLINE __m128d _mm_loadr_pd(const double *p) { +#if defined(__aarch64__) + float64x2_t v = vld1q_f64(p); + return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); +#else + int64x2_t v = vld1q_s64((const int64_t *)p); + return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); +#endif +} + +// Sets the low word to the single-precision, floating-point value of b +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) +FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) { + return vreinterpretq_m128_f32( + vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), + vreinterpretq_f32_m128(a), 0)); +} + +// Move the lower double-precision (64-bit) floating-point element from b to the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := b[63:0] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd +FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) { + return vreinterpretq_m128d_f32( + vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +} + +// Copy the lower 64-bit integer in a to the lower element of dst, and zero the +// upper element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 +FORCE_INLINE __m128i _mm_move_epi64(__m128i a) { + return vreinterpretq_m128i_s64( + vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); +} + +// Return vector of type __m128 with undefined elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps +FORCE_INLINE __m128 _mm_undefined_ps(void) { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128 a; + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +/* Logic/Binary operations */ + +// Computes the bitwise AND-NOT of the four single-precision, floating-point +// values of a and b. +// +// r0 := ~a0 & b0 +// r1 := ~a1 & b1 +// r2 := ~a2 & b2 +// r3 := ~a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_s32( + vbicq_s32(vreinterpretq_s32_m128(b), + vreinterpretq_s32_m128(a))); // *NOTE* argument swap +} + +// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd +FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) { + // *NOTE* argument swap + return vreinterpretq_m128d_s64( + vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); +} + +// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the +// 128-bit value in a. +// +// r := (~a) & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) { + return vreinterpretq_m128i_s32( + vbicq_s32(vreinterpretq_s32_m128i(b), + vreinterpretq_s32_m128i(a))); // *NOTE* argument swap +} + +// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in +// b. +// +// r := a & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) { + return vreinterpretq_m128i_s32( + vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise AND of the four single-precision, floating-point values +// of a and b. +// +// r0 := a0 & b0 +// r1 := a1 & b1 +// r2 := a2 & b2 +// r3 := a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_s32( + vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise AND of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] AND b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { + return vreinterpretq_m128d_s64( + vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the four single-precision, floating-point values +// of a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_s32( + vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Computes bitwise EXOR (exclusive-or) of the four single-precision, +// floating-point values of a and b. +// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_s32( + veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd +FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) { + return vreinterpretq_m128d_s64( + veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise OR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd +FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) { + return vreinterpretq_m128d_s64( + vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. +// +// r := a | b +// +// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) { + return vreinterpretq_m128i_s32( + vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in +// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) { + return vreinterpretq_m128i_s32( + veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Duplicate the low double-precision (64-bit) floating-point element from a, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd +FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) { +#if (__aarch64__) + return vreinterpretq_m128d_f64( + vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_u64( + vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); +#endif +} + +// Duplicate odd-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps +FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) { +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); +#else + float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); + float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Duplicate even-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps +FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) { +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); +#else + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); + float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Moves the upper two values of B into the lower two values of A. +// +// r3 := a3 +// r2 := a2 +// r1 := b3 +// r0 := b2 +FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) { + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); +} + +// Moves the lower two values of B into the upper two values of A. +// +// r3 := b1 +// r2 := b0 +// r1 := a1 +// r0 := a0 +FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) { + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8 +FORCE_INLINE int _mm_movemask_pi8(__m64 a) { + uint8x8_t input = vreinterpret_u8_m64(a); +#if defined(__aarch64__) + static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; + uint8x8_t tmp = vshr_n_u8(input, 7); + return vaddv_u8(vshl_u8(tmp, shift)); +#else + // Refer the implementation of `_mm_movemask_epi8` + uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); + uint32x2_t paired16 = + vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); + uint8x8_t paired32 = vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); + return vget_lane_u8(paired32, 0) | ((int)vget_lane_u8(paired32, 4) << 4); +#endif +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) { + return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 +FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) { + return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 +FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) { + return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 1 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 +FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) { + return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 +FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) { + return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 +FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) { + return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); +} + +// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift +// the result right by imm8 bytes, and store the low 16 bytes in dst. +// +// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) +// dst[127:0] := tmp[127:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8 +#define _mm_alignr_epi8(a, b, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) >= 32)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + uint8x16_t tmp_low, tmp_high; \ + if (imm >= 16) { \ + const int idx = imm - 16; \ + tmp_low = vreinterpretq_u8_m128i(a); \ + tmp_high = vdupq_n_u8(0); \ + ret = vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = imm; \ + tmp_low = vreinterpretq_u8_m128i(b); \ + tmp_high = vreinterpretq_u8_m128i(a); \ + ret = vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ + } \ + } \ + ret; \ + }) + +// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift +// the result right by imm8 bytes, and store the low 8 bytes in dst. +// +// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) +// dst[63:0] := tmp[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8 +#define _mm_alignr_pi8(a, b, imm) \ + __extension__({ \ + __m64 ret; \ + if (unlikely((imm) >= 16)) { \ + ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ + } else { \ + uint8x8_t tmp_low, tmp_high; \ + if (imm >= 8) { \ + const int idx = imm - 8; \ + tmp_low = vreinterpret_u8_m64(a); \ + tmp_high = vdup_n_u8(0); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = imm; \ + tmp_low = vreinterpret_u8_m64(b); \ + tmp_high = vreinterpret_u8_m64(a); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } \ + } \ + ret; \ + }) + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) { + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in high +// end of result takes the higher two 32 bit values from b and swaps them and +// places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) { + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) { + float32x2_t a21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) { + float32x2_t a03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) { + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) { + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) { + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the +// high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) { + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) { + float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) { + float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) { + float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) { + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) { + float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); + return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) { + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) { + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) { + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); +} + +// NEON does not support a general purpose permute intrinsic +// Selects four specific single-precision, floating-point values from a and b, +// based on the mask i. +// +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } +// +// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx +#define _mm_shuffle_ps_default(a, b, imm) \ + __extension__({ \ + float32x4_t ret; \ + ret = \ + vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), ret, \ + 3); \ + vreinterpretq_m128_f32(ret); \ + }) + +// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) +// int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + float32x4_t _input1 = vreinterpretq_f32_m128(a); \ + float32x4_t _input2 = vreinterpretq_f32_m128(b); \ + float32x4_t _shuf = __builtin_shufflevector( \ + _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128_f32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + __m128 ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_ps_1032((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_ps_2301((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_ps_0321((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_ps_2103((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_movelh_ps((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_ps_1001((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_ps_0101((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 1, 0): \ + ret = _mm_shuffle_ps_3210((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 1, 1): \ + ret = _mm_shuffle_ps_0011((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 2, 2): \ + ret = _mm_shuffle_ps_0022((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 2, 0, 0): \ + ret = _mm_shuffle_ps_2200((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 0, 2): \ + ret = _mm_shuffle_ps_3202((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 3, 2): \ + ret = _mm_movehl_ps((b), (a)); \ + break; \ + case _MM_SHUFFLE(1, 1, 3, 3): \ + ret = _mm_shuffle_ps_1133((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 1, 0): \ + ret = _mm_shuffle_ps_2010((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 0, 1): \ + ret = _mm_shuffle_ps_2001((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 3, 2): \ + ret = _mm_shuffle_ps_2032((a), (b)); \ + break; \ + default: \ + ret = _mm_shuffle_ps_default((a), (b), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of a and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) { + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in low end +// of result takes the higher two 32 bit values from a and swaps them and places +// in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) { + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); +} + +// rotates the least significant 32 bits into the most signficant 32 bits, and +// shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) { + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); +} + +// rotates the most significant 32 bits into the least signficant 32 bits, and +// shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) { + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of a and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) { + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the +// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) { + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the +// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and +// places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) { + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) { + int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) { + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) { + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 +FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { + int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a + uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b + uint8x16_t idx_masked = + vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) + return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); +#elif defined(__GNUC__) + int8x16_t ret; + // %e and %f represent the even and odd D registers + // respectively. + __asm__ __volatile__("vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" + "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" + : [ret] "=&w"(ret) + : [tbl] "w"(tbl), [idx] "w"(idx_masked)); + return vreinterpretq_m128i_s8(ret); +#else + // use this line if testing on aarch64 + int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; + return vreinterpretq_m128i_s8( + vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), + vtbl2_s8(a_split, vget_high_u8(idx_masked)))); +#endif +} + +// C equivalent: +// __m128i _mm_shuffle_epi32_default(__m128i a, +// __constrange(0, 255) int imm) { +// __m128i ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// return ret; +// } +#define _mm_shuffle_epi32_default(a, imm) \ + __extension__({ \ + int32x4_t ret; \ + ret = vmovq_n_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), ret, \ + 3); \ + vreinterpretq_m128i_s32(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) +// int imm) +#if defined(__aarch64__) +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ + }) +#else +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ + }) +#endif + +// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. +// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + int32x4_t _input = vreinterpretq_s32_m128i(a); \ + int32x4_t _shuf = __builtin_shufflevector( \ + _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, ((imm) >> 4) & 0x3, \ + ((imm) >> 6) & 0x3); \ + vreinterpretq_m128i_s32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_epi_1032((a)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_epi_2301((a)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_epi_0321((a)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_epi_2103((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_shuffle_epi_1010((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_epi_1001((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_epi_0101((a)); \ + break; \ + case _MM_SHUFFLE(2, 2, 1, 1): \ + ret = _mm_shuffle_epi_2211((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 2, 2): \ + ret = _mm_shuffle_epi_0122((a)); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 2): \ + ret = _mm_shuffle_epi_3332((a)); \ + break; \ + case _MM_SHUFFLE(0, 0, 0, 0): \ + ret = _mm_shuffle_epi32_splat((a), 0); \ + break; \ + case _MM_SHUFFLE(1, 1, 1, 1): \ + ret = _mm_shuffle_epi32_splat((a), 1); \ + break; \ + case _MM_SHUFFLE(2, 2, 2, 2): \ + ret = _mm_shuffle_epi32_splat((a), 2); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 3): \ + ret = _mm_shuffle_epi32_splat((a), 3); \ + break; \ + default: \ + ret = _mm_shuffle_epi32_default((a), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) +// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflelo_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t lowBits = vget_low_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, 1); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, 2); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, 3); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflelo_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ + (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#endif + +// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflehi_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t highBits = vget_high_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, 5); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, 6); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, 7); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflehi_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ + (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ + (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#endif + +// Shuffle double-precision (64-bit) floating-point elements using the control +// in imm8, and store the results in dst. +// +// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_pd(a, b, imm8) \ + vreinterpretq_m128d_s64(__builtin_shufflevector( \ + vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ + ((imm8 & 0x2) >> 1) + 2)) +#else +#define _mm_shuffle_pd(a, b, imm8) \ + _mm_castsi128_pd(_mm_set_epi64x( \ + vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) +#endif + +// Blend packed 16-bit integers from a and b using control mask imm8, and store +// the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[j] +// dst[i+15:i] := b[i+15:i] +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, +// __constrange(0,255) int imm) +#define _mm_blend_epi16(a, b, imm) \ + __extension__({ \ + const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t)-1 : 0x0, \ + ((imm) & (1 << 1)) ? (uint16_t)-1 : 0x0, \ + ((imm) & (1 << 2)) ? (uint16_t)-1 : 0x0, \ + ((imm) & (1 << 3)) ? (uint16_t)-1 : 0x0, \ + ((imm) & (1 << 4)) ? (uint16_t)-1 : 0x0, \ + ((imm) & (1 << 5)) ? (uint16_t)-1 : 0x0, \ + ((imm) & (1 << 6)) ? (uint16_t)-1 : 0x0, \ + ((imm) & (1 << 7)) ? (uint16_t)-1 : 0x0}; \ + uint16x8_t _mask_vec = vld1q_u16(_mask); \ + uint16x8_t _a = vreinterpretq_u16_m128i(a); \ + uint16x8_t _b = vreinterpretq_u16_m128i(b); \ + vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ + }) + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using control mask imm8, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd +#define _mm_blend_pd(a, b, imm) \ + __extension__({ \ + const uint64_t _mask[2] = { \ + ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ + ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \ + uint64x2_t _mask_vec = vld1q_u64(_mask); \ + uint64x2_t _a = vreinterpretq_u64_m128d(a); \ + uint64x2_t _b = vreinterpretq_u64_m128d(b); \ + vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \ + }) + +// Blend packed 8-bit integers from a and b using mask, and store the results in +// dst. +// +// FOR j := 0 to 15 +// i := j*8 +// IF mask[i+7] +// dst[i+7:i] := b[i+7:i] +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) { + // Use a signed shift right to create a mask with the sign bit + uint8x16_t mask = + vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); + uint8x16_t a = vreinterpretq_u8_m128i(_a); + uint8x16_t b = vreinterpretq_u8_m128i(_b); + return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); +} + +/* Shifts */ + +// Shift packed 16-bit integers in a right by imm while shifting in sign +// bits, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 +FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) { + const int count = (imm & ~15) ? 15 : imm; + return (__m128i)vshlq_s16((int16x8_t)a, vdupq_n_s16(-count)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx +#define _mm_slli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm)) <= 0) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s16( \ + vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. : +// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm) +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) { + if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ + return a; + if (unlikely(imm > 31)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s32( + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); +} + +// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) { + if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ + return a; + if (unlikely(imm > 63)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s64( + vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); +} + +// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[7:0] > 15 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 +#define _mm_srli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely(imm) == 0) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 16)) { \ + ret = vreinterpretq_m128i_u16( \ + vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_u32( \ + vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF imm8[7:0] > 63 +// dst[i+63:i] := 0 +// ELSE +// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 +#define _mm_srli_epi64(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 64)) { \ + ret = vreinterpretq_m128i_u64( \ + vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, +// and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) +// ELSE +// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srai_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_s32( \ + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = vreinterpretq_m128i_s32( \ + vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ + } \ + ret; \ + }) + +// Shifts the 128 - bit value in a right by imm bytes while shifting in +// zeros.imm must be an immediate. +// +// r := srl(a, imm*8) +// +// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx +// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) <= 0)) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8( \ + vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm +// must be an immediate. +// +// r := a << (imm * 8) +// +// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_slli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) <= 0)) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8( \ + vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \ + } \ + ret; \ + }) + +// Compute the square root of packed double-precision (64-bit) floating-point +// elements in a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd +FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); +#else + double a0 = sqrt(((double *)&a)[0]); + double a1 = sqrt(((double *)&a)[1]); + return _mm_set_pd(a1, a0); +#endif +} + +// Compute the square root of the lower double-precision (64-bit) floating-point +// element in b, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd +FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_sqrt_pd(b)); +#else + return _mm_set_pd(((double *)&a)[1], sqrt(((double *)&b)[0])); +#endif +} + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) { + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16((int16_t)c); + return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// r2 := a2 << count +// r3 := a3 << count +// +// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) { + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32((int32_t)c); + return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// +// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) { + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64((int64_t)c); + return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// ... +// r7 := srl(a7, count) +// +// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) { + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16(-(int16_t)c); + return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// r2 := srl(a2, count) +// r3 := srl(a3, count) +// +// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) { + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32(-(int32_t)c); + return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// +// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) { + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64(-(int64_t)c); + return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); +} + +// NEON does not provide a version of this function. +// Creates a 16-bit mask from the most significant bits of the 16 signed or +// unsigned 8-bit integers in a and zero extends the upper bits. +// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_epi8(__m128i a) { + // Use increasingly wide shifts+adds to collect the sign bits + // together. + // Since the widening shifts would be rather confusing to follow in little + // endian, everything will be illustrated in big endian order instead. This + // has a different result - the bits would actually be reversed on a big + // endian machine. + + // Starting input (only half the elements are shown): + // 89 ff 1d c0 00 10 99 33 + uint8x16_t input = vreinterpretq_u8_m128i(a); + + // Shift out everything but the sign bits with an unsigned shift right. + // + // Bytes of the vector:: + // 89 ff 1d c0 00 10 99 33 + // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) + // | | | | | | | | + // 01 01 00 01 00 00 01 00 + // + // Bits of first important lane(s): + // 10001001 (89) + // \______ + // | + // 00000001 (01) + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + + // Merge the even lanes together with a 16-bit unsigned shift right + add. + // 'xx' represents garbage data which will be ignored in the final result. + // In the important bytes, the add functions like a binary OR. + // + // 01 01 00 01 00 00 01 00 + // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) + // \| \| \| \| + // xx 03 xx 01 xx 00 xx 02 + // + // 00000001 00000001 (01 01) + // \_______ | + // \| + // xxxxxxxx xxxxxx11 (xx 03) + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + + // Repeat with a wider 32-bit shift + add. + // xx 03 xx 01 xx 00 xx 02 + // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> + // 14)) + // \| \| + // xx xx xx 0d xx xx xx 02 + // + // 00000011 00000001 (03 01) + // \\_____ || + // '----.\|| + // xxxxxxxx xxxx1101 (xx 0d) + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + + // Last, an even wider 64-bit shift + add to get our result in the low 8 bit + // lanes. xx xx xx 0d xx xx xx 02 + // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> + // 28)) + // \| + // xx xx xx xx xx xx xx d2 + // + // 00001101 00000010 (0d 02) + // \ \___ | | + // '---. \| | + // xxxxxxxx 11010010 (xx d2) + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + + // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. + // xx xx xx xx xx xx xx d2 + // || return paired64[0] + // d2 + // Note: Little endian would return the correct value 4b (01001011) instead. + return vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8); +} + +// Set each bit of mask dst based on the most significant bit of the +// corresponding packed double-precision (64-bit) floating-point element in a. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd +FORCE_INLINE int _mm_movemask_pd(__m128d a) { + uint64x2_t input = vreinterpretq_u64_m128d(a); + uint64x2_t high_bits = vshrq_n_u64(input, 63); + return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 +FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) { + return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); +} + +// Copy the 64-bit integer a to the lower element of dst, and zero the upper +// element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 +FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) { + return vreinterpretq_m128i_s64( + vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); +} + +// NEON does not provide this method +// Creates a 4-bit mask from the most significant bits of the four +// single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_ps(__m128 a) { + uint32x4_t input = vreinterpretq_u32_m128(a); +#if defined(__aarch64__) + static const int32x4_t shift = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(input, 31); + return vaddvq_u32(vshlq_u32(tmp, shift)); +#else + // Uses the exact same method as _mm_movemask_epi8, see that for details. + // Shift out everything but the sign bits with a 32-bit unsigned shift + // right. + uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); + // Merge the two pairs together with a 64-bit unsigned shift right + add. + uint8x16_t paired = + vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); + // Extract the result. + return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); +#endif +} + +// Compute the bitwise NOT of a and then AND with a 128-bit vector containing +// all 1's, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones +FORCE_INLINE int _mm_test_all_ones(__m128i a) { + return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == + ~(uint64_t)0; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros +FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) { + int64x2_t a_and_mask = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); + return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0 + : 1; +} + +/* Math operations */ + +// Subtracts the four single-precision, floating-point values of a and b. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_f32( + vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Subtract the lower single-precision (32-bit) floating-point element in b from +// the lower single-precision (32-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper 3 packed elements from +// a to the upper elements of dst. +// +// dst[31:0] := a[31:0] - b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_sub_ps(a, b)); +} + +// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, +// and store the results in dst. +// r0 := a0 - b0 +// r1 := a1 - b1 +FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) { + return vreinterpretq_m128i_s64( + vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or +// unsigned 32-bit integers of a. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_s32( + vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16 +FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_s16( + vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8 +FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) { + return vreinterpretq_m128i_s8( + vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. +// +// dst[63:0] := a[63:0] - b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 +FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) { + return vreinterpret_m64_s64( + vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit +// integers of a and saturates.. +// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx +FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) { + return vreinterpretq_m128i_u16( + vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit +// integers of a and saturates. +// +// r0 := UnsignedSaturate(a0 - b0) +// r1 := UnsignedSaturate(a1 - b1) +// ... +// r15 := UnsignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { + return vreinterpretq_m128i_u8( + vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +#define _mm_ucomieq_sd _mm_comieq_sd +#define _mm_ucomige_sd _mm_comige_sd +#define _mm_ucomigt_sd _mm_comigt_sd +#define _mm_ucomile_sd _mm_comile_sd +#define _mm_ucomilt_sd _mm_comilt_sd +#define _mm_ucomineq_sd _mm_comineq_sd + +// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r15 := SignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { + return vreinterpretq_m128i_s8( + vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r7 := SignedSaturate(a7 - b7) +// +// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_s16( + vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed double-precision (64-bit) floating-point elements in b from +// packed double-precision (64-bit) floating-point elements in a, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] - b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd +FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *)&a; + double *db = (double *)&b; + double c[2]; + c[0] = da[0] - db[0]; + c[1] = da[1] - db[1]; + return vld1q_f32((float32_t *)c); +#endif +} + +// Subtract the lower double-precision (64-bit) floating-point element in b from +// the lower double-precision (64-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd +FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) { + return _mm_move_sd(a, _mm_sub_pd(a, b)); +} + +// Add packed unsigned 16-bit integers in a and b using saturation, and store +// the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16 +FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) { + return vreinterpretq_m128i_u16( + vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Negate packed 8-bit integers in a when the corresponding signed +// 8-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..15 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) { + int8x16_t a = vreinterpretq_s8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); +#else + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a') + // based on ltMask + int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); + // res = masked & (~zeroMask) + int8x16_t res = vbicq_s8(masked, zeroMask); + + return vreinterpretq_m128i_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed +// 16-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..7 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) { + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); +#else + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative + // 'a') based on ltMask + int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); + // res = masked & (~zeroMask) + int16x8_t res = vbicq_s16(masked, zeroMask); + return vreinterpretq_m128i_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed +// 32-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..3 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) { + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); +#else + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative + // 'a') based on ltMask + int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); + // res = masked & (~zeroMask) + int32x4_t res = vbicq_s32(masked, zeroMask); + return vreinterpretq_m128i_s32(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 3 +// i := j*16 +// IF b[i+15:i] < 0 +// dst[i+15:i] := -(a[i+15:i]) +// ELSE IF b[i+15:i] == 0 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 +FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) { + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); + + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); +#else + int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a') + // based on ltMask + int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); + // res = masked & (~zeroMask) + int16x4_t res = vbic_s16(masked, zeroMask); + + return vreinterpret_m64_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed 32-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 1 +// i := j*32 +// IF b[i+31:i] < 0 +// dst[i+31:i] := -(a[i+31:i]) +// ELSE IF b[i+31:i] == 0 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := a[i+31:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 +FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) { + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); +#else + int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a') + // based on ltMask + int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); + // res = masked & (~zeroMask) + int32x2_t res = vbic_s32(masked, zeroMask); + + return vreinterpret_m64_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer +// in b is negative, and store the results in dst. Element in dst are zeroed out +// when the corresponding element in b is zero. +// +// FOR j := 0 to 7 +// i := j*8 +// IF b[i+7:i] < 0 +// dst[i+7:i] := -(a[i+7:i]) +// ELSE IF b[i+7:i] == 0 +// dst[i+7:i] := 0 +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 +FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) { + int8x8_t a = vreinterpret_s8_m64(_a); + int8x8_t b = vreinterpret_s8_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); +#else + int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a') + // based on ltMask + int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); + // res = masked & (~zeroMask) + int8x8_t res = vbic_s8(masked, zeroMask); + + return vreinterpret_m64_s8(res); +} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 +FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) { + return vreinterpret_m64_u16( + vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 +FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) { + return vreinterpret_m64_u8( + vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb +#define _m_pavgb(a, b) _mm_avg_pu8(a, b) + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw +#define _m_pavgw(a, b) _mm_avg_pu16(a, b) + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw +#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw +#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb +#define _m_pmovmskb(a) _mm_movemask_pi8(a) + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Computes the average of the 16 unsigned 8-bit integers in a and the 16 +// unsigned 8-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r15 := (a15 + b15) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) { + return vreinterpretq_m128i_u8( + vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128 +#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) + +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128 +#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) + +// Computes the average of the 8 unsigned 16-bit integers in a and the 8 +// unsigned 16-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r7 := (a7 + b7) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) { + return (__m128i)vrhaddq_u16(vreinterpretq_u16_m128i(a), + vreinterpretq_u16_m128i(b)); +} + +// Adds the four single-precision, floating-point values of a and b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_f32( + vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Add packed double-precision (64-bit) floating-point elements in a and b, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd +FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *)&a; + double *db = (double *)&b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1] + db[1]; + return vld1q_f32((float32_t *)c); +#endif +} + +// Add the lower double-precision (64-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper element from +// a to the upper element of dst. +// +// dst[63:0] := a[63:0] + b[63:0] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd +FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_add_pd(a, b)); +#else + double *da = (double *)&a; + double *db = (double *)&b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1]; + return vld1q_f32((float32_t *)c); +#endif +} + +// Add 64-bit integers a and b, and store the result in dst. +// +// dst[63:0] := a[63:0] + b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 +FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) { + return vreinterpret_m64_s64( + vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// adds the scalar single-precision floating point values of a and b. +// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) { + float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + // the upper values in the result must be the remnants of . + return vreinterpretq_m128_f32(vaddq_f32(a, value)); +} + +// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) { + return vreinterpretq_m128i_s64( + vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_s32( + vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or +// unsigned 16-bit integers in b. +// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_s16( + vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or +// unsigned 8-bit integers in b. +// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) +FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) { + return vreinterpretq_m128i_s8( + vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b +// and saturates. +// +// r0 := SignedSaturate(a0 + b0) +// r1 := SignedSaturate(a1 + b1) +// ... +// r7 := SignedSaturate(a7 + b7) +// +// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_s16( + vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Add packed signed 8-bit integers in a and b using saturation, and store the +// results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 +FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) { + return vreinterpretq_m128i_s8( + vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in +// b and saturates.. +// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) { + return vreinterpretq_m128i_u8( + vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or +// unsigned 16-bit integers from b. +// +// r0 := (a0 * b0)[15:0] +// r1 := (a1 * b1)[15:0] +// ... +// r7 := (a7 * b7)[15:0] +// +// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_s16( + vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or +// unsigned 32-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_s32( + vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// tmp[31:0] := a[i+15:i] * b[i+15:i] +// dst[i+15:i] := tmp[31:16] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Multiplies the four single-precision, floating-point values of a and b. +// +// r0 := a0 * b0 +// r1 := a1 * b1 +// r2 := a2 * b2 +// r3 := a3 * b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_f32( + vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Multiply packed double-precision (64-bit) floating-point elements in a and b, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd +FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *)&a; + double *db = (double *)&b; + double c[2]; + c[0] = da[0] * db[0]; + c[1] = da[1] * db[1]; + return vld1q_f32((float32_t *)c); +#endif +} + +// Multiply the lower double-precision (64-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper element +// from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd +FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) { + return _mm_move_sd(a, _mm_mul_pd(a, b)); +} + +// Multiply the lower single-precision (32-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// +// dst[31:0] := a[31:0] * b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_mul_ps(a, b)); +} + +// Multiply the low unsigned 32-bit integers from each packed 64-bit element in +// a and b, and store the unsigned 64-bit results in dst. +// +// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) +// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) { + // vmull_u32 upcasts instead of masking, so we downcast. + uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); + uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); + return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); +} + +// Multiply the low unsigned 32-bit integers from a and b, and store the +// unsigned 64-bit result in dst. +// +// dst[63:0] := a[31:0] * b[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) { + return vreinterpret_m64_u64(vget_low_u64( + vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); +} + +// Multiply the low signed 32-bit integers from each packed 64-bit element in +// a and b, and store the signed 64-bit results in dst. +// +// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 +// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) { + // vmull_s32 upcasts instead of masking, so we downcast. + int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); + int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0) + (a1 * b1) +// r1 := (a2 * b2) + (a3 * b3) +// r2 := (a4 * b4) + (a5 * b5) +// r3 := (a6 * b6) + (a7 * b7) +// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx +FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { + int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); + int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); + + return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. mem_addr does not need to be aligned +// on any particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128 +FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) { + int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); + __m128 b = _mm_load_ps((const float *)mem_addr); + int8x16_t masked = + vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), + vreinterpretq_s8_m128(b)); + vst1q_s8((int8_t *)mem_addr, masked); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Shift right by 15 bits while rounding up, and store +// the packed 16-bit integers in dst. +// +// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) +// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) +// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) +// ... +// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) { + // Has issues due to saturation + // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); + + // Multiply + int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + // Rounding narrowing shift right + // narrow = (int16_t)((mul + 16384) >> 15); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + + // Join together + return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, +// and pack the saturated results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + +// a[i+7:i]*b[i+7:i] ) +// ENDFOR +FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) { +#if defined(__aarch64__) + uint8x16_t a = vreinterpretq_u8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b))); + int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b))); + return vreinterpretq_m128i_s16( + vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); +#else + // This would be much simpler if x86 would choose to zero extend OR sign + // extend, not both. This could probably be optimized better. + uint16x8_t a = vreinterpretq_u16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // Zero extend a + int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); + int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); + + // Sign extend by shifting left then shifting right. + int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); + int16x8_t b_odd = vshrq_n_s16(b, 8); + + // multiply + int16x8_t prod1 = vmulq_s16(a_even, b_even); + int16x8_t prod2 = vmulq_s16(a_odd, b_odd); + + // saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); +#endif +} + +// Computes the fused multiple add product of 32-bit floating point numbers. +// +// Return Value +// Multiplies A and B, and adds C to the temporary result before returning it. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd +FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) { +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c), + vreinterpretq_f32_m128(b), + vreinterpretq_f32_m128(a))); +#else + return _mm_add_ps(_mm_mul_ps(a, b), c); +#endif +} + +// Alternatively add and subtract packed double-precision (64-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF ((j & 1) == 0) +// dst[i+63:i] := a[i+63:i] - b[i+63:i] +// ELSE +// dst[i+63:i] := a[i+63:i] + b[i+63:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd +FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) { + __m128d mask = _mm_set_pd(1.0f, -1.0f); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), + vreinterpretq_f64_m128d(b), + vreinterpretq_f64_m128d(mask))); +#else + return _mm_add_pd(_mm_mul_pd(b, mask), a); +#endif +} + +// Alternatively add and subtract packed single-precision (32-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps +FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { + __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + return _mm_fmadd_ps(b, mask, a); +} + +// Horizontally add adjacent pairs of double-precision (64-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd +FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *)&a; + double *db = (double *)&b; + double c[] = {da[0] + da[1], db[0] + db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *)c)); +#endif +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce two +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of 64-bit elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 +FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) { + uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t)a, (uint8x16_t)b)); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + uint16_t r4 = t[4] + t[5] + t[6] + t[7]; + uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); + return (__m128i)vsetq_lane_u16(r4, r, 4); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) { + uint16x4_t t = + vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0)); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// +// FOR j := 0 to 7 +// i := j*8 +// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +// ENDFOR +// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + +// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Divides the four single-precision, floating-point values of a and b. +// +// r0 := a0 / b0 +// r1 := a1 / b1 +// r2 := a2 / b2 +// r3 := a3 / b3 +// +// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { +#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV + return vreinterpretq_m128_f32( + vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#endif + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); +#endif +} + +// Divides the scalar single-precision floating point value of a by b. +// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) { + float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Divide packed double-precision (64-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// dst[i+63:i] := a[i+63:i] / b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd +FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *)&a; + double *db = (double *)&b; + double c[2]; + c[0] = da[0] / db[0]; + c[1] = da[1] / db[1]; + return vld1q_f32((float32_t *)c); +#endif +} + +// Divide the lower double-precision (64-bit) floating-point element in a by the +// lower double-precision (64-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper element from a to the upper +// element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd +FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) { +#if defined(__aarch64__) + float64x2_t tmp = + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_f64( + vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); +#else + return _mm_move_sd(a, _mm_div_pd(a, b)); +#endif +} + +// Compute the approximate reciprocal of packed single-precision (32-bit) +// floating-point elements in a, and store the results in dst. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) { + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#endif + return vreinterpretq_m128_f32(recip); +} + +// Compute the approximate reciprocal of the lower single-precision (32-bit) +// floating-point element in a, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// +// dst[31:0] := (1.0 / a[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss +FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { + return _mm_move_ss(a, _mm_rcp_ps(a)); +} + +// Computes the approximations of square roots of the four single-precision, +// floating-point values of a. First computes reciprocal square roots and then +// reciprocals of the four values. +// +// r0 := sqrt(a0) +// r1 := sqrt(a1) +// r2 := sqrt(a2) +// r3 := sqrt(a3) +// +// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) { +#if SSE2NEON_PRECISE_SQRT + float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + + // Test for vrsqrteq_f32(0) -> positive infinity case. + // Change to zero, so that s * 1/sqrt(s) result is zero too. + const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); + const uint32x4_t div_by_zero = + vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); + recip = vreinterpretq_f32_u32( + vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); + + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); + + // sqrt(s) = s * 1/sqrt(s) + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); +#elif defined(__aarch64__) + return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); +#else + float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + float32x4_t sq = vrecpeq_f32(recipsq); + return vreinterpretq_m128_f32(sq); +#endif +} + +// Computes the approximation of the square root of the scalar single-precision +// floating point value of in. +// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) { + float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); +} + +// Computes the approximations of the reciprocal square roots of the four +// single-precision floating point values of in. +// The current precision is 1% error. +// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) { + float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); +#if SSE2NEON_PRECISE_SQRT + // Additional Netwon-Raphson iteration for accuracy + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#endif + return vreinterpretq_m128_f32(out); +} + +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) { + return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) { + return vreinterpret_m64_s16( + vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Computes the maximums of the four single-precision, floating-point values of +// a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) { +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_b, _a), _a, _b); +#else + return vreinterpretq_m128_f32( + vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) { + return vreinterpret_m64_u8( + vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) { + return vreinterpret_m64_s16( + vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Computes the minima of the four single-precision, floating-point values of a +// and b. +// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) { +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_a, _b), _a, _b); +#else + return vreinterpretq_m128_f32( + vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) { + return vreinterpret_m64_u8( + vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Computes the maximum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) { + float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the minimum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) { + float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) { + return vreinterpretq_m128i_u8( + vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed maximum values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd +FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) > (*(double *)&b0) ? a0 : b0; + d[1] = (*(double *)&a1) > (*(double *)&b1) ? a1 : b1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the maximum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd +FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_max_pd(a, b)); +#else + double *da = (double *)&a; + double *db = (double *)&b; + double c[2] = {fmax(da[0], db[0]), da[1]}; + return vld1q_f32((float32_t *)c); +#endif +} + +// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx +FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) { + return vreinterpretq_m128i_u8( + vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed minimum values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd +FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) < (*(double *)&b0) ? a0 : b0; + d[1] = (*(double *)&a1) < (*(double *)&b1) ? a1 : b1; + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the minimum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd +FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_min_pd(a, b)); +#else + double *da = (double *)&a; + double *db = (double *)&b; + double c[2] = {fmin(da[0], db[0]), da[1]}; + return vld1q_f32((float32_t *)c); +#endif +} + +// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_s16( + vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 +FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) { + return vreinterpretq_m128i_s8( + vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 +FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) { + return vreinterpretq_m128i_u16( + vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 +FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) { + return vreinterpretq_m128i_s8( + vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 +FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) { + return vreinterpretq_m128i_u16( + vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_s16( + vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// epi versions of min/max +// Computes the pariwise maximums of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 > b0) ? a0 : b0 +// r1 := (a1 > b1) ? a1 : b1 +// r2 := (a2 > b2) ? a2 : b2 +// r3 := (a3 > b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_s32( + vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the pariwise minima of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 < b0) ? a0 : b0 +// r1 := (a1 < b1) ? a1 : b1 +// r2 := (a2 < b2) ? a2 : b2 +// r3 := (a3 < b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_s32( + vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) { + return vreinterpretq_m128i_u32( + vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) { + return vreinterpretq_m128i_u32( + vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) { + return vreinterpret_m64_u16(vshrn_n_u32( + vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0)[31:16] +// r1 := (a1 * b1)[31:16] +// ... +// r7 := (a7 * b7)[31:16] +// +// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) { + /* FIXME: issue with large values because of result saturation */ + // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), + // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return + // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); + int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16 +FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) { + uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab3210 = vmull_u16(a3210, b3210); +#if defined(__aarch64__) + uint32x4_t ab7654 = + vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); + uint16x8_t r = + vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r); +#else + uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab7654 = vmull_u16(a7654, b7654); + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +#endif +} + +// Computes pairwise add of each argument as single-precision, floating-point +// values a and b. +// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32( + vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); +#endif +} + +// Computes pairwise add of each argument as a 16-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) { + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) + return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); +#else + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), + vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); +#endif +} + +// Horizontally subtract adjacent pairs of double-precision (64-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd +FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsubq_f64( + vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)), + vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)))); +#else + double *da = (double *)&_a; + double *db = (double *)&_b; + double c[] = {da[0] - da[1], db[0] - db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *)c)); +#endif +} + +// Horizontally substract adjacent pairs of single-precision (32-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps +FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) { +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vsubq_f32( + vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)), + vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)))); +#else + float32x4x2_t c = + vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)); + return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); +#endif +} + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) { + return vreinterpret_m64_s16( + vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) { + return vreinterpret_m64_s32( + vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); +} + +// Computes pairwise difference of each argument as a 16-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) { + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Subtract + return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357)); +} + +// Computes saturated pairwise sub of each argument as a 16-bit signed +// integer values a and b. +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) { +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); +#endif +} + +// Computes saturated pairwise difference of each argument as a 16-bit signed +// integer values a and b. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) { +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16(vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated subtract + return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357)); +#endif +} + +// Computes pairwise add of each argument as a 32-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) { + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + return vreinterpretq_m128i_s32( + vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), + vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); +} + +// Computes pairwise difference of each argument as a 32-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) { + int64x2_t a = vreinterpretq_s64_m128i(_a); + int64x2_t b = vreinterpretq_s64_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|b0|b2] + // [a1|a2|b1|b3] + int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b)); + int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32)); + // Subtract + return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13)); +} + +// Kahan summation for accurate summation of floating-point numbers. +// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html +FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) { + y -= *c; + float t = *sum + y; + *c = (t - *sum) - y; + *sum = t; +} + +// Conditionally multiply the packed single-precision (32-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, +// and conditionally store the sum in dst using the low 4 bits of imm. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps +FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) { +#if defined(__aarch64__) + /* shortcuts */ + if (imm == 0xFF) { + return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); + } + if (imm == 0x7F) { + float32x4_t m = _mm_mul_ps(a, b); + m[3] = 0; + return _mm_set1_ps(vaddvq_f32(m)); + } +#endif + + float s = 0, c = 0; + float32x4_t f32a = vreinterpretq_f32_m128(a); + float32x4_t f32b = vreinterpretq_f32_m128(b); + + /* To improve the accuracy of floating-point summation, Kahan algorithm + * is used for each operation. + */ + if (imm & (1 << 4)) + _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); + if (imm & (1 << 5)) + _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); + if (imm & (1 << 6)) + _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); + if (imm & (1 << 7)) + _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); + s += c; + + float32x4_t res = { + (imm & 0x1) ? s : 0, + (imm & 0x2) ? s : 0, + (imm & 0x4) ? s : 0, + (imm & 0x8) ? s : 0, + }; + return vreinterpretq_m128_f32(res); +} + +/* Compare operations */ + +// Compares for less than +// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) +FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_cmplt_ps(a, b)); +} + +// Compares for greater than. +// +// r0 := (a0 > b0) ? 0xffffffff : 0x0 +// r1 := (a1 > b1) ? 0xffffffff : 0x0 +// r2 := (a2 > b2) ? 0xffffffff : 0x0 +// r3 := (a3 > b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); +} + +// Compares for greater than or equal. +// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) +FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_cmpge_ps(a, b)); +} + +// Compares for less than or equal. +// +// r0 := (a0 <= b0) ? 0xffffffff : 0x0 +// r1 := (a1 <= b1) ? 0xffffffff : 0x0 +// r2 := (a2 <= b2) ? 0xffffffff : 0x0 +// r3 := (a3 <= b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) +FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_cmple_ps(a, b)); +} + +// Compares for equality. +// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for equality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); +} + +// Compares for inequality. +// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) { + return vreinterpretq_m128_u32(vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compares for inequality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) +FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) { + return _mm_cmplt_ps(a, b); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) { + return _mm_cmplt_ss(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) { + return _mm_cmple_ps(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) { + return _mm_cmple_ss(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) { + return _mm_cmpgt_ps(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) { + return _mm_cmpgt_ss(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) { + return _mm_cmpge_ps(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) { + return _mm_cmpge_ss(a, b); +} + +// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or +// unsigned 8-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { + return vreinterpretq_m128i_u8( + vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd +FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for equality, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd +FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) { + return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd +FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) >= (*(double *)&b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *)&a1) >= (*(double *)&b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd +FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmpge_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) >= (*(double *)&b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or +// unsigned 16-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_u16( + vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed 32-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_u32( + vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed 64-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) { +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); +#else + // ARMv7 lacks vceqq_u64 + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for lesser than. +// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) { + return vreinterpretq_m128i_u8( + vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd +FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) < (*(double *)&b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *)&a1) < (*(double *)&b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd +FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmplt_pd(a, b)); +#else + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) < (*(double *)&b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd +FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-equal, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd +FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) { + return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd +FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) { + return _mm_cmplt_pd(a, b); +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than-or-equal, store the result in the lower element of +// dst, and copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd +FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) { + return _mm_cmplt_sd(a, b); +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xff : 0x0 +// r1 := (a1 > b1) ? 0xff : 0x0 +// ... +// r15 := (a15 > b15) ? 0xff : 0x0 +// +// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) { + return vreinterpretq_m128i_u8( + vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd +FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) > (*(double *)&b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *)&a1) > (*(double *)&b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd +FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) > (*(double *)&b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd +FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) <= (*(double *)&b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *)&a1) <= (*(double *)&b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than-or-equal, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd +FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmple_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t)vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t)vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *)&a0) <= (*(double *)&b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for less than. +// +// r0 := (a0 < b0) ? 0xffff : 0x0 +// r1 := (a1 < b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 < b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_u16( + vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xffff : 0x0 +// r1 := (a1 > b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 > b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_u16( + vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for less than. +// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_u32( + vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for greater than. +// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_u32( + vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers +// in b for greater than. +FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) { +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + // ARMv7 lacks vcgtq_s64. + // This is based off of Clang's SSE2 polyfill: + // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi)) + + // Mask the sign bit out since we need a signed AND an unsigned comparison + // and it is ugly to try and split them. + int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull)); + int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask); + int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask); + // Check if a > b + int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask)); + // Copy upper mask to lower mask + // a_hi > b_hi + int64x2_t gt_hi = vshrq_n_s64(greater, 63); + // Copy lower mask to upper mask + // a_lo > b_lo + int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32); + // Compare for equality + int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask)); + // Copy upper mask to lower mask + // a_hi == b_hi + int64x2_t eq_hi = vshrq_n_s64(equal, 63); + // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi) + int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi)); + return vreinterpretq_m128i_s64(ret); +#endif +} + +// Compares the four 32-bit floats in a and b to check if any values are NaN. +// Ordered compare between each value returns true for "orderable" and false for +// "not orderable" (NaN). +// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see +// also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) { + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + uint32x4_t ceqaa = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t ceqbb = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); +} + +// Compares for ordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) +FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_cmpord_ps(a, b)); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) { + uint32x4_t f32a = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t f32b = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) { + return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than operation. : +// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important +// note!! The documentation on MSDN is incorrect! If either of the values is a +// NAN the docs say you will get a one, but in fact, it will return a zero!! +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) { + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_lt_b = + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than operation. : +// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) { + // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_gt_b = + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than or equal operation. : +// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) { + // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_le_b = + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than or equal operation. : +// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) { + // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_ge_b = + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an equality operation. : +// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) { + // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_eq_b = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an inequality operation. : +// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) { + // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_neq_b = vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0; +} + +// according to the documentation, these intrinsics behave the same as the +// non-'u' versions. We'll just alias them here. +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomineq_ss _mm_comineq_ss + +/* Conversions */ + +// Convert packed signed 32-bit integers in b to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, and copy the upper 2 packed elements from a to the upper elements of +// dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps +FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) { + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) { + return vreinterpretq_m128_f32( + vsetq_lane_f32((float)b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss +#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) + +// Convert the signed 64-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss +FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) { + return vreinterpretq_m128_f32( + vsetq_lane_f32((float)b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si +FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { +#if defined(__aarch64__) + return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0); +#else + float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t diff = data - floor(data); + if (diff > 0.5) + return (int32_t)ceil(data); + if (unlikely(diff == 0.5)) { + int32_t f = (int32_t)floor(data); + int32_t c = (int32_t)ceil(data); + return c & 1 ? f : c; + } + return (int32_t)floor(data); +#endif +} + +// Convert packed 16-bit integers in a to packed single-precision (32-bit) +// floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps +FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) { + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); +} + +// Convert packed 32-bit integers in b to packed single-precision (32-bit) +// floating-point elements, store the results in the lower 2 elements of dst, +// and copy the upper 2 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps +FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) { + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, then covert the packed signed 32-bit integers in b to +// single-precision (32-bit) floating-point element, and store the results in +// the upper 2 elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(a[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(a[63:32]) +// dst[95:64] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:96] := Convert_Int32_To_FP32(b[63:32]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps +FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) { + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); +} + +// Convert the lower packed 8-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps +FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) { + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); +} + +// Convert packed unsigned 16-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps +FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) { + return vreinterpretq_m128_f32( + vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); +} + +// Convert the lower packed unsigned 8-bit integers in a to packed +// single-precision (32-bit) floating-point elements, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps +FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) { + return vreinterpretq_m128_f32( + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values using truncate. +// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) { + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 +FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) { +#if defined(__aarch64__) + return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); +#else + double ret = *((double *)&a); + return (int64_t)ret; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x +#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) + +// Converts the four signed 32-bit integer values of a to single-precision, +// floating-point values +// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) { + return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); +} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*32 +// m := j*64 +// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd +FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); +#else + double a0 = (double)vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); + double a1 = (double)vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*32 +// m := j*64 +// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd +FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); +#else + double a0 = (double)vget_lane_s32(vreinterpret_s32_m64(a), 0); + double a1 = (double)vget_lane_s32(vreinterpret_s32_m64(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) { + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_u16(u16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx +FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) { + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_u32(u32x4); +} + +// Converts the two unsigned 8-bit integers in the lower 16 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) { + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) { + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_s16(s16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) { + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_s32(s32x4); +} + +// Converts the two signed 8-bit integers in the lower 32 bits to four +// signed 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) { + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four signed 16-bit integers in the lower 64 bits to four signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) { + return vreinterpretq_m128i_s32( + vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); +} + +// Converts the two signed 16-bit integers in the lower 32 bits two signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) { + int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four unsigned 16-bit integers in the lower 64 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) { + return vreinterpretq_m128i_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); +} + +// Converts the two unsigned 16-bit integers in the lower 32 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) { + uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the two unsigned 32-bit integers in the lower 64 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) { + return vreinterpretq_m128i_u64( + vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); +} + +// Converts the two signed 32-bit integers in the lower 64 bits to two signed +// 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) { + return vreinterpretq_m128i_s64( + vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values. +// +// r0 := (int) a0 +// r1 := (int) a1 +// r2 := (int) a2 +// r3 := (int) a3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx +// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A +// does not support! It is supported on ARMv8-A however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) { +#if defined(__aarch64__) + return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); +#else + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = + vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32( + vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = + vsubq_f32(vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal)); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 16-bit integers, and store the results in dst. Note: this intrinsic +// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and +// 0x7FFFFFFF. +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16 +FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) { + return vreinterpret_m64_s16( + vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); +} + +// Copy the lower 32-bit integer in a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) { + return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 +FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) { + return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +// r2 := 0x0 +// r3 := 0x0 +// +// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) { + return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); +} + +// Moves 64-bit integer a to the least significant 64 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) { + return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); +} + +// Cast vector of type __m128 to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd +FORCE_INLINE __m128d _mm_castps_pd(__m128 a) { + return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); +} + +// Applies a type cast to reinterpret four 32-bit floating point values passed +// in as a 128-bit parameter as packed 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb514099.aspx +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) { + return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); +} + +// Cast vector of type __m128i to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); +#else + return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); +#endif +} + +// Applies a type cast to reinterpret four 32-bit integers passed in as a +// 128-bit parameter as packed 32-bit floating point values. +// https://msdn.microsoft.com/en-us/library/bb514029.aspx +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) { + return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); +} + +// Loads 128-bit value. : +// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) { + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p)); +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd +FORCE_INLINE __m128d _mm_load1_pd(const double *p) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *)p)); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into the +// upper element of dst, and copy the lower element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := a[63:0] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); +#else + return vreinterpretq_m128d_f32(vcombine_f32( + vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *)p))); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Loads 128-bit value. : +// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) { + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p)); +} + +// Load unaligned 32-bit integer from memory into the first element of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[MAX:32] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 +FORCE_INLINE __m128i _mm_loadu_si32(const void *p) { + return vreinterpretq_m128i_s32( + vsetq_lane_s32(*(const int32_t *)p, vdupq_n_s32(0), 0)); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed single-precision (32-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// k := 64*j +// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) +// ENDFOR +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps +FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) { +#if defined(__aarch64__) + float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); + return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); +#else + float a0 = (float)((double *)&a)[0]; + float a1 = (float)((double *)&a)[1]; + return _mm_set_ps(0, 0, a1, a0); +#endif +} + +// Copy the lower double-precision (64-bit) floating-point element of a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 +FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { +#if defined(__aarch64__) + return (double)vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); +#else + return ((double *)&a)[0]; +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed double-precision (64-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// k := 32*j +// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd +FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); +#else + double a0 = (double)vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + double a1 = (double)vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Cast vector of type __m128d to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 +FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) { + return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); +} + +// Cast vector of type __m128d to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps +FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) { + return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps +FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) { + // Use a signed shift right to create a mask with the sign bit + uint32x4_t mask = + vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps +FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) { + const uint32_t ALIGN_STRUCT(16) + data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; + uint32x4_t mask = vld1q_u32(data); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd +FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) { + uint64x2_t mask = + vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); +#if defined(__aarch64__) + float64x2_t a = vreinterpretq_f64_m128d(_a); + float64x2_t b = vreinterpretq_f64_m128d(_b); + return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); +#else + uint64x2_t a = vreinterpretq_u64_m128d(_a); + uint64x2_t b = vreinterpretq_u64_m128d(_b); + return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); +#endif +} + +typedef struct { + uint16_t res0; + uint8_t res1 : 6; + uint8_t bit22 : 1; + uint8_t bit23 : 1; + uint8_t res2; +#if defined(__aarch64__) + uint32_t res3; +#endif +} fpcr_bitfield; + +// Macro: Set the rounding mode bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The rounding mode may contain any of +// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, +// _MM_ROUND_TOWARD_ZERO +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE +FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { + union { + fpcr_bitfield field; +#if defined(__aarch64__) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ +#else + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + switch (rounding) { + case _MM_ROUND_TOWARD_ZERO: + r.field.bit22 = 1; + r.field.bit23 = 1; + break; + case _MM_ROUND_DOWN: + r.field.bit22 = 0; + r.field.bit23 = 1; + break; + case _MM_ROUND_UP: + r.field.bit22 = 1; + r.field.bit23 = 0; + break; + default: //_MM_ROUND_NEAREST + r.field.bit22 = 0; + r.field.bit23 = 0; + } + +#if defined(__aarch64__) + asm volatile("msr FPCR, %0" ::"r"(r)); /* write */ +#else + asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif +} + +FORCE_INLINE void _mm_setcsr(unsigned int a) { _MM_SET_ROUNDING_MODE(a); } + +// Round the packed single-precision (32-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed single-precision +// floating-point elements in dst. +// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) { +#if defined(__aarch64__) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); + } +#else + float *v_float = (float *)&a; + __m128 zero, neg_inf, pos_inf; + + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return (__m128){floorf(v_float[0]), floorf(v_float[1]), floorf(v_float[2]), + floorf(v_float[3])}; + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]), + ceilf(v_float[3])}; + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); + neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]), + floorf(v_float[2]), floorf(v_float[3])); + pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]), + ceilf(v_float[2]), ceilf(v_float[3])); + return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero)); + default: //_MM_FROUND_CUR_DIRECTION + return (__m128){roundf(v_float[0]), roundf(v_float[1]), roundf(v_float[2]), + roundf(v_float[3])}; + } +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi +FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) { +#if defined(__aarch64__) + return vreinterpret_m64_s32( + vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)))); +#else + return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128( + _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))))); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32 +#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) + +// Round the packed single-precision (32-bit) floating-point elements in a up to +// an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps +FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) { + return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); +} + +// Round the lower single-precision (32-bit) floating-point element in b up to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// +// dst[31:0] := CEIL(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss +FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) { + return _mm_move_ss( + a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); +} + +// Round the packed single-precision (32-bit) floating-point elements in a down +// to an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps +FORCE_INLINE __m128 _mm_floor_ps(__m128 a) { + return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +} + +// Round the lower single-precision (32-bit) floating-point element in b down to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// +// dst[31:0] := FLOOR(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss +FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) { + return _mm_move_ss( + a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); +} + +// Load 128-bits of integer data from unaligned memory into dst. This intrinsic +// may perform better than _mm_loadu_si128 when the data crosses a cache line +// boundary. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 +#define _mm_lddqu_si128 _mm_loadu_si128 + +/* Miscellaneous Operations */ + +// Shifts the 8 signed 16-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// ... +// r7 := a7 >> count +// +// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) { + int64_t c = (int64_t)vget_low_s64((int64x2_t)count); + if (unlikely(c > 15)) + return _mm_cmplt_epi16(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t)a, vdupq_n_s16(-c))); +} + +// Shifts the 4 signed 32-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// r2 := a2 >> count +// r3 := a3 >> count +// +// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx +FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) { + int64_t c = (int64_t)vget_low_s64((int64x2_t)count); + if (unlikely(c > 31)) + return _mm_cmplt_epi32(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t)a, vdupq_n_s32(-c))); +} + +// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and +// saturates. +// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) { + return vreinterpretq_m128i_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), + vqmovn_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// ... +// r7 := UnsignedSaturate(a7) +// r8 := UnsignedSaturate(b0) +// r9 := UnsignedSaturate(b1) +// ... +// r15 := UnsignedSaturate(b7) +// +// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) { + return vreinterpretq_m128i_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), + vqmovun_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers +// and saturates. +// +// r0 := SignedSaturate(a0) +// r1 := SignedSaturate(a1) +// r2 := SignedSaturate(a2) +// r3 := SignedSaturate(a3) +// r4 := SignedSaturate(b0) +// r5 := SignedSaturate(b1) +// r6 := SignedSaturate(b2) +// r7 := SignedSaturate(b3) +// +// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_s16( + vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), + vqmovn_s32(vreinterpretq_s32_m128i(b)))); +} + +// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// r2 := UnsignedSaturate(a2) +// r3 := UnsignedSaturate(a3) +// r4 := UnsignedSaturate(b0) +// r5 := UnsignedSaturate(b1) +// r6 := UnsignedSaturate(b2) +// r7 := UnsignedSaturate(b3) +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) { + return vreinterpretq_m128i_u16( + vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), + vqmovun_s32(vreinterpretq_s32_m128i(b)))); +} + +// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// ... +// r14 := a7 +// r15 := b7 +// +// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the +// lower 4 signed or unsigned 16-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// r4 := a2 +// r5 := b2 +// r6 := a3 +// r7 := b3 +// +// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the +// lower 2 signed or unsigned 32 - bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { + int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); +} + +// Selects and interleaves the lower two single-precision, floating-point values +// from a and b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) { +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// +// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[63:0] +// dst[127:64] := src2[63:0] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd +FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), + vget_low_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the high half of a and b, and store the results in dst. +// +// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[127:64] +// dst[127:64] := src2[127:64] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd +FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) { +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), + vget_high_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Selects and interleaves the upper two single-precision, floating-point values +// from a and b. +// +// r0 := a2 +// r1 := b2 +// r2 := a3 +// r3 := b3 +// +// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) { +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a8 +// r1 := b8 +// r2 := a9 +// r3 := b9 +// ... +// r14 := a15 +// r15 := b15 +// +// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the +// upper 4 signed or unsigned 16-bit integers in b. +// +// r0 := a4 +// r1 := b4 +// r2 := a5 +// r3 := b5 +// r4 := a6 +// r5 := b6 +// r6 := a7 +// r7 := b7 +// +// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the +// upper 2 signed or unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper signed or unsigned 64-bit integer in a with the +// upper signed or unsigned 64-bit integer in b. +// +// r0 := a1 +// r1 := b1 +FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { + int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); +} + +// Horizontally compute the minimum amongst the packed unsigned 16-bit integers +// in a, store the minimum and index in dst, and zero the remaining bits in dst. +// +// index[2:0] := 0 +// min[15:0] := a[15:0] +// FOR j := 0 to 7 +// i := j*16 +// IF a[i+15:i] < min[15:0] +// index[2:0] := j +// min[15:0] := a[i+15:i] +// FI +// ENDFOR +// dst[15:0] := min[15:0] +// dst[18:16] := index[2:0] +// dst[127:19] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16 +FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) { + __m128i dst; + uint16_t min, idx = 0; + // Find the minimum value +#if defined(__aarch64__) + min = vminvq_u16(vreinterpretq_u16_m128i(a)); +#else + __m64 tmp; + tmp = + vreinterpret_m64_u16(vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), + vget_high_u16(vreinterpretq_u16_m128i(a)))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); +#endif + // Get the index of the minimum value + int i; + for (i = 0; i < 8; i++) { + if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { + idx = (uint16_t)i; + break; + } + a = _mm_srli_si128(a, 2); + } + // Generate result + dst = _mm_setzero_si128(); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); + return dst; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the CF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128 +FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) { + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))), + vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the ZF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 +FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) { + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Extracts the selected signed or unsigned 8-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) +#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) + +// Inserts the least significant 8 bits of b into the selected 8-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, +// __constrange(0,16) int imm) +#define _mm_insert_epi8(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s8( \ + vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 16-bit integer from a and zero +// extends. +// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx +// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) +#define _mm_extract_epi16(a, imm) \ + vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16 +#define _mm_extract_pi16(a, imm) \ + (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) + +// Inserts the least significant 16 bits of b into the selected 16-bit integer +// of a. +// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, +// __constrange(0,8) int imm) +#define _mm_insert_epi16(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s16( \ + vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ + }) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 +#define _mm_insert_pi16(a, b, imm) \ + __extension__({ \ + vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 32-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) + +// Extracts the selected single-precision (32-bit) floating-point from a. +// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) +#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) + +// Inserts the least significant 32 bits of b into the selected 32-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, +// __constrange(0,4) int imm) +#define _mm_insert_epi32(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 64-bit integer from a and zero +// extends. +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) +#define _mm_extract_epi64(a, imm) \ + vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) + +// Inserts the least significant 64 bits of b into the selected 64-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, +// __constrange(0,2) int imm) +#define _mm_insert_epi64(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s64( \ + vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ + }) + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) { +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcount) + return __builtin_popcount(a); +#else + return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a))); +#endif +#else + uint32_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + + input_val = vld1_u8((uint8_t *)&a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + + vst1_u32(&count, count32x2_val); + return count; +#endif +} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) { +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcountll) + return __builtin_popcountll(a); +#else + return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a))); +#endif +#else + uint64_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((uint8_t *)&a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(&count, count64x1_val); + return count; +#endif +} + +// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision +// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the +// transposed matrix in these vectors (row0 now contains column 0, etc.). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ + float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ + row0 = \ + vcombine_f32(vget_low_f32(ROW01.val[0]), vget_low_f32(ROW23.val[0])); \ + row1 = \ + vcombine_f32(vget_low_f32(ROW01.val[1]), vget_low_f32(ROW23.val[1])); \ + row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ + vget_high_f32(ROW23.val[0])); \ + row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ + vget_high_f32(ROW23.val[1])); \ + } while (0) + +/* Crypto Extensions */ + +#if defined(__ARM_FEATURE_CRYPTO) +// Wraps vmull_p64 +FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { + poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); + poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); + return vreinterpretq_u64_p128(vmull_p64(a, b)); +} +#else // ARMv7 polyfill +// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. +// +// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a +// 64-bit->128-bit polynomial multiply. +// +// It needs some work and is somewhat slow, but it is still faster than all +// known scalar methods. +// +// Algorithm adapted to C from +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted +// from "Fast Software Polynomial Multiplication on ARM Processors Using the +// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab +// (https://hal.inria.fr/hal-01506572) +static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { + poly8x8_t a = vreinterpret_p8_u64(_a); + poly8x8_t b = vreinterpret_p8_u64(_b); + + // Masks + uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), + vcreate_u8(0x00000000ffffffff)); + uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), + vcreate_u8(0x0000000000000000)); + + // Do the multiplies, rotating with vext to get all combinations + uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 + uint8x16_t e = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 + uint8x16_t f = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 + uint8x16_t g = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 + uint8x16_t h = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 + uint8x16_t i = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 + uint8x16_t j = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 + uint8x16_t k = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 + + // Add cross products + uint8x16_t l = veorq_u8(e, f); // L = E + F + uint8x16_t m = veorq_u8(g, h); // M = G + H + uint8x16_t n = veorq_u8(i, j); // N = I + J + + // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL + // instructions. +#if defined(__aarch64__) + uint8x16_t lm_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t lm_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t nk_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); + uint8x16_t nk_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); +#else + uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); + uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); + uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); + uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); +#endif + // t0 = (L) (P0 + P1) << 8 + // t1 = (M) (P2 + P3) << 16 + uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); + uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); + uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); + + // t2 = (N) (P4 + P5) << 24 + // t3 = (K) (P6 + P7) << 32 + uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); + uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); + uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); + + // De-interleave +#if defined(__aarch64__) + uint8x16_t t0 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t1 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t2 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); + uint8x16_t t3 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); +#else + uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); + uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); + uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); + uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); +#endif + // Shift the cross products + uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 + uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 + uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 + uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 + + // Accumulate the products + uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); + uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); + uint8x16_t mix = veorq_u8(d, cross1); + uint8x16_t r = veorq_u8(mix, cross2); + return vreinterpretq_u64_u8(r); +} +#endif // ARMv7 polyfill + +// Perform a carry-less multiplication of two 64-bit integers, selected from a +// and b according to imm8, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128 +FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, + const int imm) { + uint64x2_t a = vreinterpretq_u64_m128i(_a); + uint64x2_t b = vreinterpretq_u64_m128i(_b); + switch (imm & 0x11) { + case 0x00: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); + case 0x01: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); + case 0x10: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); + case 0x11: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); + default: + abort(); + } +} + +#if !defined(__ARM_FEATURE_CRYPTO) +/* clang-format off */ +#define SSE2NEON_AES_DATA(w) \ + { \ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ + w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ + w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ + w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ + w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ + w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ + w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ + w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ + w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ + w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ + w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ + w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ + w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ + w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ + w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ + w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ + w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ + w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ + w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ + w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ + w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ + w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ + w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ + w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ + w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ + w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ + w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ + w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ + w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ + w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ + w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ + w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ + w(0xb0), w(0x54), w(0xbb), w(0x16) \ + } +/* clang-format on */ + +/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ +#define SSE2NEON_AES_H0(x) (x) +static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0); +#undef SSE2NEON_AES_H0 + +// In the absence of crypto extensions, implement aesenc using regular neon +// intrinsics instead. See: +// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and +// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52 +// for more information Reproduced with permission of the author. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) { +#if defined(__aarch64__) + static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, + 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb}; + static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc}; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(EncBlock); + + // shift rows + w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + + // sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0); + + // mix columns + w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b); + w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + + // add round key + return vreinterpretq_m128i_u8(w) ^ RoundKey; + +#else /* ARMv7-A NEON implementation */ +#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ + (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ + (b0)) +#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) +#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) +#define SSE2NEON_AES_U0(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) +#define SSE2NEON_AES_U1(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) +#define SSE2NEON_AES_U2(p) \ + SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) +#define SSE2NEON_AES_U3(p) \ + SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) + static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { + SSE2NEON_AES_DATA(SSE2NEON_AES_U0), + SSE2NEON_AES_DATA(SSE2NEON_AES_U1), + SSE2NEON_AES_DATA(SSE2NEON_AES_U2), + SSE2NEON_AES_DATA(SSE2NEON_AES_U3), + }; +#undef SSE2NEON_AES_B2W +#undef SSE2NEON_AES_F2 +#undef SSE2NEON_AES_F3 +#undef SSE2NEON_AES_U0 +#undef SSE2NEON_AES_U1 +#undef SSE2NEON_AES_U2 +#undef SSE2NEON_AES_U3 + + uint32_t x0 = _mm_cvtsi128_si32(EncBlock); + uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55)); + uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA)); + uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF)); + + __m128i out = + _mm_set_epi32((aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ + aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), + (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ + aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), + (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ + aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), + (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ + aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); + + return _mm_xor_si128(out, RoundKey); +#endif +} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { + /* FIXME: optimized for NEON */ + uint8_t v[4][4] = { + [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, + [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, + [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, + [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, + }; + for (int i = 0; i < 16; i++) + vreinterpretq_nth_u8_m128i(a, i) = + v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i); + return a; +} + +// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. +// This instruction generates a round key for AES encryption. See +// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ +// for details. +// +// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) { + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); + for (int i = 0; i < 4; ++i) { + ((uint8_t *)&X1)[i] = SSE2NEON_sbox[((uint8_t *)&X1)[i]]; + ((uint8_t *)&X3)[i] = SSE2NEON_sbox[((uint8_t *)&X3)[i]]; + } + return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, + ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); +} +#undef SSE2NEON_AES_DATA + +#else /* __ARM_FEATURE_CRYPTO */ +// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and +// AESMC and then manually applying the real key as an xor operation. This +// unfortunately means an additional xor op; the compiler should be able to +// optimize this away for repeated calls however. See +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a +// for more details. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) { + return vreinterpretq_m128i_u8( + vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^ + vreinterpretq_u8_m128i(b)); +} + +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { + return _mm_xor_si128(vreinterpretq_m128i_u8( + vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + RoundKey); +} + +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { + // AESE does ShiftRows and SubBytes on A + uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); + + uint8x16_t dest = { + // Undo ShiftRows step from AESE and extract X1 and X3 + u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) + u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) + u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) + u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) + }; + uint32x4_t r = {0, (unsigned)rcon, 0, (unsigned)rcon}; + return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); +} +#endif + +/* Streaming Extensions */ + +// Guarantees that every preceding store is globally visible before any +// subsequent store. +// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx +FORCE_INLINE void _mm_sfence(void) { __sync_synchronize(); } + +// Store 64-bits of integer data from a into memory using a non-temporal memory +// hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi +FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) { + vst1_s64((int64_t *)p, vreinterpret_s64_m64(a)); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- +// point elements) from a into memory using a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps +FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) { +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *)p); +#else + vst1q_f32(p, vreinterpretq_f32_m128(a)); +#endif +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory using a non-temporal memory hint. mem_addr must +// be aligned on a 16-byte boundary or a general-protection exception may be +// generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd +FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) { +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *)p); +#elif defined(__aarch64__) + vst1q_f64(p, vreinterpretq_f64_m128d(a)); +#else + vst1q_s64((int64_t *)p, vreinterpretq_s64_m128d(a)); +#endif +} + +// Stores the data in a to the address p without polluting the caches. If the +// cache line containing address p is already in the cache, the cache will be +// updated. +// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) { +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1q_s64((int64_t *)p, vreinterpretq_s64_m128i(a)); +#endif +} + +// Store 32-bit integer a into memory using a non-temporal hint to minimize +// cache pollution. If the cache line containing address mem_addr is already in +// the cache, the cache will be updated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32 +FORCE_INLINE void _mm_stream_si32(int *p, int a) { + vst1q_lane_s32((int32_t *)p, vdupq_n_s32(a), 0); +} + +// Load 128-bits of integer data from memory into dst using a non-temporal +// memory hint. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) { +#if __has_builtin(__builtin_nontemporal_store) + return __builtin_nontemporal_load(p); +#else + return vreinterpretq_m128i_s64(vld1q_s64((int64_t *)p)); +#endif +} + +// Cache line containing p is flushed and invalidated from all caches in the +// coherency domain. : +// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx +FORCE_INLINE void _mm_clflush(void const *p) { + (void)p; + // no corollary for Neon? +} + +// Allocate aligned blocks of memory. +// https://software.intel.com/en-us/ +// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks +FORCE_INLINE void *_mm_malloc(size_t size, size_t align) { + void *ptr; + if (align == 1) + return malloc(size); + if (align == 2 || (sizeof(void *) == 8 && align == 4)) + align = sizeof(void *); + if (!posix_memalign(&ptr, align, size)) + return ptr; + return NULL; +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64 +FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) { + int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); + __m128 b = _mm_load_ps((const float *)mem_addr); + int8x8_t masked = + vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), + vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); + vst1_s8((int8_t *)mem_addr, masked); +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq +#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) + +// Free aligned memory that was allocated with _mm_malloc. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free +FORCE_INLINE void _mm_free(void *addr) { free(addr); } + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 8-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) { +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc ^= v; + for (int bit = 0; bit < 8; bit++) { + if (crc & 1) + crc = (crc >> 1) ^ UINT32_C(0x82f63b78); + else + crc = (crc >> 1); + } +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 16-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) { +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u8(crc, v & 0xff); + crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 32-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) { +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u16(crc, v & 0xffff); + crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 64-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) +FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) { +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); + crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); +#endif + return crc; +} + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#pragma pop_macro("FORCE_INLINE") +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC pop_options +#endif + +#endif diff --git a/sysinfos.c b/sysinfos.c index cf8fb8f..1eb426c 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -1,8 +1,13 @@ +#if !defined(SYSINFOS_C__) +#define SYSINFOS_C__ + /** * Unit to read cpu informations * * tpruvot 2014 - */ + * JayDDee 2019 + * +*/ #include #include @@ -13,14 +18,48 @@ #ifndef WIN32 +// 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input +// 1035g1: /sys/class/hwmon/hwmon1/temp1_input wrong temp +// ryzen has no /sys/devices/platform/coretemp.0 +// ryzen: /sys/class/hwmon/hwmon0 +// 2400: /sys/class/hwmon/hwmon0/temp1_input incorrect temp +// 2400 has no /sys/class/hwmon/hwmon2/temp1_input +// 2400 /sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input ok +// 6700 /sys/devices/platform/coretemp.0/hwmon/hwmon2/temp1_input +// 6700 /sys/class/hwmon/hwmon2/temp1_input +// /sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input never exists +// /sys/class/hwmon/hwmon0/temp2_input doesn't exist or shows wrong temp (sys16) +// /sys/class/hwmon/hwmon0/device/temp1_input doesn't exist + + +// the first 3 will find i5-2400, i7-6700k, r7-1700, i5-1035g1. +// The others are left in for legacy, some should probably be removed. +#define HWMON_PATH1 \ + "/sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input" + +#define HWMON_PATH2 \ + "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input" + +#define HWMON_PATH3 \ + "/sys/devices/platform/coretemp.0/hwmon/hwmon2/temp1_input" + #define HWMON_PATH \ "/sys/class/hwmon/hwmon2/temp1_input" + +// need this for Ryzen #define HWMON_ALT \ "/sys/class/hwmon/hwmon0/temp1_input" + +/* #define HWMON_ALT1 \ "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input" +*/ + +// This shows wrong temp on i5-1035g1 #define HWMON_ALT2 \ "/sys/class/hwmon/hwmon1/temp1_input" + +// None of these work on any of the cpus above. #define HWMON_ALT3 \ "/sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input" #define HWMON_ALT4 \ @@ -28,90 +67,114 @@ #define HWMON_ALT5 \ "/sys/class/hwmon/hwmon0/device/temp1_input" -static float linux_cputemp(int core) +static inline float linux_cputemp(int core) { float tc = 0.0; - FILE *fd = fopen(HWMON_PATH, "r"); + FILE *fd; uint32_t val = 0; - if (!fd) - fd = fopen(HWMON_ALT, "r"); + fd = fopen(HWMON_PATH1, "r"); - if (!fd) - fd = fopen(HWMON_ALT2, "r"); + if (!fd) + fd = fopen(HWMON_PATH2, "r"); - if (!fd) - fd = fopen(HWMON_ALT3, "r"); - - if (!fd) - fd = fopen(HWMON_ALT4, "r"); + if (!fd) + fd = fopen(HWMON_PATH3, "r"); - if (!fd) - fd = fopen(HWMON_ALT5, "r"); + if (!fd) + fd = fopen(HWMON_PATH, "r"); + if (!fd) + fd = fopen(HWMON_ALT, "r"); + if (!fd) return tc; - if (fscanf(fd, "%d", &val)) + if ( fscanf( fd, "%d", &val ) ) tc = val / 1000.0; - fclose(fd); + fclose( fd ); return tc; } -#define CPUFREQ_PATH \ - "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq" -static uint32_t linux_cpufreq(int core) + +#define CPUFREQ_PATH0\ + "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq" + +#define CPUFREQ_PATHn \ + "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq" + +static inline float linux_cpufreq(int core) { - FILE *fd = fopen(CPUFREQ_PATH, "r"); - uint32_t freq = 0; + FILE *fd = fopen( CPUFREQ_PATH0, "r" ); + long int freq = 0; - if (!fd) - return freq; + if ( !fd ) return (float)freq; + if ( !fscanf( fd, "%ld", &freq ) ) freq = 0; + fclose( fd ); + return (float)freq; +} - if (!fscanf(fd, "%d", &freq)) - return freq; +static inline void linux_cpu_hilo_freq( float *lo, float *hi ) +{ + long int freq = 0, hi_freq = 0, lo_freq = 0x7fffffff; - return freq; + for ( int i = 0; i < num_cpus; i++ ) + { + char path[64]; + sprintf( path, CPUFREQ_PATHn, i ); + FILE *fd = fopen( path, "r" ); + if ( !fd ) return; + else if ( fscanf( fd, "%ld", &freq ) ) + { + if ( freq > hi_freq ) hi_freq = freq; + if ( freq < lo_freq ) lo_freq = freq; + } + fclose( fd ); + } + *hi = (float)hi_freq; + *lo = (float)lo_freq; } + #else /* WIN32 */ -static float win32_cputemp(int core) +static inline float win32_cputemp( int core ) { // todo return 0.0; } + #endif /* !WIN32 */ /* exports */ -float cpu_temp(int core) +static inline float cpu_temp( int core ) { #ifdef WIN32 - return win32_cputemp(core); + return win32_cputemp( core ); #else - return linux_cputemp(core); + return linux_cputemp( core ); #endif } -uint32_t cpu_clock(int core) +static inline uint32_t cpu_clock( int core ) { #ifdef WIN32 return 0; #else - return linux_cpufreq(core); + return linux_cpufreq( core ); #endif } -int cpu_fanpercent() +static inline int cpu_fanpercent() { return 0; } -#ifndef __arm__ +#if !(defined(__arm__) || defined(__aarch64__)) static inline void cpuid(int functionnumber, int output[4]) { #if defined (_MSC_VER) || defined (__INTEL_COMPILER) // Microsoft or Intel compiler, intrin.h included @@ -142,7 +205,7 @@ static inline void cpuid(int functionnumber, int output[4]) { #define cpuid(fn, out) out[0] = 0; #endif -void cpu_getname(char *outbuf, size_t maxsz) +static inline void cpu_getname(char *outbuf, size_t maxsz) { memset(outbuf, 0, maxsz); #ifdef WIN32 @@ -190,7 +253,7 @@ void cpu_getname(char *outbuf, size_t maxsz) #endif } -void cpu_getmodelid(char *outbuf, size_t maxsz) +static inline void cpu_getmodelid(char *outbuf, size_t maxsz) { memset(outbuf, 0, maxsz); #ifdef WIN32 @@ -259,32 +322,51 @@ void cpu_getmodelid(char *outbuf, size_t maxsz) #define CPU_BRAND_2 (0x80000003) #define CPU_BRAND_3 (0x80000004) +// Registers #define EAX_Reg (0) #define EBX_Reg (1) #define ECX_Reg (2) #define EDX_Reg (3) -#define XSAVE_Flag (1<<26) // ECX -#define OSXSAVE_Flag (1<<27) -#define AVX1_Flag (1<<28) +// Feature flags + +// CPU_INFO ECX +#define SSE3_Flag 1 +#define SSSE3_Flag (1<< 9) #define XOP_Flag (1<<11) #define FMA3_Flag (1<<12) #define AES_Flag (1<<25) +#define SSE41_Flag (1<<19) #define SSE42_Flag (1<<20) +#define AES_Flag (1<<25) +#define XSAVE_Flag (1<<26) +#define OSXSAVE_Flag (1<<27) +#define AVX_Flag (1<<28) -#define SSE_Flag (1<<25) // EDX +// CPU_INFO EDX +#define SSE_Flag (1<<25) #define SSE2_Flag (1<<26) -#define AVX2_Flag (1<< 5) // ADV EBX +// EXTENDED_FEATURES EBX +#define AVX2_Flag (1<< 5) #define AVX512F_Flag (1<<16) +#define AVX512DQ_Flag (1<<17) #define SHA_Flag (1<<29) +#define AVX512BW_Flag (1<<30) +#define AVX512VL_Flag (1<<31) + +// EXTENDED_FEATURES ECX +#define AVX512VBMI_Flag (1<<1) +#define AVX512VBMI2_Flag (1<<6) +#define VAES_Flag (1<<9) -// Use this to detect presence of feature -#define AVX1_mask (AVX1_Flag|XSAVE_Flag|OSXSAVE_Flag) -#define FMA3_mask (FMA3_Flag|AVX1_mask) +// Use this to detect presence of feature +#define AVX_mask (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag) +#define FMA3_mask (FMA3_Flag|AVX_mask) +#define AVX512_mask (AVX512VL_Flag|AVX512BW_Flag|AVX512DQ_Flag|AVX512F_Flag) -static inline bool has_sha_() +static inline bool has_sha() { #ifdef __arm__ return false; @@ -295,10 +377,7 @@ static inline bool has_sha_() #endif } -bool has_sha() { return has_sha_(); } - - -static inline bool has_sse2_() +static inline bool has_sse2() { #ifdef __arm__ return false; @@ -309,10 +388,8 @@ static inline bool has_sse2_() #endif } -bool has_sse2() { return has_sse2_(); } - -// nehalem and above, no AVX1 on nehalem -static inline bool has_aes_ni_() +// nehalem and above, no AVX on nehalem +static inline bool has_aes_ni() { #ifdef __arm__ return false; @@ -323,24 +400,20 @@ static inline bool has_aes_ni_() #endif } -bool has_aes_ni() { return has_aes_ni_(); } - // westmere and above -static inline bool has_avx1_() +static inline bool has_avx() { #ifdef __arm__ return false; #else int cpu_info[4] = { 0 }; cpuid( CPU_INFO, cpu_info ); - return ( ( cpu_info[ ECX_Reg ] & AVX1_mask ) == AVX1_mask ); + return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask ); #endif } -bool has_avx1() { return has_avx1_(); } - // haswell and above -static inline bool has_avx2_() +static inline bool has_avx2() { #ifdef __arm__ return false; @@ -351,9 +424,7 @@ static inline bool has_avx2_() #endif } -bool has_avx2() { return has_avx2_(); } - -static inline bool has_avx512f_() +static inline bool has_avx512f() { #ifdef __arm__ return false; @@ -364,24 +435,86 @@ static inline bool has_avx512f_() #endif } -bool has_avx512f() { return has_avx512f_(); } +static inline bool has_avx512dq() +{ +#ifdef __arm__ + return false; +#else + int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, cpu_info ); + return cpu_info[ EBX_Reg ] & AVX512DQ_Flag; +#endif +} + +static inline bool has_avx512bw() +{ +#ifdef __arm__ + return false; +#else + int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, cpu_info ); + return cpu_info[ EBX_Reg ] & AVX512BW_Flag; +#endif +} +static inline bool has_avx512vl() +{ +#ifdef __arm__ + return false; +#else + int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, cpu_info ); + return cpu_info[ EBX_Reg ] & AVX512VL_Flag; +#endif +} + +// Minimum to be useful +static inline bool has_avx512() +{ +#ifdef __arm__ + return false; +#else + int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, cpu_info ); + return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask ); +#endif +} + +// AMD Zen3 added support for 256 bit VAES without requiring AVX512. +// The original Intel spec requires AVX512F to support 512 bit VAES and +// requires AVX512VL to support 256 bit VAES. +// The CPUID VAES bit alone can't distiguish 256 vs 512 bit. +// If necessary: +// VAES 256 & 512 = VAES && AVX512VL +// VAES 512 = VAES && AVX512F +// VAES 256 = ( VAES && AVX512VL ) || ( VAES && !AVX512F ) +// VAES 512 only = VAES && AVX512F && !AVX512VL +// VAES 256 only = VAES && !AVX512F + +static inline bool has_vaes() +{ +#ifdef __arm__ + return false; +#else + int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, cpu_info ); + return cpu_info[ ECX_Reg ] & VAES_Flag; +#endif +} // AMD only -static inline bool has_xop_() +static inline bool has_xop() { #ifdef __arm__ return false; #else int cpu_info[4] = { 0 }; - cpuid( CPU_INFO, cpu_info ); + cpuid( EXTENDED_CPU_INFO, cpu_info ); return cpu_info[ ECX_Reg ] & XOP_Flag; #endif } -bool has_xop() { return has_xop_(); } - -static inline bool has_fma3_() +static inline bool has_fma3() { #ifdef __arm__ return false; @@ -392,9 +525,7 @@ static inline bool has_fma3_() #endif } -bool has_fma3() { return has_fma3_(); } - -static inline bool has_sse42_() +static inline bool has_sse42() { #ifdef __arm__ return false; @@ -405,9 +536,7 @@ static inline bool has_sse42_() #endif } -bool has_sse42() { return has_sse42_(); } - -static inline bool has_sse_() +static inline bool has_sse() { #ifdef __arm__ return false; @@ -418,16 +547,14 @@ static inline bool has_sse_() #endif } -bool has_sse() { return has_sse_(); } - -uint32_t cpuid_get_highest_function_number() +static inline uint32_t cpuid_get_highest_function_number() { uint32_t cpu_info[4] = {0}; cpuid( VENDOR_ID, cpu_info); return cpu_info[ EAX_Reg ]; } -void cpuid_get_highest_function( char* s ) +static inline void cpuid_get_highest_function( char* s ) { uint32_t fn = cpuid_get_highest_function_number(); switch (fn) @@ -449,7 +576,7 @@ void cpuid_get_highest_function( char* s ) } } -void cpu_bestfeature(char *outbuf, size_t maxsz) +static inline void cpu_bestfeature(char *outbuf, size_t maxsz) { #ifdef __arm__ sprintf(outbuf, "ARM"); @@ -459,19 +586,19 @@ void cpu_bestfeature(char *outbuf, size_t maxsz) cpuid( CPU_INFO, cpu_info ); cpuid( EXTENDED_FEATURES, cpu_info_adv ); - if ( has_avx1_() && has_avx2_() ) + if ( has_avx() && has_avx2() ) sprintf(outbuf, "AVX2"); - else if ( has_avx1_() ) - sprintf(outbuf, "AVX1"); - else if ( has_fma3_() ) + else if ( has_avx() ) + sprintf(outbuf, "AVX"); + else if ( has_fma3() ) sprintf(outbuf, "FMA3"); - else if ( has_xop_() ) + else if ( has_xop() ) sprintf(outbuf, "XOP"); - else if ( has_sse42_() ) + else if ( has_sse42() ) sprintf(outbuf, "SSE42"); - else if ( has_sse2_() ) + else if ( has_sse2() ) sprintf(outbuf, "SSE2"); - else if ( has_sse_() ) + else if ( has_sse() ) sprintf(outbuf, "SSE"); else *outbuf = '\0'; @@ -479,7 +606,7 @@ void cpu_bestfeature(char *outbuf, size_t maxsz) #endif } -void cpu_brand_string( char* s ) +static inline void cpu_brand_string( char* s ) { #ifdef __arm__ sprintf( s, "ARM" ); @@ -498,3 +625,5 @@ void cpu_brand_string( char* s ) #endif } +#endif // SYSINFOS_C__ + diff --git a/uint256.cpp b/uint256.cpp deleted file mode 100644 index c8af90b..0000000 --- a/uint256.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include "uint256.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#include "miner.h" - -// compute the diff ratio between a found hash and the target -double hash_target_ratio(uint32_t* hash, uint32_t* target) -{ - uint256 h, t; - double dhash; - - if (!opt_showdiff) - return 0.0; - - memcpy(&t, (void*) target, 32); - memcpy(&h, (void*) hash, 32); - - dhash = h.getdouble(); - if (dhash > 0.) - return t.getdouble() / dhash; - else - return dhash; -} - -// store ratio in work struct -void work_set_target_ratio( struct work* work, uint32_t* hash ) -{ - // only if the option is enabled (to reduce cpu usage) - if (opt_showdiff) { - work->shareratio = hash_target_ratio(hash, work->target); - work->sharediff = work->targetdiff * work->shareratio; - } -} - -#ifdef __cplusplus -} -#endif diff --git a/uint256.h b/uint256.h deleted file mode 100644 index 2a252c9..0000000 --- a/uint256.h +++ /dev/null @@ -1,784 +0,0 @@ -// Copyright (c) 2009-2010 Satoshi Nakamoto -// Copyright (c) 2009-2012 The Bitcoin developers -// Distributed under the MIT/X11 software license, see the accompanying -// file COPYING or http://www.opensource.org/licenses/mit-license.php. -#ifndef BITCOIN_UINT256_H -#define BITCOIN_UINT256_H - -#include -#include -#include -#include -#include -#include - -typedef long long int64; -typedef unsigned long long uint64; - - -inline int Testuint256AdHoc(std::vector vArg); - - - -/** Base class without constructors for uint256 and uint160. - * This makes the compiler let you use it in a union. - */ -template -class base_uint -{ -protected: - enum { WIDTH=BITS/32 }; - uint32_t pn[WIDTH]; -public: - - bool operator!() const - { - for (int i = 0; i < WIDTH; i++) - if (pn[i] != 0) - return false; - return true; - } - - const base_uint operator~() const - { - base_uint ret; - for (int i = 0; i < WIDTH; i++) - ret.pn[i] = ~pn[i]; - return ret; - } - - const base_uint operator-() const - { - base_uint ret; - for (int i = 0; i < WIDTH; i++) - ret.pn[i] = ~pn[i]; - ret++; - return ret; - } - - double getdouble() const - { - double ret = 0.0; - double fact = 1.0; - for (int i = 0; i < WIDTH; i++) { - ret += fact * pn[i]; - fact *= 4294967296.0; - } - return ret; - } - - base_uint& operator=(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - return *this; - } - - base_uint& operator^=(const base_uint& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] ^= b.pn[i]; - return *this; - } - - base_uint& operator&=(const base_uint& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] &= b.pn[i]; - return *this; - } - - base_uint& operator|=(const base_uint& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] |= b.pn[i]; - return *this; - } - - base_uint& operator^=(uint64 b) - { - pn[0] ^= (unsigned int)b; - pn[1] ^= (unsigned int)(b >> 32); - return *this; - } - - base_uint& operator|=(uint64 b) - { - pn[0] |= (unsigned int)b; - pn[1] |= (unsigned int)(b >> 32); - return *this; - } - - base_uint& operator<<=(unsigned int shift) - { - base_uint a(*this); - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - int k = shift / 32; - shift = shift % 32; - for (int i = 0; i < WIDTH; i++) - { - if (i+k+1 < WIDTH && shift != 0) - pn[i+k+1] |= (a.pn[i] >> (32-shift)); - if (i+k < WIDTH) - pn[i+k] |= (a.pn[i] << shift); - } - return *this; - } - - base_uint& operator>>=(unsigned int shift) - { - base_uint a(*this); - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - int k = shift / 32; - shift = shift % 32; - for (int i = 0; i < WIDTH; i++) - { - if (i-k-1 >= 0 && shift != 0) - pn[i-k-1] |= (a.pn[i] << (32-shift)); - if (i-k >= 0) - pn[i-k] |= (a.pn[i] >> shift); - } - return *this; - } - - base_uint& operator+=(const base_uint& b) - { - uint64 carry = 0; - for (int i = 0; i < WIDTH; i++) - { - uint64 n = carry + pn[i] + b.pn[i]; - pn[i] = n & 0xffffffff; - carry = n >> 32; - } - return *this; - } - - base_uint& operator-=(const base_uint& b) - { - *this += -b; - return *this; - } - - base_uint& operator+=(uint64 b64) - { - base_uint b; - b = b64; - *this += b; - return *this; - } - - base_uint& operator-=(uint64 b64) - { - base_uint b; - b = b64; - *this += -b; - return *this; - } - - - base_uint& operator++() - { - // prefix operator - int i = 0; - while (++pn[i] == 0 && i < WIDTH-1) - i++; - return *this; - } - - const base_uint operator++(int) - { - // postfix operator - const base_uint ret = *this; - ++(*this); - return ret; - } - - base_uint& operator--() - { - // prefix operator - int i = 0; - while (--pn[i] == -1 && i < WIDTH-1) - i++; - return *this; - } - - const base_uint operator--(int) - { - // postfix operator - const base_uint ret = *this; - --(*this); - return ret; - } - - - friend inline bool operator<(const base_uint& a, const base_uint& b) - { - for (int i = base_uint::WIDTH-1; i >= 0; i--) - { - if (a.pn[i] < b.pn[i]) - return true; - else if (a.pn[i] > b.pn[i]) - return false; - } - return false; - } - - friend inline bool operator<=(const base_uint& a, const base_uint& b) - { - for (int i = base_uint::WIDTH-1; i >= 0; i--) - { - if (a.pn[i] < b.pn[i]) - return true; - else if (a.pn[i] > b.pn[i]) - return false; - } - return true; - } - - friend inline bool operator>(const base_uint& a, const base_uint& b) - { - for (int i = base_uint::WIDTH-1; i >= 0; i--) - { - if (a.pn[i] > b.pn[i]) - return true; - else if (a.pn[i] < b.pn[i]) - return false; - } - return false; - } - - friend inline bool operator>=(const base_uint& a, const base_uint& b) - { - for (int i = base_uint::WIDTH-1; i >= 0; i--) - { - if (a.pn[i] > b.pn[i]) - return true; - else if (a.pn[i] < b.pn[i]) - return false; - } - return true; - } - - friend inline bool operator==(const base_uint& a, const base_uint& b) - { - for (int i = 0; i < base_uint::WIDTH; i++) - if (a.pn[i] != b.pn[i]) - return false; - return true; - } - - friend inline bool operator==(const base_uint& a, uint64 b) - { - if (a.pn[0] != (unsigned int)b) - return false; - if (a.pn[1] != (unsigned int)(b >> 32)) - return false; - for (int i = 2; i < base_uint::WIDTH; i++) - if (a.pn[i] != 0) - return false; - return true; - } - - friend inline bool operator!=(const base_uint& a, const base_uint& b) - { - return (!(a == b)); - } - - friend inline bool operator!=(const base_uint& a, uint64 b) - { - return (!(a == b)); - } - - - - std::string GetHex() const - { - char psz[sizeof(pn)*2 + 1]; - for (unsigned int i = 0; i < sizeof(pn); i++) - sprintf(psz + i*2, "%02x", ((unsigned char*)pn)[sizeof(pn) - i - 1]); - return std::string(psz, psz + sizeof(pn)*2); - } - - void SetHex(const char* psz) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - - // skip leading spaces - while (isspace(*psz)) - psz++; - - // skip 0x - if (psz[0] == '0' && tolower(psz[1]) == 'x') - psz += 2; - - // hex string to uint - static const unsigned char phexdigit[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0 }; - const char* pbegin = psz; - while (phexdigit[(unsigned char)*psz] || *psz == '0') - psz++; - psz--; - unsigned char* p1 = (unsigned char*)pn; - unsigned char* pend = p1 + WIDTH * 4; - while (psz >= pbegin && p1 < pend) - { - *p1 = phexdigit[(unsigned char)*psz--]; - if (psz >= pbegin) - { - *p1 |= (phexdigit[(unsigned char)*psz--] << 4); - p1++; - } - } - } - - void SetHex(const std::string& str) - { - SetHex(str.c_str()); - } - - std::string ToString() const - { - return (GetHex()); - } - - unsigned char* begin() - { - return (unsigned char*)&pn[0]; - } - - unsigned char* end() - { - return (unsigned char*)&pn[WIDTH]; - } - - const unsigned char* begin() const - { - return (unsigned char*)&pn[0]; - } - - const unsigned char* end() const - { - return (unsigned char*)&pn[WIDTH]; - } - - unsigned int size() const - { - return sizeof(pn); - } - - uint64 Get64(int n=0) const - { - return pn[2*n] | (uint64)pn[2*n+1] << 32; - } - -// unsigned int GetSerializeSize(int nType=0, int nVersion=PROTOCOL_VERSION) const - unsigned int GetSerializeSize(int nType, int nVersion) const - { - return sizeof(pn); - } - - template -// void Serialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION) const - void Serialize(Stream& s, int nType, int nVersion) const - { - s.write((char*)pn, sizeof(pn)); - } - - template -// void Unserialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION) - void Unserialize(Stream& s, int nType, int nVersion) - { - s.read((char*)pn, sizeof(pn)); - } - - - friend class uint160; - friend class uint256; - friend inline int Testuint256AdHoc(std::vector vArg); -}; - -typedef base_uint<160> base_uint160; -typedef base_uint<256> base_uint256; - - - -// -// uint160 and uint256 could be implemented as templates, but to keep -// compile errors and debugging cleaner, they're copy and pasted. -// - - - -////////////////////////////////////////////////////////////////////////////// -// -// uint160 -// - -/** 160-bit unsigned integer */ -class uint160 : public base_uint160 -{ -public: - typedef base_uint160 basetype; - - uint160() - { - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - } - - uint160(const basetype& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = b.pn[i]; - } - - uint160& operator=(const basetype& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = b.pn[i]; - return *this; - } - - uint160(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - } - - uint160& operator=(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - return *this; - } - - explicit uint160(const std::string& str) - { - SetHex(str); - } - - explicit uint160(const std::vector& vch) - { - if (vch.size() == sizeof(pn)) - memcpy(pn, &vch[0], sizeof(pn)); - else - *this = 0; - } -}; - -inline bool operator==(const uint160& a, uint64 b) { return (base_uint160)a == b; } -inline bool operator!=(const uint160& a, uint64 b) { return (base_uint160)a != b; } -inline const uint160 operator<<(const base_uint160& a, unsigned int shift) { return uint160(a) <<= shift; } -inline const uint160 operator>>(const base_uint160& a, unsigned int shift) { return uint160(a) >>= shift; } -inline const uint160 operator<<(const uint160& a, unsigned int shift) { return uint160(a) <<= shift; } -inline const uint160 operator>>(const uint160& a, unsigned int shift) { return uint160(a) >>= shift; } - -inline const uint160 operator^(const base_uint160& a, const base_uint160& b) { return uint160(a) ^= b; } -inline const uint160 operator&(const base_uint160& a, const base_uint160& b) { return uint160(a) &= b; } -inline const uint160 operator|(const base_uint160& a, const base_uint160& b) { return uint160(a) |= b; } -inline const uint160 operator+(const base_uint160& a, const base_uint160& b) { return uint160(a) += b; } -inline const uint160 operator-(const base_uint160& a, const base_uint160& b) { return uint160(a) -= b; } - -inline bool operator<(const base_uint160& a, const uint160& b) { return (base_uint160)a < (base_uint160)b; } -inline bool operator<=(const base_uint160& a, const uint160& b) { return (base_uint160)a <= (base_uint160)b; } -inline bool operator>(const base_uint160& a, const uint160& b) { return (base_uint160)a > (base_uint160)b; } -inline bool operator>=(const base_uint160& a, const uint160& b) { return (base_uint160)a >= (base_uint160)b; } -inline bool operator==(const base_uint160& a, const uint160& b) { return (base_uint160)a == (base_uint160)b; } -inline bool operator!=(const base_uint160& a, const uint160& b) { return (base_uint160)a != (base_uint160)b; } -inline const uint160 operator^(const base_uint160& a, const uint160& b) { return (base_uint160)a ^ (base_uint160)b; } -inline const uint160 operator&(const base_uint160& a, const uint160& b) { return (base_uint160)a & (base_uint160)b; } -inline const uint160 operator|(const base_uint160& a, const uint160& b) { return (base_uint160)a | (base_uint160)b; } -inline const uint160 operator+(const base_uint160& a, const uint160& b) { return (base_uint160)a + (base_uint160)b; } -inline const uint160 operator-(const base_uint160& a, const uint160& b) { return (base_uint160)a - (base_uint160)b; } - -inline bool operator<(const uint160& a, const base_uint160& b) { return (base_uint160)a < (base_uint160)b; } -inline bool operator<=(const uint160& a, const base_uint160& b) { return (base_uint160)a <= (base_uint160)b; } -inline bool operator>(const uint160& a, const base_uint160& b) { return (base_uint160)a > (base_uint160)b; } -inline bool operator>=(const uint160& a, const base_uint160& b) { return (base_uint160)a >= (base_uint160)b; } -inline bool operator==(const uint160& a, const base_uint160& b) { return (base_uint160)a == (base_uint160)b; } -inline bool operator!=(const uint160& a, const base_uint160& b) { return (base_uint160)a != (base_uint160)b; } -inline const uint160 operator^(const uint160& a, const base_uint160& b) { return (base_uint160)a ^ (base_uint160)b; } -inline const uint160 operator&(const uint160& a, const base_uint160& b) { return (base_uint160)a & (base_uint160)b; } -inline const uint160 operator|(const uint160& a, const base_uint160& b) { return (base_uint160)a | (base_uint160)b; } -inline const uint160 operator+(const uint160& a, const base_uint160& b) { return (base_uint160)a + (base_uint160)b; } -inline const uint160 operator-(const uint160& a, const base_uint160& b) { return (base_uint160)a - (base_uint160)b; } - -inline bool operator<(const uint160& a, const uint160& b) { return (base_uint160)a < (base_uint160)b; } -inline bool operator<=(const uint160& a, const uint160& b) { return (base_uint160)a <= (base_uint160)b; } -inline bool operator>(const uint160& a, const uint160& b) { return (base_uint160)a > (base_uint160)b; } -inline bool operator>=(const uint160& a, const uint160& b) { return (base_uint160)a >= (base_uint160)b; } -inline bool operator==(const uint160& a, const uint160& b) { return (base_uint160)a == (base_uint160)b; } -inline bool operator!=(const uint160& a, const uint160& b) { return (base_uint160)a != (base_uint160)b; } -inline const uint160 operator^(const uint160& a, const uint160& b) { return (base_uint160)a ^ (base_uint160)b; } -inline const uint160 operator&(const uint160& a, const uint160& b) { return (base_uint160)a & (base_uint160)b; } -inline const uint160 operator|(const uint160& a, const uint160& b) { return (base_uint160)a | (base_uint160)b; } -inline const uint160 operator+(const uint160& a, const uint160& b) { return (base_uint160)a + (base_uint160)b; } -inline const uint160 operator-(const uint160& a, const uint160& b) { return (base_uint160)a - (base_uint160)b; } - - - - - - -////////////////////////////////////////////////////////////////////////////// -// -// uint256 -// - -/** 256-bit unsigned integer */ -class uint256 : public base_uint256 -{ -public: - typedef base_uint256 basetype; - - uint256() - { - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - } - - uint256(const basetype& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = b.pn[i]; - } - - uint256& operator=(const basetype& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = b.pn[i]; - return *this; - } - - uint256(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - } - - uint256& operator=(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - return *this; - } - - explicit uint256(const std::string& str) - { - SetHex(str); - } - - explicit uint256(const std::vector& vch) - { - if (vch.size() == sizeof(pn)) - memcpy(pn, &vch[0], sizeof(pn)); - else - *this = 0; - } -}; - -inline bool operator==(const uint256& a, uint64 b) { return (base_uint256)a == b; } -inline bool operator!=(const uint256& a, uint64 b) { return (base_uint256)a != b; } -inline const uint256 operator<<(const base_uint256& a, unsigned int shift) { return uint256(a) <<= shift; } -inline const uint256 operator>>(const base_uint256& a, unsigned int shift) { return uint256(a) >>= shift; } -inline const uint256 operator<<(const uint256& a, unsigned int shift) { return uint256(a) <<= shift; } -inline const uint256 operator>>(const uint256& a, unsigned int shift) { return uint256(a) >>= shift; } - -inline const uint256 operator^(const base_uint256& a, const base_uint256& b) { return uint256(a) ^= b; } -inline const uint256 operator&(const base_uint256& a, const base_uint256& b) { return uint256(a) &= b; } -inline const uint256 operator|(const base_uint256& a, const base_uint256& b) { return uint256(a) |= b; } -inline const uint256 operator+(const base_uint256& a, const base_uint256& b) { return uint256(a) += b; } -inline const uint256 operator-(const base_uint256& a, const base_uint256& b) { return uint256(a) -= b; } - -inline bool operator<(const base_uint256& a, const uint256& b) { return (base_uint256)a < (base_uint256)b; } -inline bool operator<=(const base_uint256& a, const uint256& b) { return (base_uint256)a <= (base_uint256)b; } -inline bool operator>(const base_uint256& a, const uint256& b) { return (base_uint256)a > (base_uint256)b; } -inline bool operator>=(const base_uint256& a, const uint256& b) { return (base_uint256)a >= (base_uint256)b; } -inline bool operator==(const base_uint256& a, const uint256& b) { return (base_uint256)a == (base_uint256)b; } -inline bool operator!=(const base_uint256& a, const uint256& b) { return (base_uint256)a != (base_uint256)b; } -inline const uint256 operator^(const base_uint256& a, const uint256& b) { return (base_uint256)a ^ (base_uint256)b; } -inline const uint256 operator&(const base_uint256& a, const uint256& b) { return (base_uint256)a & (base_uint256)b; } -inline const uint256 operator|(const base_uint256& a, const uint256& b) { return (base_uint256)a | (base_uint256)b; } -inline const uint256 operator+(const base_uint256& a, const uint256& b) { return (base_uint256)a + (base_uint256)b; } -inline const uint256 operator-(const base_uint256& a, const uint256& b) { return (base_uint256)a - (base_uint256)b; } - -inline bool operator<(const uint256& a, const base_uint256& b) { return (base_uint256)a < (base_uint256)b; } -inline bool operator<=(const uint256& a, const base_uint256& b) { return (base_uint256)a <= (base_uint256)b; } -inline bool operator>(const uint256& a, const base_uint256& b) { return (base_uint256)a > (base_uint256)b; } -inline bool operator>=(const uint256& a, const base_uint256& b) { return (base_uint256)a >= (base_uint256)b; } -inline bool operator==(const uint256& a, const base_uint256& b) { return (base_uint256)a == (base_uint256)b; } -inline bool operator!=(const uint256& a, const base_uint256& b) { return (base_uint256)a != (base_uint256)b; } -inline const uint256 operator^(const uint256& a, const base_uint256& b) { return (base_uint256)a ^ (base_uint256)b; } -inline const uint256 operator&(const uint256& a, const base_uint256& b) { return (base_uint256)a & (base_uint256)b; } -inline const uint256 operator|(const uint256& a, const base_uint256& b) { return (base_uint256)a | (base_uint256)b; } -inline const uint256 operator+(const uint256& a, const base_uint256& b) { return (base_uint256)a + (base_uint256)b; } -inline const uint256 operator-(const uint256& a, const base_uint256& b) { return (base_uint256)a - (base_uint256)b; } - -inline bool operator<(const uint256& a, const uint256& b) { return (base_uint256)a < (base_uint256)b; } -inline bool operator<=(const uint256& a, const uint256& b) { return (base_uint256)a <= (base_uint256)b; } -inline bool operator>(const uint256& a, const uint256& b) { return (base_uint256)a > (base_uint256)b; } -inline bool operator>=(const uint256& a, const uint256& b) { return (base_uint256)a >= (base_uint256)b; } -inline bool operator==(const uint256& a, const uint256& b) { return (base_uint256)a == (base_uint256)b; } -inline bool operator!=(const uint256& a, const uint256& b) { return (base_uint256)a != (base_uint256)b; } -inline const uint256 operator^(const uint256& a, const uint256& b) { return (base_uint256)a ^ (base_uint256)b; } -inline const uint256 operator&(const uint256& a, const uint256& b) { return (base_uint256)a & (base_uint256)b; } -inline const uint256 operator|(const uint256& a, const uint256& b) { return (base_uint256)a | (base_uint256)b; } -inline const uint256 operator+(const uint256& a, const uint256& b) { return (base_uint256)a + (base_uint256)b; } -inline const uint256 operator-(const uint256& a, const uint256& b) { return (base_uint256)a - (base_uint256)b; } - - - - - - - - - - -#ifdef TEST_UINT256 - -inline int Testuint256AdHoc(std::vector vArg) -{ - uint256 g(0); - - - printf("%s\n", g.ToString().c_str()); - g--; printf("g--\n"); - printf("%s\n", g.ToString().c_str()); - g--; printf("g--\n"); - printf("%s\n", g.ToString().c_str()); - g++; printf("g++\n"); - printf("%s\n", g.ToString().c_str()); - g++; printf("g++\n"); - printf("%s\n", g.ToString().c_str()); - g++; printf("g++\n"); - printf("%s\n", g.ToString().c_str()); - g++; printf("g++\n"); - printf("%s\n", g.ToString().c_str()); - - - - uint256 a(7); - printf("a=7\n"); - printf("%s\n", a.ToString().c_str()); - - uint256 b; - printf("b undefined\n"); - printf("%s\n", b.ToString().c_str()); - int c = 3; - - a = c; - a.pn[3] = 15; - printf("%s\n", a.ToString().c_str()); - uint256 k(c); - - a = 5; - a.pn[3] = 15; - printf("%s\n", a.ToString().c_str()); - b = 1; - b <<= 52; - - a |= b; - - a ^= 0x500; - - printf("a %s\n", a.ToString().c_str()); - - a = a | b | (uint256)0x1000; - - - printf("a %s\n", a.ToString().c_str()); - printf("b %s\n", b.ToString().c_str()); - - a = 0xfffffffe; - a.pn[4] = 9; - - printf("%s\n", a.ToString().c_str()); - a++; - printf("%s\n", a.ToString().c_str()); - a++; - printf("%s\n", a.ToString().c_str()); - a++; - printf("%s\n", a.ToString().c_str()); - a++; - printf("%s\n", a.ToString().c_str()); - - a--; - printf("%s\n", a.ToString().c_str()); - a--; - printf("%s\n", a.ToString().c_str()); - a--; - printf("%s\n", a.ToString().c_str()); - uint256 d = a--; - printf("%s\n", d.ToString().c_str()); - printf("%s\n", a.ToString().c_str()); - a--; - printf("%s\n", a.ToString().c_str()); - a--; - printf("%s\n", a.ToString().c_str()); - - d = a; - - printf("%s\n", d.ToString().c_str()); - for (int i = uint256::WIDTH-1; i >= 0; i--) printf("%08x", d.pn[i]); printf("\n"); - - uint256 neg = d; - neg = ~neg; - printf("%s\n", neg.ToString().c_str()); - - - uint256 e = uint256("0xABCDEF123abcdef12345678909832180000011111111"); - printf("\n"); - printf("%s\n", e.ToString().c_str()); - - - printf("\n"); - uint256 x1 = uint256("0xABCDEF123abcdef12345678909832180000011111111"); - uint256 x2; - printf("%s\n", x1.ToString().c_str()); - for (int i = 0; i < 270; i += 4) - { - x2 = x1 << i; - printf("%s\n", x2.ToString().c_str()); - } - - printf("\n"); - printf("%s\n", x1.ToString().c_str()); - for (int i = 0; i < 270; i += 4) - { - x2 = x1; - x2 >>= i; - printf("%s\n", x2.ToString().c_str()); - } - - - for (int i = 0; i < 100; i++) - { - uint256 k = (~uint256(0) >> i); - printf("%s\n", k.ToString().c_str()); - } - - for (int i = 0; i < 100; i++) - { - uint256 k = (~uint256(0) << i); - printf("%s\n", k.ToString().c_str()); - } - - return (0); -} - -#endif - -#endif diff --git a/util.c b/util.c index 7a43355..d22ad25 100644 --- a/util.c +++ b/util.c @@ -12,30 +12,31 @@ #define _GNU_SOURCE #include -#include -#include +#include "sysinfos.c" #include -#include -#include -#include -#include -#include +#include #include -#include +#include #include -#include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include //#include #if defined(WIN32) -#include -#include #include "compat/winansi.h" +#include +#include #else -#include #include #include +#include #endif #ifndef _MSC_VER @@ -43,2357 +44,2270 @@ #include #endif -#include "miner.h" -#include "elist.h" +//#include "miner.h" #include "algo-gate-api.h" +#include "elist.h" -//extern pthread_mutex_t stats_lock; +// extern pthread_mutex_t stats_lock; struct data_buffer { - void *buf; - size_t len; + void *buf; + size_t len; }; struct upload_buffer { - const void *buf; - size_t len; - size_t pos; + const void *buf; + size_t len; + size_t pos; }; struct header_info { - char *lp_path; - char *reason; - char *stratum_url; + char *lp_path; + char *reason; + char *stratum_url; }; struct tq_ent { - void *data; - struct list_head q_node; + void *data; + struct list_head q_node; }; struct thread_q { - struct list_head q; + struct list_head q; - bool frozen; + bool frozen; - pthread_mutex_t mutex; - pthread_cond_t cond; + pthread_mutex_t mutex; + pthread_cond_t cond; }; -void applog(int prio, const char *fmt, ...) -{ - va_list ap; +bool is_power_of_2(int n) { + while (n > 1) { + if (n % 2 != 0) + return false; + n = n / 2; + } + return true; +} - va_start(ap, fmt); +void applog2(int prio, const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); #ifdef HAVE_SYSLOG_H - if (use_syslog) { - va_list ap2; - char *buf; - int len; - - /* custom colors to syslog prio */ - if (prio > LOG_DEBUG) { - switch (prio) { - case LOG_BLUE: prio = LOG_NOTICE; break; - } - } - - va_copy(ap2, ap); - len = vsnprintf(NULL, 0, fmt, ap2) + 1; - va_end(ap2); - buf = alloca(len); - if (vsnprintf(buf, len, fmt, ap) >= 0) - syslog(prio, "%s", buf); - } + if (use_syslog) { + va_list ap2; + char *buf; + int len; + + /* custom colors to syslog prio */ + if (prio > LOG_DEBUG) { + switch (prio) { + case LOG_BLUE: + prio = LOG_NOTICE; + break; + } + } + + va_copy(ap2, ap); + len = vsnprintf(NULL, 0, fmt, ap2) + 1; + va_end(ap2); + buf = alloca(len); + if (vsnprintf(buf, len, fmt, ap) >= 0) + syslog(prio, "%s", buf); + } #else - if (0) {} + if (0) { + } #endif - else { - const char* color = ""; - char *f; - int len; - struct tm tm; - time_t now = time(NULL); - - localtime_r(&now, &tm); - - switch (prio) { - case LOG_ERR: color = CL_RED; break; - case LOG_WARNING: color = CL_YLW; break; - case LOG_NOTICE: color = CL_WHT; break; - case LOG_INFO: color = ""; break; - case LOG_DEBUG: color = CL_GRY; break; - - case LOG_BLUE: - prio = LOG_NOTICE; - color = CL_CYN; - break; - } - if (!use_colors) - color = ""; - - len = 64 + (int) strlen(fmt) + 2; - f = (char*) malloc(len); - sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d]%s %s%s\n", - tm.tm_year + 1900, - tm.tm_mon + 1, - tm.tm_mday, - tm.tm_hour, - tm.tm_min, - tm.tm_sec, - color, - fmt, - use_colors ? CL_N : "" - ); - pthread_mutex_lock(&applog_lock); - vfprintf(stdout, f, ap); /* atomic write to stdout */ - fflush(stdout); - free(f); - pthread_mutex_unlock(&applog_lock); - } - va_end(ap); -} - -void log_sw_err( char* filename, int line_number, char* msg ) -{ - applog( LOG_ERR, "SW_ERR: %s:%d, %s", filename, line_number, msg ); + else { + const char *color = ""; + char *f; + int len; + // struct tm tm; + // time_t now = time(NULL); + + // localtime_r(&now, &tm); + + switch (prio) { + case LOG_ERR: + color = CL_RED; + break; + case LOG_WARNING: + color = CL_YLW; + break; + case LOG_NOTICE: + color = CL_WHT; + break; + case LOG_INFO: + color = ""; + break; + case LOG_DEBUG: + color = CL_GRY; + break; + + case LOG_BLUE: + prio = LOG_NOTICE; + color = CL_CYN; + break; + } + if (!use_colors) + color = ""; + + len = 64 + (int)strlen(fmt) + 2; + f = (char *)malloc(len); + sprintf(f, " %s %s%s\n", + // sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d]%s %s%s\n", + // tm.tm_year + 1900, + // tm.tm_mon + 1, + // tm.tm_mday, + // tm.tm_hour, + // tm.tm_min, + // tm.tm_sec, + color, fmt, use_colors ? CL_N : ""); + pthread_mutex_lock(&applog_lock); + vfprintf(stdout, f, ap); /* atomic write to stdout */ + fflush(stdout); + free(f); + pthread_mutex_unlock(&applog_lock); + } + va_end(ap); +} + +void applog(int prio, const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + +#ifdef HAVE_SYSLOG_H + if (use_syslog) { + va_list ap2; + char *buf; + int len; + + /* custom colors to syslog prio */ + if (prio > LOG_DEBUG) { + switch (prio) { + case LOG_BLUE: + prio = LOG_NOTICE; + break; + } + } + + va_copy(ap2, ap); + len = vsnprintf(NULL, 0, fmt, ap2) + 1; + va_end(ap2); + buf = alloca(len); + if (vsnprintf(buf, len, fmt, ap) >= 0) + syslog(prio, "%s", buf); + } +#else + if (0) { + } +#endif + else { + const char *color = ""; + char *f; + int len; + struct tm tm; + time_t now = time(NULL); + + localtime_r(&now, &tm); + + switch (prio) { + case LOG_ERR: + color = CL_RED; + break; + case LOG_WARNING: + color = CL_YLW; + break; + case LOG_NOTICE: + color = CL_WHT; + break; + case LOG_INFO: + color = ""; + break; + case LOG_DEBUG: + color = CL_GRY; + break; + + case LOG_BLUE: + prio = LOG_NOTICE; + color = CL_CYN; + break; + } + if (!use_colors) + color = ""; + + len = 64 + (int)strlen(fmt) + 2; + f = (char *)malloc(len); + sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d]%s %s%s\n", tm.tm_year + 1900, + tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, color, + fmt, use_colors ? CL_N : ""); + pthread_mutex_lock(&applog_lock); + vfprintf(stdout, f, ap); /* atomic write to stdout */ + fflush(stdout); + free(f); + pthread_mutex_unlock(&applog_lock); + } + va_end(ap); +} + +void log_sw_err(char *filename, int line_number, char *msg) { + applog(LOG_ERR, "SW_ERR: %s:%d, %s", filename, line_number, msg); } /* Get default config.json path (will be system specific) */ -void get_defconfig_path(char *out, size_t bufsize, char *argv0) -{ - char *cmd = strdup(argv0); - char *dir = dirname(cmd); - const char *sep = strstr(dir, "\\") ? "\\" : "/"; - struct stat info = { 0 }; +void get_defconfig_path(char *out, size_t bufsize, char *argv0) { + char *cmd = strdup(argv0); + char *dir = dirname(cmd); + const char *sep = strstr(dir, "\\") ? "\\" : "/"; + struct stat info = {0}; #ifdef WIN32 - snprintf(out, bufsize, "%s\\cpuminer\\cpuminer-conf.json", getenv("APPDATA")); + snprintf(out, bufsize, "%s\\cpuminer\\cpuminer-conf.json", getenv("APPDATA")); #else - snprintf(out, bufsize, "%s\\.cpuminer\\cpuminer-conf.json", getenv("HOME")); + snprintf(out, bufsize, "%s\\.cpuminer\\cpuminer-conf.json", getenv("HOME")); #endif - if (dir && stat(out, &info) != 0) { - snprintf(out, bufsize, "%s%scpuminer-conf.json", dir, sep); - } - if (stat(out, &info) != 0) { - out[0] = '\0'; - return; - } - out[bufsize - 1] = '\0'; - free(cmd); -} - - -void format_hashrate(double hashrate, char *output) -{ - char prefix = '\0'; - - if (hashrate < 10000) { - // nop - } - else if (hashrate < 1e7) { - prefix = 'k'; - hashrate *= 1e-3; - } - else if (hashrate < 1e10) { - prefix = 'M'; - hashrate *= 1e-6; - } - else if (hashrate < 1e13) { - prefix = 'G'; - hashrate *= 1e-9; - } - else { - prefix = 'T'; - hashrate *= 1e-12; - } - - sprintf( - output, - prefix ? "%.2f %cH/s" : "%.2f H/s%c", - hashrate, prefix - ); + if (dir && stat(out, &info) != 0) { + snprintf(out, bufsize, "%s%scpuminer-conf.json", dir, sep); + } + if (stat(out, &info) != 0) { + out[0] = '\0'; + return; + } + out[bufsize - 1] = '\0'; + free(cmd); +} + +void format_hashrate(double hashrate, char *output) { + char prefix = '\0'; + + if (hashrate < 10000) { + // nop + } else if (hashrate < 1e7) { + prefix = 'k'; + hashrate *= 1e-3; + } else if (hashrate < 1e10) { + prefix = 'M'; + hashrate *= 1e-6; + } else if (hashrate < 1e13) { + prefix = 'G'; + hashrate *= 1e-9; + } else { + prefix = 'T'; + hashrate *= 1e-12; + } + + sprintf(output, prefix ? "%.2f %cH/s" : "%.2f H/s%c", hashrate, prefix); } /* Modify the representation of integer numbers which would cause an overflow * so that they are treated as floating-point numbers. * This is a hack to overcome the limitations of some versions of Jansson. */ -static char *hack_json_numbers(const char *in) -{ - char *out; - int i, off, intoff; - bool in_str, in_int; - - out = (char*) calloc(2 * strlen(in) + 1, 1); - if (!out) - return NULL; - off = intoff = 0; - in_str = in_int = false; - for (i = 0; in[i]; i++) { - char c = in[i]; - if (c == '"') { - in_str = !in_str; - } else if (c == '\\') { - out[off++] = c; - if (!in[++i]) - break; - } else if (!in_str && !in_int && isdigit(c)) { - intoff = off; - in_int = true; - } else if (in_int && !isdigit(c)) { - if (c != '.' && c != 'e' && c != 'E' && c != '+' && c != '-') { - in_int = false; - if (off - intoff > 4) { - char *end; +static char *hack_json_numbers(const char *in) { + char *out; + int i, off, intoff; + bool in_str, in_int; + + out = (char *)calloc(2 * strlen(in) + 1, 1); + if (!out) + return NULL; + off = intoff = 0; + in_str = in_int = false; + for (i = 0; in[i]; i++) { + char c = in[i]; + if (c == '"') { + in_str = !in_str; + } else if (c == '\\') { + out[off++] = c; + if (!in[++i]) + break; + } else if (!in_str && !in_int && isdigit(c)) { + intoff = off; + in_int = true; + } else if (in_int && !isdigit(c)) { + if (c != '.' && c != 'e' && c != 'E' && c != '+' && c != '-') { + in_int = false; + if (off - intoff > 4) { + char *end; #if JSON_INTEGER_IS_LONG_LONG - errno = 0; - strtoll(out + intoff, &end, 10); - if (!*end && errno == ERANGE) { + errno = 0; + strtoll(out + intoff, &end, 10); + if (!*end && errno == ERANGE) { #else - long l; - errno = 0; - l = strtol(out + intoff, &end, 10); - if (!*end && (errno == ERANGE || l > INT_MAX)) { + long l; + errno = 0; + l = strtol(out + intoff, &end, 10); + if (!*end && (errno == ERANGE || l > INT_MAX)) { #endif - out[off++] = '.'; - out[off++] = '0'; - } - } - } - } - out[off++] = in[i]; - } - return out; + out[off++] = '.'; + out[off++] = '0'; + } + } + } + } + out[off++] = in[i]; + } + return out; } -static void databuf_free(struct data_buffer *db) -{ - if (!db) - return; +static void databuf_free(struct data_buffer *db) { + if (!db) + return; - free(db->buf); + free(db->buf); - memset(db, 0, sizeof(*db)); + memset(db, 0, sizeof(*db)); } static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb, - void *user_data) -{ - struct data_buffer *db = (struct data_buffer *) user_data; - size_t len = size * nmemb; - size_t oldlen, newlen; - void *newmem; - static const unsigned char zero = 0; + void *user_data) { + struct data_buffer *db = (struct data_buffer *)user_data; + size_t len = size * nmemb; + size_t oldlen, newlen; + void *newmem; + static const unsigned char zero = 0; - oldlen = db->len; - newlen = oldlen + len; + oldlen = db->len; + newlen = oldlen + len; - newmem = realloc(db->buf, newlen + 1); - if (!newmem) - return 0; + newmem = realloc(db->buf, newlen + 1); + if (!newmem) + return 0; - db->buf = newmem; - db->len = newlen; - memcpy((uchar*) db->buf + oldlen, ptr, len); - memcpy((uchar*) db->buf + newlen, &zero, 1); /* null terminate */ + db->buf = newmem; + db->len = newlen; + memcpy((uchar *)db->buf + oldlen, ptr, len); + memcpy((uchar *)db->buf + newlen, &zero, 1); /* null terminate */ - return len; + return len; } static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb, - void *user_data) -{ - struct upload_buffer *ub = (struct upload_buffer *) user_data; - size_t len = size * nmemb; + void *user_data) { + struct upload_buffer *ub = (struct upload_buffer *)user_data; + size_t len = size * nmemb; - if (len > ub->len - ub->pos) - len = ub->len - ub->pos; + if (len > ub->len - ub->pos) + len = ub->len - ub->pos; - if (len) { - memcpy(ptr, ((uchar*)ub->buf) + ub->pos, len); - ub->pos += len; - } + if (len) { + memcpy(ptr, ((uchar *)ub->buf) + ub->pos, len); + ub->pos += len; + } - return len; + return len; } #if LIBCURL_VERSION_NUM >= 0x071200 -static int seek_data_cb(void *user_data, curl_off_t offset, int origin) -{ - struct upload_buffer *ub = (struct upload_buffer *) user_data; - - switch (origin) { - case SEEK_SET: - ub->pos = (size_t) offset; - break; - case SEEK_CUR: - ub->pos += (size_t) offset; - break; - case SEEK_END: - ub->pos = ub->len + (size_t) offset; - break; - default: - return 1; /* CURL_SEEKFUNC_FAIL */ - } - - return 0; /* CURL_SEEKFUNC_OK */ +static int seek_data_cb(void *user_data, curl_off_t offset, int origin) { + struct upload_buffer *ub = (struct upload_buffer *)user_data; + + switch (origin) { + case SEEK_SET: + ub->pos = (size_t)offset; + break; + case SEEK_CUR: + ub->pos += (size_t)offset; + break; + case SEEK_END: + ub->pos = ub->len + (size_t)offset; + break; + default: + return 1; /* CURL_SEEKFUNC_FAIL */ + } + + return 0; /* CURL_SEEKFUNC_OK */ } #endif -static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data) -{ - struct header_info *hi = (struct header_info *) user_data; - size_t remlen, slen, ptrlen = size * nmemb; - char *rem, *val = NULL, *key = NULL; - void *tmp; - - val = (char*) calloc(1, ptrlen); - key = (char*) calloc(1, ptrlen); - if (!key || !val) - goto out; - - tmp = memchr(ptr, ':', ptrlen); - if (!tmp || (tmp == ptr)) /* skip empty keys / blanks */ - goto out; - slen = (char*)tmp - (char*)ptr; - if ((slen + 1) == ptrlen) /* skip key w/ no value */ - goto out; - memcpy(key, ptr, slen); /* store & nul term key */ - key[slen] = 0; - - rem = (char*)ptr + slen + 1; /* trim value's leading whitespace */ - remlen = ptrlen - slen - 1; - while ((remlen > 0) && (isspace(*rem))) { - remlen--; - rem++; - } - - memcpy(val, rem, remlen); /* store value, trim trailing ws */ - val[remlen] = 0; - while ((*val) && (isspace(val[strlen(val) - 1]))) { - val[strlen(val) - 1] = 0; - } - - if (!strcasecmp("X-Long-Polling", key)) { - hi->lp_path = val; /* steal memory reference */ - val = NULL; - } - - if (!strcasecmp("X-Reject-Reason", key)) { - hi->reason = val; /* steal memory reference */ - val = NULL; - } - - if (!strcasecmp("X-Stratum", key)) { - hi->stratum_url = val; /* steal memory reference */ - val = NULL; - } +static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, + void *user_data) { + struct header_info *hi = (struct header_info *)user_data; + size_t remlen, slen, ptrlen = size * nmemb; + char *rem, *val = NULL, *key = NULL; + void *tmp; + + val = (char *)calloc(1, ptrlen); + key = (char *)calloc(1, ptrlen); + if (!key || !val) + goto out; + + tmp = memchr(ptr, ':', ptrlen); + if (!tmp || (tmp == ptr)) /* skip empty keys / blanks */ + goto out; + slen = (char *)tmp - (char *)ptr; + if ((slen + 1) == ptrlen) /* skip key w/ no value */ + goto out; + memcpy(key, ptr, slen); /* store & nul term key */ + key[slen] = 0; + + rem = (char *)ptr + slen + 1; /* trim value's leading whitespace */ + remlen = ptrlen - slen - 1; + while ((remlen > 0) && (isspace(*rem))) { + remlen--; + rem++; + } + + memcpy(val, rem, remlen); /* store value, trim trailing ws */ + val[remlen] = 0; + while ((*val) && (isspace(val[strlen(val) - 1]))) { + val[strlen(val) - 1] = 0; + } + + if (!strcasecmp("X-Long-Polling", key)) { + hi->lp_path = val; /* steal memory reference */ + val = NULL; + } + + if (!strcasecmp("X-Reject-Reason", key)) { + hi->reason = val; /* steal memory reference */ + val = NULL; + } + + if (!strcasecmp("X-Stratum", key)) { + hi->stratum_url = val; /* steal memory reference */ + val = NULL; + } out: - free(key); - free(val); - return ptrlen; + free(key); + free(val); + return ptrlen; } #if LIBCURL_VERSION_NUM >= 0x070f06 static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, - curlsocktype purpose) -{ + curlsocktype purpose) { #ifdef __linux - int tcp_keepcnt = 3; + int tcp_keepcnt = 3; #endif - int tcp_keepintvl = 50; - int tcp_keepidle = 50; + int tcp_keepintvl = 50; + int tcp_keepidle = 50; #ifndef WIN32 - int keepalive = 1; - if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, - sizeof(keepalive)))) - return 1; + int keepalive = 1; + if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, + sizeof(keepalive)))) + return 1; #ifdef __linux - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT, - &tcp_keepcnt, sizeof(tcp_keepcnt)))) - return 1; - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, - &tcp_keepidle, sizeof(tcp_keepidle)))) - return 1; - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, - &tcp_keepintvl, sizeof(tcp_keepintvl)))) - return 1; + if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT, &tcp_keepcnt, + sizeof(tcp_keepcnt)))) + return 1; + if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, &tcp_keepidle, + sizeof(tcp_keepidle)))) + return 1; + if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, &tcp_keepintvl, + sizeof(tcp_keepintvl)))) + return 1; #endif /* __linux */ #ifdef __APPLE_CC__ - if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, - &tcp_keepintvl, sizeof(tcp_keepintvl)))) - return 1; + if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, &tcp_keepintvl, + sizeof(tcp_keepintvl)))) + return 1; #endif /* __APPLE_CC__ */ -#else /* WIN32 */ - struct tcp_keepalive vals; - vals.onoff = 1; - vals.keepalivetime = tcp_keepidle * 1000; - vals.keepaliveinterval = tcp_keepintvl * 1000; - DWORD outputBytes; - if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals), - NULL, 0, &outputBytes, NULL, NULL))) - return 1; +#else /* WIN32 */ + struct tcp_keepalive vals; + vals.onoff = 1; + vals.keepalivetime = tcp_keepidle * 1000; + vals.keepaliveinterval = tcp_keepintvl * 1000; + DWORD outputBytes; + if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals), NULL, 0, + &outputBytes, NULL, NULL))) + return 1; #endif /* WIN32 */ - return 0; + return 0; } #endif -json_t *json_rpc_call(CURL *curl, const char *url, - const char *userpass, const char *rpc_req, - int *curl_err, int flags) -{ - json_t *val, *err_val, *res_val; - int rc; - long http_rc; - struct data_buffer all_data = {0}; - struct upload_buffer upload_data; - char *json_buf; - json_error_t err; - struct curl_slist *headers = NULL; - char len_hdr[64]; - char curl_err_str[CURL_ERROR_SIZE] = { 0 }; - long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30; - struct header_info hi = {0}; - - /* it is assumed that 'curl' is freshly [re]initialized at this pt */ - - if (opt_protocol) - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); - curl_easy_setopt(curl, CURLOPT_URL, url); - if (opt_cert) - curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert); -// - curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false); - - curl_easy_setopt(curl, CURLOPT_ENCODING, ""); - curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0); - curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); - curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb); - curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data); +json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, + const char *rpc_req, int *curl_err, int flags) { + json_t *val, *err_val, *res_val; + int rc; + long http_rc; + struct data_buffer all_data = {0}; + struct upload_buffer upload_data; + char *json_buf; + json_error_t err; + struct curl_slist *headers = NULL; + char len_hdr[64]; + char curl_err_str[CURL_ERROR_SIZE] = {0}; + long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30; + struct header_info hi = {0}; + + applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); + /* it is assumed that 'curl' is freshly [re]initialized at this pt */ + + if (opt_protocol) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(curl, CURLOPT_URL, url); + if (opt_cert) + curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert); + // + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false); + + curl_easy_setopt(curl, CURLOPT_ENCODING, ""); + curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); + curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb); + curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data); #if LIBCURL_VERSION_NUM >= 0x071200 - curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb); - curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data); + curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb); + curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data); #endif - curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str); - if (opt_redirect) - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb); - curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi); - if (opt_proxy) { - curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); - } - if (userpass) { - curl_easy_setopt(curl, CURLOPT_USERPWD, userpass); - curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); - } + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str); + if (opt_redirect) + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb); + curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi); + if (opt_proxy) { + curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); + } + if (userpass) { + curl_easy_setopt(curl, CURLOPT_USERPWD, userpass); + curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); + } #if LIBCURL_VERSION_NUM >= 0x070f06 - if (flags & JSON_RPC_LONGPOLL) - curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); + if (flags & JSON_RPC_LONGPOLL) + curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); #endif - curl_easy_setopt(curl, CURLOPT_POST, 1); - - if (opt_protocol) - applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req); - - upload_data.buf = rpc_req; - upload_data.len = strlen(rpc_req); - upload_data.pos = 0; - sprintf(len_hdr, "Content-Length: %lu", - (unsigned long) upload_data.len); - - headers = curl_slist_append(headers, "Content-Type: application/json"); - headers = curl_slist_append(headers, len_hdr); - headers = curl_slist_append(headers, "User-Agent: " USER_AGENT); - headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll reject-reason"); - //headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/ - //headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/ - - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - - rc = curl_easy_perform(curl); - if (curl_err != NULL) - *curl_err = rc; - if (rc) { - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_rc); - if (!((flags & JSON_RPC_LONGPOLL) && rc == CURLE_OPERATION_TIMEDOUT) && - !((flags & JSON_RPC_QUIET_404) && http_rc == 404)) - applog(LOG_ERR, "HTTP request failed: %s", curl_err_str); - if (curl_err && (flags & JSON_RPC_QUIET_404) && http_rc == 404) - *curl_err = CURLE_OK; - goto err_out; - } - - /* If X-Stratum was found, activate Stratum */ - if (want_stratum && hi.stratum_url && - !strncasecmp(hi.stratum_url, "stratum+tcp://", 14)) { - have_stratum = true; - tq_push(thr_info[stratum_thr_id].q, hi.stratum_url); - hi.stratum_url = NULL; - } - - /* If X-Long-Polling was found, activate long polling */ - if (!have_longpoll && want_longpoll && hi.lp_path && !have_gbt && - allow_getwork && !have_stratum) { - have_longpoll = true; - tq_push(thr_info[longpoll_thr_id].q, hi.lp_path); - hi.lp_path = NULL; - } - - if (!all_data.buf) { - applog(LOG_ERR, "Empty data received in json_rpc_call."); - goto err_out; - } - - json_buf = hack_json_numbers((char*) all_data.buf); - errno = 0; /* needed for Jansson < 2.1 */ - val = JSON_LOADS(json_buf, &err); - free(json_buf); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto err_out; - } - - if (opt_protocol) { - char *s = json_dumps(val, JSON_INDENT(3)); - applog(LOG_DEBUG, "JSON protocol response:\n%s", s); - free(s); - } - - /* JSON-RPC valid response returns a 'result' and a null 'error'. */ - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - - if (!res_val || (err_val && !json_is_null(err_val) - && !(flags & JSON_RPC_IGNOREERR))) { - - char *s = NULL; - - if (err_val) { - s = json_dumps(err_val, 0); - json_t *msg = json_object_get(err_val, "message"); - json_t *err_code = json_object_get(err_val, "code"); - if (curl_err && json_integer_value(err_code)) - *curl_err = (int)json_integer_value(err_code); - - if (msg && json_is_string(msg)) { - free(s); - s = strdup(json_string_value(msg)); - if (have_longpoll && s && !strcmp(s, "method not getwork")) { - json_decref(err_val); - free(s); - goto err_out; - } - } - json_decref(err_val); - } - else - s = strdup("(unknown reason)"); - - if (!curl_err || opt_debug) - applog(LOG_ERR, "JSON-RPC call failed: %s", s); - - free(s); - - goto err_out; - } - - if (hi.reason) - json_object_set_new(val, "reject-reason", json_string(hi.reason)); - - databuf_free(&all_data); - curl_slist_free_all(headers); - curl_easy_reset(curl); - return val; + curl_easy_setopt(curl, CURLOPT_POST, 1); + + if (opt_protocol) + applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req); + + upload_data.buf = rpc_req; + upload_data.len = strlen(rpc_req); + upload_data.pos = 0; + sprintf(len_hdr, "Content-Length: %lu", (unsigned long)upload_data.len); + + headers = curl_slist_append(headers, "Content-Type: application/json"); + headers = curl_slist_append(headers, len_hdr); + headers = curl_slist_append(headers, "User-Agent: " USER_AGENT); + headers = + curl_slist_append(headers, "X-Mining-Extensions: longpoll reject-reason"); + // headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/ + // headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/ + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + + rc = curl_easy_perform(curl); + if (curl_err != NULL) + *curl_err = rc; + if (rc) { + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_rc); + if (!((flags & JSON_RPC_LONGPOLL) && rc == CURLE_OPERATION_TIMEDOUT) && + !((flags & JSON_RPC_QUIET_404) && http_rc == 404)) + applog(LOG_ERR, "HTTP request failed: %s", curl_err_str); + if (curl_err && (flags & JSON_RPC_QUIET_404) && http_rc == 404) + *curl_err = CURLE_OK; + goto err_out; + } + + // want_stratum is useless, and so is this code it seems. Nothing in + // hi appears to be set. + /* If X-Stratum was found, activate Stratum */ + if (want_stratum && hi.stratum_url && + !strncasecmp(hi.stratum_url, "stratum+tcp://", 14)) { + have_stratum = true; + tq_push(thr_info[stratum_thr_id].q, hi.stratum_url); + hi.stratum_url = NULL; + } + + /* If X-Long-Polling was found, activate long polling */ + if (!have_longpoll && want_longpoll && hi.lp_path && !have_gbt && + allow_getwork && !have_stratum) { + have_longpoll = true; + tq_push(thr_info[longpoll_thr_id].q, hi.lp_path); + hi.lp_path = NULL; + } + + if (!all_data.buf) { + applog(LOG_ERR, "Empty data received in json_rpc_call."); + goto err_out; + } + + json_buf = hack_json_numbers((char *)all_data.buf); + errno = 0; /* needed for Jansson < 2.1 */ + val = JSON_LOADS(json_buf, &err); + free(json_buf); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto err_out; + } + + if (opt_protocol) { + char *s = json_dumps(val, JSON_INDENT(3)); + applog(LOG_DEBUG, "JSON protocol response:\n%s", s); + free(s); + } + + /* JSON-RPC valid response returns a 'result' and a null 'error'. */ + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + + if (!res_val || + (err_val && !json_is_null(err_val) && !(flags & JSON_RPC_IGNOREERR))) { + + char *s = NULL; + + if (err_val) { + s = json_dumps(err_val, 0); + json_t *msg = json_object_get(err_val, "message"); + json_t *err_code = json_object_get(err_val, "code"); + if (curl_err && json_integer_value(err_code)) + *curl_err = (int)json_integer_value(err_code); + + if (msg && json_is_string(msg)) { + free(s); + s = strdup(json_string_value(msg)); + if (have_longpoll && s && !strcmp(s, "method not getwork")) { + json_decref(err_val); + free(s); + goto err_out; + } + } + json_decref(err_val); + } else + s = strdup("(unknown reason)"); + + if (!curl_err || opt_debug) + applog(LOG_ERR, "JSON-RPC call failed: %s", s); + + free(s); + + goto err_out; + } + + if (hi.reason) + json_object_set_new(val, "reject-reason", json_string(hi.reason)); + + databuf_free(&all_data); + curl_slist_free_all(headers); + curl_easy_reset(curl); + return val; err_out: - free(hi.lp_path); - free(hi.reason); - free(hi.stratum_url); - databuf_free(&all_data); - curl_slist_free_all(headers); - curl_easy_reset(curl); - return NULL; + free(hi.lp_path); + free(hi.reason); + free(hi.stratum_url); + databuf_free(&all_data); + curl_slist_free_all(headers); + curl_easy_reset(curl); + return NULL; } /* used to load a remote config */ -json_t* json_load_url(char* cfg_url, json_error_t *err) -{ - char err_str[CURL_ERROR_SIZE] = { 0 }; - struct data_buffer all_data = { 0 }; - int rc = 0; json_t *cfg = NULL; - CURL *curl = curl_easy_init(); - if (unlikely(!curl)) { - applog(LOG_ERR, "Remote config init failed!"); - return NULL; - } - curl_easy_setopt(curl, CURLOPT_URL, cfg_url); - curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1); - curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15); - curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, err_str); - curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); - if (opt_proxy) { - curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); - } else if (getenv("http_proxy")) { - if (getenv("all_proxy")) - curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy")); - else if (getenv("ALL_PROXY")) - curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY")); - else - curl_easy_setopt(curl, CURLOPT_PROXY, ""); - } - rc = curl_easy_perform(curl); - if (rc) { - applog(LOG_ERR, "Remote config read failed: %s", err_str); - goto err_out; - } - if (!all_data.buf || !all_data.len) { - applog(LOG_ERR, "Empty data received for config"); - goto err_out; - } - - cfg = JSON_LOADS((char*)all_data.buf, err); +json_t *json_load_url(char *cfg_url, json_error_t *err) { + char err_str[CURL_ERROR_SIZE] = {0}; + struct data_buffer all_data = {0}; + int rc = 0; + json_t *cfg = NULL; + CURL *curl = curl_easy_init(); + if (unlikely(!curl)) { + applog(LOG_ERR, "Remote config init failed!"); + return NULL; + } + curl_easy_setopt(curl, CURLOPT_URL, cfg_url); + curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15); + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, err_str); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); + if (opt_proxy) { + curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); + } else if (getenv("http_proxy")) { + if (getenv("all_proxy")) + curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy")); + else if (getenv("ALL_PROXY")) + curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY")); + else + curl_easy_setopt(curl, CURLOPT_PROXY, ""); + } + rc = curl_easy_perform(curl); + if (rc) { + applog(LOG_ERR, "Remote config read failed: %s", err_str); + goto err_out; + } + if (!all_data.buf || !all_data.len) { + applog(LOG_ERR, "Empty data received for config"); + goto err_out; + } + + cfg = JSON_LOADS((char *)all_data.buf, err); err_out: - curl_easy_cleanup(curl); - return cfg; -} - -void cbin2hex(char *out, const char *in, size_t len) -{ - if (out) { - unsigned int i; - for (i = 0; i < len; i++) - sprintf(out + (i * 2), "%02x", (uint8_t)in[i]); - } -} - -void bin2hex(char *s, const unsigned char *p, size_t len) -{ - for (size_t i = 0; i < len; i++) - sprintf(s + (i * 2), "%02x", (unsigned int) p[i]); -} - -char *abin2hex(const unsigned char *p, size_t len) -{ - char *s = (char*) malloc((len * 2) + 1); - if (!s) - return NULL; - bin2hex(s, p, len); - return s; -} - -bool hex2bin(unsigned char *p, const char *hexstr, size_t len) -{ - char hex_byte[3]; - char *ep; - - hex_byte[2] = '\0'; - - while (*hexstr && len) { - if (!hexstr[1]) { - applog(LOG_ERR, "hex2bin str truncated"); - return false; - } - hex_byte[0] = hexstr[0]; - hex_byte[1] = hexstr[1]; - *p = (unsigned char) strtol(hex_byte, &ep, 16); - if (*ep) { - applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte); - return false; - } - p++; - hexstr += 2; - len--; - } - - return(!len) ? true : false; -/* return (len == 0 && *hexstr == 0) ? true : false; */ -} - -int varint_encode(unsigned char *p, uint64_t n) -{ - int i; - if (n < 0xfd) { - p[0] = (uchar) n; - return 1; - } - if (n <= 0xffff) { - p[0] = 0xfd; - p[1] = n & 0xff; - p[2] = (uchar) (n >> 8); - return 3; - } - if (n <= 0xffffffff) { - p[0] = 0xfe; - for (i = 1; i < 5; i++) { - p[i] = n & 0xff; - n >>= 8; - } - return 5; - } - p[0] = 0xff; - for (i = 1; i < 9; i++) { - p[i] = n & 0xff; - n >>= 8; - } - return 9; -} - -static const char b58digits[] = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"; - -static bool b58dec(unsigned char *bin, size_t binsz, const char *b58) -{ - size_t i, j; - uint64_t t; - uint32_t c; - uint32_t *outi; - size_t outisz = (binsz + 3) / 4; - int rem = binsz % 4; - uint32_t remmask = 0xffffffff << (8 * rem); - size_t b58sz = strlen(b58); - bool rc = false; - - outi = (uint32_t *) calloc(outisz, sizeof(*outi)); - - for (i = 0; i < b58sz; ++i) { - for (c = 0; b58digits[c] != b58[i]; c++) - if (!b58digits[c]) - goto out; - for (j = outisz; j--; ) { - t = (uint64_t)outi[j] * 58 + c; - c = t >> 32; - outi[j] = t & 0xffffffff; - } - if (c || outi[0] & remmask) - goto out; - } - - j = 0; - switch (rem) { - case 3: - *(bin++) = (outi[0] >> 16) & 0xff; - case 2: - *(bin++) = (outi[0] >> 8) & 0xff; - case 1: - *(bin++) = outi[0] & 0xff; - ++j; - default: - break; - } - for (; j < outisz; ++j) { - be32enc((uint32_t *)bin, outi[j]); - bin += sizeof(uint32_t); - } - - rc = true; + curl_easy_cleanup(curl); + return cfg; +} + +// Segwit BEGIN +void memrev(unsigned char *p, size_t len) { + unsigned char c, *q; + for (q = p + len - 1; p < q; p++, q--) { + c = *p; + *p = *q; + *q = c; + } +} +// Segwit END + +void cbin2hex(char *out, const char *in, size_t len) { + if (out) { + unsigned int i; + for (i = 0; i < len; i++) + sprintf(out + (i * 2), "%02x", (uint8_t)in[i]); + } +} + +void bin2hex(char *s, const unsigned char *p, size_t len) { + for (size_t i = 0; i < len; i++) + sprintf(s + (i * 2), "%02x", (unsigned int)p[i]); +} + +char *abin2hex(const unsigned char *p, size_t len) { + char *s = (char *)malloc((len * 2) + 1); + if (!s) + return NULL; + bin2hex(s, p, len); + return s; +} + +bool hex2bin(unsigned char *p, const char *hexstr, size_t len) { + char hex_byte[3]; + char *ep; + + hex_byte[2] = '\0'; + + while (*hexstr && len) { + if (!hexstr[1]) { + applog(LOG_ERR, "hex2bin str truncated"); + return false; + } + hex_byte[0] = hexstr[0]; + hex_byte[1] = hexstr[1]; + *p = (unsigned char)strtol(hex_byte, &ep, 16); + if (*ep) { + applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte); + return false; + } + p++; + hexstr += 2; + len--; + } + + return (!len) ? true : false; + /* return (len == 0 && *hexstr == 0) ? true : false; */ +} + +int varint_encode(unsigned char *p, uint64_t n) { + int i; + if (n < 0xfd) { + p[0] = (uchar)n; + return 1; + } + if (n <= 0xffff) { + p[0] = 0xfd; + p[1] = n & 0xff; + p[2] = (uchar)(n >> 8); + return 3; + } + if (n <= 0xffffffff) { + p[0] = 0xfe; + for (i = 1; i < 5; i++) { + p[i] = n & 0xff; + n >>= 8; + } + return 5; + } + p[0] = 0xff; + for (i = 1; i < 9; i++) { + p[i] = n & 0xff; + n >>= 8; + } + return 9; +} + +static const char b58digits[] = + "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"; + +static bool b58dec(unsigned char *bin, size_t binsz, const char *b58) { + size_t i, j; + uint64_t t; + uint32_t c; + uint32_t *outi; + size_t outisz = (binsz + 3) / 4; + int rem = binsz % 4; + uint32_t remmask = 0xffffffff << (8 * rem); + size_t b58sz = strlen(b58); + bool rc = false; + + outi = (uint32_t *)calloc(outisz, sizeof(*outi)); + + for (i = 0; i < b58sz; ++i) { + for (c = 0; b58digits[c] != b58[i]; c++) + if (!b58digits[c]) + goto out; + for (j = outisz; j--;) { + t = (uint64_t)outi[j] * 58 + c; + c = t >> 32; + outi[j] = t & 0xffffffff; + } + if (c || outi[0] & remmask) + goto out; + } + + j = 0; + switch (rem) { + case 3: + *(bin++) = (outi[0] >> 16) & 0xff; + case 2: + *(bin++) = (outi[0] >> 8) & 0xff; + case 1: + *(bin++) = outi[0] & 0xff; + ++j; + default: + break; + } + for (; j < outisz; ++j) { + be32enc((uint32_t *)bin, outi[j]); + bin += sizeof(uint32_t); + } + + rc = true; out: - free(outi); - return rc; -} - -static int b58check(unsigned char *bin, size_t binsz, const char *b58) -{ - unsigned char buf[32]; - int i; - - sha256d(buf, bin, (int) (binsz - 4)); - if (memcmp(&bin[binsz - 4], buf, 4)) - return -1; - - /* Check number of zeros is correct AFTER verifying checksum - * (to avoid possibility of accessing the string beyond the end) */ - for (i = 0; bin[i] == '\0' && b58[i] == '1'; ++i); - if (bin[i] == '\0' || b58[i] == '1') - return -3; - - return bin[0]; -} - -bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen) -{ - const char *hexstr; - json_t *tmp; - - tmp = json_object_get(obj, key); - if (unlikely(!tmp)) { - applog(LOG_ERR, "JSON key '%s' not found", key); - return false; - } - hexstr = json_string_value(tmp); - if (unlikely(!hexstr)) { - applog(LOG_ERR, "JSON key '%s' is not a string", key); - return false; - } - if (!hex2bin((uchar*) buf, hexstr, buflen)) - return false; - - return true; -} - -size_t address_to_script(unsigned char *out, size_t outsz, const char *addr) -{ - unsigned char addrbin[25]; - int addrver; - size_t rv; - - if (!b58dec(addrbin, sizeof(addrbin), addr)) - return 0; - addrver = b58check(addrbin, sizeof(addrbin), addr); - if (addrver < 0) - return 0; - switch (addrver) { - case 5: /* Bitcoin script hash */ - case 196: /* Testnet script hash */ - if (outsz < (rv = 23)) - return rv; - out[ 0] = 0xa9; /* OP_HASH160 */ - out[ 1] = 0x14; /* push 20 bytes */ - memcpy(&out[2], &addrbin[1], 20); - out[22] = 0x87; /* OP_EQUAL */ - return rv; - default: - if (outsz < (rv = 25)) - return rv; - out[ 0] = 0x76; /* OP_DUP */ - out[ 1] = 0xa9; /* OP_HASH160 */ - out[ 2] = 0x14; /* push 20 bytes */ - memcpy(&out[3], &addrbin[1], 20); - out[23] = 0x88; /* OP_EQUALVERIFY */ - out[24] = 0xac; /* OP_CHECKSIG */ - return rv; - } + free(outi); + return rc; +} + +static int b58check(unsigned char *bin, size_t binsz, const char *b58) { + unsigned char buf[32]; + int i; + + sha256d(buf, bin, (int)(binsz - 4)); + if (memcmp(&bin[binsz - 4], buf, 4)) + return -1; + + /* Check number of zeros is correct AFTER verifying checksum + * (to avoid possibility of accessing the string beyond the end) */ + for (i = 0; bin[i] == '\0' && b58[i] == '1'; ++i) + ; + if (bin[i] == '\0' || b58[i] == '1') + return -3; + + return bin[0]; +} + +bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen) { + const char *hexstr; + json_t *tmp; + + tmp = json_object_get(obj, key); + if (unlikely(!tmp)) { + applog(LOG_ERR, "JSON key '%s' not found", key); + return false; + } + hexstr = json_string_value(tmp); + if (unlikely(!hexstr)) { + applog(LOG_ERR, "JSON key '%s' is not a string", key); + return false; + } + if (!hex2bin((uchar *)buf, hexstr, buflen)) + return false; + + return true; +} + +size_t address_to_script(unsigned char *out, size_t outsz, const char *addr) { + unsigned char addrbin[pk_buffer_size_max]; + int addrver; + size_t rv; + + if (!b58dec(addrbin, outsz, addr)) + return 0; + + addrver = b58check(addrbin, outsz, addr); + if (addrver < 0) + return 0; + + switch (addrver) { + case 5: /* Bitcoin script hash */ + case 196: /* Testnet script hash */ + if (outsz < (rv = 23)) + return rv; + out[0] = 0xa9; /* OP_HASH160 */ + out[1] = 0x14; /* push 20 bytes */ + memcpy(&out[2], &addrbin[1], 20); + out[22] = 0x87; /* OP_EQUAL */ + return rv; + default: + if (outsz < (rv = 25)) + return rv; + out[0] = 0x76; /* OP_DUP */ + out[1] = 0xa9; /* OP_HASH160 */ + out[2] = 0x14; /* push 20 bytes */ + memcpy(&out[3], &addrbin[1], 20); + out[23] = 0x88; /* OP_EQUALVERIFY */ + out[24] = 0xac; /* OP_CHECKSIG */ + return rv; + } } /* Subtract the `struct timeval' values X and Y, storing the result in RESULT. Return 1 if the difference is negative, otherwise 0. */ int timeval_subtract(struct timeval *result, struct timeval *x, - struct timeval *y) -{ - /* Perform the carry for the later subtraction by updating Y. */ - if (x->tv_usec < y->tv_usec) { - int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; - y->tv_usec -= 1000000 * nsec; - y->tv_sec += nsec; - } - if (x->tv_usec - y->tv_usec > 1000000) { - int nsec = (x->tv_usec - y->tv_usec) / 1000000; - y->tv_usec += 1000000 * nsec; - y->tv_sec -= nsec; - } - - /* Compute the time remaining to wait. - * `tv_usec' is certainly positive. */ - result->tv_sec = x->tv_sec - y->tv_sec; - result->tv_usec = x->tv_usec - y->tv_usec; - - /* Return 1 if result is negative. */ - return x->tv_sec < y->tv_sec; -} - -bool fulltest(const uint32_t *hash, const uint32_t *target) -{ - int i; - bool rc = true; - - for (i = 7; i >= 0; i--) { - if (hash[i] > target[i]) { - rc = false; - break; - } - if (hash[i] < target[i]) { - rc = true; - break; - } - } - - if (opt_debug) { - uint32_t hash_be[8], target_be[8]; - char hash_str[65], target_str[65]; - - for (i = 0; i < 8; i++) { - be32enc(hash_be + i, hash[7 - i]); - be32enc(target_be + i, target[7 - i]); - } - bin2hex(hash_str, (unsigned char *)hash_be, 32); - bin2hex(target_str, (unsigned char *)target_be, 32); - - applog(LOG_DEBUG, "DEBUG: %s\nHash: %s\nTarget: %s", - rc ? "hash <= target" - : "hash > target (false positive)", - hash_str, - target_str); - } - - return rc; -} - -void diff_to_target(uint32_t *target, double diff) -{ - uint64_t m; - int k; - - for (k = 6; k > 0 && diff > 1.0; k--) - diff /= 4294967296.0; - m = (uint64_t)(4294901760.0 / diff); - if (m == 0 && k == 6) - memset(target, 0xff, 32); - else { - memset(target, 0, 32); - target[k] = (uint32_t)m; - target[k + 1] = (uint32_t)(m >> 32); - } -} - -// Only used by stratum pools -void work_set_target(struct work* work, double diff) -{ - diff_to_target(work->target, diff); - work->targetdiff = diff; -} - -// Only used by longpoll pools -double target_to_diff(uint32_t* target) -{ - uchar* tgt = (uchar*) target; - uint64_t m = - (uint64_t)tgt[29] << 56 | - (uint64_t)tgt[28] << 48 | - (uint64_t)tgt[27] << 40 | - (uint64_t)tgt[26] << 32 | - (uint64_t)tgt[25] << 24 | - (uint64_t)tgt[24] << 16 | - (uint64_t)tgt[23] << 8 | - (uint64_t)tgt[22] << 0; - - if (!m) - return 0.; - else - return (double)0x0000ffff00000000/m; + struct timeval *y) { + /* Perform the carry for the later subtraction by updating Y. */ + if (x->tv_usec < y->tv_usec) { + int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; + y->tv_usec -= 1000000 * nsec; + y->tv_sec += nsec; + } + if (x->tv_usec - y->tv_usec > 1000000) { + int nsec = (x->tv_usec - y->tv_usec) / 1000000; + y->tv_usec += 1000000 * nsec; + y->tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + * `tv_usec' is certainly positive. */ + result->tv_sec = x->tv_sec - y->tv_sec; + result->tv_usec = x->tv_usec - y->tv_usec; + + /* Return 1 if result is negative. */ + return x->tv_sec < y->tv_sec; +} + +// Deprecated +bool fulltest(const uint32_t *hash, const uint32_t *target) { + int i; + bool rc = true; + + for (i = 7; i >= 0; i--) { + if (hash[i] > target[i]) { + rc = false; + break; + } + if (hash[i] < target[i]) { + rc = true; + break; + } + } + + if (opt_debug) { + uint32_t hash_be[8], target_be[8]; + char hash_str[65], target_str[65]; + + for (i = 0; i < 8; i++) { + be32enc(hash_be + i, hash[7 - i]); + be32enc(target_be + i, target[7 - i]); + } + bin2hex(hash_str, (unsigned char *)hash_be, 32); + bin2hex(target_str, (unsigned char *)target_be, 32); + + applog(LOG_DEBUG, "DEBUG: %s\nHash: %s\nTarget: %s", + rc ? "hash <= target" : "hash > target (false positive)", hash_str, + target_str); + } + return rc; +} + +// Mathmatically the difficulty is simply the reciprocal of the hash: d = 1/h. +// Both are real numbers but the hash (target) is represented as a 256 bit +// fixed point number with the upper 32 bits representing the whole integer +// part and the lower 224 bits representing the fractional part: +// target[ 255:224 ] = trunc( 1/diff ) +// target[ 223: 0 ] = frac( 1/diff ) +// +// The 256 bit hash is exact but any floating point representation is not. +// Stratum provides the target difficulty as double precision, inexcact, +// which must be converted to a hash target. The converted hash target will +// likely be less precise due to inexact input and conversion error. +// On the other hand getwork provides a 256 bit hash target which is exact. +// +// How much precision is needed? +// +// 128 bit types are implemented in software by the compiler on 64 bit +// hardware resulting in lower performance and more error than would be +// expected with a hardware 128 bit implementaion. +// Float80 exploits the internals of the FP unit which provide a 64 bit +// mantissa in an 80 bit register with hardware rounding. When the destination +// is double the data is rounded to float64 format. Long double returns all +// 80 bits without rounding and including any accumulated computation error. +// Float80 does not fit efficiently in memory. +// +// Significant digits: +// 256 bit hash: 76 +// float: 7 (float32, 80 bits with rounding to 32 bits) +// double: 15 (float64, 80 bits with rounding to 64 bits) +// long double: 19 (float80, 80 bits with no rounding) +// __float128: 33 (128 bits with no rounding) +// uint32_t: 9 +// uint64_t: 19 +// uint128_t 38 +// +// The concept of significant digits doesn't apply to the 256 bit hash +// representation. It's fixed point making leading zeros significant, +// limiting its range and precision due to fewer zon-zero significant digits. +// +// Doing calculations with float128 and uint128 increases precision for +// target_to_diff, but doesn't help with stratum diff being limited to +// double precision. Is the extra precision really worth the extra cost? +// With float128 the error rate is 1/1e33 compared with 1/1e15 for double. +// For double that's 1 error in every petahash with a very low difficulty, +// not a likely situation. With higher difficulty effective precision +// increases. +// +// Unfortunately I can't get float128 to work so long double (float80) is +// as precise as it gets. +// All calculations will be done using long double then converted to double. +// This prevents introducing significant new error while taking advantage +// of HW rounding. + +#if defined(GCC_INT128) + +void diff_to_hash(uint32_t *target, const double diff) { + uint128_t *targ = (uint128_t *)target; + register long double m = 1. / diff; + // targ[0] = 0; + targ[0] = -1; + targ[1] = (uint128_t)(m * exp96); } +double hash_to_diff(const void *target) { + const uint128_t *targ = (const uint128_t *)target; + register long double m = ((long double)targ[1] / exp96); + // + ( (long double)targ[0] / exp160 ); + return (double)(1. / m); +} + +inline bool valid_hash(const void *hash, const void *target) { + const uint128_t *h = (const uint128_t *)hash; + const uint128_t *t = (const uint128_t *)target; + if (h[1] > t[1]) + return false; + if (h[1] < t[1]) + return true; + if (h[0] > t[0]) + return false; + return true; +} + +#else + +void diff_to_hash(uint32_t *target, const double diff) { + uint64_t *targ = (uint64_t *)target; + register long double m = (1. / diff) * exp32; + // targ[1] = targ[0] = 0; + targ[1] = targ[0] = -1; + targ[3] = (uint64_t)m; + targ[2] = (uint64_t)((m - (long double)targ[3]) * exp64); +} + +double hash_to_diff(const void *target) { + const uint64_t *targ = (const uint64_t *)target; + register long double m = + ((long double)targ[3] / exp32) + ((long double)targ[2] / exp96); + return (double)(1. / m); +} + +inline bool valid_hash(const void *hash, const void *target) { + const uint64_t *h = (const uint64_t *)hash; + const uint64_t *t = (const uint64_t *)target; + if (h[3] > t[3]) + return false; + if (h[3] < t[3]) + return true; + if (h[2] > t[2]) + return false; + if (h[2] < t[2]) + return true; + if (h[1] > t[1]) + return false; + if (h[1] < t[1]) + return true; + if (h[0] > t[0]) + return false; + return true; +} + +#endif + #ifdef WIN32 #define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK) #else #define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK) #endif -static bool send_line(curl_socket_t sock, char *s) -{ - size_t sent = 0; - int len; +static bool send_line(struct stratum_ctx *sctx, char *s) { + size_t sent = 0; + int len; + + len = (int)strlen(s); + s[len++] = '\n'; - len = (int) strlen(s); - s[len++] = '\n'; + while (len > 0) { + struct timeval timeout = {0, 0}; + int n; + fd_set wd; - while (len > 0) { - struct timeval timeout = {0, 0}; - int n; - fd_set wd; + FD_ZERO(&wd); + FD_SET(sctx->sock, &wd); + if (select((int)(sctx->sock + 1), NULL, &wd, NULL, &timeout) < 1) + return false; - FD_ZERO(&wd); - FD_SET(sock, &wd); - if (select((int) (sock + 1), NULL, &wd, NULL, &timeout) < 1) - return false; - n = send(sock, s + sent, len, 0); - if (n < 0) { - if (!socket_blocks()) - return false; - n = 0; - } - sent += n; - len -= n; - } +#if LIBCURL_VERSION_NUM >= 0x071802 - return true; + CURLcode rc = curl_easy_send(sctx->curl, s + sent, len, (size_t *)&n); + if (rc != CURLE_OK) { + if (rc != CURLE_AGAIN) +#else + n = send(sock, s + sent, len, 0); + if (n < 0) { + if (!socket_blocks()) +#endif + return false; + n = 0; + } + sent += n; + len -= n; + } + + return true; } -bool stratum_send_line(struct stratum_ctx *sctx, char *s) -{ - bool ret = false; +bool stratum_send_line(struct stratum_ctx *sctx, char *s) { + bool ret = false; - if (opt_protocol) - applog(LOG_DEBUG, "> %s", s); + if (opt_protocol) + applog(LOG_DEBUG, "> %s", s); - pthread_mutex_lock(&sctx->sock_lock); - ret = send_line(sctx->sock, s); - pthread_mutex_unlock(&sctx->sock_lock); + pthread_mutex_lock(&sctx->sock_lock); + ret = send_line(sctx, s); + pthread_mutex_unlock(&sctx->sock_lock); - return ret; + return ret; } -static bool socket_full(curl_socket_t sock, int timeout) -{ - struct timeval tv; - fd_set rd; +static bool socket_full(curl_socket_t sock, int timeout) { + struct timeval tv; + fd_set rd; - FD_ZERO(&rd); - FD_SET(sock, &rd); - tv.tv_sec = timeout; - tv.tv_usec = 0; - if (select((int)(sock + 1), &rd, NULL, NULL, &tv) > 0) - return true; - return false; + FD_ZERO(&rd); + FD_SET(sock, &rd); + tv.tv_sec = timeout; + tv.tv_usec = 0; + if (select((int)(sock + 1), &rd, NULL, NULL, &tv) > 0) + return true; + return false; } -bool stratum_socket_full(struct stratum_ctx *sctx, int timeout) -{ - return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout); +bool stratum_socket_full(struct stratum_ctx *sctx, int timeout) { + return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout); } #define RBUFSIZE 2048 #define RECVSIZE (RBUFSIZE - 4) -static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s) -{ - size_t old, n; - - old = strlen(sctx->sockbuf); - n = old + strlen(s) + 1; - if (n >= sctx->sockbuf_size) { - sctx->sockbuf_size = n + (RBUFSIZE - (n % RBUFSIZE)); - sctx->sockbuf = (char*) realloc(sctx->sockbuf, sctx->sockbuf_size); - } - strcpy(sctx->sockbuf + old, s); -} - -char *stratum_recv_line(struct stratum_ctx *sctx) -{ - ssize_t len, buflen; - char *tok, *sret = NULL; - - if (!strstr(sctx->sockbuf, "\n")) { - bool ret = true; - time_t rstart; - - time(&rstart); - if (!socket_full(sctx->sock, 60)) { - applog(LOG_WARNING, "stratum_recv_line timed out"); - goto out; - } - do { - char s[RBUFSIZE]; - ssize_t n; - - memset(s, 0, RBUFSIZE); - n = recv(sctx->sock, s, RECVSIZE, 0); - if (!n) { - ret = false; - break; - } - if (n < 0) { - if (!socket_blocks() || !socket_full(sctx->sock, 1)) { - ret = false; - break; - } - } else - stratum_buffer_append(sctx, s); - } while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n")); - - if (!ret) { - applog(LOG_WARNING, "stratum_recv_line failed"); - goto out; - } - } - - buflen = (ssize_t) strlen(sctx->sockbuf); - tok = strtok(sctx->sockbuf, "\n"); - if (!tok) { - applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string"); - goto out; - } - sret = strdup(tok); - len = (ssize_t) strlen(sret); - - if (buflen > len + 1) - memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1); - else - sctx->sockbuf[0] = '\0'; +static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s) { + size_t old, n; + + old = strlen(sctx->sockbuf); + n = old + strlen(s) + 1; + if (n >= sctx->sockbuf_size) { + sctx->sockbuf_size = n + (RBUFSIZE - (n % RBUFSIZE)); + sctx->sockbuf = (char *)realloc(sctx->sockbuf, sctx->sockbuf_size); + } + strcpy(sctx->sockbuf + old, s); +} + +char *stratum_recv_line(struct stratum_ctx *sctx) { + ssize_t len, buflen; + char *tok, *sret = NULL; + + if (!strstr(sctx->sockbuf, "\n")) { + bool ret = true; + time_t rstart; + + time(&rstart); + if (!socket_full(sctx->sock, 60)) { + applog(LOG_WARNING, "stratum_recv_line timed out"); + goto out; + } + do { + char s[RBUFSIZE]; + ssize_t n; + + memset(s, 0, RBUFSIZE); + +#if LIBCURL_VERSION_NUM >= 0x071802 + + CURLcode rc = curl_easy_recv(sctx->curl, s, RECVSIZE, (size_t *)&n); + if (rc == CURLE_OK && !n) { + ret = false; + break; + } + if (rc != CURLE_OK) { + if (rc != CURLE_AGAIN || !socket_full(sctx->sock, 1)) { +#else + + n = recv(sctx->sock, s, RECVSIZE, 0); + if (!n) { + ret = false; + break; + } + if (n < 0) { + if (!socket_blocks() || !socket_full(sctx->sock, 1)) { +#endif + ret = false; + break; + } + } else + stratum_buffer_append(sctx, s); + } while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n")); + + if (!ret) { + applog(LOG_WARNING, "stratum_recv_line failed"); + goto out; + } + } + + buflen = (ssize_t)strlen(sctx->sockbuf); + tok = strtok(sctx->sockbuf, "\n"); + if (!tok) { + applog(LOG_ERR, + "stratum_recv_line failed to parse a newline-terminated string"); + goto out; + } + sret = strdup(tok); + len = (ssize_t)strlen(sret); + + if (buflen > len + 1) + memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1); + else + sctx->sockbuf[0] = '\0'; out: - if (sret && opt_protocol) - applog(LOG_DEBUG, "< %s", sret); - return sret; + if (sret && opt_protocol) + applog(LOG_DEBUG, "< %s", sret); + return sret; } #if LIBCURL_VERSION_NUM >= 0x071101 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, - struct curl_sockaddr *addr) -{ - curl_socket_t *sock = (curl_socket_t*) clientp; - *sock = socket(addr->family, addr->socktype, addr->protocol); - return *sock; + struct curl_sockaddr *addr) { + curl_socket_t *sock = (curl_socket_t *)clientp; + *sock = socket(addr->family, addr->socktype, addr->protocol); + return *sock; } #endif -bool stratum_connect(struct stratum_ctx *sctx, const char *url) -{ - CURL *curl; - int rc; - - pthread_mutex_lock(&sctx->sock_lock); - if (sctx->curl) - curl_easy_cleanup(sctx->curl); - sctx->curl = curl_easy_init(); - if (!sctx->curl) { - applog(LOG_ERR, "CURL initialization failed"); - pthread_mutex_unlock(&sctx->sock_lock); - return false; - } - curl = sctx->curl; - if (!sctx->sockbuf) { - sctx->sockbuf = (char*) calloc(RBUFSIZE, 1); - sctx->sockbuf_size = RBUFSIZE; - } - sctx->sockbuf[0] = '\0'; - pthread_mutex_unlock(&sctx->sock_lock); - if (url != sctx->url) { - free(sctx->url); - sctx->url = strdup(url); - } - free(sctx->curl_url); - sctx->curl_url = (char*) malloc(strlen(url)); - sprintf(sctx->curl_url, "http%s", strstr(url, "://")); - - if (opt_protocol) - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); - curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url); - curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1); - curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30); - curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str); - curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); - if (opt_proxy) { - curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); - } - curl_easy_setopt(curl, CURLOPT_HTTPPROXYTUNNEL, 1); +bool stratum_connect(struct stratum_ctx *sctx, const char *url) { + CURL *curl; + int rc; + + pthread_mutex_lock(&sctx->sock_lock); + if (sctx->curl) + curl_easy_cleanup(sctx->curl); + sctx->curl = curl_easy_init(); + if (!sctx->curl) { + applog(LOG_ERR, "CURL initialization failed"); + pthread_mutex_unlock(&sctx->sock_lock); + return false; + } + curl = sctx->curl; + if (!sctx->sockbuf) { + sctx->sockbuf = (char *)calloc(RBUFSIZE, 1); + sctx->sockbuf_size = RBUFSIZE; + } + sctx->sockbuf[0] = '\0'; + pthread_mutex_unlock(&sctx->sock_lock); + if (url != sctx->url) { + free(sctx->url); + sctx->url = strdup(url); + } + free(sctx->curl_url); + sctx->curl_url = (char *)malloc(strlen(url)); + sprintf(sctx->curl_url, "http%s", + strstr(url, "s://") ? strstr(url, "s://") : strstr(url, "://")); + + if (opt_protocol) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url); + curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30); + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0); + if (opt_proxy) { + curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); + } + curl_easy_setopt(curl, CURLOPT_HTTPPROXYTUNNEL, 1); #if LIBCURL_VERSION_NUM >= 0x070f06 - curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); + curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); #endif #if LIBCURL_VERSION_NUM >= 0x071101 - curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb); - curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock); + curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb); + curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock); #endif - curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1); + curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1); - rc = curl_easy_perform(curl); - if (rc) { - applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str); - curl_easy_cleanup(curl); - sctx->curl = NULL; - return false; - } + rc = curl_easy_perform(curl); + if (rc) { + applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str); + curl_easy_cleanup(curl); + sctx->curl = NULL; + return false; + } #if LIBCURL_VERSION_NUM < 0x071101 - /* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */ - curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock); + /* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */ + curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock); #endif - return true; -} - -void stratum_disconnect(struct stratum_ctx *sctx) -{ - pthread_mutex_lock(&sctx->sock_lock); - if (sctx->curl) { - curl_easy_cleanup(sctx->curl); - sctx->curl = NULL; - sctx->sockbuf[0] = '\0'; - } - pthread_mutex_unlock(&sctx->sock_lock); -} - -static const char *get_stratum_session_id(json_t *val) -{ - json_t *arr_val; - int i, n; - - arr_val = json_array_get(val, 0); - if (!arr_val || !json_is_array(arr_val)) - return NULL; - n = (int) json_array_size(arr_val); - for (i = 0; i < n; i++) { - const char *notify; - json_t *arr = json_array_get(arr_val, i); - - if (!arr || !json_is_array(arr)) - break; - notify = json_string_value(json_array_get(arr, 0)); - if (!notify) - continue; - if (!strcasecmp(notify, "mining.notify")) - return json_string_value(json_array_get(arr, 1)); - } - return NULL; -} - -static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, int pndx) -{ - const char* xnonce1; - int xn2_size; - - xnonce1 = json_string_value(json_array_get(params, pndx)); - if (!xnonce1) { - applog(LOG_ERR, "Failed to get extranonce1"); - goto out; - } - xn2_size = (int) json_integer_value(json_array_get(params, pndx+1)); - if (!xn2_size) { - applog(LOG_ERR, "Failed to get extranonce2_size"); - goto out; - } - if (xn2_size < 2 || xn2_size > 16) { - applog(LOG_INFO, "Failed to get valid n2size in parse_extranonce"); - goto out; - } - - pthread_mutex_lock(&sctx->work_lock); - if (sctx->xnonce1) - free(sctx->xnonce1); - sctx->xnonce1_size = strlen(xnonce1) / 2; - sctx->xnonce1 = (uchar*) calloc(1, sctx->xnonce1_size); - if (unlikely(!sctx->xnonce1)) { - applog(LOG_ERR, "Failed to alloc xnonce1"); - pthread_mutex_unlock(&sctx->work_lock); - goto out; - } - hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); - sctx->xnonce2_size = xn2_size; - pthread_mutex_unlock(&sctx->work_lock); - - if (pndx == 0 && opt_debug) /* pool dynamic change */ - applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d", - xnonce1, xn2_size); - - return true; + return true; +} + +void stratum_disconnect(struct stratum_ctx *sctx) { + pthread_mutex_lock(&sctx->sock_lock); + if (sctx->curl) { + curl_easy_cleanup(sctx->curl); + sctx->curl = NULL; + sctx->sockbuf[0] = '\0'; + } + pthread_mutex_unlock(&sctx->sock_lock); +} + +static const char *get_stratum_session_id(json_t *val) { + json_t *arr_val; + int i, n; + + arr_val = json_array_get(val, 0); + if (!arr_val || !json_is_array(arr_val)) + return NULL; + n = (int)json_array_size(arr_val); + for (i = 0; i < n; i++) { + const char *notify; + json_t *arr = json_array_get(arr_val, i); + + if (!arr || !json_is_array(arr)) + break; + notify = json_string_value(json_array_get(arr, 0)); + if (!notify) + continue; + if (!strcasecmp(notify, "mining.notify")) + return json_string_value(json_array_get(arr, 1)); + } + return NULL; +} + +static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, + int pndx) { + const char *xnonce1; + int xn2_size; + + xnonce1 = json_string_value(json_array_get(params, pndx)); + if (!xnonce1) { + applog(LOG_ERR, "Failed to get extranonce1"); + goto out; + } + xn2_size = (int)json_integer_value(json_array_get(params, pndx + 1)); + if (!xn2_size) { + applog(LOG_ERR, "Failed to get extranonce2_size"); + goto out; + } + if (xn2_size < 2 || xn2_size > 16) { + applog(LOG_INFO, "Failed to get valid n2size in parse_extranonce"); + goto out; + } + + pthread_mutex_lock(&sctx->work_lock); + if (sctx->xnonce1) + free(sctx->xnonce1); + sctx->xnonce1_size = strlen(xnonce1) / 2; + sctx->xnonce1 = (uchar *)calloc(1, sctx->xnonce1_size); + if (unlikely(!sctx->xnonce1)) { + applog(LOG_ERR, "Failed to alloc xnonce1"); + pthread_mutex_unlock(&sctx->work_lock); + goto out; + } + hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); + sctx->xnonce2_size = xn2_size; + pthread_mutex_unlock(&sctx->work_lock); + + if (!opt_quiet && !sctx->dev) /* pool dynamic change */ + applog(LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d", xnonce1, + xn2_size); + // if (pndx == 0 && opt_debug) + // applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 + // size=%d", xnonce1, xn2_size); + + return true; out: - return false; + return false; } -bool stratum_subscribe(struct stratum_ctx *sctx) -{ - char *s, *sret = NULL; - const char *sid; - json_t *val = NULL, *res_val, *err_val; - json_error_t err; - bool ret = false, retry = false; - - if (jsonrpc_2) - return true; +bool stratum_subscribe(struct stratum_ctx *sctx) { + char *s, *sret = NULL; + const char *sid; + json_t *val = NULL, *res_val, *err_val; + json_error_t err; + bool ret = false, retry = false; start: - s = (char*) malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0)); - if (retry) - sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}"); - else if (sctx->session_id) - sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id); - else - sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}"); - - if (!stratum_send_line(sctx, s)) { - applog(LOG_ERR, "stratum_subscribe send failed"); - goto out; - } - - if (!socket_full(sctx->sock, 30)) { - applog(LOG_ERR, "stratum_subscribe timed out"); - goto out; - } - - sret = stratum_recv_line(sctx); - if (!sret) - goto out; - - val = JSON_LOADS(sret, &err); - free(sret); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - - if (!res_val || json_is_null(res_val) || - (err_val && !json_is_null(err_val))) { - if (opt_debug || retry) { - free(s); - if (err_val) - s = json_dumps(err_val, JSON_INDENT(3)); - else - s = strdup("(unknown reason)"); - applog(LOG_ERR, "JSON-RPC call failed: %s", s); - } - goto out; - } - - sid = get_stratum_session_id(res_val); - if (opt_debug && sid) - applog(LOG_DEBUG, "Stratum session id: %s", sid); - - pthread_mutex_lock(&sctx->work_lock); - if (sctx->session_id) - free(sctx->session_id); - sctx->session_id = sid ? strdup(sid) : NULL; - sctx->next_diff = 1.0; - pthread_mutex_unlock(&sctx->work_lock); - - // sid is param 1, extranonce params are 2 and 3 - if (!stratum_parse_extranonce(sctx, res_val, 1)) { - goto out; - } - - ret = true; + s = (char *)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0)); + if (retry) + sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}"); + else if (sctx->session_id) + sprintf(s, + "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": " + "[\"" USER_AGENT "\", \"%s\"]}", + sctx->session_id); + else + sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": " + "[\"" USER_AGENT "\"]}"); + + if (!stratum_send_line(sctx, s)) { + applog(LOG_ERR, "stratum_subscribe send failed"); + goto out; + } + + if (!socket_full(sctx->sock, 30)) { + applog(LOG_ERR, "stratum_subscribe timed out"); + goto out; + } + + sret = stratum_recv_line(sctx); + if (!sret) + goto out; + + val = JSON_LOADS(sret, &err); + free(sret); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + + if (!res_val || json_is_null(res_val) || + (err_val && !json_is_null(err_val))) { + if (opt_debug || retry) { + free(s); + if (err_val) + s = json_dumps(err_val, JSON_INDENT(3)); + else + s = strdup("(unknown reason)"); + applog(LOG_ERR, "JSON-RPC call failed: %s", s); + } + goto out; + } + + sid = get_stratum_session_id(res_val); + if (opt_debug && sid) + applog(LOG_DEBUG, "Stratum session id: %s", sid); + + pthread_mutex_lock(&sctx->work_lock); + if (sctx->session_id) + free(sctx->session_id); + sctx->session_id = sid ? strdup(sid) : NULL; + sctx->next_diff = 1.0; + pthread_mutex_unlock(&sctx->work_lock); + + // sid is param 1, extranonce params are 2 and 3 + if (!stratum_parse_extranonce(sctx, res_val, 1)) { + goto out; + } + + ret = true; out: - free(s); - if (val) - json_decref(val); - - if (!ret) { - if (sret && !retry) { - retry = true; - goto start; - } - } - - return ret; -} - -extern bool opt_extranonce; - -bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass) -{ - json_t *val = NULL, *res_val, *err_val; - char *s, *sret; - json_error_t err; - bool ret = false; - - if (jsonrpc_2) { - s = (char*) malloc(300 + strlen(user) + strlen(pass)); - sprintf(s, "{\"method\": \"login\", \"params\": {" - "\"login\": \"%s\", \"pass\": \"%s\", \"agent\": \"%s\"}, \"id\": 1}", - user, pass, USER_AGENT); - } else { - s = (char*) malloc(80 + strlen(user) + strlen(pass)); - sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}", - user, pass); - } - - if (!stratum_send_line(sctx, s)) - goto out; - - while (1) { - sret = stratum_recv_line(sctx); - if (!sret) - goto out; - if (!stratum_handle_method(sctx, sret)) - break; - free(sret); - } - - val = JSON_LOADS(sret, &err); - free(sret); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - - if (!res_val || json_is_false(res_val) || - (err_val && !json_is_null(err_val))) { - applog(LOG_ERR, "Stratum authentication failed"); - goto out; - } - - if (jsonrpc_2) { - rpc2_login_decode(val); - json_t *job_val = json_object_get(res_val, "job"); - pthread_mutex_lock(&sctx->work_lock); - if(job_val) rpc2_job_decode(job_val, &sctx->work); - sctx->job.job_id = strdup(sctx->work.job_id); - pthread_mutex_unlock(&sctx->work_lock); - } - - ret = true; - - if (!opt_extranonce) - goto out; - - // subscribe to extranonce (optional) - sprintf(s, "{\"id\": 3, \"method\": \"mining.extranonce.subscribe\", \"params\": []}"); - - if (!stratum_send_line(sctx, s)) - goto out; - - if (!socket_full(sctx->sock, 3)) { - if (opt_debug) - applog(LOG_DEBUG, "stratum extranonce subscribe timed out"); - goto out; - } - - sret = stratum_recv_line(sctx); - if (sret) { - json_t *extra = JSON_LOADS(sret, &err); - if (!extra) { - applog(LOG_WARNING, "JSON decode failed(%d): %s", err.line, err.text); - } else { - if (json_integer_value(json_object_get(extra, "id")) != 3) { - // we receive a standard method if extranonce is ignored - if (!stratum_handle_method(sctx, sret)) - applog(LOG_WARNING, "Stratum answer id is not correct!"); - } -// res_val = json_object_get(extra, "result"); -// if (opt_debug && (!res_val || json_is_false(res_val))) -// applog(LOG_DEBUG, "extranonce subscribe not supported"); - json_decref(extra); - } - free(sret); - } + free(s); + if (val) + json_decref(val); + + if (!ret) { + if (sret && !retry) { + retry = true; + goto start; + } + } + + return ret; +} + +bool stratum_authorize(struct stratum_ctx *sctx, const char *user, + const char *pass) { + json_t *val = NULL, *res_val, *err_val; + char *s, *sret; + json_error_t err; + bool ret = false; + + s = (char *)malloc(80 + strlen(user) + strlen(pass)); + sprintf(s, + "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", " + "\"%s\"]}", + user, pass); + + if (!stratum_send_line(sctx, s)) + goto out; + + while (1) { + sret = stratum_recv_line(sctx); + if (!sret) + goto out; + if (!stratum_handle_method(sctx, sret)) + break; + free(sret); + } + + val = JSON_LOADS(sret, &err); + free(sret); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + + if (!res_val || json_is_false(res_val) || + (err_val && !json_is_null(err_val))) { + if (false) { + applog(LOG_ERR, "Dev stratum authentication failed"); + + } else { + applog(LOG_ERR, "Stratum authentication failed"); + } + goto out; + } + + ret = true; + + if (!opt_extranonce) + goto out; + + // subscribe to extranonce (optional) + sprintf(s, "{\"id\": 3, \"method\": \"mining.extranonce.subscribe\", " + "\"params\": []}"); + + if (!stratum_send_line(sctx, s)) + goto out; + + if (!sctx->dev || opt_debug) { + if (!socket_full(sctx->sock, 3)) { + applog(LOG_WARNING, "Extranonce disabled, subscribe timed out"); + opt_extranonce = false; + goto out; + } + if (!opt_quiet) + applog(LOG_INFO, "Extranonce subscription enabled"); + } + + sret = stratum_recv_line(sctx); + if (sret) { + json_t *extra = JSON_LOADS(sret, &err); + if (!extra) { + applog(LOG_WARNING, "JSON decode failed(%d): %s", err.line, err.text); + } else { + if (json_integer_value(json_object_get(extra, "id")) != 3) { + // we receive a standard method if extranonce is ignored + if (!stratum_handle_method(sctx, sret)) + applog(LOG_WARNING, "Stratum answer id is not correct!"); + } + res_val = json_object_get(extra, "result"); + // if (opt_debug && (!res_val || + // json_is_false(res_val))) applog(LOG_DEBUG, "extranonce subscribe not + // supported"); + json_decref(extra); + } + free(sret); + } out: - free(s); - if (val) - json_decref(val); - - return ret; -} - -// -------------------- RPC 2.0 (XMR/AEON) ------------------------- - -//extern pthread_mutex_t rpc2_login_lock; -//extern pthread_mutex_t rpc2_job_lock; - -bool rpc2_login_decode(const json_t *val) -{ - const char *id; - const char *s; - - json_t *res = json_object_get(val, "result"); - if(!res) { - applog(LOG_ERR, "JSON invalid result"); - goto err_out; - } - - json_t *tmp; - tmp = json_object_get(res, "id"); - if(!tmp) { - applog(LOG_ERR, "JSON inval id"); - goto err_out; - } - id = json_string_value(tmp); - if(!id) { - applog(LOG_ERR, "JSON id is not a string"); - goto err_out; - } - - memcpy(&rpc2_id, id, 64); - - if(opt_debug) - applog(LOG_DEBUG, "Auth id: %s", id); - - tmp = json_object_get(res, "status"); - if(!tmp) { - applog(LOG_ERR, "JSON inval status"); - goto err_out; - } - s = json_string_value(tmp); - if(!s) { - applog(LOG_ERR, "JSON status is not a string"); - goto err_out; - } - if(strcmp(s, "OK")) { - applog(LOG_ERR, "JSON returned status \"%s\"", s); - return false; - } - - return true; - -err_out: - applog(LOG_WARNING,"%s: fail", __func__); - return false; -} - -json_t* json_rpc2_call_recur(CURL *curl, const char *url, const char *userpass, - json_t *rpc_req, int *curl_err, int flags, int recur) -{ - if(recur >= 5) { - if(opt_debug) - applog(LOG_DEBUG, "Failed to call rpc command after %i tries", recur); - return NULL; - } - if(!strcmp(rpc2_id, "")) { - if(opt_debug) - applog(LOG_DEBUG, "Tried to call rpc2 command before authentication"); - return NULL; - } - json_t *params = json_object_get(rpc_req, "params"); - if (params) { - json_t *auth_id = json_object_get(params, "id"); - if (auth_id) { - json_string_set(auth_id, rpc2_id); - } - } - json_t *res = json_rpc_call(curl, url, userpass, json_dumps(rpc_req, 0), - curl_err, flags | JSON_RPC_IGNOREERR); - if(!res) goto end; - json_t *error = json_object_get(res, "error"); - if(!error) goto end; - json_t *message; - if(json_is_string(error)) - message = error; - else - message = json_object_get(error, "message"); - if(!message || !json_is_string(message)) goto end; - const char *mes = json_string_value(message); - if(!strcmp(mes, "Unauthenticated")) { - pthread_mutex_lock(&rpc2_login_lock); - rpc2_login(curl); - sleep(1); - pthread_mutex_unlock(&rpc2_login_lock); - return json_rpc2_call_recur(curl, url, userpass, rpc_req, - curl_err, flags, recur + 1); - } else if(!strcmp(mes, "Low difficulty share") || !strcmp(mes, "Block expired") || !strcmp(mes, "Invalid job id") || !strcmp(mes, "Duplicate share")) { - json_t *result = json_object_get(res, "result"); - if(!result) { - goto end; - } - json_object_set(result, "reject-reason", json_string(mes)); - } else { - applog(LOG_ERR, "json_rpc2.0 error: %s", mes); - return NULL; - } - end: - return res; -} - -json_t *json_rpc2_call(CURL *curl, const char *url, const char *userpass, const char *rpc_req, int *curl_err, int flags) -{ - json_t* req_json = JSON_LOADS(rpc_req, NULL); - json_t* res = json_rpc2_call_recur(curl, url, userpass, req_json, curl_err, flags, 0); - json_decref(req_json); - return res; -} - -bool rpc2_job_decode(const json_t *job, struct work *work) -{ - if (!jsonrpc_2) { - applog(LOG_ERR, "Tried to decode job without JSON-RPC 2.0"); - return false; - } - json_t *tmp; - tmp = json_object_get(job, "job_id"); - if (!tmp) { - applog(LOG_ERR, "JSON invalid job id"); - goto err_out; - } - const char *job_id = json_string_value(tmp); - tmp = json_object_get(job, "blob"); - if (!tmp) { - applog(LOG_ERR, "JSON invalid blob"); - goto err_out; - } - const char *hexblob = json_string_value(tmp); - size_t blobLen = strlen(hexblob); - if (blobLen % 2 != 0 || ((blobLen / 2) < 40 && blobLen != 0) || (blobLen / 2) > 128) { - applog(LOG_ERR, "JSON invalid blob length"); - goto err_out; - } - if (blobLen != 0) { - uint32_t target = 0; - pthread_mutex_lock(&rpc2_job_lock); - uchar *blob = (uchar*) malloc(blobLen / 2); - if (!hex2bin(blob, hexblob, blobLen / 2)) { - applog(LOG_ERR, "JSON invalid blob"); - pthread_mutex_unlock(&rpc2_job_lock); - goto err_out; - } - rpc2_bloblen = blobLen / 2; - if (rpc2_blob) free(rpc2_blob); - rpc2_blob = (char*) malloc(rpc2_bloblen); - if (!rpc2_blob) { - applog(LOG_ERR, "RPC2 OOM!"); - goto err_out; - } - memcpy(rpc2_blob, blob, blobLen / 2); - free(blob); - - jobj_binary(job, "target", &target, 4); - if(rpc2_target != target) - { - double hashrate = 0.0; - pthread_mutex_lock(&stats_lock); - for (int i = 0; i < opt_n_threads; i++) - hashrate += thr_hashrates[i]; - pthread_mutex_unlock(&stats_lock); - double diff = trunc( ( ((double)0xffffffff) / target ) ); - if ( !opt_quiet ) - // xmr pool diff can change a lot... - applog(LOG_WARNING, "Stratum difficulty set to %g", diff); - stratum_diff = diff; - rpc2_target = target; - } - - if (rpc2_job_id) free(rpc2_job_id); - rpc2_job_id = strdup(job_id); - pthread_mutex_unlock(&rpc2_job_lock); - } - if(work) { - if (!rpc2_blob) { - applog(LOG_WARNING, "Work requested before it was received"); - goto err_out; - } - memcpy(work->data, rpc2_blob, rpc2_bloblen); - memset(work->target, 0xff, sizeof(work->target)); - work->target[7] = rpc2_target; - if (work->job_id) free(work->job_id); - work->job_id = strdup(rpc2_job_id); - } - return true; + free(s); + if (val) + json_decref(val); -err_out: - applog(LOG_WARNING, "%s", __func__); - return false; + return ret; } /** * Extract bloc height L H... here len=3, height=0x1333e8 * "...0000000000ffffffff2703e83313062f503253482f043d61105408" */ -static uint32_t getblocheight(struct stratum_ctx *sctx) -{ - uint32_t height = 0; - uint8_t hlen = 0, *p, *m; - - // find 0xffff tag - p = (uint8_t*) sctx->job.coinbase + 32; - m = p + 128; - while (*p != 0xff && p < m) p++; - while (*p == 0xff && p < m) p++; - if (*(p-1) == 0xff && *(p-2) == 0xff) { - p++; hlen = *p; - p++; height = le16dec(p); - p += 2; - switch (hlen) { - case 4: - height += 0x10000UL * le16dec(p); - break; - case 3: - height += 0x10000UL * (*p); - break; - } - } - return height; -} - -static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) -{ - const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime; - const char *denom10 = NULL, *denom100 = NULL, *denom1000 = NULL, - *denom10000 = NULL, *prooffullnode = NULL; - const char *extradata = NULL; - size_t coinb1_size, coinb2_size; - bool clean, ret = false; - int merkle_count, i, p = 0; - json_t *merkle_arr; - uchar **merkle = NULL; - int jsize = json_array_size(params); - bool has_claim = ( opt_algo == ALGO_LBRY ) && ( jsize == 10 ); - bool has_roots = ( opt_algo == ALGO_PHI2 ) && ( jsize == 10 ); - bool is_veil = ( opt_algo == ALGO_X16RT_VEIL ); - - job_id = json_string_value(json_array_get(params, p++)); - prevhash = json_string_value(json_array_get(params, p++)); - if ( has_claim ) - { - extradata = json_string_value(json_array_get(params, p++)); - if ( !extradata || strlen( extradata ) != 64 ) - { - applog(LOG_ERR, "Stratum notify: invalid claim parameter"); - goto out; - } - } - else if ( has_roots ) - { - extradata = json_string_value(json_array_get(params, p++)); - if ( !extradata || strlen( extradata ) != 128 ) - { - applog(LOG_ERR, "Stratum notify: invalid UTXO root parameter"); - goto out; - } - } - if ( is_veil ) - { - denom10 = json_string_value(json_array_get(params, p++)); - denom100 = json_string_value(json_array_get(params, p++)); - denom1000 = json_string_value(json_array_get(params, p++)); - denom10000 = json_string_value(json_array_get(params, p++)); - prooffullnode = json_string_value(json_array_get(params, p++)); - } - - coinb1 = json_string_value(json_array_get(params, p++)); - coinb2 = json_string_value(json_array_get(params, p++)); - merkle_arr = json_array_get(params, p++); - if (!merkle_arr || !json_is_array(merkle_arr)) - goto out; - merkle_count = (int) json_array_size(merkle_arr); - version = json_string_value(json_array_get(params, p++)); - nbits = json_string_value(json_array_get(params, p++)); - stime = json_string_value(json_array_get(params, p++)); - clean = json_is_true(json_array_get(params, p)); p++; - - if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime || - strlen(prevhash) != 64 || strlen(version) != 8 || - strlen(nbits) != 8 || strlen(stime) != 8) { - applog(LOG_ERR, "Stratum notify: invalid parameters"); - goto out; - } - - if ( is_veil ) - { - if ( !denom10 || !denom100 || !denom1000 || !denom10000 - || !prooffullnode || strlen(denom10) != 64 || strlen(denom100) != 64 - || strlen(denom1000) != 64 || strlen(denom10000) != 64 - || strlen(prooffullnode) != 64 ) - { - applog(LOG_ERR, "Stratum notify: invalid veil parameters"); - goto out; - } - } - - if ( merkle_count ) - merkle = (uchar**) malloc(merkle_count * sizeof(char *)); - for ( i = 0; i < merkle_count; i++ ) - { - const char *s = json_string_value(json_array_get(merkle_arr, i)); - if (!s || strlen(s) != 64) { - while (i--) - free(merkle[i]); - free(merkle); - applog(LOG_ERR, "Stratum notify: invalid Merkle branch"); - goto out; - } - merkle[i] = (uchar*) malloc(32); - hex2bin(merkle[i], s, 32); - } - - pthread_mutex_lock(&sctx->work_lock); - - coinb1_size = strlen(coinb1) / 2; - coinb2_size = strlen(coinb2) / 2; - sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size + - sctx->xnonce2_size + coinb2_size; - sctx->job.coinbase = (uchar*) realloc(sctx->job.coinbase, sctx->job.coinbase_size); - sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size; - hex2bin(sctx->job.coinbase, coinb1, coinb1_size); - memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size); - if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) - memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); - hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size); - free(sctx->job.job_id); - sctx->job.job_id = strdup(job_id); - hex2bin(sctx->job.prevhash, prevhash, 32); - if (has_claim) hex2bin(sctx->job.extra, extradata, 32); - if (has_roots) hex2bin(sctx->job.extra, extradata, 64); - - if ( is_veil ) - { - hex2bin(sctx->job.denom10, denom10, 32); - hex2bin(sctx->job.denom100, denom100, 32); - hex2bin(sctx->job.denom1000, denom1000, 32); - hex2bin(sctx->job.denom10000, denom10000, 32); - hex2bin(sctx->job.proofoffullnode, prooffullnode, 32); - } - - sctx->bloc_height = getblocheight(sctx); - - for (i = 0; i < sctx->job.merkle_count; i++) - free(sctx->job.merkle[i]); - - free(sctx->job.merkle); - sctx->job.merkle = merkle; - sctx->job.merkle_count = merkle_count; - - hex2bin(sctx->job.version, version, 4); - hex2bin(sctx->job.nbits, nbits, 4); - hex2bin(sctx->job.ntime, stime, 4); - sctx->job.clean = clean; - - sctx->job.diff = sctx->next_diff; - - pthread_mutex_unlock(&sctx->work_lock); - - ret = true; +static uint32_t getblocheight(struct stratum_ctx *sctx) { + uint32_t height = 0; + uint8_t hlen = 0, *p, *m; + + // find 0xffff tag + p = (uint8_t *)sctx->job.coinbase + 32; + m = p + 128; + while (*p != 0xff && p < m) + p++; + while (*p == 0xff && p < m) + p++; + if (*(p - 1) == 0xff && *(p - 2) == 0xff) { + p++; + hlen = *p; + p++; + height = le16dec(p); + p += 2; + switch (hlen) { + case 4: + height += 0x10000UL * le16dec(p); + break; + case 3: + height += 0x10000UL * (*p); + break; + } + } + return height; +} + +static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) { + const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime; + const char *finalsaplinghash = NULL; + size_t coinb1_size, coinb2_size; + bool clean, ret = false; + int merkle_count, i, p = 0; + json_t *merkle_arr; + uchar **merkle = NULL; + + job_id = json_string_value(json_array_get(params, p++)); + prevhash = json_string_value(json_array_get(params, p++)); + + coinb1 = json_string_value(json_array_get(params, p++)); + coinb2 = json_string_value(json_array_get(params, p++)); + merkle_arr = json_array_get(params, p++); + if (!merkle_arr || !json_is_array(merkle_arr)) + goto out; + merkle_count = (int)json_array_size(merkle_arr); + version = json_string_value(json_array_get(params, p++)); + nbits = json_string_value(json_array_get(params, p++)); + stime = json_string_value(json_array_get(params, p++)); + clean = json_is_true(json_array_get(params, p)); + p++; + + if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || + !stime || strlen(prevhash) != 64 || strlen(version) != 8 || + strlen(nbits) != 8 || strlen(stime) != 8) { + applog(LOG_ERR, "Stratum notify: invalid parameters"); + goto out; + } + + hex2bin(sctx->job.version, version, 4); + + if (opt_sapling) { + finalsaplinghash = json_string_value(json_array_get(params, 9)); + if (!finalsaplinghash || strlen(finalsaplinghash) != 64) { + applog(LOG_ERR, "Stratum notify: invalid sapling parameters"); + goto out; + } + } + + if (merkle_count) + merkle = (uchar **)malloc(merkle_count * sizeof(char *)); + for (i = 0; i < merkle_count; i++) { + const char *s = json_string_value(json_array_get(merkle_arr, i)); + if (!s || strlen(s) != 64) { + while (i--) + free(merkle[i]); + free(merkle); + applog(LOG_ERR, "Stratum notify: invalid Merkle branch"); + goto out; + } + merkle[i] = (uchar *)malloc(32); + hex2bin(merkle[i], s, 32); + } + + pthread_mutex_lock(&sctx->work_lock); + + coinb1_size = strlen(coinb1) / 2; + coinb2_size = strlen(coinb2) / 2; + sctx->job.coinbase_size = + coinb1_size + sctx->xnonce1_size + sctx->xnonce2_size + coinb2_size; + sctx->job.coinbase = + (uchar *)realloc(sctx->job.coinbase, sctx->job.coinbase_size); + sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size; + hex2bin(sctx->job.coinbase, coinb1, coinb1_size); + memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size); + if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) + memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); + hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size); + free(sctx->job.job_id); + sctx->job.job_id = strdup(job_id); + hex2bin(sctx->job.prevhash, prevhash, 32); + if (opt_sapling) + hex2bin(sctx->job.final_sapling_hash, finalsaplinghash, 32); + + sctx->block_height = getblocheight(sctx); + + for (i = 0; i < sctx->job.merkle_count; i++) + free(sctx->job.merkle[i]); + + free(sctx->job.merkle); + sctx->job.merkle = merkle; + sctx->job.merkle_count = merkle_count; + + hex2bin(sctx->job.nbits, nbits, 4); + hex2bin(sctx->job.ntime, stime, 4); + sctx->job.clean = clean; + + sctx->job.diff = sctx->next_diff; + + pthread_mutex_unlock(&sctx->work_lock); + + ret = true; out: - return ret; + return ret; } -static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) -{ - double diff; - - diff = json_number_value(json_array_get(params, 0)); - if (diff == 0) - return false; +static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) { + double diff; - pthread_mutex_lock(&sctx->work_lock); - sctx->next_diff = diff; - pthread_mutex_unlock(&sctx->work_lock); + diff = json_number_value(json_array_get(params, 0)); + if (diff == 0) + return false; - /* store for api stats */ - stratum_diff = diff; - - if ( !opt_quiet ) - applog(LOG_BLUE, "Stratum difficulty set to %g", diff); - - return true; + pthread_mutex_lock(&sctx->work_lock); + sctx->next_diff = diff; + pthread_mutex_unlock(&sctx->work_lock); + return true; } -static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params) -{ - json_t *port_val; - char *url; - const char *host; - int port; +static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params) { + json_t *port_val; + char *url; + const char *host; + int port; - host = json_string_value(json_array_get(params, 0)); - port_val = json_array_get(params, 1); - if (json_is_string(port_val)) - port = atoi(json_string_value(port_val)); - else - port = (int) json_integer_value(port_val); - if (!host || !port) - return false; + host = json_string_value(json_array_get(params, 0)); + port_val = json_array_get(params, 1); + if (json_is_string(port_val)) + port = atoi(json_string_value(port_val)); + else + port = (int)json_integer_value(port_val); + if (!host || !port) + return false; - url = (char*) malloc(32 + strlen(host)); - sprintf(url, "stratum+tcp://%s:%d", host, port); + url = (char *)malloc(32 + strlen(host)); - if (!opt_redirect) { - applog(LOG_INFO, "Ignoring request to reconnect to %s", url); - free(url); - return true; - } + strncpy(url, sctx->url, 15); + sprintf(strstr(url, "://") + 3, "%s:%d", host, port); - applog(LOG_NOTICE, "Server requested reconnection to %s", url); + if (!opt_redirect) { + applog(LOG_INFO, "Ignoring request to reconnect to %s", url); + free(url); + return true; + } + if (sctx->dev) { + applog(LOG_NOTICE, "Server requested reconnection to dev pool %s", url); + } else { + applog(LOG_NOTICE, "Server requested reconnection to %s", url); + } - free(sctx->url); - sctx->url = url; - stratum_disconnect(sctx); + free(sctx->url); + sctx->url = url; + stratum_disconnect(sctx); - return true; + return true; } -static bool json_object_set_error(json_t *result, int code, const char *msg) -{ - json_t *val = json_object(); - json_object_set_new(val, "code", json_integer(code)); - json_object_set_new(val, "message", json_string(msg)); - return json_object_set_new(result, "error", val) != -1; +static bool json_object_set_error(json_t *result, int code, const char *msg) { + json_t *val = json_object(); + json_object_set_new(val, "code", json_integer(code)); + json_object_set_new(val, "message", json_string(msg)); + return json_object_set_new(result, "error", val) != -1; } /* allow to report algo perf to the pool for algo stats */ -static bool stratum_benchdata(json_t *result, json_t *params, int thr_id) -{ - char algo[64] = { 0 }; - char cpuname[80] = { 0 }; - char vendorid[32] = { 0 }; - char compiler[32] = { 0 }; - char arch[16] = { 0 }; - char os[8]; - char *p; - double cpufreq = 0; - json_t *val; - - if (!opt_stratum_stats) return false; - - get_currentalgo(algo, sizeof(algo)); +static bool stratum_benchdata(json_t *result, json_t *params, int thr_id) { + char algo[64] = {0}; + char cpuname[80] = {0}; + char vendorid[32] = {0}; + char compiler[32] = {0}; + char arch[16] = {0}; + char os[8]; + char *p; + double cpufreq = 0; + json_t *val; + + if (!opt_stratum_stats) + return false; + + get_currentalgo(algo, sizeof(algo)); #if defined(WIN32) && (defined(_M_X64) || defined(__x86_64__)) - strcpy(os, "win64"); + strcpy(os, "win64"); #else - strcpy(os, is_windows() ? "win32" : "linux"); + strcpy(os, is_windows() ? "win32" : "linux"); #endif #ifdef _MSC_VER - sprintf(compiler, "MSVC %d\n", msver()); + sprintf(compiler, "MSVC %d\n", msver()); #elif defined(__clang__) - sprintf(compiler, "clang %s\n", __clang_version__); + sprintf(compiler, "clang %s\n", __clang_version__); #elif defined(__GNUC__) - sprintf(compiler, "GCC %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); + sprintf(compiler, "GCC %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, + __GNUC_PATCHLEVEL__); #endif #ifdef __AVX2__ - strcat(compiler, " AVX2"); + strcat(compiler, " AVX2"); #elif defined(__AVX__) - strcat(compiler, " AVX"); + strcat(compiler, " AVX"); #elif defined(__FMA4__) - strcat(compiler, " FMA4"); + strcat(compiler, " FMA4"); #elif defined(__FMA3__) - strcat(compiler, " FMA3"); + strcat(compiler, " FMA3"); #elif defined(__SSE4_2__) - strcat(compiler, " SSE4.2"); + strcat(compiler, " SSE4.2"); #elif defined(__SSE4_1__) - strcat(compiler, " SSE4"); + strcat(compiler, " SSE4"); #elif defined(__SSE3__) - strcat(compiler, " SSE3"); + strcat(compiler, " SSE3"); #elif defined(__SSE2__) - strcat(compiler, " SSE2"); + strcat(compiler, " SSE2"); #elif defined(__SSE__) - strcat(compiler, " SSE"); + strcat(compiler, " SSE"); #endif - cpu_bestfeature(arch, 16); - if (has_aes_ni()) strcat(arch, " NI"); + cpu_bestfeature(arch, 16); + if (has_aes_ni()) + strcat(arch, " NI"); + + cpu_getmodelid(vendorid, 32); + cpu_getname(cpuname, 80); + p = strstr(cpuname, " @ "); + if (p) { + // linux only + char freq[32] = {0}; + *p = '\0'; + p += 3; + snprintf(freq, 32, "%s", p); + cpufreq = atof(freq); + p = strstr(freq, "GHz"); + if (p) + cpufreq *= 1000; + applog(LOG_NOTICE, "sharing CPU stats with freq %s", freq); + } - cpu_getmodelid(vendorid, 32); - cpu_getname(cpuname, 80); - p = strstr(cpuname, " @ "); - if (p) { - // linux only - char freq[32] = { 0 }; - *p = '\0'; p += 3; - snprintf(freq, 32, "%s", p); - cpufreq = atof(freq); - p = strstr(freq, "GHz"); if (p) cpufreq *= 1000; - applog(LOG_NOTICE, "sharing CPU stats with freq %s", freq); - } + compiler[31] = '\0'; - compiler[31] = '\0'; + val = json_object(); + json_object_set_new(val, "algo", json_string(algo)); + json_object_set_new(val, "type", json_string("cpu")); + json_object_set_new(val, "device", json_string(cpuname)); + json_object_set_new(val, "vendorid", json_string(vendorid)); + json_object_set_new(val, "arch", json_string(arch)); + json_object_set_new(val, "freq", json_integer((uint64_t)cpufreq)); + json_object_set_new(val, "memf", json_integer(0)); + json_object_set_new(val, "power", json_integer(0)); + json_object_set_new(val, "khashes", + json_real((double)global_hashrate / 1000.0)); + json_object_set_new(val, "intensity", json_real(opt_priority)); + json_object_set_new(val, "throughput", json_integer(opt_n_threads)); + json_object_set_new(val, "client", + json_string(PACKAGE_NAME "/" PACKAGE_VERSION)); + json_object_set_new(val, "os", json_string(os)); + json_object_set_new(val, "driver", json_string(compiler)); - val = json_object(); - json_object_set_new(val, "algo", json_string(algo)); - json_object_set_new(val, "type", json_string("cpu")); - json_object_set_new(val, "device", json_string(cpuname)); - json_object_set_new(val, "vendorid", json_string(vendorid)); - json_object_set_new(val, "arch", json_string(arch)); - json_object_set_new(val, "freq", json_integer((uint64_t)cpufreq)); - json_object_set_new(val, "memf", json_integer(0)); - json_object_set_new(val, "power", json_integer(0)); - json_object_set_new(val, "khashes", json_real((double)global_hashrate / 1000.0)); - json_object_set_new(val, "intensity", json_real(opt_priority)); - json_object_set_new(val, "throughput", json_integer(opt_n_threads)); - json_object_set_new(val, "client", json_string(PACKAGE_NAME "/" PACKAGE_VERSION)); - json_object_set_new(val, "os", json_string(os)); - json_object_set_new(val, "driver", json_string(compiler)); + json_object_set_new(result, "result", val); - json_object_set_new(result, "result", val); - - return true; + return true; } -static bool stratum_get_stats(struct stratum_ctx *sctx, json_t *id, json_t *params) -{ - char *s; - json_t *val; - bool ret; +static bool stratum_get_stats(struct stratum_ctx *sctx, json_t *id, + json_t *params) { + char *s; + json_t *val; + bool ret; + + if (!id || json_is_null(id)) + return false; - if (!id || json_is_null(id)) - return false; + val = json_object(); + json_object_set(val, "id", id); - val = json_object(); - json_object_set(val, "id", id); + ret = stratum_benchdata(val, params, 0); - ret = stratum_benchdata(val, params, 0); + if (!ret) { + json_object_set_error(val, 1, "disabled"); // EPERM + } else { + json_object_set_new(val, "error", json_null()); + } - if (!ret) { - json_object_set_error(val, 1, "disabled"); //EPERM - } else { - json_object_set_new(val, "error", json_null()); - } + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); - s = json_dumps(val, 0); - ret = stratum_send_line(sctx, s); - json_decref(val); - free(s); + return ret; +} + +static bool stratum_unknown_method(struct stratum_ctx *sctx, json_t *id) { + char *s; + json_t *val; + bool ret = false; - return ret; -} - -static bool stratum_unknown_method(struct stratum_ctx *sctx, json_t *id) -{ - char *s; - json_t *val; - bool ret = false; - - if (!id || json_is_null(id)) - return ret; + if (!id || json_is_null(id)) + return ret; + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "result", json_false()); + json_object_set_error(val, 38, "unknown method"); // ENOSYS + + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} + +static bool stratum_pong(struct stratum_ctx *sctx, json_t *id) { + char buf[64]; + bool ret = false; - val = json_object(); - json_object_set(val, "id", id); - json_object_set_new(val, "result", json_false()); - json_object_set_error(val, 38, "unknown method"); // ENOSYS - - s = json_dumps(val, 0); - ret = stratum_send_line(sctx, s); - json_decref(val); - free(s); - - return ret; -} - -static bool stratum_pong(struct stratum_ctx *sctx, json_t *id) -{ - char buf[64]; - bool ret = false; - - if (!id || json_is_null(id)) - return ret; - - sprintf(buf, "{\"id\":%d,\"result\":\"pong\",\"error\":null}", - (int) json_integer_value(id)); - ret = stratum_send_line(sctx, buf); - - return ret; -} - -static bool stratum_get_algo(struct stratum_ctx *sctx, json_t *id, json_t *params) -{ - char algo[64] = { 0 }; - char *s; - json_t *val; - bool ret = true; - - if (!id || json_is_null(id)) - return false; - - get_currentalgo(algo, sizeof(algo)); - - val = json_object(); - json_object_set(val, "id", id); - json_object_set_new(val, "error", json_null()); - json_object_set_new(val, "result", json_string(algo)); - - s = json_dumps(val, 0); - ret = stratum_send_line(sctx, s); - json_decref(val); - free(s); - - return ret; -} - - -static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id) -{ - char *s; - json_t *val; - bool ret; - - if (!id || json_is_null(id)) - return false; - - val = json_object(); - json_object_set(val, "id", id); - json_object_set_new(val, "error", json_null()); - json_object_set_new(val, "result", json_string(USER_AGENT)); - s = json_dumps(val, 0); - ret = stratum_send_line(sctx, s); - json_decref(val); - free(s); - - return ret; -} - -static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params) -{ - char *s; - json_t *val; - bool ret; - - val = json_array_get(params, 0); - if (val) - applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val)); - - if (!id || json_is_null(id)) - return true; - - val = json_object(); - json_object_set(val, "id", id); - json_object_set_new(val, "error", json_null()); - json_object_set_new(val, "result", json_true()); - s = json_dumps(val, 0); - ret = stratum_send_line(sctx, s); - json_decref(val); - free(s); - - return ret; -} - -bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) -{ - json_t *val, *id, *params; - json_error_t err; - const char *method; - bool ret = false; - - val = JSON_LOADS(s, &err); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - method = json_string_value(json_object_get(val, "method")); - if (!method) - goto out; - - params = json_object_get(val, "params"); - - if (jsonrpc_2) { - if (!strcasecmp(method, "job")) { - ret = rpc2_stratum_job(sctx, params); - } - goto out; - } - - id = json_object_get(val, "id"); - - if (!strcasecmp(method, "mining.notify")) { - ret = stratum_notify(sctx, params); - goto out; - } - if (!strcasecmp(method, "mining.ping")) { // cgminer 4.7.1+ - if (opt_debug) applog(LOG_DEBUG, "Pool ping"); - ret = stratum_pong(sctx, id); - goto out; - } - if (!strcasecmp(method, "mining.set_difficulty")) { - ret = stratum_set_difficulty(sctx, params); - goto out; - } - if (!strcasecmp(method, "mining.set_extranonce")) { - ret = stratum_parse_extranonce(sctx, params, 0); - goto out; - } - if (!strcasecmp(method, "client.reconnect")) { - ret = stratum_reconnect(sctx, params); - goto out; - } - if (!strcasecmp(method, "client.get_algo")) { - // will prevent wrong algo parameters on a pool, will be used as test on rejects - if (!opt_quiet) applog(LOG_NOTICE, "Pool asked your algo parameter"); - ret = stratum_get_algo(sctx, id, params); - goto out; - } - if (!strcasecmp(method, "client.get_stats")) { - // optional to fill device benchmarks - ret = stratum_get_stats(sctx, id, params); - goto out; - } - if (!strcasecmp(method, "client.get_version")) { - ret = stratum_get_version(sctx, id); - goto out; - } - if (!strcasecmp(method, "client.show_message")) { - ret = stratum_show_message(sctx, id, params); - goto out; - } - - if (!ret) { - // don't fail = disconnect stratum on unknown (and optional?) methods - if (opt_debug) applog(LOG_WARNING, "unknown stratum method %s!", method); - ret = stratum_unknown_method(sctx, id); - } + if (!id || json_is_null(id)) + return ret; + + sprintf(buf, "{\"id\":%d,\"result\":\"pong\",\"error\":null}", + (int)json_integer_value(id)); + ret = stratum_send_line(sctx, buf); + + return ret; +} + +static bool stratum_get_algo(struct stratum_ctx *sctx, json_t *id, + json_t *params) { + char algo[64] = {0}; + char *s; + json_t *val; + bool ret = true; + + if (!id || json_is_null(id)) + return false; + + get_currentalgo(algo, sizeof(algo)); + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "error", json_null()); + json_object_set_new(val, "result", json_string(algo)); + + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} + +static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id) { + char *s; + json_t *val; + bool ret; + + if (!id || json_is_null(id)) + return false; + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "error", json_null()); + json_object_set_new(val, "result", json_string(USER_AGENT)); + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} + +static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, + json_t *params) { + char *s; + json_t *val; + bool ret; + + val = json_array_get(params, 0); + if (val) + applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val)); + + if (!id || json_is_null(id)) + return true; + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "error", json_null()); + json_object_set_new(val, "result", json_true()); + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} + +bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) { + json_t *val, *id, *params; + json_error_t err; + const char *method; + bool ret = false; + + val = JSON_LOADS(s, &err); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + method = json_string_value(json_object_get(val, "method")); + if (!method) + goto out; + + params = json_object_get(val, "params"); + + id = json_object_get(val, "id"); + + if (!strcasecmp(method, "mining.notify")) { + ret = stratum_notify(sctx, params); + sctx->new_job = true; + goto out; + } + if (!strcasecmp(method, "mining.ping")) { // cgminer 4.7.1+ + if (opt_debug) + applog(LOG_DEBUG, "Pool ping"); + ret = stratum_pong(sctx, id); + goto out; + } + if (!strcasecmp(method, "mining.set_difficulty")) { + ret = stratum_set_difficulty(sctx, params); + goto out; + } + if (!strcasecmp(method, "mining.set_extranonce")) { + ret = stratum_parse_extranonce(sctx, params, 0); + goto out; + } + if (!strcasecmp(method, "client.reconnect")) { + ret = stratum_reconnect(sctx, params); + goto out; + } + if (!strcasecmp(method, "client.get_algo")) { + // will prevent wrong algo parameters on a pool, will be used as test on + // rejects + if (!opt_quiet) + applog(LOG_NOTICE, "Pool asked your algo parameter"); + ret = stratum_get_algo(sctx, id, params); + goto out; + } + if (!strcasecmp(method, "client.get_stats")) { + // optional to fill device benchmarks + ret = stratum_get_stats(sctx, id, params); + goto out; + } + if (!strcasecmp(method, "client.get_version")) { + ret = stratum_get_version(sctx, id); + goto out; + } + if (!strcasecmp(method, "client.show_message")) { + ret = stratum_show_message(sctx, id, params); + goto out; + } + + if (!ret) { + // don't fail = disconnect stratum on unknown (and optional?) methods + if (opt_debug) + applog(LOG_WARNING, "unknown stratum method %s!", method); + ret = stratum_unknown_method(sctx, id); + } out: - if (val) - json_decref(val); + if (val) + json_decref(val); - return ret; + return ret; } -struct thread_q *tq_new(void) -{ - struct thread_q *tq; +struct thread_q *tq_new(void) { + struct thread_q *tq; - tq = (struct thread_q*) calloc(1, sizeof(*tq)); - if (!tq) - return NULL; + tq = (struct thread_q *)calloc(1, sizeof(*tq)); + if (!tq) + return NULL; - INIT_LIST_HEAD(&tq->q); - pthread_mutex_init(&tq->mutex, NULL); - pthread_cond_init(&tq->cond, NULL); + INIT_LIST_HEAD(&tq->q); + pthread_mutex_init(&tq->mutex, NULL); + pthread_cond_init(&tq->cond, NULL); - return tq; + return tq; } -void tq_free(struct thread_q *tq) -{ - struct tq_ent *ent, *iter; +void tq_free(struct thread_q *tq) { + struct tq_ent *ent, *iter; - if (!tq) - return; + if (!tq) + return; - list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent) { - list_del(&ent->q_node); - free(ent); - } + list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent) { + list_del(&ent->q_node); + free(ent); + } - pthread_cond_destroy(&tq->cond); - pthread_mutex_destroy(&tq->mutex); + pthread_cond_destroy(&tq->cond); + pthread_mutex_destroy(&tq->mutex); - memset(tq, 0, sizeof(*tq)); /* poison */ - free(tq); + memset(tq, 0, sizeof(*tq)); /* poison */ + free(tq); } -static void tq_freezethaw(struct thread_q *tq, bool frozen) -{ - pthread_mutex_lock(&tq->mutex); +static void tq_freezethaw(struct thread_q *tq, bool frozen) { + pthread_mutex_lock(&tq->mutex); - tq->frozen = frozen; + tq->frozen = frozen; - pthread_cond_signal(&tq->cond); - pthread_mutex_unlock(&tq->mutex); + pthread_cond_signal(&tq->cond); + pthread_mutex_unlock(&tq->mutex); } -void tq_freeze(struct thread_q *tq) -{ - tq_freezethaw(tq, true); -} +void tq_freeze(struct thread_q *tq) { tq_freezethaw(tq, true); } -void tq_thaw(struct thread_q *tq) -{ - tq_freezethaw(tq, false); -} +void tq_thaw(struct thread_q *tq) { tq_freezethaw(tq, false); } -bool tq_push(struct thread_q *tq, void *data) -{ - struct tq_ent *ent; - bool rc = true; +bool tq_push(struct thread_q *tq, void *data) { + struct tq_ent *ent; + bool rc = true; - ent = (struct tq_ent*) calloc(1, sizeof(*ent)); - if (!ent) - return false; + ent = (struct tq_ent *)calloc(1, sizeof(*ent)); + if (!ent) + return false; - ent->data = data; - INIT_LIST_HEAD(&ent->q_node); + ent->data = data; + INIT_LIST_HEAD(&ent->q_node); - pthread_mutex_lock(&tq->mutex); + pthread_mutex_lock(&tq->mutex); - if (!tq->frozen) { - list_add_tail(&ent->q_node, &tq->q); - } else { - free(ent); - rc = false; - } + if (!tq->frozen) { + list_add_tail(&ent->q_node, &tq->q); + } else { + free(ent); + rc = false; + } - pthread_cond_signal(&tq->cond); - pthread_mutex_unlock(&tq->mutex); + pthread_cond_signal(&tq->cond); + pthread_mutex_unlock(&tq->mutex); - return rc; + return rc; } -void *tq_pop(struct thread_q *tq, const struct timespec *abstime) -{ - struct tq_ent *ent; - void *rval = NULL; - int rc; +void *tq_pop(struct thread_q *tq, const struct timespec *abstime) { + struct tq_ent *ent; + void *rval = NULL; + int rc; - pthread_mutex_lock(&tq->mutex); + pthread_mutex_lock(&tq->mutex); - if (!list_empty(&tq->q)) - goto pop; + if (!list_empty(&tq->q)) { + goto pop; + } - if (abstime) - rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime); - else - rc = pthread_cond_wait(&tq->cond, &tq->mutex); - if (rc) - goto out; - if (list_empty(&tq->q)) - goto out; + if (abstime) { + rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime); + } else { + rc = pthread_cond_wait(&tq->cond, &tq->mutex); + } + if (rc) + goto out; + if (list_empty(&tq->q)) + goto out; pop: - ent = list_entry(tq->q.next, struct tq_ent, q_node); - rval = ent->data; + ent = list_entry(tq->q.next, struct tq_ent, q_node); + rval = ent->data; - list_del(&ent->q_node); - free(ent); + list_del(&ent->q_node); + free(ent); out: - pthread_mutex_unlock(&tq->mutex); - return rval; + pthread_mutex_unlock(&tq->mutex); + return rval; } /* sprintf can be used in applog */ -static char* format_hash(char* buf, uint8_t *hash) -{ - int len = 0; - for (int i=0; i < 32; i += 4) { - len += sprintf(buf+len, "%02x%02x%02x%02x ", - hash[i], hash[i+1], hash[i+2], hash[i+3]); - } - return buf; -} - -void applog_compare_hash(void *hash, void *hash_ref) -{ - char s[256] = ""; - int len = 0; - uchar* hash1 = (uchar*)hash; - uchar* hash2 = (uchar*)hash_ref; - for (int i=0; i < 32; i += 4) { - const char *color = memcmp(hash1+i, hash2+i, 4) ? CL_WHT : CL_GRY; - len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color, - hash1[i], hash1[i+1], hash1[i+2], hash1[i+3]); - s[len] = '\0'; - } - applog(LOG_DEBUG, "%s", s); +static char *format_hash(char *buf, uint8_t *hash) { + int len = 0; + for (int i = 0; i < 32; i += 4) { + len += sprintf(buf + len, "%02x%02x%02x%02x ", hash[i], hash[i + 1], + hash[i + 2], hash[i + 3]); + } + return buf; } -void applog_hash(void *hash) -{ - char s[128] = {'\0'}; - applog(LOG_DEBUG, "%s", format_hash(s, (uchar*) hash)); +void applog_compare_hash(void *hash, void *hash_ref) { + char s[256] = ""; + int len = 0; + uchar *hash1 = (uchar *)hash; + uchar *hash2 = (uchar *)hash_ref; + for (int i = 0; i < 32; i += 4) { + const char *color = memcmp(hash1 + i, hash2 + i, 4) ? CL_WHT : CL_GRY; + len += sprintf(s + len, "%s%02x%02x%02x%02x " CL_GRY, color, hash1[i], + hash1[i + 1], hash1[i + 2], hash1[i + 3]); + s[len] = '\0'; + } + applog(LOG_DEBUG, "%s", s); } -void applog_hex(void *data, int len) -{ - char* hex = abin2hex((uchar*)data, len); - applog(LOG_DEBUG, "%s", hex); - free(hex); +void applog_hash(void *hash) { + char s[128] = {'\0'}; + applog(LOG_DEBUG, "%s", format_hash(s, (uchar *)hash)); } -void applog_hash64(void *hash) -{ - char s[128] = {'\0'}; - char t[128] = {'\0'}; - applog(LOG_DEBUG, "%s %s", format_hash(s, (uchar*)hash), format_hash(t, &((uchar*)hash)[32])); +void applog_hex(void *data, int len) { + char *hex = abin2hex((uchar *)data, len); + applog(LOG_DEBUG, "%s", hex); + free(hex); } -#define printpfx(n,h) \ - printf("%s%11s%s: %s\n", CL_CYN, n, CL_N, format_hash(s, (uint8_t*) h)) +void applog_hash64(void *hash) { + char s[128] = {'\0'}; + char t[128] = {'\0'}; + applog(LOG_DEBUG, "%s %s", format_hash(s, (uchar *)hash), + format_hash(t, &((uchar *)hash)[32])); +} -void print_hash_tests(void) -{ - uchar *scratchbuf = NULL; - char hash[128], s[80]; - char buf[192] = { 0 }; - int algo; - scratchbuf = (uchar*) calloc(128, 1024); +#define printpfx(n, h) \ + printf("%s%11s%s: %s\n", CL_CYN, n, CL_N, format_hash(s, (uint8_t *)h)) - printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n\n"); +void print_hash_tests(void) { + uchar *scratchbuf = NULL; + char hash[128], s[80]; + char buf[192] = {0}; + int algo; + scratchbuf = (uchar *)calloc(128, 1024); - //buf[0] = 1; buf[64] = 2; // for endian tests - for ( algo=0; algo < ALGO_COUNT; algo++ ) - { - exec_hash_function( algo, &hash[0], &buf[0] ); - printpfx( algo_names[algo], hash ); - } + printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n\n"); - printf("\n"); + // buf[0] = 1; buf[64] = 2; // for endian tests + for (algo = 0; algo < ALGO_COUNT; algo++) { + exec_hash_function(algo, &hash[0], &buf[0]); + printpfx(algo_names[algo], hash); + } - free(scratchbuf); -} + printf("\n"); + free(scratchbuf); +} diff --git a/winbuild-cross.sh b/winbuild-cross.sh index 42f1549..157dbef 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -10,10 +10,12 @@ # define some local variables export LOCAL_LIB="$HOME/usr/lib" - -export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl" - export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32" +export MINGW_LIB="/usr/x86_64-w64-mingw32/lib" +# set correct gcc version +export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/10-win32" +# used by GCC +export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl" # make link to local gmp header file. ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h @@ -22,80 +24,74 @@ ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h #sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac # make release directory and copy selected DLLs. -mkdir release -cp README.txt release/ -cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/ -cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/ -cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/ -cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/ -cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/ -cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ + +rm -rf bin/win/ > /dev/null + +mkdir -p bin/win +cp $MINGW_LIB/zlib1.dll bin/win/ +cp $MINGW_LIB/libwinpthread-1.dll bin/win/ +cp $GCC_MINGW_LIB/libstdc++-6.dll bin/win/ +cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll bin/win/ +cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll bin/win/ +cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll bin/win/ + +DFLAGS="-Wall -fno-common -Wno-comment -Wno-maybe-uninitialized" + +# Start building... + +# 1 - Architecture +# 2 - Output suffix +# 3 - Additional options +compile() { make distclean || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS -make -j 16 +CFLAGS="-O3 -march=${1} ${3} ${DFLAGS}" ./configure ${CONFIGURE_ARGS} +make -j 8 strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-zen.exe +cp cpuminer.exe bin/win/cpuminer-${2}.exe -#make clean || echo clean -#CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $CONFIGURE_ARGS -#make -#strip -s cpuminer.exe -#mv cpuminer.exe release/cpuminer-avx-sha.exe +} -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $CONFIGURE_ARGS -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-avx2.exe +# Icelake AVX512 SHA VAES +compile "icelake-client" "avx512-sha-vaes" -#make clean || echo clean -#rm -f config.status -#CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS -#make -j -#strip -s cpuminer.exe -#mv cpuminer.exe release/cpuminer-aes-sha.exe +# Rocketlake AVX512 SHA AES +compile "cascadelake" "avx512-sha" "-msha" +# Slylake-X AVX512 AES +compile "skylake-avx512" "avx512" -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-avx.exe +# Haswell AVX2 AES +# GCC 9 doesn't include AES with core-avx2 +compile "core-avx2" "avx2" "-maes" -# -march=westmere is supported in gcc5 -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS -#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-aes-sse42.exe - -#make clean || echo clean -#rm -f config.status -#CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS -#make -#strip -s cpuminer.exe -#mv cpuminer.exe release/cpuminer-sse42.exe - -#make clean || echo clean -#rm -f config.status -#CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS -#make -#strip -s cpuminer.exe -#mv cpuminer.exe release/cpuminer-ssse3.exe -#make clean || echo clean - -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-sse2.exe -make clean || echo clean +# Sandybridge AVX AES +compile "corei7-avx" "avx" "-maes" + +# Westmere SSE4.2 AES +compile "westmere" "aes-sse42" + +# Nehalem SSE4.2 +compile "corei7" "sse42" + +# Core2 SSSE3 +compile "core2" "ssse3" + +# Generic SSE2 +compile "x86-64" "sse2" "-msse2" + +# AMD Zen1 AVX2 SHA +compile "znver1" "zen" + +# AMD Zen3 AVX2 SHA VAES +compile "znver2" "zen3" "-mvaes" + +# Build native +compile "native" "native" "-mtune=native" +ls -l bin/win +if ( $(ls bin/win/*.exe | wc -l) != 12 ); then + echo "Some binaries did not compile?" +fi