From 5a2c6d232e2f8dec1a302510956c30140b8413cd Mon Sep 17 00:00:00 2001 From: nicehashdev Date: Mon, 22 Aug 2016 13:17:31 +0200 Subject: [PATCH] Faster Lyra2RE --- algo/aes_ni/README | 14 + algo/aes_ni/api.h | 2 + algo/aes_ni/architectures | 1 + algo/aes_ni/brg_endian.h | 133 +++ algo/aes_ni/brg_types.h | 234 +++++ algo/aes_ni/groestl-asm-aes.h | 1043 ++++++++++++++++++++ algo/aes_ni/groestl-asm-avx.h | 1105 +++++++++++++++++++++ algo/aes_ni/groestl-asm-vperm.h | 1397 +++++++++++++++++++++++++++ algo/aes_ni/groestl-intr-aes.h | 965 ++++++++++++++++++ algo/aes_ni/groestl-intr-avx.h | 1072 ++++++++++++++++++++ algo/aes_ni/groestl-intr-vperm.h | 1294 +++++++++++++++++++++++++ algo/aes_ni/groestl-version.h | 16 + algo/aes_ni/groestl256-asm-aes.h | 529 ++++++++++ algo/aes_ni/groestl256-asm-avx.h | 519 ++++++++++ algo/aes_ni/groestl256-asm-vperm.h | 856 ++++++++++++++++ algo/aes_ni/groestl256-intr-aes.h | 496 ++++++++++ algo/aes_ni/groestl256-intr-avx.h | 482 +++++++++ algo/aes_ni/groestl256-intr-vperm.h | 793 +++++++++++++++ algo/aes_ni/hash-groestl.c | 306 ++++++ algo/aes_ni/hash-groestl.h | 110 +++ algo/aes_ni/hash-groestl256.c | 318 ++++++ algo/aes_ni/hash-groestl256.h | 116 +++ algo/aes_ni/implementors | 3 + algo/lyra2re.c | 65 +- cpu-miner.c | 6 +- lyra2/Lyra2.c | 99 +- lyra2/Lyra2.h | 10 +- lyra2/Sponge.c | 1384 +++++++++++++++----------- lyra2/Sponge.h | 171 ++-- mingw64avx.sh | 2 +- mingw64avx2.sh | 2 +- mingw64sse2.sh | 2 +- 32 files changed, 12839 insertions(+), 706 deletions(-) create mode 100644 algo/aes_ni/README create mode 100644 algo/aes_ni/api.h create mode 100644 algo/aes_ni/architectures create mode 100644 algo/aes_ni/brg_endian.h create mode 100644 algo/aes_ni/brg_types.h create mode 100644 algo/aes_ni/groestl-asm-aes.h create mode 100644 algo/aes_ni/groestl-asm-avx.h create mode 100644 algo/aes_ni/groestl-asm-vperm.h create mode 100644 algo/aes_ni/groestl-intr-aes.h create mode 100644 algo/aes_ni/groestl-intr-avx.h create mode 100644 algo/aes_ni/groestl-intr-vperm.h create mode 100644 algo/aes_ni/groestl-version.h create mode 100644 algo/aes_ni/groestl256-asm-aes.h create mode 100644 algo/aes_ni/groestl256-asm-avx.h create mode 100644 algo/aes_ni/groestl256-asm-vperm.h create mode 100644 algo/aes_ni/groestl256-intr-aes.h create mode 100644 algo/aes_ni/groestl256-intr-avx.h create mode 100644 algo/aes_ni/groestl256-intr-vperm.h create mode 100644 algo/aes_ni/hash-groestl.c create mode 100644 algo/aes_ni/hash-groestl.h create mode 100644 algo/aes_ni/hash-groestl256.c create mode 100644 algo/aes_ni/hash-groestl256.h create mode 100644 algo/aes_ni/implementors diff --git a/algo/aes_ni/README b/algo/aes_ni/README new file mode 100644 index 000000000..e55be0b59 --- /dev/null +++ b/algo/aes_ni/README @@ -0,0 +1,14 @@ +This package contains an implementation of the Groestl-512 hash +function optimized for the Intel AES instructions. + +Authors are Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + +There are no known present or future claims by a copyright holder that +the distribution of this software infringes the copyright. In +particular, the author of the software is not making such claims and +does not intend to make such claims. + +Moreover, there are no known present or future claims by a patent +holder that the use of this software infringes the patent. In +particular, the author of the software is not making such claims and +does not intend to make such claims. diff --git a/algo/aes_ni/api.h b/algo/aes_ni/api.h new file mode 100644 index 000000000..e56a47f18 --- /dev/null +++ b/algo/aes_ni/api.h @@ -0,0 +1,2 @@ +#define CRYPTO_BYTES 64 +#define CRYPTO_VERSION "2.2" diff --git a/algo/aes_ni/architectures b/algo/aes_ni/architectures new file mode 100644 index 000000000..21d5bd8c7 --- /dev/null +++ b/algo/aes_ni/architectures @@ -0,0 +1 @@ +amd64 diff --git a/algo/aes_ni/brg_endian.h b/algo/aes_ni/brg_endian.h new file mode 100644 index 000000000..e3cf0d11d --- /dev/null +++ b/algo/aes_ni/brg_endian.h @@ -0,0 +1,133 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 +*/ + +#ifndef _BRG_ENDIAN_H +#define _BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __sun ) +# include +#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined( _AIX ) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order +#endif + +#endif + +#endif diff --git a/algo/aes_ni/brg_types.h b/algo/aes_ni/brg_types.h new file mode 100644 index 000000000..fd603b752 --- /dev/null +++ b/algo/aes_ni/brg_types.h @@ -0,0 +1,234 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + (a few lines added by Soeren S. Thomsen, October 2008) + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 + + The unsigned integer types defined here are of the form uint_t where + is the length of the type; for example, the unsigned 32-bit type is + 'uint_32t'. These are NOT the same as the 'C99 integer types' that are + defined in the inttypes.h and stdint.h headers since attempts to use these + types have shown that support for them is still highly variable. However, + since the latter are of the form uint_t, a regular expression search + and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t') + can be used to convert the types used here to the C99 standard types. +*/ + +#ifndef _BRG_TYPES_H +#define _BRG_TYPES_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include + +#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 ) +# include +# define ptrint_t intptr_t +#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 ) +# include +# define ptrint_t intptr_t +#else +# define ptrint_t int +#endif + +#ifndef BRG_UI8 +# define BRG_UI8 +# if UCHAR_MAX == 255u + typedef unsigned char uint_8t; +# else +# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI16 +# define BRG_UI16 +# if USHRT_MAX == 65535u + typedef unsigned short uint_16t; +# else +# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h +# endif +#endif + +#ifndef BRG_UI32 +# define BRG_UI32 +# if UINT_MAX == 4294967295u +# define li_32(h) 0x##h##u + typedef unsigned int uint_32t; +# elif ULONG_MAX == 4294967295u +# define li_32(h) 0x##h##ul + typedef unsigned long uint_32t; +# elif defined( _CRAY ) +# error This code needs 32-bit data types, which Cray machines do not provide +# else +# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI64 +# if defined( __BORLANDC__ ) && !defined( __MSDOS__ ) +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */ +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# elif defined( __MVS__ ) +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned int long long uint_64t; +# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u +# if UINT_MAX == 18446744073709551615u +# define BRG_UI64 +# define li_64(h) 0x##h##u + typedef unsigned int uint_64t; +# endif +# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u +# if ULONG_MAX == 18446744073709551615ul +# define BRG_UI64 +# define li_64(h) 0x##h##ul + typedef unsigned long uint_64t; +# endif +# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u +# if ULLONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u +# if ULONG_LONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# endif +#endif + +#if !defined( BRG_UI64 ) +# if defined( NEED_UINT_64T ) +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; + /*# error Please define uint_64t as an unsigned 64 bit type in brg_types.h*/ +# endif +#endif + +#ifndef RETURN_VALUES +# define RETURN_VALUES +# if defined( DLL_EXPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllexport ) void __stdcall +# define INT_RETURN __declspec( dllexport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllexport__ ) void +# define INT_RETURN __declspec( __dllexport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( DLL_IMPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllimport ) void __stdcall +# define INT_RETURN __declspec( dllimport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllimport__ ) void +# define INT_RETURN __declspec( __dllimport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( __WATCOMC__ ) +# define VOID_RETURN void __cdecl +# define INT_RETURN int __cdecl +# else +# define VOID_RETURN void +# define INT_RETURN int +# endif +#endif + +/* These defines are used to detect and set the memory alignment of pointers. + Note that offsets are in bytes. + + ALIGN_OFFSET(x,n) return the positive or zero offset of + the memory addressed by the pointer 'x' + from an address that is aligned on an + 'n' byte boundary ('n' is a power of 2) + + ALIGN_FLOOR(x,n) return a pointer that points to memory + that is aligned on an 'n' byte boundary + and is not higher than the memory address + pointed to by 'x' ('n' is a power of 2) + + ALIGN_CEIL(x,n) return a pointer that points to memory + that is aligned on an 'n' byte boundary + and is not lower than the memory address + pointed to by 'x' ('n' is a power of 2) +*/ + +#define ALIGN_OFFSET(x,n) (((ptrint_t)(x)) & ((n) - 1)) +#define ALIGN_FLOOR(x,n) ((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1))) +#define ALIGN_CEIL(x,n) ((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1))) + +/* These defines are used to declare buffers in a way that allows + faster operations on longer variables to be used. In all these + defines 'size' must be a power of 2 and >= 8. NOTE that the + buffer size is in bytes but the type length is in bits + + UNIT_TYPEDEF(x,size) declares a variable 'x' of length + 'size' bits + + BUFR_TYPEDEF(x,size,bsize) declares a buffer 'x' of length 'bsize' + bytes defined as an array of variables + each of 'size' bits (bsize must be a + multiple of size / 8) + + UNIT_CAST(x,size) casts a variable to a type of + length 'size' bits + + UPTR_CAST(x,size) casts a pointer to a pointer to a + varaiable of length 'size' bits +*/ + +#define UI_TYPE(size) uint_##size##t +#define UNIT_TYPEDEF(x,size) typedef UI_TYPE(size) x +#define BUFR_TYPEDEF(x,size,bsize) typedef UI_TYPE(size) x[bsize / (size >> 3)] +#define UNIT_CAST(x,size) ((UI_TYPE(size) )(x)) +#define UPTR_CAST(x,size) ((UI_TYPE(size)*)(x)) + + /* Added by Soeren S. Thomsen (begin) */ +#define u8 uint_8t +#define u32 uint_32t +#define u64 uint_64t + /* (end) */ + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/algo/aes_ni/groestl-asm-aes.h b/algo/aes_ni/groestl-asm-aes.h new file mode 100644 index 000000000..c4e44a4d6 --- /dev/null +++ b/algo/aes_ni/groestl-asm-aes.h @@ -0,0 +1,1043 @@ +/* groestl-asm-aes.h Aug 2011 + * + * Groestl implementation with inline assembly using ssse3, sse4.1, and aes + * instructions. + * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include "hash-groestl.h" +/* global constants */ +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; +__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16]; +__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16]; +__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; +__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; + +/* temporary variables */ +__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; +__attribute__ ((aligned (16))) unsigned char TEMP[3*16]; + + +#define tos(a) #a +#define tostr(a) tos(a) + + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b */ +#define MUL2(i, j, k){\ + asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\ + asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\ + asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\ + asm("pand xmm"tostr(j)", xmm"tostr(k)"");\ + asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\ +}/**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\ + asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\ + asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\ + asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\ + asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\ + asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\ + asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\ + asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\ + asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\ + asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\ + asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\ + asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\ + asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\ + asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\ + asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\ + asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ + asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\ + asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ + asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ + asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\ + asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\ + /* spill values y_4, y_5 to memory */\ + asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\ + asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\ + asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\ + asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\ + asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ + asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\ + asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\ + asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ + asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\ + asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\ + asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ + asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\ + asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ + asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ + \ + /* compute x_i = t_i + t_{i+3} */\ + asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\ + asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\ + asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\ + asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\ + asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\ + asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\ + asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + asm("movaps xmm"tostr(b1)", [ALL_1B]");\ + MUL2(a0, b0, b1);\ + asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\ + MUL2(a1, b0, b1);\ + asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\ + MUL2(a2, b0, b1);\ + asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\ + MUL2(a3, b0, b1);\ + asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\ + MUL2(a4, b0, b1);\ + asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\ + MUL2(a5, b0, b1);\ + asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\ + MUL2(a6, b0, b1);\ + asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\ + MUL2(a7, b0, b1);\ + asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ + MUL2(a1, b0, b1);\ + asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\ + MUL2(a2, b0, b1);\ + asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ + MUL2(a5, b0, b1);\ + asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\ + MUL2(a6, b0, b1);\ + asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ + MUL2(a7, b0, b1);\ + asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\ + asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\ + asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ + asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ +}/*MixBytes*/ + +#if (LENGTH <= 256) + +#define SET_CONSTANTS(){\ + ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ + ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ + ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\ + ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\ + ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\ + ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\ + ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\ + ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\ + ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\ + ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\ + ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\ + ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\ + ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\ + ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\ + ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\ + ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\ + ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\ + ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ + ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ + ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ + }\ + ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ +}while(0); + +#define Push_All_Regs() do{\ +/* not using any... + asm("push rax");\ + asm("push rbx");\ + asm("push rcx");*/\ +}while(0); + +#define Pop_All_Regs() do{\ +/* not using any... + asm("pop rcx");\ + asm("pop rbx");\ + asm("pop rax");*/\ +}while(0); + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant */\ + asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ + asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ + asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ + /* ShiftBytes + SubBytes (interleaved) */\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ + asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ + asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ + asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ + asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ + asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ + asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ + asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ + asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ + \ + asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ + \ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ + \ + asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ + asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ + \ + asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ + asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ + asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ + \ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ + \ + asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ + asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ + asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ + asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ +}/**/ + + +void INIT(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + asm volatile ("emms"); + + /* load IV into registers xmm12 - xmm15 */ + asm ("movaps xmm12, [rdi+0*16]"); + asm ("movaps xmm13, [rdi+1*16]"); + asm ("movaps xmm14, [rdi+2*16]"); + asm ("movaps xmm15, [rdi+3*16]"); + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* store transposed IV */ + asm ("movaps [rdi+0*16], xmm12"); + asm ("movaps [rdi+1*16], xmm2"); + asm ("movaps [rdi+2*16], xmm6"); + asm ("movaps [rdi+3*16], xmm7"); + + asm volatile ("emms"); + asm (".att_syntax noprefix"); +} + +void TF512(u64* h, u64* m) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + /* message M in rsi */ + +#ifdef IACA_TRACE + IACA_START; +#endif + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load message into registers xmm12 - xmm15 (Q = message) */ + asm ("movaps xmm12, [rsi+0*16]"); + asm ("movaps xmm13, [rsi+1*16]"); + asm ("movaps xmm14, [rsi+2*16]"); + asm ("movaps xmm15, [rsi+3*16]"); + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm0, [rdi+1*16]"); + asm ("movaps xmm4, [rdi+2*16]"); + asm ("movaps xmm5, [rdi+3*16]"); + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + asm ("pxor xmm8, xmm12"); + asm ("pxor xmm0, xmm2"); + asm ("pxor xmm4, xmm6"); + asm ("pxor xmm5, xmm7"); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + asm ("pxor xmm0, xmm8"); + asm ("pxor xmm1, xmm10"); + asm ("pxor xmm2, xmm12"); + asm ("pxor xmm3, xmm14"); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + asm ("pxor xmm0, [rdi+0*16]"); + asm ("pxor xmm1, [rdi+1*16]"); + asm ("pxor xmm2, [rdi+2*16]"); + asm ("pxor xmm3, [rdi+3*16]"); + + /* store CV */ + asm ("movaps [rdi+0*16], xmm0"); + asm ("movaps [rdi+1*16], xmm1"); + asm ("movaps [rdi+2*16], xmm2"); + asm ("movaps [rdi+3*16], xmm3"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm10, [rdi+1*16]"); + asm ("movaps xmm12, [rdi+2*16]"); + asm ("movaps xmm14, [rdi+3*16]"); + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm10, [rdi+1*16]"); + asm ("pxor xmm12, [rdi+2*16]"); + asm ("pxor xmm14, [rdi+3*16]"); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); + + /* we only need to return the truncated half of the state */ + asm ("movaps [rdi+2*16], xmm9"); + asm ("movaps [rdi+3*16], xmm11"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + + return; +} + +#endif + +#if (LENGTH > 256) + +#define SET_CONSTANTS(){\ + ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\ + ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\ + ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ + ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ + ((u64*)SUBSH_MASK)[ 0] = 0x0b0e0104070a0d00ULL;\ + ((u64*)SUBSH_MASK)[ 1] = 0x0306090c0f020508ULL;\ + ((u64*)SUBSH_MASK)[ 2] = 0x0c0f0205080b0e01ULL;\ + ((u64*)SUBSH_MASK)[ 3] = 0x04070a0d00030609ULL;\ + ((u64*)SUBSH_MASK)[ 4] = 0x0d000306090c0f02ULL;\ + ((u64*)SUBSH_MASK)[ 5] = 0x05080b0e0104070aULL;\ + ((u64*)SUBSH_MASK)[ 6] = 0x0e0104070a0d0003ULL;\ + ((u64*)SUBSH_MASK)[ 7] = 0x06090c0f0205080bULL;\ + ((u64*)SUBSH_MASK)[ 8] = 0x0f0205080b0e0104ULL;\ + ((u64*)SUBSH_MASK)[ 9] = 0x070a0d000306090cULL;\ + ((u64*)SUBSH_MASK)[10] = 0x000306090c0f0205ULL;\ + ((u64*)SUBSH_MASK)[11] = 0x080b0e0104070a0dULL;\ + ((u64*)SUBSH_MASK)[12] = 0x0104070a0d000306ULL;\ + ((u64*)SUBSH_MASK)[13] = 0x090c0f0205080b0eULL;\ + ((u64*)SUBSH_MASK)[14] = 0x06090c0f0205080bULL;\ + ((u64*)SUBSH_MASK)[15] = 0x0e0104070a0d0003ULL;\ + for(i = 0; i < ROUNDS1024; i++)\ + {\ + ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\ + ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ + ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\ + ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ + }\ +}while(0); + +#define Push_All_Regs() do{\ + asm("push rax");\ + asm("push rbx");\ + asm("push rcx");\ +}while(0); + +#define Pop_All_Regs() do{\ + asm("pop rcx");\ + asm("pop rbx");\ + asm("pop rax");\ +}while(0); + +/* one round + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* SubBytes */\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\ + asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\ + asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\ + asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\ + asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\ + asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\ + asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\ + asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\ + asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +#define ROUNDS_P(){\ + asm ("xor rax, rax");\ + asm ("xor rbx, rbx");\ + asm ("add bl, 2");\ + asm ("1:");\ + /* AddRoundConstant P1024 */\ + asm ("pxor xmm8, [ROUND_CONST_P+eax*8]");\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + asm ("pshufb xmm8, [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm9, [SUBSH_MASK+1*16]");\ + asm ("pshufb xmm10, [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm11, [SUBSH_MASK+3*16]");\ + asm ("pshufb xmm12, [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm13, [SUBSH_MASK+5*16]");\ + asm ("pshufb xmm14, [SUBSH_MASK+6*16]");\ + asm ("pshufb xmm15, [SUBSH_MASK+7*16]");\ + /* SubBytes + MixBytes */\ + SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + \ + /* AddRoundConstant P1024 */\ + asm ("pxor xmm0, [ROUND_CONST_P+ebx*8]");\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + asm ("pshufb xmm0, [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm1, [SUBSH_MASK+1*16]");\ + asm ("pshufb xmm2, [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm3, [SUBSH_MASK+3*16]");\ + asm ("pshufb xmm4, [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm5, [SUBSH_MASK+5*16]");\ + asm ("pshufb xmm6, [SUBSH_MASK+6*16]");\ + asm ("pshufb xmm7, [SUBSH_MASK+7*16]");\ + /* SubBytes + MixBytes */\ + SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + asm ("add al, 4");\ + asm ("add bl, 4");\ + asm ("mov rcx, rax");\ + asm ("sub cl, 28");\ + asm ("jb 1b");\ +} + +#define ROUNDS_Q(){\ + asm ("xor rax, rax");\ + asm ("xor rbx, rbx");\ + asm ("add bl, 2");\ + asm ("2:");\ + /* AddRoundConstant Q1024 */\ + asm ("movaps xmm1, [ALL_FF]");\ + asm ("pxor xmm8, xmm1");\ + asm ("pxor xmm9, xmm1");\ + asm ("pxor xmm10, xmm1");\ + asm ("pxor xmm11, xmm1");\ + asm ("pxor xmm12, xmm1");\ + asm ("pxor xmm13, xmm1");\ + asm ("pxor xmm14, xmm1");\ + asm ("pxor xmm15, [ROUND_CONST_Q+eax*8]");\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + asm ("pshufb xmm8, [SUBSH_MASK+1*16]");\ + asm ("pshufb xmm9, [SUBSH_MASK+3*16]");\ + asm ("pshufb xmm10, [SUBSH_MASK+5*16]");\ + asm ("pshufb xmm11, [SUBSH_MASK+7*16]");\ + asm ("pshufb xmm12, [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm13, [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm14, [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm15, [SUBSH_MASK+6*16]");\ + /* SubBytes + MixBytes */\ + SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + \ + /* AddConstant */\ + asm ("movaps xmm9, [ALL_FF]");\ + asm ("pxor xmm0, xmm9");\ + asm ("pxor xmm1, xmm9");\ + asm ("pxor xmm2, xmm9");\ + asm ("pxor xmm3, xmm9");\ + asm ("pxor xmm4, xmm9");\ + asm ("pxor xmm5, xmm9");\ + asm ("pxor xmm6, xmm9");\ + asm ("pxor xmm7, [ROUND_CONST_Q+ebx*8]");\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + asm ("pshufb xmm0, [SUBSH_MASK+1*16]");\ + asm ("pshufb xmm1, [SUBSH_MASK+3*16]");\ + asm ("pshufb xmm2, [SUBSH_MASK+5*16]");\ + asm ("pshufb xmm3, [SUBSH_MASK+7*16]");\ + asm ("pshufb xmm4, [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm5, [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm6, [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm7, [SUBSH_MASK+6*16]");\ + /* SubBytes + MixBytes */\ + SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + asm ("add al, 4");\ + asm ("add bl, 4");\ + asm ("mov rcx, rax");\ + asm ("sub cl, 28");\ + asm ("jb 2b");\ +} + +/* Matrix Transpose + * input is a 1024-bit state with two columns in one xmm + * output is a 1024-bit state with two rows in one xmm + * inputs: i0-i7 + * outputs: i0-i7 + * clobbers: t0-t7 + */ +#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ + asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ + \ + asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\ + asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\ + asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\ + \ + /* continue with unpack using 4 temp registers */\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\ + asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\ + asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\ + asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\ + asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\ + asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ + \ + /* shuffle with immediate */\ + asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ + asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ + asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\ + asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ + asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ + asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ + asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\ + \ + /* continue with unpack */\ + asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\ + asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("punpckhdq xmm"tostr(t4)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\ + asm ("punpckldq xmm"tostr(t0)", xmm"tostr(t1)"");\ + asm ("punpckhdq xmm"tostr(t5)", xmm"tostr(t1)"");\ + asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\ + asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\ + asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\ + asm ("punpckhdq xmm"tostr(t6)", xmm"tostr(i6)"");\ + asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("punpckldq xmm"tostr(t2)", xmm"tostr(t3)"");\ + asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\ + asm ("punpckhdq xmm"tostr(t7)", xmm"tostr(t3)"");\ + \ + /* there are now 2 rows in each xmm */\ + /* unpack to get 1 row of CV in each xmm */\ + asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ + asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\ + asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\ + asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\ + asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\ + asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\ + asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\ + asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\ + /* transpose done */\ +}/**/ + +/* Matrix Transpose Inverse + * input is a 1024-bit state with two rows in one xmm + * output is a 1024-bit state with two columns in one xmm + * inputs: i0-i7 + * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) + * clobbers: t0-t4 + */ +#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ + /* transpose matrix to get output format */\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\ + asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\ + asm ("movaps xmm"tostr(o0)", [TRANSP_MASK]");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\ + /* load transpose mask into a register, because it will be used 8 times */\ + asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\ + /* continue with unpack using 4 temp registers */\ + asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\ + asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\ + \ + asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\ + asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\ + asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\ + asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\ + asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\ + asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\ + /* shuffle with immediate */\ + asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ + asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ + asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ + asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\ + asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\ + asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ + asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\ + /* continue with unpack */\ + asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\ + asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\ + asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\ + asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\ + asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\ + asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\ + asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\ + asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\ + asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\ + asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\ + /* transpose done */\ +}/**/ + + +void INIT(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + asm volatile ("emms"); + + /* load IV into registers xmm8 - xmm15 */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm9, [rdi+1*16]"); + asm ("movaps xmm10, [rdi+2*16]"); + asm ("movaps xmm11, [rdi+3*16]"); + asm ("movaps xmm12, [rdi+4*16]"); + asm ("movaps xmm13, [rdi+5*16]"); + asm ("movaps xmm14, [rdi+6*16]"); + asm ("movaps xmm15, [rdi+7*16]"); + + /* transform chaining value from column ordering into row ordering */ + Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + + /* store transposed IV */ + asm ("movaps [rdi+0*16], xmm8"); + asm ("movaps [rdi+1*16], xmm9"); + asm ("movaps [rdi+2*16], xmm10"); + asm ("movaps [rdi+3*16], xmm11"); + asm ("movaps [rdi+4*16], xmm12"); + asm ("movaps [rdi+5*16], xmm13"); + asm ("movaps [rdi+6*16], xmm14"); + asm ("movaps [rdi+7*16], xmm15"); + + asm volatile ("emms"); + asm (".att_syntax noprefix"); +} + +void TF1024(u64* h, u64* m) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + /* message M in rsi */ + +#ifdef IACA_TRACE + IACA_START; +#endif + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load message into registers xmm8 - xmm15 (Q = message) */ + asm ("movaps xmm8, [rsi+0*16]"); + asm ("movaps xmm9, [rsi+1*16]"); + asm ("movaps xmm10, [rsi+2*16]"); + asm ("movaps xmm11, [rsi+3*16]"); + asm ("movaps xmm12, [rsi+4*16]"); + asm ("movaps xmm13, [rsi+5*16]"); + asm ("movaps xmm14, [rsi+6*16]"); + asm ("movaps xmm15, [rsi+7*16]"); + + /* transform message M from column ordering into row ordering */ + Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + + /* store message M (Q input) for later */ + asm ("movaps [QTEMP+0*16], xmm8"); + asm ("movaps [QTEMP+1*16], xmm9"); + asm ("movaps [QTEMP+2*16], xmm10"); + asm ("movaps [QTEMP+3*16], xmm11"); + asm ("movaps [QTEMP+4*16], xmm12"); + asm ("movaps [QTEMP+5*16], xmm13"); + asm ("movaps [QTEMP+6*16], xmm14"); + asm ("movaps [QTEMP+7*16], xmm15"); + + /* xor CV to message to get P input */ + /* result: CV+M in xmm8...xmm15 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm9, [rdi+1*16]"); + asm ("pxor xmm10, [rdi+2*16]"); + asm ("pxor xmm11, [rdi+3*16]"); + asm ("pxor xmm12, [rdi+4*16]"); + asm ("pxor xmm13, [rdi+5*16]"); + asm ("pxor xmm14, [rdi+6*16]"); + asm ("pxor xmm15, [rdi+7*16]"); + + /* compute permutation P */ + /* result: P(CV+M) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV+M)+CV in xmm8...xmm15 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm9, [rdi+1*16]"); + asm ("pxor xmm10, [rdi+2*16]"); + asm ("pxor xmm11, [rdi+3*16]"); + asm ("pxor xmm12, [rdi+4*16]"); + asm ("pxor xmm13, [rdi+5*16]"); + asm ("pxor xmm14, [rdi+6*16]"); + asm ("pxor xmm15, [rdi+7*16]"); + + /* store P(CV+M)+CV */ + asm ("movaps [rdi+0*16], xmm8"); + asm ("movaps [rdi+1*16], xmm9"); + asm ("movaps [rdi+2*16], xmm10"); + asm ("movaps [rdi+3*16], xmm11"); + asm ("movaps [rdi+4*16], xmm12"); + asm ("movaps [rdi+5*16], xmm13"); + asm ("movaps [rdi+6*16], xmm14"); + asm ("movaps [rdi+7*16], xmm15"); + + /* load message M (Q input) into xmm8-15 */ + asm ("movaps xmm8, [QTEMP+0*16]"); + asm ("movaps xmm9, [QTEMP+1*16]"); + asm ("movaps xmm10, [QTEMP+2*16]"); + asm ("movaps xmm11, [QTEMP+3*16]"); + asm ("movaps xmm12, [QTEMP+4*16]"); + asm ("movaps xmm13, [QTEMP+5*16]"); + asm ("movaps xmm14, [QTEMP+6*16]"); + asm ("movaps xmm15, [QTEMP+7*16]"); + + /* compute permutation Q */ + /* result: Q(M) in xmm8...xmm15 */ + ROUNDS_Q(); + + /* xor Q output */ + /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm9, [rdi+1*16]"); + asm ("pxor xmm10, [rdi+2*16]"); + asm ("pxor xmm11, [rdi+3*16]"); + asm ("pxor xmm12, [rdi+4*16]"); + asm ("pxor xmm13, [rdi+5*16]"); + asm ("pxor xmm14, [rdi+6*16]"); + asm ("pxor xmm15, [rdi+7*16]"); + + /* store CV */ + asm ("movaps [rdi+0*16], xmm8"); + asm ("movaps [rdi+1*16], xmm9"); + asm ("movaps [rdi+2*16], xmm10"); + asm ("movaps [rdi+3*16], xmm11"); + asm ("movaps [rdi+4*16], xmm12"); + asm ("movaps [rdi+5*16], xmm13"); + asm ("movaps [rdi+6*16], xmm14"); + asm ("movaps [rdi+7*16], xmm15"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + +#ifdef IACA_TRACE + IACA_END; +#endif + + return; +} + +void OF1024(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load CV into registers xmm8 - xmm15 */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm9, [rdi+1*16]"); + asm ("movaps xmm10, [rdi+2*16]"); + asm ("movaps xmm11, [rdi+3*16]"); + asm ("movaps xmm12, [rdi+4*16]"); + asm ("movaps xmm13, [rdi+5*16]"); + asm ("movaps xmm14, [rdi+6*16]"); + asm ("movaps xmm15, [rdi+7*16]"); + + /* compute permutation P */ + /* result: P(CV) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8...xmm15 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm9, [rdi+1*16]"); + asm ("pxor xmm10, [rdi+2*16]"); + asm ("pxor xmm11, [rdi+3*16]"); + asm ("pxor xmm12, [rdi+4*16]"); + asm ("pxor xmm13, [rdi+5*16]"); + asm ("pxor xmm14, [rdi+6*16]"); + asm ("pxor xmm15, [rdi+7*16]"); + + /* transpose CV back from row ordering to column ordering */ + /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ + Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7); + + /* we only need to return the truncated half of the state */ + asm ("movaps [rdi+4*16], xmm0"); + asm ("movaps [rdi+5*16], xmm6"); + asm ("movaps [rdi+6*16], xmm13"); + asm ("movaps [rdi+7*16], xmm15"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + + return; +} + +#endif + diff --git a/algo/aes_ni/groestl-asm-avx.h b/algo/aes_ni/groestl-asm-avx.h new file mode 100644 index 000000000..6e8be1be4 --- /dev/null +++ b/algo/aes_ni/groestl-asm-avx.h @@ -0,0 +1,1105 @@ +/* groestl-asm-avx.h Aug 2011 + * + * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx + * instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include "hash-groestl.h" + +/* global variables */ +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16]; +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; +__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16]; +__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16]; +__attribute__ ((aligned (32))) unsigned char ALL_1B[32]; +__attribute__ ((aligned (32))) unsigned char ALL_FF[32]; + +/* temporary variables */ +__attribute__ ((aligned (32))) unsigned char TEMP[6*32]; + + +#define tos(a) #a +#define tostr(a) tos(a) + +#if (LENGTH <= 256) + +#define SET_CONSTANTS(){\ + ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ + ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ + ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\ + ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\ + ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\ + ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\ + ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\ + ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\ + ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\ + ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\ + ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\ + ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\ + ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\ + ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\ + ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\ + ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\ + ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\ + ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ + ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ + ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ + }\ + ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ +}while(0); + +#define Push_All_Regs() do{\ +/* not using any... + asm("push rax");\ + asm("push rbx");\ + asm("push rcx");*/\ +}while(0); + +#define Pop_All_Regs() do{\ +/* not using any... + asm("pop rcx");\ + asm("pop rbx");\ + asm("pop rax");*/\ +}while(0); + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b + * xmm[z] has to be zero */ +#define VMUL2(i, j, k, z){\ + asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\ + asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ + asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\ + asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ +}/**/ + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b + * xmm[z] has to be zero */ +#define VMUL2v2(i, j, k, z){\ + asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\ + asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ + asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ +}/**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ + asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\ + asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\ + asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\ + asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\ + asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\ + asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\ + asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\ + asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\ + \ + /* t_i = a_i + a_{i+1} */\ + asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\ + asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\ + asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\ + asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\ + asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\ + asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\ + asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\ + asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\ + asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\ + asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\ + asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\ + \ + asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\ + asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\ + asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\ + asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\ + asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\ + asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\ + \ + /* spill values y_4, y_5 to memory */\ + asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\ + asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\ + asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\ + \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\ + asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\ + asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\ + \ + /* compute x_i = t_i + t_{i+3} */\ + asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\ + asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\ + asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\ + asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\ + asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\ + \ + /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\ + asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\ + VMUL2(a7, b0, b1, b2);\ + VMUL2(a6, b0, b1, b2);\ + VMUL2(a5, b0, b1, b2);\ + VMUL2(a4, b0, b1, b2);\ + VMUL2(a3, b0, b1, b2);\ + VMUL2(a2, b0, b1, b2);\ + VMUL2(a1, b0, b1, b2);\ + VMUL2(a0, b0, b1, b2);\ + \ + /* compute w_i : add y_{i+4} */\ + asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\ + asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\ + asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\ + asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\ + asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\ + asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\ + asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\ + asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\ + \ + /*compute v_i: double w_i */\ + VMUL2(a0, b0, b1, b2);\ + VMUL2(a1, b0, b1, b2);\ + VMUL2(a2, b0, b1, b2);\ + VMUL2(a3, b0, b1, b2);\ + VMUL2(a4, b0, b1, b2);\ + VMUL2(a5, b0, b1, b2);\ + VMUL2(a6, b0, b1, b2);\ + VMUL2(a7, b0, b1, b2);\ + \ + /* add to y_4 y_5 .. v3, v4, ... */\ + asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\ + asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\ + asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\ + asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\ + asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\ + asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\ +}/*MixBytes*/ + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant */\ + asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ + asm ("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ + asm ("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ + /* ShiftBytes + SubBytes (interleaved) */\ + asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ + asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ + asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ + asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ + asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ + asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ + asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ + asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ + asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\ +\ + asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\ +\ + asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ +\ + asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ + asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ + asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ +\ + asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\ + asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\ + asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\ + asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\ + asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\ + asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\ + asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\ + asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ +}/**/ + + +void INIT(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + asm volatile ("emms"); + + /* load IV into registers xmm12 - xmm15 */ + asm ("vmovaps xmm12, [rdi+0*16]"); + asm ("vmovaps xmm13, [rdi+1*16]"); + asm ("vmovaps xmm14, [rdi+2*16]"); + asm ("vmovaps xmm15, [rdi+3*16]"); + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* store transposed IV */ + asm ("vmovaps [rdi+0*16], xmm12"); + asm ("vmovaps [rdi+1*16], xmm2"); + asm ("vmovaps [rdi+2*16], xmm6"); + asm ("vmovaps [rdi+3*16], xmm7"); + + asm volatile ("emms"); + asm (".att_syntax noprefix"); +} + +void TF512(u64* h, u64* m) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + /* message M in rsi */ + +#ifdef IACA_TRACE + IACA_START; +#endif + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load message into registers xmm12 - xmm15 (Q = message) */ + asm ("vmovaps xmm12, [rsi+0*16]"); + asm ("vmovaps xmm13, [rsi+1*16]"); + asm ("vmovaps xmm14, [rsi+2*16]"); + asm ("vmovaps xmm15, [rsi+3*16]"); + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* load previous chaining value and xor message to CV to get input of P */ + /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + asm ("vpxor xmm8, xmm12, [rdi+0*16]"); + asm ("vpxor xmm0, xmm2, [rdi+1*16]"); + asm ("vpxor xmm4, xmm6, [rdi+2*16]"); + asm ("vpxor xmm5, xmm7, [rdi+3*16]"); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + asm ("vpxor xmm0, xmm0, xmm8"); + asm ("vpxor xmm1, xmm1, xmm10"); + asm ("vpxor xmm2, xmm2, xmm12"); + asm ("vpxor xmm3, xmm3, xmm14"); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + asm ("vpxor xmm0, xmm0, [rdi+0*16]"); + asm ("vpxor xmm1, xmm1, [rdi+1*16]"); + asm ("vpxor xmm2, xmm2, [rdi+2*16]"); + asm ("vpxor xmm3, xmm3, [rdi+3*16]"); + + /* store CV */ + asm ("vmovaps [rdi+0*16], xmm0"); + asm ("vmovaps [rdi+1*16], xmm1"); + asm ("vmovaps [rdi+2*16], xmm2"); + asm ("vmovaps [rdi+3*16], xmm3"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + asm ("vmovaps xmm8, [rdi+0*16]"); + asm ("vmovaps xmm10, [rdi+1*16]"); + asm ("vmovaps xmm12, [rdi+2*16]"); + asm ("vmovaps xmm14, [rdi+3*16]"); + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + asm ("vpxor xmm8, xmm8, [rdi+0*16]"); + asm ("vpxor xmm10, xmm10, [rdi+1*16]"); + asm ("vpxor xmm12, xmm12, [rdi+2*16]"); + asm ("vpxor xmm14, xmm14, [rdi+3*16]"); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); + + /* we only need to return the truncated half of the state */ + asm ("vmovaps [rdi+2*16], xmm9"); + asm ("vmovaps [rdi+3*16], xmm11"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + + return; +} + +#endif + +#if (LENGTH > 256) + +#define SET_CONSTANTS(){\ + ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ + ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ + ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\ + ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\ + ((u64*)ALL_FF)[2] = 0x0000000000000000ULL;\ + ((u64*)ALL_FF)[3] = 0x0000000000000000ULL;\ + ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[2] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[3] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)SUBSH_MASK)[ 0] = 0x0b0e0104070a0d00ULL;\ + ((u64*)SUBSH_MASK)[ 1] = 0x0306090c0f020508ULL;\ + ((u64*)SUBSH_MASK)[ 2] = 0x0c0f0205080b0e01ULL;\ + ((u64*)SUBSH_MASK)[ 3] = 0x04070a0d00030609ULL;\ + ((u64*)SUBSH_MASK)[ 4] = 0x0d000306090c0f02ULL;\ + ((u64*)SUBSH_MASK)[ 5] = 0x05080b0e0104070aULL;\ + ((u64*)SUBSH_MASK)[ 6] = 0x0e0104070a0d0003ULL;\ + ((u64*)SUBSH_MASK)[ 7] = 0x06090c0f0205080bULL;\ + ((u64*)SUBSH_MASK)[ 8] = 0x0f0205080b0e0104ULL;\ + ((u64*)SUBSH_MASK)[ 9] = 0x070a0d000306090cULL;\ + ((u64*)SUBSH_MASK)[10] = 0x000306090c0f0205ULL;\ + ((u64*)SUBSH_MASK)[11] = 0x080b0e0104070a0dULL;\ + ((u64*)SUBSH_MASK)[12] = 0x0104070a0d000306ULL;\ + ((u64*)SUBSH_MASK)[13] = 0x090c0f0205080b0eULL;\ + ((u64*)SUBSH_MASK)[14] = 0x06090c0f0205080bULL;\ + ((u64*)SUBSH_MASK)[15] = 0x0e0104070a0d0003ULL;\ + for(i = 0; i < ROUNDS1024; i++)\ + {\ + ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\ + ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ + ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\ + ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ + }\ +}while(0); + +#define Push_All_Regs() do{\ + asm("push rax");\ + asm("push rbx");\ + asm("push rcx");\ +}while(0); + +#define Pop_All_Regs() do{\ + asm("pop rcx");\ + asm("pop rbx");\ + asm("pop rax");\ +}while(0); + +/* AVX MUL2 + * ymm[i] will be multiplied by 2 + * ymm[j] will be lost + * ymm[k] has to be all 0x1b + * ymm[z] has to be zero + * clobbers: t2, t3 */ +#define VMUL2(i, j, k, z, ih, jh){\ + asm("vextractf128 xmm"tostr(ih)", ymm"tostr(i)", 1");\ + asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\ + asm("vpcmpgtb xmm"tostr(jh)", xmm"tostr(z)", xmm"tostr(ih)"");\ + asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ + asm("vpaddb xmm"tostr(ih)", xmm"tostr(ih)", xmm"tostr(ih)"");\ + asm("vinsertf128 ymm"tostr(j)", ymm"tostr(j)", xmm"tostr(jh)", 1");\ + asm("vinsertf128 ymm"tostr(i)", ymm"tostr(i)", xmm"tostr(ih)", 1");\ + asm("vandpd ymm"tostr(j)", ymm"tostr(j)", ymm"tostr(k)"");\ + asm("vxorpd ymm"tostr(i)", ymm"tostr(i)", ymm"tostr(j)"");\ +}/**/ + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b + * xmm[z] has to be zero */ +#define VMUL2v2(i, j, k, z){\ + asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\ + asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ + asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\ + asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ +}/**/ + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b + * xmm[z] has to be zero */ +#define VMUL2v3(i, j, k, z){\ + asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\ + asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ + asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ +}/**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* ymm"tostr(8..ymm"tostr(15 = a2 a3... a0 a1 */\ + asm("vmovdqa ymm"tostr(b0)", ymm"tostr(a2)"");\ + asm("vmovdqa ymm"tostr(b1)", ymm"tostr(a3)"");\ + asm("vmovdqa ymm"tostr(b2)", ymm"tostr(a4)"");\ + asm("vmovdqa ymm"tostr(b3)", ymm"tostr(a5)"");\ + asm("vmovdqa ymm"tostr(b4)", ymm"tostr(a6)"");\ + asm("vmovdqa ymm"tostr(b5)", ymm"tostr(a7)"");\ + asm("vmovdqa ymm"tostr(b6)", ymm"tostr(a0)"");\ + asm("vmovdqa ymm"tostr(b7)", ymm"tostr(a1)"");\ + \ + /* t_i = a_i + a_{i+1} */\ + asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", ymm"tostr(a1)"");\ + asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", ymm"tostr(a2)"");\ + asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", ymm"tostr(a3)"");\ + asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", ymm"tostr(a4)"");\ + asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", ymm"tostr(a5)"");\ + asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(a6)"");\ + asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(a7)"");\ + asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", ymm"tostr(b6)"");\ + \ + /* build y4 y5 y6 ... in regs ymm8, ymm9, ymm10 by adding t_i*/\ + asm("vxorpd ymm"tostr(b0)", ymm"tostr(b0)", ymm"tostr(a4)"");\ + asm("vxorpd ymm"tostr(b1)", ymm"tostr(b1)", ymm"tostr(a5)"");\ + asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(a6)"");\ + asm("vxorpd ymm"tostr(b3)", ymm"tostr(b3)", ymm"tostr(a7)"");\ + asm("vxorpd ymm"tostr(b4)", ymm"tostr(b4)", ymm"tostr(a0)"");\ + asm("vxorpd ymm"tostr(b5)", ymm"tostr(b5)", ymm"tostr(a1)"");\ + asm("vxorpd ymm"tostr(b6)", ymm"tostr(b6)", ymm"tostr(a2)"");\ + asm("vxorpd ymm"tostr(b7)", ymm"tostr(b7)", ymm"tostr(a3)"");\ + \ + asm("vxorpd ymm"tostr(b0)", ymm"tostr(b0)", ymm"tostr(a6)"");\ + asm("vxorpd ymm"tostr(b1)", ymm"tostr(b1)", ymm"tostr(a7)"");\ + asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(a0)"");\ + asm("vxorpd ymm"tostr(b3)", ymm"tostr(b3)", ymm"tostr(a1)"");\ + asm("vxorpd ymm"tostr(b4)", ymm"tostr(b4)", ymm"tostr(a2)"");\ + asm("vxorpd ymm"tostr(b5)", ymm"tostr(b5)", ymm"tostr(a3)"");\ + asm("vxorpd ymm"tostr(b6)", ymm"tostr(b6)", ymm"tostr(a4)"");\ + asm("vxorpd ymm"tostr(b7)", ymm"tostr(b7)", ymm"tostr(a5)"");\ + \ + /* spill values y_4, y_5 to memory */\ + asm("vmovaps [TEMP+0*32], ymm"tostr(b0)"");\ + asm("vmovaps [TEMP+1*32], ymm"tostr(b1)"");\ + asm("vmovaps [TEMP+2*32], ymm"tostr(b2)"");\ + asm("vmovaps [TEMP+3*32], ymm"tostr(b3)"");\ + asm("vmovaps [TEMP+4*32], ymm"tostr(b4)"");\ + \ + /* save values t0, t1, t2 to ymm8, ymm9 and memory */\ + asm("vmovdqa ymm"tostr(b0)", ymm"tostr(a0)"");\ + asm("vmovdqa ymm"tostr(b1)", ymm"tostr(a1)"");\ + asm("vmovaps [TEMP+5*32], ymm"tostr(a2)"");\ + \ + /* compute x_i = t_i + t_{i+3} */\ + asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", ymm"tostr(a3)"");\ + asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", ymm"tostr(a4)"");\ + asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", ymm"tostr(a5)"");\ + asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", ymm"tostr(a6)"");\ + asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", ymm"tostr(a7)"");\ + asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(b0)"");\ + asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(b1)"");\ + asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", [TEMP+5*32]");\ + \ + /*compute z_i : double x_i using temp ymm8 and 1B ymm9 */\ + asm("vmovaps ymm"tostr(b1)", [ALL_1B]");\ + asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(b2)"");\ + VMUL2(a7, b0, b1, b2, b3, b4);\ + VMUL2(a6, b0, b1, b2, b3, b4);\ + VMUL2(a5, b0, b1, b2, b3, b4);\ + VMUL2(a4, b0, b1, b2, b3, b4);\ + VMUL2(a3, b0, b1, b2, b3, b4);\ + VMUL2(a2, b0, b1, b2, b3, b4);\ + VMUL2(a1, b0, b1, b2, b3, b4);\ + VMUL2(a0, b0, b1, b2, b3, b4);\ + \ + /* compute w_i : add y_{i+4} */\ + asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", [TEMP+0*32]");\ + asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", [TEMP+1*32]");\ + asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", [TEMP+2*32]");\ + asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", [TEMP+3*32]");\ + asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", [TEMP+4*32]");\ + asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(b5)"");\ + asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(b6)"");\ + asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", ymm"tostr(b7)"");\ + \ + /*compute v_i: double w_i */\ + VMUL2(a0, b0, b1, b2, b3, b4);\ + VMUL2(a1, b0, b1, b2, b3, b4);\ + VMUL2(a2, b0, b1, b2, b3, b4);\ + VMUL2(a3, b0, b1, b2, b3, b4);\ + VMUL2(a4, b0, b1, b2, b3, b4);\ + VMUL2(a5, b0, b1, b2, b3, b4);\ + VMUL2(a6, b0, b1, b2, b3, b4);\ + VMUL2(a7, b0, b1, b2, b3, b4);\ + \ + /* add to y_4 y_5 .. v3, v4, ... */\ + asm("vxorpd ymm"tostr(b0)", ymm"tostr(a3)", [TEMP+0*32]");\ + asm("vxorpd ymm"tostr(b1)", ymm"tostr(a4)", [TEMP+1*32]");\ + asm("vxorpd ymm"tostr(b2)", ymm"tostr(a5)", [TEMP+2*32]");\ + asm("vxorpd ymm"tostr(b3)", ymm"tostr(a6)", [TEMP+3*32]");\ + asm("vxorpd ymm"tostr(b4)", ymm"tostr(a7)", [TEMP+4*32]");\ + asm("vxorpd ymm"tostr(b5)", ymm"tostr(a0)", ymm"tostr(b5)"");\ + asm("vxorpd ymm"tostr(b6)", ymm"tostr(a1)", ymm"tostr(b6)"");\ + asm("vxorpd ymm"tostr(b7)", ymm"tostr(a2)", ymm"tostr(b7)"");\ +}/*MixBytes*/ + +/* AVX SubShift + * inputs: + * * i + * * c0 (must be 0) + * * ShiftP + * * ShiftQ + * output i = S[Shift(i_1, ShiftQ)|Shift(i_0, ShiftP)] + * clobbers: t0 + * */ +#define SubShift(i, t0, c0, ShiftP, ShiftQ){\ + asm("vextractf128 xmm"tostr(t0)", ymm"tostr(i)", 1");\ + asm("vpshufb xmm"tostr(i)", xmm"tostr(i)", [SUBSH_MASK+"tostr(ShiftP)"*16]");\ + asm("vpshufb xmm"tostr(t0)", xmm"tostr(t0)", [SUBSH_MASK+"tostr(ShiftQ)"*16]");\ + asm("vaesenclast xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(c0)"");\ + asm("vaesenclast xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(c0)"");\ + asm("vinsertf128 ymm"tostr(i)", ymm"tostr(i)", xmm"tostr(t0)", 1");\ +}/**/ + +/* one round + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define SUBSHIFTMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* ShiftBytes + SubBytes */\ + asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\ + SubShift(a0, b1, b0, 0, 1);\ + SubShift(a1, b1, b0, 1, 3);\ + SubShift(a2, b1, b0, 2, 5);\ + SubShift(a3, b1, b0, 3, 7);\ + SubShift(a4, b1, b0, 4, 0);\ + SubShift(a5, b1, b0, 5, 2);\ + SubShift(a6, b1, b0, 6, 4);\ + SubShift(a7, b1, b0, 7, 6);\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +#define ROUNDS_P_Q(){\ + asm ("xor rax, rax");\ + asm ("1:");\ + /* AddRoundConstant */\ + asm ("vxorpd ymm6, ymm6, ymm6");\ + asm ("vinsertf128 ymm7, ymm6, [ROUND_CONST_Q+eax*8], 1");\ + asm ("vinsertf128 ymm6, ymm6, [ALL_FF], 1");\ + asm ("vinsertf128 ymm0, ymm6, [ROUND_CONST_P+eax*8], 0");\ + asm ("vxorpd ymm0, ymm8, ymm0");\ + asm ("vxorpd ymm1, ymm9, ymm6");\ + asm ("vxorpd ymm2, ymm10, ymm6");\ + asm ("vxorpd ymm3, ymm11, ymm6");\ + asm ("vxorpd ymm4, ymm12, ymm6");\ + asm ("vxorpd ymm5, ymm13, ymm6");\ + asm ("vxorpd ymm6, ymm14, ymm6");\ + asm ("vxorpd ymm7, ymm15, ymm7");\ + /* SubBytes + ShiftBytes + MixBytes */\ + SUBSHIFTMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + asm ("add al, 2");\ + asm ("mov rbx, rax");\ + asm ("sub bl, 28");\ + asm ("jb 1b");\ +} + +/* Matrix Transpose + * input is a 1024-bit state with two columns in one xmm + * output is a 1024-bit state with two rows in one xmm + * inputs: i0-i7 + * outputs: i0-i7 + * clobbers: t0-t7 + */ +#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ + asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\ +\ + asm ("vpshufb xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i5)", xmm"tostr(i5)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i7)", xmm"tostr(i7)", xmm"tostr(t0)"");\ +\ + /* continue with unpack */\ + asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpckhwd xmm"tostr(t1)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpckhwd xmm"tostr(t2)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpckhwd xmm"tostr(t3)", xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpcklwd xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpcklwd xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ +\ + /* shuffle with immediate */\ + asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ + asm ("vpshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ + asm ("vpshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\ + asm ("vpshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ + asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ + asm ("vpshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ + asm ("vpshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\ +\ + /* continue with unpack */\ + asm ("vpunpckhdq xmm"tostr(t4)", xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("vpunpckhdq xmm"tostr(t5)", xmm"tostr(t0)", xmm"tostr(t1)"");\ + asm ("vpunpckldq xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t1)"");\ + asm ("vpunpckhdq xmm"tostr(t6)", xmm"tostr(i4)", xmm"tostr(i6)"");\ + asm ("vpunpckldq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i6)"");\ + asm ("vpunpckhdq xmm"tostr(t7)", xmm"tostr(t2)", xmm"tostr(t3)"");\ + asm ("vpunpckldq xmm"tostr(t2)", xmm"tostr(t2)", xmm"tostr(t3)"");\ +\ + /* there are now 2 rows in each xmm */\ + /* unpack to get 1 row of CV in each xmm */\ + asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(t0)", xmm"tostr(t2)"");\ + asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(t0)", xmm"tostr(t2)"");\ + asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(t4)", xmm"tostr(t6)"");\ + asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(t4)", xmm"tostr(t6)"");\ + asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(t5)", xmm"tostr(t7)"");\ + asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(t5)", xmm"tostr(t7)"");\ + /* transpose done */\ +}/**/ + +/* Matrix Transpose Inverse + * input is a 1024-bit state with two rows in one xmm + * output is a 1024-bit state with two columns in one xmm + * inputs: i0-i7 + * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) + * clobbers: t0-t4 + */ +#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ + asm ("vmovaps xmm"tostr(o0)", [TRANSP_MASK]");\ + /* transpose matrix to get output format */\ + asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpckhqdq xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpckhqdq xmm"tostr(t1)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpckhqdq xmm"tostr(t2)", xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ + /* load transpose mask into a register, because it will be used 8 times */\ + asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(o0)"");\ + asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(o0)"");\ + asm ("vpshufb xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(o0)"");\ + asm ("vpshufb xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(o0)"");\ + asm ("vpshufb xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(o0)"");\ + asm ("vpshufb xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(o0)"");\ + asm ("vpshufb xmm"tostr(t1)", xmm"tostr(t1)", xmm"tostr(o0)"");\ + asm ("vpshufb xmm"tostr(t2)", xmm"tostr(t2)", xmm"tostr(o0)"");\ + /* continue with unpack */\ + asm ("vpunpckhwd xmm"tostr(t3)", xmm"tostr(i4)", xmm"tostr(i6)"");\ + asm ("vpunpcklwd xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i6)"");\ + asm ("vpunpckhwd xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("vpunpckhwd xmm"tostr(o2)", xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("vpunpcklwd xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("vpunpckhwd xmm"tostr(t4)", xmm"tostr(t1)", xmm"tostr(t2)"");\ + asm ("vpunpcklwd xmm"tostr(t1)", xmm"tostr(t1)", xmm"tostr(t2)"");\ + /* shuffle with immediate */\ + asm ("vpshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ + asm ("vpshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ + asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ + asm ("vpshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\ + asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("vpshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\ + asm ("vpshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ + asm ("vpshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\ + /* continue with unpack */\ + asm ("vpunpckhdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("vpunpckhdq xmm"tostr(i3)", xmm"tostr(o0)", xmm"tostr(t3)"");\ + asm ("vpunpckldq xmm"tostr(o0)", xmm"tostr(o0)", xmm"tostr(t3)"");\ + asm ("vpunpckhdq xmm"tostr(i5)", xmm"tostr(o1)", xmm"tostr(t1)"");\ + asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t1)"");\ + asm ("vpunpckhdq xmm"tostr(i7)", xmm"tostr(o2)", xmm"tostr(t4)"");\ + asm ("vpunpckldq xmm"tostr(o2)", xmm"tostr(o2)", xmm"tostr(t4)"");\ + /* transpose done */\ +}/**/ + + +void INIT(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + asm volatile ("emms"); + + /* load IV into registers xmm8 - xmm15 */ + asm ("vmovaps xmm8, [rdi+0*16]"); + asm ("vmovaps xmm9, [rdi+1*16]"); + asm ("vmovaps xmm10, [rdi+2*16]"); + asm ("vmovaps xmm11, [rdi+3*16]"); + asm ("vmovaps xmm12, [rdi+4*16]"); + asm ("vmovaps xmm13, [rdi+5*16]"); + asm ("vmovaps xmm14, [rdi+6*16]"); + asm ("vmovaps xmm15, [rdi+7*16]"); + + /* transform chaining value from column ordering into row ordering */ + Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + + /* store transposed IV */ + asm ("vmovaps [rdi+0*16], xmm8"); + asm ("vmovaps [rdi+1*16], xmm9"); + asm ("vmovaps [rdi+2*16], xmm10"); + asm ("vmovaps [rdi+3*16], xmm11"); + asm ("vmovaps [rdi+4*16], xmm12"); + asm ("vmovaps [rdi+5*16], xmm13"); + asm ("vmovaps [rdi+6*16], xmm14"); + asm ("vmovaps [rdi+7*16], xmm15"); + + asm volatile ("emms"); + asm (".att_syntax noprefix"); +} + +void TF1024(u64* h, u64* m) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + /* message M in rsi */ + +#ifdef IACA_TRACE + IACA_START; +#endif + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load message into registers xmm8...xmm15 (Q = message) */ + asm ("vmovaps xmm0, [rsi+0*16]"); + asm ("vmovaps xmm1, [rsi+1*16]"); + asm ("vmovaps xmm2, [rsi+2*16]"); + asm ("vmovaps xmm3, [rsi+3*16]"); + asm ("vmovaps xmm4, [rsi+4*16]"); + asm ("vmovaps xmm5, [rsi+5*16]"); + asm ("vmovaps xmm6, [rsi+6*16]"); + asm ("vmovaps xmm7, [rsi+7*16]"); + + /* transform message M from column ordering into row ordering */ + Matrix_Transpose(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + /* load previous chaining value and xor message to CV to get input of P */ + /* we put two rows (2x64 bit) of the CV into one 128-bit xmm register */ + /* result: CV+M in xmm8...xmm15 */ + asm ("vpxor xmm8, xmm0, [rdi+0*16]"); + asm ("vpxor xmm9, xmm1, [rdi+1*16]"); + asm ("vpxor xmm10, xmm2, [rdi+2*16]"); + asm ("vpxor xmm11, xmm3, [rdi+3*16]"); + asm ("vpxor xmm12, xmm4, [rdi+4*16]"); + asm ("vpxor xmm13, xmm5, [rdi+5*16]"); + asm ("vpxor xmm14, xmm6, [rdi+6*16]"); + asm ("vpxor xmm15, xmm7, [rdi+7*16]"); + + /* generate AVX registers with Q in high and P in low 128 bits */ + asm ("vinsertf128 ymm8, ymm8, xmm0, 1"); + asm ("vinsertf128 ymm9, ymm9, xmm1, 1"); + asm ("vinsertf128 ymm10, ymm10, xmm2, 1"); + asm ("vinsertf128 ymm11, ymm11, xmm3, 1"); + asm ("vinsertf128 ymm12, ymm12, xmm4, 1"); + asm ("vinsertf128 ymm13, ymm13, xmm5, 1"); + asm ("vinsertf128 ymm14, ymm14, xmm6, 1"); + asm ("vinsertf128 ymm15, ymm15, xmm7, 1"); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* extract output of Q to xmm0...xmm7 */ + asm ("vextractf128 xmm0, ymm8, 1"); + asm ("vextractf128 xmm1, ymm9, 1"); + asm ("vextractf128 xmm2, ymm10, 1"); + asm ("vextractf128 xmm3, ymm11, 1"); + asm ("vextractf128 xmm4, ymm12, 1"); + asm ("vextractf128 xmm5, ymm13, 1"); + asm ("vextractf128 xmm6, ymm14, 1"); + asm ("vextractf128 xmm7, ymm15, 1"); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm8...xmm15 */ + asm ("vpxor xmm8, xmm8, xmm0"); + asm ("vpxor xmm9, xmm9, xmm1"); + asm ("vpxor xmm10, xmm10, xmm2"); + asm ("vpxor xmm11, xmm11, xmm3"); + asm ("vpxor xmm12, xmm12, xmm4"); + asm ("vpxor xmm13, xmm13, xmm5"); + asm ("vpxor xmm14, xmm14, xmm6"); + asm ("vpxor xmm15, xmm15, xmm7"); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm8...xmm15 */ + asm ("vpxor xmm8, xmm8, [rdi+0*16]"); + asm ("vpxor xmm9, xmm9, [rdi+1*16]"); + asm ("vpxor xmm10, xmm10, [rdi+2*16]"); + asm ("vpxor xmm11, xmm11, [rdi+3*16]"); + asm ("vpxor xmm12, xmm12, [rdi+4*16]"); + asm ("vpxor xmm13, xmm13, [rdi+5*16]"); + asm ("vpxor xmm14, xmm14, [rdi+6*16]"); + asm ("vpxor xmm15, xmm15, [rdi+7*16]"); + + /* store CV */ + asm ("vmovaps [rdi+0*16], xmm8"); + asm ("vmovaps [rdi+1*16], xmm9"); + asm ("vmovaps [rdi+2*16], xmm10"); + asm ("vmovaps [rdi+3*16], xmm11"); + asm ("vmovaps [rdi+4*16], xmm12"); + asm ("vmovaps [rdi+5*16], xmm13"); + asm ("vmovaps [rdi+6*16], xmm14"); + asm ("vmovaps [rdi+7*16], xmm15"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF1024(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + asm ("vpxor xmm0, xmm0, xmm0"); + + /* load CV into registers xmm8...xmm15 */ + asm ("vmovaps xmm8, [rdi+0*16]"); + asm ("vmovaps xmm9, [rdi+1*16]"); + asm ("vmovaps xmm10, [rdi+2*16]"); + asm ("vmovaps xmm11, [rdi+3*16]"); + asm ("vmovaps xmm12, [rdi+4*16]"); + asm ("vmovaps xmm13, [rdi+5*16]"); + asm ("vmovaps xmm14, [rdi+6*16]"); + asm ("vmovaps xmm15, [rdi+7*16]"); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8...xmm15 */ + ROUNDS_P_Q(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8...xmm15 */ + asm ("vpxor xmm8, xmm8, [rdi+0*16]"); + asm ("vpxor xmm9, xmm9, [rdi+1*16]"); + asm ("vpxor xmm10, xmm10, [rdi+2*16]"); + asm ("vpxor xmm11, xmm11, [rdi+3*16]"); + asm ("vpxor xmm12, xmm12, [rdi+4*16]"); + asm ("vpxor xmm13, xmm13, [rdi+5*16]"); + asm ("vpxor xmm14, xmm14, [rdi+6*16]"); + asm ("vpxor xmm15, xmm15, [rdi+7*16]"); + + /* transpose CV back from row ordering to column ordering */ + /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ + Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7); + + /* we only need to return the truncated half of the state */ + asm ("vmovaps [rdi+4*16], xmm0"); + asm ("vmovaps [rdi+5*16], xmm6"); + asm ("vmovaps [rdi+6*16], xmm13"); + asm ("vmovaps [rdi+7*16], xmm15"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + + return; +} + +#endif + diff --git a/algo/aes_ni/groestl-asm-vperm.h b/algo/aes_ni/groestl-asm-vperm.h new file mode 100644 index 000000000..f8ae27caa --- /dev/null +++ b/algo/aes_ni/groestl-asm-vperm.h @@ -0,0 +1,1397 @@ +/* groestl-asm-vperm.h Aug 2011 + * + * Groestl implementation with inline assembly using ssse3 instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * Based on the vperm and aes_ni implementations of the hash function Groestl + * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ + * Institute of Applied Mathematics, Middle East Technical University, Turkey + * + * This code is placed in the public domain + */ + +#include "hash-groestl.h" + +/* global constants */ +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; +__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16]; +__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16]; +__attribute__ ((aligned (16))) unsigned char ALL_0F[16]; +__attribute__ ((aligned (16))) unsigned char ALL_15[16]; +__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; +__attribute__ ((aligned (16))) unsigned char ALL_63[16]; +__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; +__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16]; + +/* temporary variables */ +__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16]; +__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16]; +__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16]; +__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; +__attribute__ ((aligned (16))) unsigned char TEMP[8*16]; + + +#define tos(a) #a +#define tostr(a) tos(a) + +#define SET_SHARED_CONSTANTS(){\ + ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ + ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ + ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\ + ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\ + ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\ + ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\ + ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\ + ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\ + ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\ + ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\ + ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\ + ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\ + ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\ + ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\ + ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\ + ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\ + ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\ + ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\ + ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\ + ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\ + ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\ + ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\ + ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\ + ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\ + ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\ + ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\ + ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\ + ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\ + ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\ + ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\ +/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\ + ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\ + ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\ + ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\ + ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\ + ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\ +}/**/ + +/* VPERM + * Transform w/o settings c* + * transforms 2 rows to/from "vperm mode" + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0, a1 = 2 rows + * table = transformation table to use + * t*, c* = clobbers + * outputs: + * a0, a1 = 2 rows transformed with table + * */ +#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ + asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\ + asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ + asm ("pandn xmm"tostr(t1)", xmm"tostr(a1)"");\ + asm ("psrld xmm"tostr(t0)", 4");\ + asm ("psrld xmm"tostr(t1)", 4");\ + asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ + asm ("pand xmm"tostr(a1)", xmm"tostr(c0)"");\ + asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\ + asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\ + asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\ + asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\ + asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\ + asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\ + asm ("pxor xmm"tostr(a0)", xmm"tostr(t2)"");\ + asm ("pxor xmm"tostr(a1)", xmm"tostr(t3)"");\ +}/**/ + +#define VPERM_Transform_Set_Const(table, c0, c1, c2){\ + asm ("movaps xmm"tostr(c0)", [ALL_0F]");\ + asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\ + asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\ +}/**/ + +/* VPERM + * Transform + * transforms 2 rows to/from "vperm mode" + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0, a1 = 2 rows + * table = transformation table to use + * t*, c* = clobbers + * outputs: + * a0, a1 = 2 rows transformed with table + * */ +#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ + VPERM_Transform_Set_Const(table, c0, c1, c2);\ + VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ +}/**/ + +/* VPERM + * Transform State + * inputs: + * a0-a3 = state + * table = transformation table to use + * t* = clobbers + * outputs: + * a0-a3 = transformed state + * */ +#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ + VPERM_Transform_Set_Const(table, c0, c1, c2);\ + VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ + VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ +}/**/ + +/* VPERM + * Add Constant to State + * inputs: + * a0-a7 = state + * constant = constant to add + * t0 = clobber + * outputs: + * a0-a7 = state + constant + * */ +#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ + asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\ + asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a1)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a2)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a3)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a4)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a5)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a6)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a7)", xmm"tostr(t0)"");\ +}/**/ + +/* VPERM + * Set Substitute Core Constants + * */ +#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\ + VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\ +}/**/ + +/* VPERM + * Substitute Core + * first part of sbox inverse computation + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0 = 1 row + * t*, c* = clobbers + * outputs: + * b0a, b0b = inputs for lookup step + * */ +#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ + asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ + asm ("psrld xmm"tostr(t0)", 4");\ + asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ + asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\ + asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(b0b)", xmm"tostr(b0a)"");\ + asm ("movdqa xmm"tostr(t1)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(t1)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(t1)", xmm"tostr(b0a)"");\ + asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\ + asm ("pxor xmm"tostr(b0a)", xmm"tostr(a0)"");\ + asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\ + asm ("pxor xmm"tostr(b0b)", xmm"tostr(t0)"");\ +}/**/ + +/* VPERM + * Lookup + * second part of sbox inverse computation + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0a, a0b = output of Substitution Core + * table = lookup table to use (*1 / *2 / *4) + * t0 = clobber + * outputs: + * b0 = output of sbox + multiplication + * */ +#define VPERM_Lookup(a0a, a0b, table, b0, t0){\ + asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\ + asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\ + asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\ + asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(t0)"");\ +}/**/ + +/* VPERM + * SubBytes and *2 / *4 + * this function is derived from: + * Constant-time SSSE3 AES core implementation + * by Mike Hamburg + * and + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0-a7 = state + * t*, c* = clobbers + * outputs: + * a0-a7 = state * 4 + * c2 = row0 * 2 -> b0 + * c1 = row7 * 2 -> b3 + * c0 = row7 * 1 -> b4 + * t2 = row4 * 1 -> b7 + * TEMP_MUL1 = row(i) * 1 + * TEMP_MUL2 = row(i) * 2 + * + * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ +#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ + /* set Constants */\ + VPERM_Substitute_Core_Set_Const(c0, c1, c2);\ + /* row 1 */\ + VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\ + /* --- */\ + /* row 2 */\ + VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\ + /* --- */\ + /* row 3 */\ + VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\ + /* --- */\ + /* row 5 */\ + VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\ + /* --- */\ + /* row 6 */\ + VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\ + /* --- */\ + /* row 7 */\ + VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\ + VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\ + /* --- */\ + /* row 4 */\ + VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\ + /* --- */\ + /* row 0 */\ + VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\ + VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\ + asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\ + /* --- */\ +}/**/ + + +/* Optimized MixBytes + * inputs: + * a0-a7 = (row0-row7) * 4 + * b0 = row0 * 2 + * b3 = row7 * 2 + * b4 = row7 * 1 + * b7 = row4 * 1 + * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 + * output: b0-b7 + * */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* save one value */\ + asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\ + /* 1 */\ + asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\ + asm ("pxor xmm"tostr(b1)", [TEMP_MUL2+3*16]");\ + asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\ + \ + /* 2 */\ + asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a4)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\ + asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\ + \ + /* 4 */\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(a6)"");\ + /*asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\ + asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+6*16]");\ + asm ("pxor xmm"tostr(b7)", [TEMP_MUL2+1*16]");\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\ + asm ("pxor xmm"tostr(b2)", xmm"tostr(b7)"");\ + \ + /* 3 */\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(a7)"");\ + asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+5*16]");\ + asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+7*16]");\ + /*asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\ + asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+2*16]");\ + asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(b0)"");\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\ + \ + /* 5 */\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ + /*asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\ + asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+2*16]");\ + asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+3*16]");\ + asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+5*16]");\ + asm ("pxor xmm"tostr(b3)", xmm"tostr(b4)"");\ + asm ("pxor xmm"tostr(b6)", xmm"tostr(b4)"");\ + \ + /* 6 */\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ + \ + /* 7 */\ + asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\ + asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\ + asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\ + asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ + \ + /* 8 */\ + asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\ + asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\ + asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ + \ + /* 9 */\ + asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+5*16]");\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ + \ + /* 10 */\ + asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\ + asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+1*16]");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(a1)"");\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a1)"");\ + \ + /* 11 */\ + asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\ + asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+6*16]");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ + asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ + \ + /* 12 */\ + asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+2*16]");\ + asm ("pxor xmm"tostr(b2)", xmm"tostr(a3)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ + \ + /* 13 */\ + asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ + asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ +}/**/ + +#if (LENGTH <= 256) + +#define SET_CONSTANTS(){\ + SET_SHARED_CONSTANTS();\ + ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\ + ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\ + ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\ + ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\ + ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\ + ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\ + ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\ + ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\ + ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\ + ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\ + ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\ + ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\ + ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\ + ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\ + ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\ + ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ + ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ + ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ + }\ + ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ +}/**/ + +#define Push_All_Regs(){\ +/* not using any... + asm("push rax");\ + asm("push rbx");\ + asm("push rcx");*/\ +}/**/ + +#define Pop_All_Regs(){\ +/* not using any... + asm("pop rcx");\ + asm("pop rbx");\ + asm("pop rax");*/\ +}/**/ + + +/* vperm: + * transformation before rounds with ipt + * first round add transformed constant + * middle rounds: add constant XOR 0x15...15 + * last round: additionally add 0x15...15 after MB + * transformation after rounds with opt + */ +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant + ShiftBytes (interleaved) */\ + asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ + asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ + asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ + asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ + asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ + asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ + asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ + asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ + asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ + asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ + /* SubBytes + Multiplication by 2 and 4 */\ + VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +}/**/ + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ + ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ +} + + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ +\ + asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ +\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ +\ + asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ + asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ +\ + asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ + asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ + asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ +\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ +\ + asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ + asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ + asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ + asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ +}/**/ + + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst_CNT2(i, j){\ + asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\ + asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\ + asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\ + asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\ + VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ + asm ("pxor xmm0, [ALL_15]");\ + asm ("pxor xmm1, [ALL_15]");\ + asm ("pxor xmm2, [ALL_15]");\ + asm ("pxor xmm3, [ALL_15]");\ + asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\ + asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\ + asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\ + asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\ +}/**/ + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst(){\ + asm ("movaps xmm0, [ROUND_CONST_Lx]");\ + VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ + asm ("pxor xmm0, [ALL_15]");\ + asm ("movaps [ROUND_CONST_Lx], xmm0");\ + VPERM_Transform_RoundConst_CNT2(0, 1);\ + VPERM_Transform_RoundConst_CNT2(2, 3);\ + VPERM_Transform_RoundConst_CNT2(4, 5);\ + VPERM_Transform_RoundConst_CNT2(6, 7);\ + VPERM_Transform_RoundConst_CNT2(8, 9);\ +}/**/ + +void INIT(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + asm volatile ("emms"); + + /* transform round constants into VPERM mode */ + VPERM_Transform_RoundConst(); + + /* load IV into registers xmm12 - xmm15 */ + asm ("movaps xmm12, [rdi+0*16]"); + asm ("movaps xmm13, [rdi+1*16]"); + asm ("movaps xmm14, [rdi+2*16]"); + asm ("movaps xmm15, [rdi+3*16]"); + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* store transposed IV */ + asm ("movaps [rdi+0*16], xmm12"); + asm ("movaps [rdi+1*16], xmm2"); + asm ("movaps [rdi+2*16], xmm6"); + asm ("movaps [rdi+3*16], xmm7"); + + asm volatile ("emms"); + asm (".att_syntax noprefix"); +} + +void TF512(u64* h, u64* m) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + /* message M in rsi */ + +#ifdef IACA_TRACE + IACA_START; +#endif + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load message into registers xmm12 - xmm15 (Q = message) */ + asm ("movaps xmm12, [rsi+0*16]"); + asm ("movaps xmm13, [rsi+1*16]"); + asm ("movaps xmm14, [rsi+2*16]"); + asm ("movaps xmm15, [rsi+3*16]"); + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm0, [rdi+1*16]"); + asm ("movaps xmm4, [rdi+2*16]"); + asm ("movaps xmm5, [rdi+3*16]"); + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + asm ("pxor xmm8, xmm12"); + asm ("pxor xmm0, xmm2"); + asm ("pxor xmm4, xmm6"); + asm ("pxor xmm5, xmm7"); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + asm ("pxor xmm0, xmm8"); + asm ("pxor xmm1, xmm10"); + asm ("pxor xmm2, xmm12"); + asm ("pxor xmm3, xmm14"); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + asm ("pxor xmm0, [rdi+0*16]"); + asm ("pxor xmm1, [rdi+1*16]"); + asm ("pxor xmm2, [rdi+2*16]"); + asm ("pxor xmm3, [rdi+3*16]"); + + /* store CV */ + asm ("movaps [rdi+0*16], xmm0"); + asm ("movaps [rdi+1*16], xmm1"); + asm ("movaps [rdi+2*16], xmm2"); + asm ("movaps [rdi+3*16], xmm3"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + +#ifdef IACA_TRACE + IACA_END; +#endif + + return; +} + +void OF512(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm10, [rdi+1*16]"); + asm ("movaps xmm12, [rdi+2*16]"); + asm ("movaps xmm14, [rdi+3*16]"); + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm10, [rdi+1*16]"); + asm ("pxor xmm12, [rdi+2*16]"); + asm ("pxor xmm14, [rdi+3*16]"); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); + VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7); + + /* we only need to return the truncated half of the state */ + asm ("movaps [rdi+2*16], xmm9"); + asm ("movaps [rdi+3*16], xmm11"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + + return; +} + +#endif + +#if (LENGTH > 256) + +#define SET_CONSTANTS(){\ + SET_SHARED_CONSTANTS();\ + ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\ + ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\ + ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\ + ((u64*)SUBSH_MASK)[ 1] = 0x0f0e0d0c0b0a0908ULL;\ + ((u64*)SUBSH_MASK)[ 2] = 0x0807060504030201ULL;\ + ((u64*)SUBSH_MASK)[ 3] = 0x000f0e0d0c0b0a09ULL;\ + ((u64*)SUBSH_MASK)[ 4] = 0x0908070605040302ULL;\ + ((u64*)SUBSH_MASK)[ 5] = 0x01000f0e0d0c0b0aULL;\ + ((u64*)SUBSH_MASK)[ 6] = 0x0a09080706050403ULL;\ + ((u64*)SUBSH_MASK)[ 7] = 0x0201000f0e0d0c0bULL;\ + ((u64*)SUBSH_MASK)[ 8] = 0x0b0a090807060504ULL;\ + ((u64*)SUBSH_MASK)[ 9] = 0x030201000f0e0d0cULL;\ + ((u64*)SUBSH_MASK)[10] = 0x0c0b0a0908070605ULL;\ + ((u64*)SUBSH_MASK)[11] = 0x04030201000f0e0dULL;\ + ((u64*)SUBSH_MASK)[12] = 0x0d0c0b0a09080706ULL;\ + ((u64*)SUBSH_MASK)[13] = 0x0504030201000f0eULL;\ + ((u64*)SUBSH_MASK)[14] = 0x0201000f0e0d0c0bULL;\ + ((u64*)SUBSH_MASK)[15] = 0x0a09080706050403ULL;\ + for(i = 0; i < ROUNDS1024; i++)\ + {\ + ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\ + ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ + ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\ + ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ + }\ +}/**/ + +#define Push_All_Regs(){\ + asm("push rax");\ + asm("push rbx");\ + asm("push rcx");\ +}/**/ + +#define Pop_All_Regs(){\ + asm("pop rcx");\ + asm("pop rbx");\ + asm("pop rax");\ +}/**/ + +/* one round + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* SubBytes + Multiplication */\ + VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +}/**/ + +#define ROUNDS_P(){\ + asm ("xor rax, rax");\ + asm ("xor rbx, rbx");\ + asm ("add bl, 2");\ + asm ("1:");\ + /* AddRoundConstant P1024 */\ + asm ("pxor xmm8, [ROUND_CONST_P+eax*8]");\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + asm ("pshufb xmm8, [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm9, [SUBSH_MASK+1*16]");\ + asm ("pshufb xmm10, [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm11, [SUBSH_MASK+3*16]");\ + asm ("pshufb xmm12, [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm13, [SUBSH_MASK+5*16]");\ + asm ("pshufb xmm14, [SUBSH_MASK+6*16]");\ + asm ("pshufb xmm15, [SUBSH_MASK+7*16]");\ + /* SubBytes + MixBytes */\ + SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + VPERM_Add_Constant(0, 1, 2, 3, 4, 5, 6, 7, ALL_15, 8);\ + /* AddRoundConstant P1024 */\ + asm ("pxor xmm0, [ROUND_CONST_P+ebx*8]");\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + asm ("pshufb xmm0, [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm1, [SUBSH_MASK+1*16]");\ + asm ("pshufb xmm2, [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm3, [SUBSH_MASK+3*16]");\ + asm ("pshufb xmm4, [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm5, [SUBSH_MASK+5*16]");\ + asm ("pshufb xmm6, [SUBSH_MASK+6*16]");\ + asm ("pshufb xmm7, [SUBSH_MASK+7*16]");\ + /* SubBytes + MixBytes */\ + SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ + asm ("add al, 4");\ + asm ("add bl, 4");\ + asm ("mov rcx, rax");\ + asm ("sub cl, 28");\ + asm ("jb 1b");\ +}/**/ + +#define ROUNDS_Q(){\ + VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\ + asm ("xor rax, rax");\ + asm ("xor rbx, rbx");\ + asm ("add bl, 2");\ + asm ("2:");\ + /* AddRoundConstant Q1024 */\ + asm ("movaps xmm1, [ALL_FF]");\ + asm ("pxor xmm8, xmm1");\ + asm ("pxor xmm9, xmm1");\ + asm ("pxor xmm10, xmm1");\ + asm ("pxor xmm11, xmm1");\ + asm ("pxor xmm12, xmm1");\ + asm ("pxor xmm13, xmm1");\ + asm ("pxor xmm14, xmm1");\ + asm ("pxor xmm15, [ROUND_CONST_Q+eax*8]");\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + asm ("pshufb xmm8, [SUBSH_MASK+1*16]");\ + asm ("pshufb xmm9, [SUBSH_MASK+3*16]");\ + asm ("pshufb xmm10, [SUBSH_MASK+5*16]");\ + asm ("pshufb xmm11, [SUBSH_MASK+7*16]");\ + asm ("pshufb xmm12, [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm13, [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm14, [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm15, [SUBSH_MASK+6*16]");\ + /* SubBytes + MixBytes */\ + SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + /* AddRoundConstant Q1024 */\ + asm ("movaps xmm9, [ALL_FF]");\ + asm ("pxor xmm0, xmm9");\ + asm ("pxor xmm1, xmm9");\ + asm ("pxor xmm2, xmm9");\ + asm ("pxor xmm3, xmm9");\ + asm ("pxor xmm4, xmm9");\ + asm ("pxor xmm5, xmm9");\ + asm ("pxor xmm6, xmm9");\ + asm ("pxor xmm7, [ROUND_CONST_Q+ebx*8]");\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + asm ("pshufb xmm0, [SUBSH_MASK+1*16]");\ + asm ("pshufb xmm1, [SUBSH_MASK+3*16]");\ + asm ("pshufb xmm2, [SUBSH_MASK+5*16]");\ + asm ("pshufb xmm3, [SUBSH_MASK+7*16]");\ + asm ("pshufb xmm4, [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm5, [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm6, [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm7, [SUBSH_MASK+6*16]");\ + /* SubBytes + MixBytes */\ + SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + asm ("add al, 4");\ + asm ("add bl, 4");\ + asm ("mov rcx, rax");\ + asm ("sub cl, 28");\ + asm ("jb 2b");\ + VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\ +}/**/ + + +/* Matrix Transpose + * input is a 1024-bit state with two columns in one xmm + * output is a 1024-bit state with two rows in one xmm + * inputs: i0-i7 + * outputs: i0-i7 + * clobbers: t0-t7 + */ +#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ + asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ +\ + asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\ + asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\ + asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\ +\ + /* continue with unpack using 4 temp registers */\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\ + asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\ + asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\ + asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\ + asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\ + asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ +\ + /* shuffle with immediate */\ + asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ + asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ + asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\ + asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ + asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ + asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ + asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\ +\ + /* continue with unpack */\ + asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\ + asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("punpckhdq xmm"tostr(t4)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\ + asm ("punpckldq xmm"tostr(t0)", xmm"tostr(t1)"");\ + asm ("punpckhdq xmm"tostr(t5)", xmm"tostr(t1)"");\ + asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\ + asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\ + asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\ + asm ("punpckhdq xmm"tostr(t6)", xmm"tostr(i6)"");\ + asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("punpckldq xmm"tostr(t2)", xmm"tostr(t3)"");\ + asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\ + asm ("punpckhdq xmm"tostr(t7)", xmm"tostr(t3)"");\ +\ + /* there are now 2 rows in each xmm */\ + /* unpack to get 1 row of CV in each xmm */\ + asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ + asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\ + asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\ + asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\ + asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\ + asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\ + asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\ + asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\ + /* transpose done */\ +}/**/ + +/* Matrix Transpose Inverse + * input is a 1024-bit state with two rows in one xmm + * output is a 1024-bit state with two columns in one xmm + * inputs: i0-i7 + * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) + * clobbers: t0-t4 + */ +#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ + /* transpose matrix to get output format */\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\ + asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\ + asm ("movaps xmm"tostr(o0)", [TRANSP_MASK]");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\ + /* load transpose mask into a register, because it will be used 8 times */\ + asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\ + asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\ + /* continue with unpack using 4 temp registers */\ + asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\ + asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\ + \ + asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\ + asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\ + asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\ + asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\ + asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\ + asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\ + /* shuffle with immediate */\ + asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ + asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ + asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ + asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\ + asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\ + asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ + asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\ + /* continue with unpack */\ + asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\ + asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\ + asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\ + asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\ + asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\ + asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\ + asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\ + asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\ + asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\ + asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\ + /* transpose done */\ +}/**/ + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst_CNT2(i, j){\ + asm ("movaps xmm0, [ROUND_CONST_P+"tostr(i)"*16]");\ + asm ("movaps xmm1, [ROUND_CONST_P+"tostr(j)"*16]");\ + asm ("movaps xmm2, [ROUND_CONST_Q+"tostr(i)"*16]");\ + asm ("movaps xmm3, [ROUND_CONST_Q+"tostr(j)"*16]");\ + VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ + asm ("pxor xmm2, [ALL_15]");\ + asm ("pxor xmm3, [ALL_15]");\ + asm ("movaps [ROUND_CONST_P+"tostr(i)"*16], xmm0");\ + asm ("movaps [ROUND_CONST_P+"tostr(j)"*16], xmm1");\ + asm ("movaps [ROUND_CONST_Q+"tostr(i)"*16], xmm2");\ + asm ("movaps [ROUND_CONST_Q+"tostr(j)"*16], xmm3");\ +}/**/ + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst(){\ + VPERM_Transform_RoundConst_CNT2(0, 1);\ + VPERM_Transform_RoundConst_CNT2(2, 3);\ + VPERM_Transform_RoundConst_CNT2(4, 5);\ + VPERM_Transform_RoundConst_CNT2(6, 7);\ + VPERM_Transform_RoundConst_CNT2(8, 9);\ + VPERM_Transform_RoundConst_CNT2(10, 11);\ + VPERM_Transform_RoundConst_CNT2(12, 13);\ + asm ("movaps xmm0, [ALL_FF]");\ + VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ + asm ("pxor xmm0, [ALL_15]");\ + asm ("movaps [ALL_FF], xmm0");\ +}/**/ + + +void INIT(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + asm volatile ("emms"); + + /* transform round constants into VPERM mode */ + VPERM_Transform_RoundConst(); + + /* load IV into registers xmm8 - xmm15 */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm9, [rdi+1*16]"); + asm ("movaps xmm10, [rdi+2*16]"); + asm ("movaps xmm11, [rdi+3*16]"); + asm ("movaps xmm12, [rdi+4*16]"); + asm ("movaps xmm13, [rdi+5*16]"); + asm ("movaps xmm14, [rdi+6*16]"); + asm ("movaps xmm15, [rdi+7*16]"); + + /* transform chaining value from column ordering into row ordering */ + VPERM_Transform_State( 8, 9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); + VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); + Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + + /* store transposed IV */ + asm ("movaps [rdi+0*16], xmm8"); + asm ("movaps [rdi+1*16], xmm9"); + asm ("movaps [rdi+2*16], xmm10"); + asm ("movaps [rdi+3*16], xmm11"); + asm ("movaps [rdi+4*16], xmm12"); + asm ("movaps [rdi+5*16], xmm13"); + asm ("movaps [rdi+6*16], xmm14"); + asm ("movaps [rdi+7*16], xmm15"); + + asm volatile ("emms"); + asm (".att_syntax noprefix"); +} + +void TF1024(u64* h, u64* m) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + /* message M in rsi */ + +#ifdef IACA_TRACE + IACA_START; +#endif + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load message into registers xmm8 - xmm15 (Q = message) */ + asm ("movaps xmm8, [rsi+0*16]"); + asm ("movaps xmm9, [rsi+1*16]"); + asm ("movaps xmm10, [rsi+2*16]"); + asm ("movaps xmm11, [rsi+3*16]"); + asm ("movaps xmm12, [rsi+4*16]"); + asm ("movaps xmm13, [rsi+5*16]"); + asm ("movaps xmm14, [rsi+6*16]"); + asm ("movaps xmm15, [rsi+7*16]"); + + /* transform message M from column ordering into row ordering */ + VPERM_Transform_State( 8, 9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); + VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); + Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); + + /* store message M (Q input) for later */ + asm ("movaps [QTEMP+0*16], xmm8"); + asm ("movaps [QTEMP+1*16], xmm9"); + asm ("movaps [QTEMP+2*16], xmm10"); + asm ("movaps [QTEMP+3*16], xmm11"); + asm ("movaps [QTEMP+4*16], xmm12"); + asm ("movaps [QTEMP+5*16], xmm13"); + asm ("movaps [QTEMP+6*16], xmm14"); + asm ("movaps [QTEMP+7*16], xmm15"); + + /* xor CV to message to get P input */ + /* result: CV+M in xmm8...xmm15 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm9, [rdi+1*16]"); + asm ("pxor xmm10, [rdi+2*16]"); + asm ("pxor xmm11, [rdi+3*16]"); + asm ("pxor xmm12, [rdi+4*16]"); + asm ("pxor xmm13, [rdi+5*16]"); + asm ("pxor xmm14, [rdi+6*16]"); + asm ("pxor xmm15, [rdi+7*16]"); + + /* compute permutation P */ + /* result: P(CV+M) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV+M)+CV in xmm8...xmm15 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm9, [rdi+1*16]"); + asm ("pxor xmm10, [rdi+2*16]"); + asm ("pxor xmm11, [rdi+3*16]"); + asm ("pxor xmm12, [rdi+4*16]"); + asm ("pxor xmm13, [rdi+5*16]"); + asm ("pxor xmm14, [rdi+6*16]"); + asm ("pxor xmm15, [rdi+7*16]"); + + /* store P(CV+M)+CV */ + asm ("movaps [rdi+0*16], xmm8"); + asm ("movaps [rdi+1*16], xmm9"); + asm ("movaps [rdi+2*16], xmm10"); + asm ("movaps [rdi+3*16], xmm11"); + asm ("movaps [rdi+4*16], xmm12"); + asm ("movaps [rdi+5*16], xmm13"); + asm ("movaps [rdi+6*16], xmm14"); + asm ("movaps [rdi+7*16], xmm15"); + + /* load message M (Q input) into xmm8-15 */ + asm ("movaps xmm8, [QTEMP+0*16]"); + asm ("movaps xmm9, [QTEMP+1*16]"); + asm ("movaps xmm10, [QTEMP+2*16]"); + asm ("movaps xmm11, [QTEMP+3*16]"); + asm ("movaps xmm12, [QTEMP+4*16]"); + asm ("movaps xmm13, [QTEMP+5*16]"); + asm ("movaps xmm14, [QTEMP+6*16]"); + asm ("movaps xmm15, [QTEMP+7*16]"); + + /* compute permutation Q */ + /* result: Q(M) in xmm8...xmm15 */ + ROUNDS_Q(); + + /* xor Q output */ + /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm9, [rdi+1*16]"); + asm ("pxor xmm10, [rdi+2*16]"); + asm ("pxor xmm11, [rdi+3*16]"); + asm ("pxor xmm12, [rdi+4*16]"); + asm ("pxor xmm13, [rdi+5*16]"); + asm ("pxor xmm14, [rdi+6*16]"); + asm ("pxor xmm15, [rdi+7*16]"); + + /* store CV */ + asm ("movaps [rdi+0*16], xmm8"); + asm ("movaps [rdi+1*16], xmm9"); + asm ("movaps [rdi+2*16], xmm10"); + asm ("movaps [rdi+3*16], xmm11"); + asm ("movaps [rdi+4*16], xmm12"); + asm ("movaps [rdi+5*16], xmm13"); + asm ("movaps [rdi+6*16], xmm14"); + asm ("movaps [rdi+7*16], xmm15"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + +#ifdef IACA_TRACE + IACA_END; +#endif + + return; +} + +void OF1024(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load CV into registers xmm8 - xmm15 */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm9, [rdi+1*16]"); + asm ("movaps xmm10, [rdi+2*16]"); + asm ("movaps xmm11, [rdi+3*16]"); + asm ("movaps xmm12, [rdi+4*16]"); + asm ("movaps xmm13, [rdi+5*16]"); + asm ("movaps xmm14, [rdi+6*16]"); + asm ("movaps xmm15, [rdi+7*16]"); + + /* compute permutation P */ + /* result: P(CV) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8...xmm15 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm9, [rdi+1*16]"); + asm ("pxor xmm10, [rdi+2*16]"); + asm ("pxor xmm11, [rdi+3*16]"); + asm ("pxor xmm12, [rdi+4*16]"); + asm ("pxor xmm13, [rdi+5*16]"); + asm ("pxor xmm14, [rdi+6*16]"); + asm ("pxor xmm15, [rdi+7*16]"); + + /* transpose CV back from row ordering to column ordering */ + /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ + Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7); + VPERM_Transform_State( 0, 6, 13, 15, VPERM_OPT, 1, 2, 3, 5, 7, 10, 12); + + /* we only need to return the truncated half of the state */ + asm ("movaps [rdi+4*16], xmm0"); + asm ("movaps [rdi+5*16], xmm6"); + asm ("movaps [rdi+6*16], xmm13"); + asm ("movaps [rdi+7*16], xmm15"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + + return; +} + +#endif + diff --git a/algo/aes_ni/groestl-intr-aes.h b/algo/aes_ni/groestl-intr-aes.h new file mode 100644 index 000000000..3502c0358 --- /dev/null +++ b/algo/aes_ni/groestl-intr-aes.h @@ -0,0 +1,965 @@ +/* groestl-intr-aes.h Aug 2011 + * + * Groestl implementation with intrinsics using ssse3, sse4.1, and aes + * instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include +#include +#include "hash-groestl.h" + +/* global constants */ +__m128i ROUND_CONST_Lx; +__m128i ROUND_CONST_L0[ROUNDS512]; +__m128i ROUND_CONST_L7[ROUNDS512]; +__m128i ROUND_CONST_P[ROUNDS1024]; +__m128i ROUND_CONST_Q[ROUNDS1024]; +__m128i TRANSP_MASK; +__m128i SUBSH_MASK[8]; +__m128i ALL_1B; +__m128i ALL_FF; + + +#define tos(a) #a +#define tostr(a) tos(a) + + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b */ +#define MUL2(i, j, k){\ + j = _mm_xor_si128(j, j);\ + j = _mm_cmpgt_epi8(j, i);\ + i = _mm_add_epi8(i, i);\ + j = _mm_and_si128(j, k);\ + i = _mm_xor_si128(i, j);\ +} + + /**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm_xor_si128(a0, a1);\ + b0 = a2;\ + a1 = _mm_xor_si128(a1, a2);\ + b1 = a3;\ + a2 = _mm_xor_si128(a2, a3);\ + b2 = a4;\ + a3 = _mm_xor_si128(a3, a4);\ + b3 = a5;\ + a4 = _mm_xor_si128(a4, a5);\ + b4 = a6;\ + a5 = _mm_xor_si128(a5, a6);\ + b5 = a7;\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm_xor_si128(b0, a4);\ + b6 = _mm_xor_si128(b6, a4);\ + b1 = _mm_xor_si128(b1, a5);\ + b7 = _mm_xor_si128(b7, a5);\ + b2 = _mm_xor_si128(b2, a6);\ + b0 = _mm_xor_si128(b0, a6);\ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + b3 = _mm_xor_si128(b3, a7);\ + b1 = _mm_xor_si128(b1, a7);\ + TEMP1 = b1;\ + b4 = _mm_xor_si128(b4, a0);\ + b2 = _mm_xor_si128(b2, a0);\ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b5 = _mm_xor_si128(b5, a1);\ + b3 = _mm_xor_si128(b3, a1);\ + b1 = a1;\ + b6 = _mm_xor_si128(b6, a2);\ + b4 = _mm_xor_si128(b4, a2);\ + TEMP2 = a2;\ + b7 = _mm_xor_si128(b7, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(a2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = ALL_1B;\ + MUL2(a0, b0, b1);\ + a0 = _mm_xor_si128(a0, TEMP0);\ + MUL2(a1, b0, b1);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + MUL2(a2, b0, b1);\ + a2 = _mm_xor_si128(a2, b2);\ + MUL2(a3, b0, b1);\ + a3 = _mm_xor_si128(a3, b3);\ + MUL2(a4, b0, b1);\ + a4 = _mm_xor_si128(a4, b4);\ + MUL2(a5, b0, b1);\ + a5 = _mm_xor_si128(a5, b5);\ + MUL2(a6, b0, b1);\ + a6 = _mm_xor_si128(a6, b6);\ + MUL2(a7, b0, b1);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + b5 = _mm_xor_si128(b5, a0);\ + MUL2(a1, b0, b1);\ + b6 = _mm_xor_si128(b6, a1);\ + MUL2(a2, b0, b1);\ + b7 = _mm_xor_si128(b7, a2);\ + MUL2(a5, b0, b1);\ + b2 = _mm_xor_si128(b2, a5);\ + MUL2(a6, b0, b1);\ + b3 = _mm_xor_si128(b3, a6);\ + MUL2(a7, b0, b1);\ + b4 = _mm_xor_si128(b4, a7);\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm_xor_si128(b0, a3);\ + b1 = _mm_xor_si128(b1, a4);\ +}/*MixBytes*/ + +#if (LENGTH <= 256) + +#define SET_CONSTANTS(){\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ + SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ + SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ + SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ + SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ + SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ + SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ + SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ + }\ + ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ +}while(0); \ + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant */\ + b1 = ROUND_CONST_Lx;\ + a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ + a1 = _mm_xor_si128(a1, b1);\ + a2 = _mm_xor_si128(a2, b1);\ + a3 = _mm_xor_si128(a3, b1);\ + a4 = _mm_xor_si128(a4, b1);\ + a5 = _mm_xor_si128(a5, b1);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + \ + /* ShiftBytes + SubBytes (interleaved) */\ + b0 = _mm_xor_si128(b0, b0);\ + a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ + a0 = _mm_aesenclast_si128(a0, b0);\ + a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ + a1 = _mm_aesenclast_si128(a1, b0);\ + a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ + a2 = _mm_aesenclast_si128(a2, b0);\ + a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ + a3 = _mm_aesenclast_si128(a3, b0);\ + a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ + a4 = _mm_aesenclast_si128(a4, b0);\ + a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ + a5 = _mm_aesenclast_si128(a5, b0);\ + a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ + a6 = _mm_aesenclast_si128(a6, b0);\ + a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ + a7 = _mm_aesenclast_si128(a7, b0);\ + \ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + t0 = TRANSP_MASK;\ + \ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + \ + o1 = i0;\ + t0 = i2;\ + \ + i0 = _mm_unpacklo_epi16(i0, i1);\ + o1 = _mm_unpackhi_epi16(o1, i1);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + t0 = _mm_unpackhi_epi16(t0, i3);\ + \ + i0 = _mm_shuffle_epi32(i0, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + t0 = _mm_shuffle_epi32(t0, 216);\ + \ + o2 = i0;\ + o3 = o1;\ + \ + i0 = _mm_unpacklo_epi32(i0, i2);\ + o1 = _mm_unpacklo_epi32(o1, t0);\ + o2 = _mm_unpackhi_epi32(o2, i2);\ + o3 = _mm_unpackhi_epi32(o3, t0);\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + o1 = i0;\ + o2 = i1;\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + o1 = _mm_unpackhi_epi64(o1, i4);\ + o3 = i1;\ + o4 = i2;\ + o2 = _mm_unpacklo_epi64(o2, i5);\ + o3 = _mm_unpackhi_epi64(o3, i5);\ + o5 = i2;\ + o6 = i3;\ + o4 = _mm_unpacklo_epi64(o4, i6);\ + o5 = _mm_unpackhi_epi64(o5, i6);\ + o7 = i3;\ + o6 = _mm_unpacklo_epi64(o6, i7);\ + o7 = _mm_unpackhi_epi64(o7, i7);\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + o0 = i0;\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o0 = _mm_unpackhi_epi64(o0, i1);\ + o1 = i2;\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + o1 = _mm_unpackhi_epi64(o1, i3);\ + o2 = i4;\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + o2 = _mm_unpackhi_epi64(o2, i5);\ + o3 = i6;\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + o3 = _mm_unpackhi_epi64(o3, i7);\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + t0 = _mm_xor_si128(t0, t0);\ + i1 = i0;\ + i3 = i2;\ + i5 = i4;\ + i7 = i6;\ + i0 = _mm_unpacklo_epi64(i0, t0);\ + i1 = _mm_unpackhi_epi64(i1, t0);\ + i2 = _mm_unpacklo_epi64(i2, t0);\ + i3 = _mm_unpackhi_epi64(i3, t0);\ + i4 = _mm_unpacklo_epi64(i4, t0);\ + i5 = _mm_unpackhi_epi64(i5, t0);\ + i6 = _mm_unpacklo_epi64(i6, t0);\ + i7 = _mm_unpackhi_epi64(i7, t0);\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +endif\ +}/**/ + + +void INIT(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7; + static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm12 - xmm15 */ + xmm12 = chaining[0]; + xmm13 = chaining[1]; + xmm14 = chaining[2]; + xmm15 = chaining[3]; + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* store transposed IV */ + chaining[0] = xmm12; + chaining[1] = xmm2; + chaining[2] = xmm6; + chaining[3] = xmm7; +} + +void TF512(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm12 - xmm15 */ + xmm12 = message[0]; + xmm13 = message[1]; + xmm14 = message[2]; + xmm15 = message[3]; + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + xmm8 = chaining[0]; + xmm0 = chaining[1]; + xmm4 = chaining[2]; + xmm5 = chaining[3]; + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + xmm8 = _mm_xor_si128(xmm8, xmm12); + xmm0 = _mm_xor_si128(xmm0, xmm2); + xmm4 = _mm_xor_si128(xmm4, xmm6); + xmm5 = _mm_xor_si128(xmm5, xmm7); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, xmm8); + xmm1 = _mm_xor_si128(xmm1, xmm10); + xmm2 = _mm_xor_si128(xmm2, xmm12); + xmm3 = _mm_xor_si128(xmm3, xmm14); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, (chaining[0])); + xmm1 = _mm_xor_si128(xmm1, (chaining[1])); + xmm2 = _mm_xor_si128(xmm2, (chaining[2])); + xmm3 = _mm_xor_si128(xmm3, (chaining[3])); + + /* store CV */ + chaining[0] = xmm0; + chaining[1] = xmm1; + chaining[2] = xmm2; + chaining[3] = xmm3; + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + xmm8 = chaining[0]; + xmm10 = chaining[1]; + xmm12 = chaining[2]; + xmm14 = chaining[3]; + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm10 = _mm_xor_si128(xmm10, (chaining[1])); + xmm12 = _mm_xor_si128(xmm12, (chaining[2])); + xmm14 = _mm_xor_si128(xmm14, (chaining[3])); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); + + /* we only need to return the truncated half of the state */ + chaining[2] = xmm9; + chaining[3] = xmm11; +} + +#endif + +#if (LENGTH > 256) + +#define SET_CONSTANTS(){\ + ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\ + SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\ + SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\ + SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\ + SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\ + SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\ + SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\ + SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\ + for(i = 0; i < ROUNDS1024; i++)\ + {\ + ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\ + }\ +}while(0);\ + +/* one round + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* SubBytes */\ + b0 = _mm_xor_si128(b0, b0);\ + a0 = _mm_aesenclast_si128(a0, b0);\ + a1 = _mm_aesenclast_si128(a1, b0);\ + a2 = _mm_aesenclast_si128(a2, b0);\ + a3 = _mm_aesenclast_si128(a3, b0);\ + a4 = _mm_aesenclast_si128(a4, b0);\ + a5 = _mm_aesenclast_si128(a5, b0);\ + a6 = _mm_aesenclast_si128(a6, b0);\ + a7 = _mm_aesenclast_si128(a7, b0);\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +#define ROUNDS_P(){\ + u8 round_counter = 0;\ + for(round_counter = 0; round_counter < 14; round_counter+=2) {\ + /* AddRoundConstant P1024 */\ + xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\ + xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\ + xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\ + xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\ + xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\ + xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\ + xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\ + xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + \ + /* AddRoundConstant P1024 */\ + xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\ + xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\ + xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\ + xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\ + xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\ + xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\ + xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\ + xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + }\ +} + +#define ROUNDS_Q(){\ + u8 round_counter = 0;\ + for(round_counter = 0; round_counter < 14; round_counter+=2) {\ + /* AddRoundConstant Q1024 */\ + xmm1 = ALL_FF;\ + xmm8 = _mm_xor_si128(xmm8, xmm1);\ + xmm9 = _mm_xor_si128(xmm9, xmm1);\ + xmm10 = _mm_xor_si128(xmm10, xmm1);\ + xmm11 = _mm_xor_si128(xmm11, xmm1);\ + xmm12 = _mm_xor_si128(xmm12, xmm1);\ + xmm13 = _mm_xor_si128(xmm13, xmm1);\ + xmm14 = _mm_xor_si128(xmm14, xmm1);\ + xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\ + xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\ + xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\ + xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\ + xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\ + xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\ + xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\ + xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + \ + /* AddRoundConstant Q1024 */\ + xmm9 = ALL_FF;\ + xmm0 = _mm_xor_si128(xmm0, xmm9);\ + xmm1 = _mm_xor_si128(xmm1, xmm9);\ + xmm2 = _mm_xor_si128(xmm2, xmm9);\ + xmm3 = _mm_xor_si128(xmm3, xmm9);\ + xmm4 = _mm_xor_si128(xmm4, xmm9);\ + xmm5 = _mm_xor_si128(xmm5, xmm9);\ + xmm6 = _mm_xor_si128(xmm6, xmm9);\ + xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\ + xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\ + xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\ + xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\ + xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\ + xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\ + xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\ + xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + }\ +} + +/* Matrix Transpose + * input is a 1024-bit state with two columns in one xmm + * output is a 1024-bit state with two rows in one xmm + * inputs: i0-i7 + * outputs: i0-i7 + * clobbers: t0-t7 + */ +#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ + t0 = TRANSP_MASK;\ +\ + i6 = _mm_shuffle_epi8(i6, t0);\ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + t1 = i2;\ + i4 = _mm_shuffle_epi8(i4, t0);\ + i5 = _mm_shuffle_epi8(i5, t0);\ + t2 = i4;\ + t3 = i6;\ + i7 = _mm_shuffle_epi8(i7, t0);\ +\ + /* continue with unpack using 4 temp registers */\ + t0 = i0;\ + t2 = _mm_unpackhi_epi16(t2, i5);\ + i4 = _mm_unpacklo_epi16(i4, i5);\ + t3 = _mm_unpackhi_epi16(t3, i7);\ + i6 = _mm_unpacklo_epi16(i6, i7);\ + t0 = _mm_unpackhi_epi16(t0, i1);\ + t1 = _mm_unpackhi_epi16(t1, i3);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + i0 = _mm_unpacklo_epi16(i0, i1);\ +\ + /* shuffle with immediate */\ + t0 = _mm_shuffle_epi32(t0, 216);\ + t1 = _mm_shuffle_epi32(t1, 216);\ + t2 = _mm_shuffle_epi32(t2, 216);\ + t3 = _mm_shuffle_epi32(t3, 216);\ + i0 = _mm_shuffle_epi32(i0, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + i4 = _mm_shuffle_epi32(i4, 216);\ + i6 = _mm_shuffle_epi32(i6, 216);\ +\ + /* continue with unpack */\ + t4 = i0;\ + i0 = _mm_unpacklo_epi32(i0, i2);\ + t4 = _mm_unpackhi_epi32(t4, i2);\ + t5 = t0;\ + t0 = _mm_unpacklo_epi32(t0, t1);\ + t5 = _mm_unpackhi_epi32(t5, t1);\ + t6 = i4;\ + i4 = _mm_unpacklo_epi32(i4, i6);\ + t7 = t2;\ + t6 = _mm_unpackhi_epi32(t6, i6);\ + i2 = t0;\ + t2 = _mm_unpacklo_epi32(t2, t3);\ + i3 = t0;\ + t7 = _mm_unpackhi_epi32(t7, t3);\ +\ + /* there are now 2 rows in each xmm */\ + /* unpack to get 1 row of CV in each xmm */\ + i1 = i0;\ + i1 = _mm_unpackhi_epi64(i1, i4);\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + i4 = t4;\ + i3 = _mm_unpackhi_epi64(i3, t2);\ + i5 = t4;\ + i2 = _mm_unpacklo_epi64(i2, t2);\ + i6 = t5;\ + i5 = _mm_unpackhi_epi64(i5, t6);\ + i7 = t5;\ + i4 = _mm_unpacklo_epi64(i4, t6);\ + i7 = _mm_unpackhi_epi64(i7, t7);\ + i6 = _mm_unpacklo_epi64(i6, t7);\ + /* transpose done */\ +}/**/ + +/* Matrix Transpose Inverse + * input is a 1024-bit state with two rows in one xmm + * output is a 1024-bit state with two columns in one xmm + * inputs: i0-i7 + * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) + * clobbers: t0-t4 + */ +#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ + /* transpose matrix to get output format */\ + o1 = i0;\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o1 = _mm_unpackhi_epi64(o1, i1);\ + t0 = i2;\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + t0 = _mm_unpackhi_epi64(t0, i3);\ + t1 = i4;\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + t1 = _mm_unpackhi_epi64(t1, i5);\ + t2 = i6;\ + o0 = TRANSP_MASK;\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + t2 = _mm_unpackhi_epi64(t2, i7);\ + /* load transpose mask into a register, because it will be used 8 times */\ + i0 = _mm_shuffle_epi8(i0, o0);\ + i2 = _mm_shuffle_epi8(i2, o0);\ + i4 = _mm_shuffle_epi8(i4, o0);\ + i6 = _mm_shuffle_epi8(i6, o0);\ + o1 = _mm_shuffle_epi8(o1, o0);\ + t0 = _mm_shuffle_epi8(t0, o0);\ + t1 = _mm_shuffle_epi8(t1, o0);\ + t2 = _mm_shuffle_epi8(t2, o0);\ + /* continue with unpack using 4 temp registers */\ + t3 = i4;\ + o2 = o1;\ + o0 = i0;\ + t4 = t1;\ + \ + t3 = _mm_unpackhi_epi16(t3, i6);\ + i4 = _mm_unpacklo_epi16(i4, i6);\ + o0 = _mm_unpackhi_epi16(o0, i2);\ + i0 = _mm_unpacklo_epi16(i0, i2);\ + o2 = _mm_unpackhi_epi16(o2, t0);\ + o1 = _mm_unpacklo_epi16(o1, t0);\ + t4 = _mm_unpackhi_epi16(t4, t2);\ + t1 = _mm_unpacklo_epi16(t1, t2);\ + /* shuffle with immediate */\ + i4 = _mm_shuffle_epi32(i4, 216);\ + t3 = _mm_shuffle_epi32(t3, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + o2 = _mm_shuffle_epi32(o2, 216);\ + i0 = _mm_shuffle_epi32(i0, 216);\ + o0 = _mm_shuffle_epi32(o0, 216);\ + t1 = _mm_shuffle_epi32(t1, 216);\ + t4 = _mm_shuffle_epi32(t4, 216);\ + /* continue with unpack */\ + i1 = i0;\ + i3 = o0;\ + i5 = o1;\ + i7 = o2;\ + i0 = _mm_unpacklo_epi32(i0, i4);\ + i1 = _mm_unpackhi_epi32(i1, i4);\ + o0 = _mm_unpacklo_epi32(o0, t3);\ + i3 = _mm_unpackhi_epi32(i3, t3);\ + o1 = _mm_unpacklo_epi32(o1, t1);\ + i5 = _mm_unpackhi_epi32(i5, t1);\ + o2 = _mm_unpacklo_epi32(o2, t4);\ + i7 = _mm_unpackhi_epi32(i7, t4);\ + /* transpose done */\ +}/**/ + + +void INIT(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* transform chaining value from column ordering into row ordering */ + Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store transposed IV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; +} + +void TF1024(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i QTEMP[8]; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm8 - xmm15 (Q = message) */ + xmm8 = message[0]; + xmm9 = message[1]; + xmm10 = message[2]; + xmm11 = message[3]; + xmm12 = message[4]; + xmm13 = message[5]; + xmm14 = message[6]; + xmm15 = message[7]; + + /* transform message M from column ordering into row ordering */ + Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store message M (Q input) for later */ + QTEMP[0] = xmm8; + QTEMP[1] = xmm9; + QTEMP[2] = xmm10; + QTEMP[3] = xmm11; + QTEMP[4] = xmm12; + QTEMP[5] = xmm13; + QTEMP[6] = xmm14; + QTEMP[7] = xmm15; + + /* xor CV to message to get P input */ + /* result: CV+M in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* compute permutation P */ + /* result: P(CV+M) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV+M)+CV in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* store P(CV+M)+CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + + /* load message M (Q input) into xmm8-15 */ + xmm8 = QTEMP[0]; + xmm9 = QTEMP[1]; + xmm10 = QTEMP[2]; + xmm11 = QTEMP[3]; + xmm12 = QTEMP[4]; + xmm13 = QTEMP[5]; + xmm14 = QTEMP[6]; + xmm15 = QTEMP[7]; + + /* compute permutation Q */ + /* result: Q(M) in xmm8...xmm15 */ + ROUNDS_Q(); + + /* xor Q output */ + /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* store CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + +#ifdef IACA_TRACE + IACA_END; +#endif + + return; +} + +void OF1024(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + + /* load CV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* compute permutation P */ + /* result: P(CV) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* transpose CV back from row ordering to column ordering */ + /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ + Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7); + + /* we only need to return the truncated half of the state */ + chaining[4] = xmm0; + chaining[5] = xmm6; + chaining[6] = xmm13; + chaining[7] = xmm15; + + return; +} + +#endif + diff --git a/algo/aes_ni/groestl-intr-avx.h b/algo/aes_ni/groestl-intr-avx.h new file mode 100644 index 000000000..97f08dd69 --- /dev/null +++ b/algo/aes_ni/groestl-intr-avx.h @@ -0,0 +1,1072 @@ +/* groestl-intr-avx.h Aug 2011 + * + * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx + * instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include +#include +#include +#include "hash-groestl.h" + +/* global constants */ +__m128i ROUND_CONST_Lx; +__m128i ROUND_CONST_L0[ROUNDS512]; +__m128i ROUND_CONST_L7[ROUNDS512]; +__m128i ROUND_CONST_P[ROUNDS1024]; +__m128i ROUND_CONST_Q[ROUNDS1024]; +__m128i TRANSP_MASK; +__m128i SUBSH_MASK[8]; +__m128i ALL_FF; +#if LENGTH <= 256 +__m128i ALL_1B; +#else +__m256d ALL_1B; +#endif + +#define tos(a) #a +#define tostr(a) tos(a) + +#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos))) +#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos)) + +#if (LENGTH <= 256) + +#define SET_CONSTANTS(){\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ + SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ + SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ + SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ + SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ + SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ + SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ + SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ + }\ + ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ +}while(0); + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b + * xmm[z] has to be zero */ +#define VMUL2(i, j, k, z){\ + j = _mm_cmpgt_epi8(z, i);\ + i = _mm_add_epi8(i, i);\ + j = _mm_and_si128(j, k);\ + i = _mm_xor_si128(i, j);\ +}/**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ + b0 = a2;\ + b1 = a3;\ + b2 = a4;\ + b3 = a5;\ + b4 = a6;\ + b5 = a7;\ + b6 = a0;\ + b7 = a1;\ + \ + /* t_i = a_i + a_{i+1} */\ + a0 = _mm_xor_si128(a0, a1);\ + a1 = _mm_xor_si128(a1, a2);\ + a2 = _mm_xor_si128(a2, a3);\ + a3 = _mm_xor_si128(a3, a4);\ + a4 = _mm_xor_si128(a4, a5);\ + a5 = _mm_xor_si128(a5, a6);\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm_xor_si128(b0, a4);\ + b1 = _mm_xor_si128(b1, a5);\ + b2 = _mm_xor_si128(b2, a6);\ + b3 = _mm_xor_si128(b3, a7);\ + b4 = _mm_xor_si128(b4, a0);\ + b5 = _mm_xor_si128(b5, a1);\ + b6 = _mm_xor_si128(b6, a2);\ + b7 = _mm_xor_si128(b7, a3);\ + \ + b0 = _mm_xor_si128(b0, a6);\ + b1 = _mm_xor_si128(b1, a7);\ + b2 = _mm_xor_si128(b2, a0);\ + b3 = _mm_xor_si128(b3, a1);\ + b4 = _mm_xor_si128(b4, a2);\ + b5 = _mm_xor_si128(b5, a3);\ + b6 = _mm_xor_si128(b6, a4);\ + b7 = _mm_xor_si128(b7, a5);\ + \ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + TEMP1 = b1;\ + TEMP2 = b2;\ + \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b1 = a1;\ + TEMP3 = a2;\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(a2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP3);\ + \ + /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + b1 = ALL_1B;\ + b2 = _mm_xor_si128(b2, b2);\ + VMUL2(a7, b0, b1, b2);\ + VMUL2(a6, b0, b1, b2);\ + VMUL2(a5, b0, b1, b2);\ + VMUL2(a4, b0, b1, b2);\ + VMUL2(a3, b0, b1, b2);\ + VMUL2(a2, b0, b1, b2);\ + VMUL2(a1, b0, b1, b2);\ + VMUL2(a0, b0, b1, b2);\ + \ + /* compute w_i : add y_{i+4} */\ + a0 = _mm_xor_si128(a0, TEMP0);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + a2 = _mm_xor_si128(a2, TEMP2);\ + a3 = _mm_xor_si128(a3, b3);\ + a4 = _mm_xor_si128(a4, b4);\ + a5 = _mm_xor_si128(a5, b5);\ + a6 = _mm_xor_si128(a6, b6);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /*compute v_i: double w_i */\ + VMUL2(a0, b0, b1, b2);\ + VMUL2(a1, b0, b1, b2);\ + VMUL2(a2, b0, b1, b2);\ + VMUL2(a3, b0, b1, b2);\ + VMUL2(a4, b0, b1, b2);\ + VMUL2(a5, b0, b1, b2);\ + VMUL2(a6, b0, b1, b2);\ + VMUL2(a7, b0, b1, b2);\ + \ + /* add to y_4 y_5 .. v3, v4, ... */\ + b0 = _mm_xor_si128(a3, TEMP0);\ + b1 = _mm_xor_si128(a4, TEMP1);\ + b2 = _mm_xor_si128(a5, TEMP2);\ + b3 = _mm_xor_si128(b3, a6);\ + b4 = _mm_xor_si128(b4, a7);\ + b5 = _mm_xor_si128(b5, a0);\ + b6 = _mm_xor_si128(b6, a1);\ + b7 = _mm_xor_si128(b7, a2);\ +}/*MixBytes*/ + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* Add Round Constant */\ + b1 = ROUND_CONST_Lx;\ + a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ + a1 = _mm_xor_si128(a1, b1);\ + a2 = _mm_xor_si128(a2, b1);\ + a3 = _mm_xor_si128(a3, b1);\ + a4 = _mm_xor_si128(a4, b1);\ + a5 = _mm_xor_si128(a5, b1);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + \ + /* ShiftBytes + SubBytes (interleaved) */\ + b0 = _mm_xor_si128(b0, b0);\ + a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ + a0 = _mm_aesenclast_si128(a0, b0);\ + a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ + a1 = _mm_aesenclast_si128(a1, b0);\ + a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ + a2 = _mm_aesenclast_si128(a2, b0);\ + a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ + a3 = _mm_aesenclast_si128(a3, b0);\ + a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ + a4 = _mm_aesenclast_si128(a4, b0);\ + a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ + a5 = _mm_aesenclast_si128(a5, b0);\ + a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ + a6 = _mm_aesenclast_si128(a6, b0);\ + a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ + a7 = _mm_aesenclast_si128(a7, b0);\ + \ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + t0 = TRANSP_MASK;\ + \ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + \ + o1 = _mm_unpackhi_epi16(i0, i1);\ + i0 = _mm_unpacklo_epi16(i0, i1);\ + t0 = _mm_unpackhi_epi16(i2, i3);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + \ + i0 = _mm_shuffle_epi32(i0, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + t0 = _mm_shuffle_epi32(t0, 216);\ + \ + o2 = _mm_unpackhi_epi32(i0, i2);\ + o3 = _mm_unpackhi_epi32(o1, t0);\ + i0 = _mm_unpacklo_epi32(i0, i2);\ + o1 = _mm_unpacklo_epi32(o1, t0);\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + o1 = _mm_unpackhi_epi64(i0, i4);\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + o2 = _mm_unpacklo_epi64(i1, i5);\ + o3 = _mm_unpackhi_epi64(i1, i5);\ + o4 = _mm_unpacklo_epi64(i2, i6);\ + o5 = _mm_unpackhi_epi64(i2, i6);\ + o6 = _mm_unpacklo_epi64(i3, i7);\ + o7 = _mm_unpackhi_epi64(i3, i7);\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + o0 = _mm_unpackhi_epi64(i0, i1);\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o1 = _mm_unpackhi_epi64(i2, i3);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + o2 = _mm_unpackhi_epi64(i4, i5);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + o3 = _mm_unpackhi_epi64(i6, i7);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + t0 = _mm_xor_si128(t0, t0);\ + i1 = _mm_unpackhi_epi64(i0, t0);\ + i0 = _mm_unpacklo_epi64(i0, t0);\ + i3 = _mm_unpackhi_epi64(i2, t0);\ + i2 = _mm_unpacklo_epi64(i2, t0);\ + i5 = _mm_unpackhi_epi64(i4, t0);\ + i4 = _mm_unpacklo_epi64(i4, t0);\ + i7 = _mm_unpackhi_epi64(i6, t0);\ + i6 = _mm_unpacklo_epi64(i6, t0);\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +}/**/ + + +void INIT(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7; + static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm12 - xmm15 */ + xmm12 = chaining[0]; + xmm13 = chaining[1]; + xmm14 = chaining[2]; + xmm15 = chaining[3]; + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* store transposed IV */ + chaining[0] = xmm12; + chaining[1] = xmm2; + chaining[2] = xmm6; + chaining[3] = xmm7; +} + +void TF512(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + static __m128i TEMP3; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm12 - xmm15 */ + xmm12 = message[0]; + xmm13 = message[1]; + xmm14 = message[2]; + xmm15 = message[3]; + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* load previous chaining value and xor message to CV to get input of P */ + /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + xmm8 = _mm_xor_si128(xmm12, chaining[0]); + xmm0 = _mm_xor_si128(xmm2, chaining[1]); + xmm4 = _mm_xor_si128(xmm6, chaining[2]); + xmm5 = _mm_xor_si128(xmm7, chaining[3]); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, xmm8); + xmm1 = _mm_xor_si128(xmm1, xmm10); + xmm2 = _mm_xor_si128(xmm2, xmm12); + xmm3 = _mm_xor_si128(xmm3, xmm14); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, chaining[0]); + xmm1 = _mm_xor_si128(xmm1, chaining[1]); + xmm2 = _mm_xor_si128(xmm2, chaining[2]); + xmm3 = _mm_xor_si128(xmm3, chaining[3]); + + /* store CV */ + chaining[0] = xmm0; + chaining[1] = xmm1; + chaining[2] = xmm2; + chaining[3] = xmm3; + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + static __m128i TEMP3; + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + xmm8 = chaining[0]; + xmm10 = chaining[1]; + xmm12 = chaining[2]; + xmm14 = chaining[3]; + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm10 = _mm_xor_si128(xmm10, (chaining[1])); + xmm12 = _mm_xor_si128(xmm12, (chaining[2])); + xmm14 = _mm_xor_si128(xmm14, (chaining[3])); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); + + /* we only need to return the truncated half of the state */ + chaining[2] = xmm9; + chaining[3] = xmm11; +} + +#endif + +#if (LENGTH > 256) + +#define SET_CONSTANTS(){\ + __m128i xmm0, xmm1;\ + __m256d ymm0;\ + xmm0 = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + xmm1 = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + ymm0 = insert_m128i_in_m256d(ymm0, xmm0, 0);\ + ymm0 = insert_m128i_in_m256d(ymm0, xmm1, 1);\ + ALL_1B = ymm0;\ + ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\ + SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\ + SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\ + SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\ + SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\ + SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\ + SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\ + SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\ + for(i = 0; i < ROUNDS1024; i++)\ + {\ + ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\ + }\ +}while(0); + +/* AVX MUL2 + * input: i + * output i = 2 * i + * */ +#define VMUL2(i){\ + xmmZERO = _mm_xor_si128(xmmZERO, xmmZERO);\ + xmmIL = extract_m128i_from_m256d(i, 0);\ + xmmIH = extract_m128i_from_m256d(i, 1);\ + xmmJL = _mm_cmpgt_epi8(xmmZERO, xmmIL);\ + xmmJH = _mm_cmpgt_epi8(xmmZERO, xmmIH);\ + xmmIL = _mm_add_epi8(xmmIL, xmmIL);\ + xmmIH = _mm_add_epi8(xmmIH, xmmIH);\ + ymmJ = insert_m128i_in_m256d(ymmJ, xmmJL, 0);\ + ymmJ = insert_m128i_in_m256d(ymmJ, xmmJH, 1);\ + ymmJ = _mm256_and_pd(ymmJ, ALL_1B);\ + i = insert_m128i_in_m256d(i, xmmIL, 0);\ + i = insert_m128i_in_m256d(i, xmmIH, 1);\ + i = _mm256_xor_pd(i, ymmJ);\ +}/**/ + +/* AVX SubShift + * inputs: + * * i + * * c0 (must be 0) + * * ShiftP + * * ShiftQ + * output i = S(Shift(i_1, ShiftQ)|Shift(i_0, ShiftP)) + * clobbers: t0 + * */ +#define SubShift(i, ShiftP, ShiftQ){\ + xmmZERO = _mm_xor_si128(xmmZERO, xmmZERO);\ + xmmIL = extract_m128i_from_m256d(i, 0);\ + xmmIH = extract_m128i_from_m256d(i, 1);\ + xmmIL = _mm_shuffle_epi8(xmmIL, SUBSH_MASK[ShiftP]);\ + xmmIH = _mm_shuffle_epi8(xmmIH, SUBSH_MASK[ShiftQ]);\ + xmmIL = _mm_aesenclast_si128(xmmIL, xmmZERO);\ + xmmIH = _mm_aesenclast_si128(xmmIH, xmmZERO);\ + i = insert_m128i_in_m256d(i, xmmIL, 0);\ + i = insert_m128i_in_m256d(i, xmmIH, 1);\ +}/**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ + b0 = a2;\ + b1 = a3;\ + b2 = a4;\ + b3 = a5;\ + b4 = a6;\ + b5 = a7;\ + b6 = a0;\ + b7 = a1;\ + \ + /* t_i = a_i + a_{i+1} */\ + a0 = _mm256_xor_pd(a0, a1);\ + a1 = _mm256_xor_pd(a1, a2);\ + a2 = _mm256_xor_pd(a2, a3);\ + a3 = _mm256_xor_pd(a3, a4);\ + a4 = _mm256_xor_pd(a4, a5);\ + a5 = _mm256_xor_pd(a5, a6);\ + a6 = _mm256_xor_pd(a6, a7);\ + a7 = _mm256_xor_pd(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm256_xor_pd(b0, a4);\ + b1 = _mm256_xor_pd(b1, a5);\ + b2 = _mm256_xor_pd(b2, a6);\ + b3 = _mm256_xor_pd(b3, a7);\ + b4 = _mm256_xor_pd(b4, a0);\ + b5 = _mm256_xor_pd(b5, a1);\ + b6 = _mm256_xor_pd(b6, a2);\ + b7 = _mm256_xor_pd(b7, a3);\ + \ + b0 = _mm256_xor_pd(b0, a6);\ + b1 = _mm256_xor_pd(b1, a7);\ + b2 = _mm256_xor_pd(b2, a0);\ + b3 = _mm256_xor_pd(b3, a1);\ + b4 = _mm256_xor_pd(b4, a2);\ + b5 = _mm256_xor_pd(b5, a3);\ + b6 = _mm256_xor_pd(b6, a4);\ + b7 = _mm256_xor_pd(b7, a5);\ + \ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + TEMP1 = b1;\ + TEMP2 = b2;\ + \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b1 = a1;\ + TEMP3 = a2;\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm256_xor_pd(a0, a3);\ + a1 = _mm256_xor_pd(a1, a4);\ + a2 = _mm256_xor_pd(a2, a5);\ + a3 = _mm256_xor_pd(a3, a6);\ + a4 = _mm256_xor_pd(a4, a7);\ + a5 = _mm256_xor_pd(a5, b0);\ + a6 = _mm256_xor_pd(a6, b1);\ + a7 = _mm256_xor_pd(a7, TEMP3);\ + \ + /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + b1 = ALL_1B;\ + b2 = _mm256_xor_pd(b2, b2);\ + VMUL2(a7);\ + VMUL2(a6);\ + VMUL2(a5);\ + VMUL2(a4);\ + VMUL2(a3);\ + VMUL2(a2);\ + VMUL2(a1);\ + VMUL2(a0);\ + \ + /* compute w_i : add y_{i+4} */\ + a0 = _mm256_xor_pd(a0, TEMP0);\ + a1 = _mm256_xor_pd(a1, TEMP1);\ + a2 = _mm256_xor_pd(a2, TEMP2);\ + a3 = _mm256_xor_pd(a3, b3);\ + a4 = _mm256_xor_pd(a4, b4);\ + a5 = _mm256_xor_pd(a5, b5);\ + a6 = _mm256_xor_pd(a6, b6);\ + a7 = _mm256_xor_pd(a7, b7);\ + \ + /*compute v_i: double w_i */\ + VMUL2(a0);\ + VMUL2(a1);\ + VMUL2(a2);\ + VMUL2(a3);\ + VMUL2(a4);\ + VMUL2(a5);\ + VMUL2(a6);\ + VMUL2(a7);\ + \ + /* add to y_4 y_5 .. v3, v4, ... */\ + b0 = _mm256_xor_pd(a3, TEMP0);\ + b1 = _mm256_xor_pd(a4, TEMP1);\ + b2 = _mm256_xor_pd(a5, TEMP2);\ + b3 = _mm256_xor_pd(b3, a6);\ + b4 = _mm256_xor_pd(b4, a7);\ + b5 = _mm256_xor_pd(b5, a0);\ + b6 = _mm256_xor_pd(b6, a1);\ + b7 = _mm256_xor_pd(b7, a2);\ +}/*MixBytes*/ + +/* one round + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define SUBSHIFTMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* ShiftBytes + SubBytes */\ + SubShift(a0, 0, 1);\ + SubShift(a1, 1, 3);\ + SubShift(a2, 2, 5);\ + SubShift(a3, 3, 7);\ + SubShift(a4, 4, 0);\ + SubShift(a5, 5, 2);\ + SubShift(a6, 6, 4);\ + SubShift(a7, 7, 6);\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +#define ROUNDS_P_Q(){\ + u8 round_counter = 0;\ + for(round_counter = 0; round_counter < 14; round_counter++) {\ + /* AddRoundConstant */\ + ymm6 = _mm256_xor_pd(ymm6, ymm6);\ + ymm7 = insert_m128i_in_m256d(ymm6, ROUND_CONST_Q[round_counter], 1);\ + ymm6 = insert_m128i_in_m256d(ymm6, ALL_FF, 1);\ + ymm0 = insert_m128i_in_m256d(ymm6, ROUND_CONST_P[round_counter], 0);\ + ymm0 = _mm256_xor_pd(ymm8, ymm0);\ + ymm1 = _mm256_xor_pd(ymm9, ymm6);\ + ymm2 = _mm256_xor_pd(ymm10, ymm6);\ + ymm3 = _mm256_xor_pd(ymm11, ymm6);\ + ymm4 = _mm256_xor_pd(ymm12, ymm6);\ + ymm5 = _mm256_xor_pd(ymm13, ymm6);\ + ymm6 = _mm256_xor_pd(ymm14, ymm6);\ + ymm7 = _mm256_xor_pd(ymm15, ymm7);\ + /* SubBytes + ShiftBytes + MixBytes */\ + SUBSHIFTMIX(ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15);\ + }\ +} + +/* Matrix Transpose + * input is a 1024-bit state with two columns in one xmm + * output is a 1024-bit state with two rows in one xmm + * inputs: i0-i7 + * outputs: i0-i7 + * clobbers: t0-t7 + */ +#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ + t0 = TRANSP_MASK;\ + \ + i6 = _mm_shuffle_epi8(i6, t0);\ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + i4 = _mm_shuffle_epi8(i4, t0);\ + i5 = _mm_shuffle_epi8(i5, t0);\ + i7 = _mm_shuffle_epi8(i7, t0);\ + \ + /* continue with unpack */\ + t0 = _mm_unpackhi_epi16(i0, i1);\ + t1 = _mm_unpackhi_epi16(i2, i3);\ + t2 = _mm_unpackhi_epi16(i4, i5);\ + t3 = _mm_unpackhi_epi16(i6, i7);\ + i0 = _mm_unpacklo_epi16(i0, i1);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + i4 = _mm_unpacklo_epi16(i4, i5);\ + i6 = _mm_unpacklo_epi16(i6, i7);\ + \ + /* shuffle with immediate */\ + t0 = _mm_shuffle_epi32(t0, 216);\ + t1 = _mm_shuffle_epi32(t1, 216);\ + t2 = _mm_shuffle_epi32(t2, 216);\ + t3 = _mm_shuffle_epi32(t3, 216);\ + i0 = _mm_shuffle_epi32(i0, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + i4 = _mm_shuffle_epi32(i4, 216);\ + i6 = _mm_shuffle_epi32(i6, 216);\ + \ + /* continue with unpack */\ + t4 = _mm_unpackhi_epi32(i0, i2);\ + i0 = _mm_unpacklo_epi32(i0, i2);\ + t5 = _mm_unpackhi_epi32(t0, t1);\ + t0 = _mm_unpacklo_epi32(t0, t1);\ + t6 = _mm_unpackhi_epi32(i4, i6);\ + i4 = _mm_unpacklo_epi32(i4, i6);\ + t7 = _mm_unpackhi_epi32(t2, t3);\ + t2 = _mm_unpacklo_epi32(t2, t3);\ + \ + /* there are now 2 rows in each xmm */\ + /* unpack to get 1 row of CV in each xmm */\ + i1 = _mm_unpackhi_epi64(i0, i4);\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + i2 = _mm_unpacklo_epi64(t0, t2);\ + i3 = _mm_unpackhi_epi64(t0, t2);\ + i4 = _mm_unpacklo_epi64(t4, t6);\ + i5 = _mm_unpackhi_epi64(t4, t6);\ + i6 = _mm_unpacklo_epi64(t5, t7);\ + i7 = _mm_unpackhi_epi64(t5, t7);\ + /* transpose done */\ +}/**/ + +/* Matrix Transpose Inverse + * input is a 1024-bit state with two rows in one xmm + * output is a 1024-bit state with two columns in one xmm + * inputs: i0-i7 + * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) + * clobbers: t0-t4 + */ +#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ + o0 = TRANSP_MASK;\ + /* transpose matrix to get output format */\ + o1 = _mm_unpackhi_epi64(i0, i1);\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + t0 = _mm_unpackhi_epi64(i2, i3);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + t1 = _mm_unpackhi_epi64(i4, i5);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + t2 = _mm_unpackhi_epi64(i6, i7);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + /* load transpose mask into a register, because it will be used 8 times */\ + i0 = _mm_shuffle_epi8(i0, o0);\ + i2 = _mm_shuffle_epi8(i2, o0);\ + i4 = _mm_shuffle_epi8(i4, o0);\ + i6 = _mm_shuffle_epi8(i6, o0);\ + o1 = _mm_shuffle_epi8(o1, o0);\ + t0 = _mm_shuffle_epi8(t0, o0);\ + t1 = _mm_shuffle_epi8(t1, o0);\ + t2 = _mm_shuffle_epi8(t2, o0);\ + /* continue with unpack */\ + t3 = _mm_unpackhi_epi16(i4, i6);\ + i4 = _mm_unpacklo_epi16(i4, i6);\ + o0 = _mm_unpackhi_epi16(i0, i2);\ + i0 = _mm_unpacklo_epi16(i0, i2);\ + o2 = _mm_unpackhi_epi16(o1, t0);\ + o1 = _mm_unpacklo_epi16(o1, t0);\ + t4 = _mm_unpackhi_epi16(t1, t2);\ + t1 = _mm_unpacklo_epi16(t1, t2);\ + /* shuffle with immediate */\ + i4 = _mm_shuffle_epi32(i4, 216);\ + t3 = _mm_shuffle_epi32(t3, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + o2 = _mm_shuffle_epi32(o2, 216);\ + i0 = _mm_shuffle_epi32(i0, 216);\ + o0 = _mm_shuffle_epi32(o0, 216);\ + t1 = _mm_shuffle_epi32(t1, 216);\ + t4 = _mm_shuffle_epi32(t4, 216);\ + /* continue with unpack */\ + i1 = _mm_unpackhi_epi32(i0, i4);\ + i0 = _mm_unpacklo_epi32(i0, i4);\ + i3 = _mm_unpackhi_epi32(o0, t3);\ + o0 = _mm_unpacklo_epi32(o0, t3);\ + i5 = _mm_unpackhi_epi32(o1, t1);\ + o1 = _mm_unpacklo_epi32(o1, t1);\ + i7 = _mm_unpackhi_epi32(o2, t4);\ + o2 = _mm_unpacklo_epi32(o2, t4);\ + /* transpose done */\ +}/**/ + +void INIT(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* transform chaining value from column ordering into row ordering */ + Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store transposed IV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; +} + +void TF1024(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i xmmIL, xmmIH, xmmJL, xmmJH, xmmZERO; + static __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + static __m256d ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; + static __m256d ymmJ; + static __m256d TEMP0; + static __m256d TEMP1; + static __m256d TEMP2; + static __m256d TEMP3; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm8 - xmm15 (Q = message) */ + xmm0 = message[0]; + xmm1 = message[1]; + xmm2 = message[2]; + xmm3 = message[3]; + xmm4 = message[4]; + xmm5 = message[5]; + xmm6 = message[6]; + xmm7 = message[7]; + + /* transform message M from column ordering into row ordering */ + Matrix_Transpose(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* load previous chaining value and xor message to CV to get input of P */ + /* we put two rows (2x64 bit) of the CV into one 128-bit xmm register */ + /* result: CV+M in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm0, chaining[0]); + xmm9 = _mm_xor_si128(xmm1, chaining[1]); + xmm10 = _mm_xor_si128(xmm2, chaining[2]); + xmm11 = _mm_xor_si128(xmm3, chaining[3]); + xmm12 = _mm_xor_si128(xmm4, chaining[4]); + xmm13 = _mm_xor_si128(xmm5, chaining[5]); + xmm14 = _mm_xor_si128(xmm6, chaining[6]); + xmm15 = _mm_xor_si128(xmm7, chaining[7]); + + /* generate AVX registers with Q in high and P in low 128 bits */ + ymm8 = insert_m128i_in_m256d(ymm8, xmm8, 0); + ymm9 = insert_m128i_in_m256d(ymm9, xmm9, 0); + ymm10 = insert_m128i_in_m256d(ymm10, xmm10, 0); + ymm11 = insert_m128i_in_m256d(ymm11, xmm11, 0); + ymm12 = insert_m128i_in_m256d(ymm12, xmm12, 0); + ymm13 = insert_m128i_in_m256d(ymm13, xmm13, 0); + ymm14 = insert_m128i_in_m256d(ymm14, xmm14, 0); + ymm15 = insert_m128i_in_m256d(ymm15, xmm15, 0); + + ymm8 = insert_m128i_in_m256d(ymm8, xmm0, 1); + ymm9 = insert_m128i_in_m256d(ymm9, xmm1, 1); + ymm10 = insert_m128i_in_m256d(ymm10, xmm2, 1); + ymm11 = insert_m128i_in_m256d(ymm11, xmm3, 1); + ymm12 = insert_m128i_in_m256d(ymm12, xmm4, 1); + ymm13 = insert_m128i_in_m256d(ymm13, xmm5, 1); + ymm14 = insert_m128i_in_m256d(ymm14, xmm6, 1); + ymm15 = insert_m128i_in_m256d(ymm15, xmm7, 1); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* extract Q to xmm */ + xmm0 = extract_m128i_from_m256d(ymm8, 1); + xmm1 = extract_m128i_from_m256d(ymm9, 1); + xmm2 = extract_m128i_from_m256d(ymm10, 1); + xmm3 = extract_m128i_from_m256d(ymm11, 1); + xmm4 = extract_m128i_from_m256d(ymm12, 1); + xmm5 = extract_m128i_from_m256d(ymm13, 1); + xmm6 = extract_m128i_from_m256d(ymm14, 1); + xmm7 = extract_m128i_from_m256d(ymm15, 1); + + /* extract P to xmm */ + xmm8 = extract_m128i_from_m256d(ymm8, 0); + xmm9 = extract_m128i_from_m256d(ymm9, 0); + xmm10 = extract_m128i_from_m256d(ymm10, 0); + xmm11 = extract_m128i_from_m256d(ymm11, 0); + xmm12 = extract_m128i_from_m256d(ymm12, 0); + xmm13 = extract_m128i_from_m256d(ymm13, 0); + xmm14 = extract_m128i_from_m256d(ymm14, 0); + xmm15 = extract_m128i_from_m256d(ymm15, 0); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, xmm0); + xmm9 = _mm_xor_si128(xmm9, xmm1); + xmm10 = _mm_xor_si128(xmm10, xmm2); + xmm11 = _mm_xor_si128(xmm11, xmm3); + xmm12 = _mm_xor_si128(xmm12, xmm4); + xmm13 = _mm_xor_si128(xmm13, xmm5); + xmm14 = _mm_xor_si128(xmm14, xmm6); + xmm15 = _mm_xor_si128(xmm15, xmm7); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, chaining[0]); + xmm9 = _mm_xor_si128(xmm9, chaining[1]); + xmm10 = _mm_xor_si128(xmm10, chaining[2]); + xmm11 = _mm_xor_si128(xmm11, chaining[3]); + xmm12 = _mm_xor_si128(xmm12, chaining[4]); + xmm13 = _mm_xor_si128(xmm13, chaining[5]); + xmm14 = _mm_xor_si128(xmm14, chaining[6]); + xmm15 = _mm_xor_si128(xmm15, chaining[7]); + + /* store CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF1024(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i xmmIL, xmmIH, xmmJL, xmmJH, xmmZERO; + static __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + static __m256d ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; + static __m256d ymmJ; + static __m256d TEMP0; + static __m256d TEMP1; + static __m256d TEMP2; + static __m256d TEMP3; + + /* load CV into registers xmm8...xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + xmm0 = _mm_xor_si128(xmm0, xmm0); + + /* generate AVX registers with Q in high and P in low 128 bits */ + ymm8 = insert_m128i_in_m256d(ymm8, xmm8, 0); + ymm9 = insert_m128i_in_m256d(ymm9, xmm9, 0); + ymm10 = insert_m128i_in_m256d(ymm10, xmm10, 0); + ymm11 = insert_m128i_in_m256d(ymm11, xmm11, 0); + ymm12 = insert_m128i_in_m256d(ymm12, xmm12, 0); + ymm13 = insert_m128i_in_m256d(ymm13, xmm13, 0); + ymm14 = insert_m128i_in_m256d(ymm14, xmm14, 0); + ymm15 = insert_m128i_in_m256d(ymm15, xmm15, 0); + + ymm8 = insert_m128i_in_m256d(ymm8, xmm0, 1); + ymm9 = insert_m128i_in_m256d(ymm9, xmm0, 1); + ymm10 = insert_m128i_in_m256d(ymm10, xmm0, 1); + ymm11 = insert_m128i_in_m256d(ymm11, xmm0, 1); + ymm12 = insert_m128i_in_m256d(ymm12, xmm0, 1); + ymm13 = insert_m128i_in_m256d(ymm13, xmm0, 1); + ymm14 = insert_m128i_in_m256d(ymm14, xmm0, 1); + ymm15 = insert_m128i_in_m256d(ymm15, xmm0, 1); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8...xmm15 */ + ROUNDS_P_Q(); + + xmm8 = extract_m128i_from_m256d(ymm8, 0); + xmm9 = extract_m128i_from_m256d(ymm9, 0); + xmm10 = extract_m128i_from_m256d(ymm10, 0); + xmm11 = extract_m128i_from_m256d(ymm11, 0); + xmm12 = extract_m128i_from_m256d(ymm12, 0); + xmm13 = extract_m128i_from_m256d(ymm13, 0); + xmm14 = extract_m128i_from_m256d(ymm14, 0); + xmm15 = extract_m128i_from_m256d(ymm15, 0); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, chaining[0]); + xmm9 = _mm_xor_si128(xmm9, chaining[1]); + xmm10 = _mm_xor_si128(xmm10, chaining[2]); + xmm11 = _mm_xor_si128(xmm11, chaining[3]); + xmm12 = _mm_xor_si128(xmm12, chaining[4]); + xmm13 = _mm_xor_si128(xmm13, chaining[5]); + xmm14 = _mm_xor_si128(xmm14, chaining[6]); + xmm15 = _mm_xor_si128(xmm15, chaining[7]); + + /* transpose CV back from row ordering to column ordering */ + /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ + Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7); + + /* we only need to return the truncated half of the state */ + chaining[0] = xmm8; + chaining[1] = xmm4; + chaining[2] = xmm9; + chaining[3] = xmm11; + chaining[4] = xmm0; + chaining[5] = xmm6; + chaining[6] = xmm13; + chaining[7] = xmm15; + + return; +}//OF1024() + +#endif + diff --git a/algo/aes_ni/groestl-intr-vperm.h b/algo/aes_ni/groestl-intr-vperm.h new file mode 100644 index 000000000..c75522961 --- /dev/null +++ b/algo/aes_ni/groestl-intr-vperm.h @@ -0,0 +1,1294 @@ +/* groestl-intr-vperm.h Aug 2011 + * + * Groestl implementation with intrinsics using ssse3 instructions. + * Author: Günther A. Roland, Martin Schläffer + * + * Based on the vperm and aes_ni implementations of the hash function Groestl + * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ + * Institute of Applied Mathematics, Middle East Technical University, Turkey + * + * This code is placed in the public domain + */ + +#include +#include "hash-groestl.h" + +/* global constants */ +__m128i ROUND_CONST_Lx; +__m128i ROUND_CONST_L0[ROUNDS512]; +__m128i ROUND_CONST_L7[ROUNDS512]; +__m128i ROUND_CONST_P[ROUNDS1024]; +__m128i ROUND_CONST_Q[ROUNDS1024]; +__m128i TRANSP_MASK; +__m128i SUBSH_MASK[8]; +__m128i ALL_0F; +__m128i ALL_15; +__m128i ALL_1B; +__m128i ALL_63; +__m128i ALL_FF; +__m128i VPERM_IPT[2]; +__m128i VPERM_OPT[2]; +__m128i VPERM_INV[2]; +__m128i VPERM_SB1[2]; +__m128i VPERM_SB2[2]; +__m128i VPERM_SB4[2]; +__m128i VPERM_SBO[2]; + + +#define tos(a) #a +#define tostr(a) tos(a) + +#define SET_SHARED_CONSTANTS(){\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\ + ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\ + ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\ + VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\ + VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\ + VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\ + VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\ + VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\ + VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\ + VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\ + VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\ + VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\ + VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\ + VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\ + VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\ +}/**/ + +/* VPERM + * Transform w/o settings c* + * transforms 2 rows to/from "vperm mode" + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0, a1 = 2 rows + * table = transformation table to use + * t*, c* = clobbers + * outputs: + * a0, a1 = 2 rows transformed with table + * */ +#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ + t0 = c0;\ + t1 = c0;\ + t0 = _mm_andnot_si128(t0, a0);\ + t1 = _mm_andnot_si128(t1, a1);\ + t0 = _mm_srli_epi32(t0, 4);\ + t1 = _mm_srli_epi32(t1, 4);\ + a0 = _mm_and_si128(a0, c0);\ + a1 = _mm_and_si128(a1, c0);\ + t2 = c2;\ + t3 = c2;\ + t2 = _mm_shuffle_epi8(t2, a0);\ + t3 = _mm_shuffle_epi8(t3, a1);\ + a0 = c1;\ + a1 = c1;\ + a0 = _mm_shuffle_epi8(a0, t0);\ + a1 = _mm_shuffle_epi8(a1, t1);\ + a0 = _mm_xor_si128(a0, t2);\ + a1 = _mm_xor_si128(a1, t3);\ +}/**/ + +#define VPERM_Transform_Set_Const(table, c0, c1, c2){\ + c0 = ALL_0F;\ + c1 = ((__m128i*) table )[0];\ + c2 = ((__m128i*) table )[1];\ +}/**/ + +/* VPERM + * Transform + * transforms 2 rows to/from "vperm mode" + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0, a1 = 2 rows + * table = transformation table to use + * t*, c* = clobbers + * outputs: + * a0, a1 = 2 rows transformed with table + * */ +#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ + VPERM_Transform_Set_Const(table, c0, c1, c2);\ + VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ +}/**/ + +/* VPERM + * Transform State + * inputs: + * a0-a3 = state + * table = transformation table to use + * t* = clobbers + * outputs: + * a0-a3 = transformed state + * */ +#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ + VPERM_Transform_Set_Const(table, c0, c1, c2);\ + VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ + VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ +}/**/ + +/* VPERM + * Add Constant to State + * inputs: + * a0-a7 = state + * constant = constant to add + * t0 = clobber + * outputs: + * a0-a7 = state + constant + * */ +#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ + t0 = constant;\ + a0 = _mm_xor_si128(a0, t0);\ + a1 = _mm_xor_si128(a1, t0);\ + a2 = _mm_xor_si128(a2, t0);\ + a3 = _mm_xor_si128(a3, t0);\ + a4 = _mm_xor_si128(a4, t0);\ + a5 = _mm_xor_si128(a5, t0);\ + a6 = _mm_xor_si128(a6, t0);\ + a7 = _mm_xor_si128(a7, t0);\ +}/**/ + +/* VPERM + * Set Substitute Core Constants + * */ +#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\ + VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\ +}/**/ + +/* VPERM + * Substitute Core + * first part of sbox inverse computation + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0 = 1 row + * t*, c* = clobbers + * outputs: + * b0a, b0b = inputs for lookup step + * */ +#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ + t0 = c0;\ + t0 = _mm_andnot_si128(t0, a0);\ + t0 = _mm_srli_epi32(t0, 4);\ + a0 = _mm_and_si128(a0, c0);\ + b0a = c1;\ + b0a = _mm_shuffle_epi8(b0a, a0);\ + a0 = _mm_xor_si128(a0, t0);\ + b0b = c2;\ + b0b = _mm_shuffle_epi8(b0b, t0);\ + b0b = _mm_xor_si128(b0b, b0a);\ + t1 = c2;\ + t1 = _mm_shuffle_epi8(t1, a0);\ + t1 = _mm_xor_si128(t1, b0a);\ + b0a = c2;\ + b0a = _mm_shuffle_epi8(b0a, b0b);\ + b0a = _mm_xor_si128(b0a, a0);\ + b0b = c2;\ + b0b = _mm_shuffle_epi8(b0b, t1);\ + b0b = _mm_xor_si128(b0b, t0);\ +}/**/ + +/* VPERM + * Lookup + * second part of sbox inverse computation + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0a, a0b = output of Substitution Core + * table = lookup table to use (*1 / *2 / *4) + * t0 = clobber + * outputs: + * b0 = output of sbox + multiplication + * */ +#define VPERM_Lookup(a0a, a0b, table, b0, t0){\ + b0 = ((__m128i*) table )[0];\ + t0 = ((__m128i*) table )[1];\ + b0 = _mm_shuffle_epi8(b0, a0b);\ + t0 = _mm_shuffle_epi8(t0, a0a);\ + b0 = _mm_xor_si128(b0, t0);\ +}/**/ + +/* VPERM + * SubBytes and *2 / *4 + * this function is derived from: + * Constant-time SSSE3 AES core implementation + * by Mike Hamburg + * and + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0-a7 = state + * t*, c* = clobbers + * outputs: + * a0-a7 = state * 4 + * c2 = row0 * 2 -> b0 + * c1 = row7 * 2 -> b3 + * c0 = row7 * 1 -> b4 + * t2 = row4 * 1 -> b7 + * TEMP_MUL1 = row(i) * 1 + * TEMP_MUL2 = row(i) * 2 + * + * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ +#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ + /* set Constants */\ + VPERM_Substitute_Core_Set_Const(c0, c1, c2);\ + /* row 1 */\ + VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[1] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[1] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\ + /* --- */\ + /* row 2 */\ + VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[2] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[2] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\ + /* --- */\ + /* row 3 */\ + VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[3] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[3] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\ + /* --- */\ + /* row 5 */\ + VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[5] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[5] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\ + /* --- */\ + /* row 6 */\ + VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[6] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[6] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\ + /* --- */\ + /* row 7 */\ + VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[7] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\ + VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\ + /* --- */\ + /* row 4 */\ + VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[4] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\ + /* --- */\ + /* row 0 */\ + VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\ + VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\ + TEMP_MUL2[0] = c2;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\ + /* --- */\ +}/**/ + + +/* Optimized MixBytes + * inputs: + * a0-a7 = (row0-row7) * 4 + * b0 = row0 * 2 + * b3 = row7 * 2 + * b4 = row7 * 1 + * b7 = row4 * 1 + * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 + * output: b0-b7 + * */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* save one value */\ + TEMP_MUL4 = a3;\ + /* 1 */\ + b1 = a0;\ + b1 = _mm_xor_si128(b1, a5);\ + b1 = _mm_xor_si128(b1, b4); /* -> helper! */\ + b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\ + b2 = b1;\ + \ + /* 2 */\ + b5 = a1;\ + b5 = _mm_xor_si128(b5, a4);\ + b5 = _mm_xor_si128(b5, b7); /* -> helper! */\ + b5 = _mm_xor_si128(b5, b3); /* -> helper! */\ + b6 = b5;\ + \ + /* 4 */\ + b7 = _mm_xor_si128(b7, a6);\ + /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\ + b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\ + b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\ + b7 = _mm_xor_si128(b7, b3); /* -> helper! */\ + b2 = _mm_xor_si128(b2, b7);\ + \ + /* 3 */\ + b0 = _mm_xor_si128(b0, a7);\ + b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\ + b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\ + /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\ + b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\ + b3 = b0;\ + b1 = _mm_xor_si128(b1, b0);\ + b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\ + \ + /* 5 */\ + b4 = _mm_xor_si128(b4, a2);\ + /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\ + b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\ + b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\ + b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\ + b3 = _mm_xor_si128(b3, b4);\ + b6 = _mm_xor_si128(b6, b4);\ + \ + /* 6 */\ + a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\ + a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\ + a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\ + a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\ + b4 = _mm_xor_si128(b4, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + b7 = _mm_xor_si128(b7, a3);\ + \ + /* 7 */\ + a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\ + a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\ + b2 = _mm_xor_si128(b2, a1);\ + b3 = _mm_xor_si128(b3, a1);\ + \ + /* 8 */\ + a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\ + a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\ + b6 = _mm_xor_si128(b6, a5);\ + b7 = _mm_xor_si128(b7, a5);\ + \ + /* 9 */\ + a3 = TEMP_MUL1[2];\ + a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\ + b0 = _mm_xor_si128(b0, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + \ + /* 10 */\ + a1 = TEMP_MUL1[6];\ + a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\ + b1 = _mm_xor_si128(b1, a1);\ + b4 = _mm_xor_si128(b4, a1);\ + \ + /* 11 */\ + a5 = TEMP_MUL1[3];\ + a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\ + b1 = _mm_xor_si128(b1, a5);\ + b6 = _mm_xor_si128(b6, a5);\ + \ + /* 12 */\ + a3 = TEMP_MUL1[7];\ + a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\ + b2 = _mm_xor_si128(b2, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + \ + /* 13 */\ + b0 = _mm_xor_si128(b0, (TEMP_MUL4));\ + b0 = _mm_xor_si128(b0, a4);\ + b1 = _mm_xor_si128(b1, a4);\ + b3 = _mm_xor_si128(b3, a6);\ + b4 = _mm_xor_si128(b4, a0);\ + b4 = _mm_xor_si128(b4, a7);\ + b5 = _mm_xor_si128(b5, a0);\ + b7 = _mm_xor_si128(b7, a2);\ +}/**/ + +#if (LENGTH <= 256) + +#define SET_CONSTANTS(){\ + SET_SHARED_CONSTANTS();\ + SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\ + SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\ + SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\ + SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\ + SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\ + SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\ + SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\ + SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ + }\ + ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ +}/**/ + +/* vperm: + * transformation before rounds with ipt + * first round add transformed constant + * middle rounds: add constant XOR 0x15...15 + * last round: additionally add 0x15...15 after MB + * transformation after rounds with opt + */ +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant + ShiftBytes (interleaved) */\ + b1 = ROUND_CONST_Lx;\ + a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ + a1 = _mm_xor_si128(a1, b1);\ + a2 = _mm_xor_si128(a2, b1);\ + a3 = _mm_xor_si128(a3, b1);\ + a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ + a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ + a4 = _mm_xor_si128(a4, b1);\ + a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ + a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ + a5 = _mm_xor_si128(a5, b1);\ + a6 = _mm_xor_si128(a6, b1);\ + a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ + a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ + a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ + a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ + /* SubBytes + Multiplication by 2 and 4 */\ + VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +}/**/ + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ + ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ +} + + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + t0 = TRANSP_MASK;\ +\ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ +\ + o1 = i0;\ + t0 = i2;\ +\ + i0 = _mm_unpacklo_epi16(i0, i1);\ + o1 = _mm_unpackhi_epi16(o1, i1);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + t0 = _mm_unpackhi_epi16(t0, i3);\ +\ + i0 = _mm_shuffle_epi32(i0, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + t0 = _mm_shuffle_epi32(t0, 216);\ +\ + o2 = i0;\ + o3 = o1;\ +\ + i0 = _mm_unpacklo_epi32(i0, i2);\ + o1 = _mm_unpacklo_epi32(o1, t0);\ + o2 = _mm_unpackhi_epi32(o2, i2);\ + o3 = _mm_unpackhi_epi32(o3, t0);\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + o1 = i0;\ + o2 = i1;\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + o1 = _mm_unpackhi_epi64(o1, i4);\ + o3 = i1;\ + o4 = i2;\ + o2 = _mm_unpacklo_epi64(o2, i5);\ + o3 = _mm_unpackhi_epi64(o3, i5);\ + o5 = i2;\ + o6 = i3;\ + o4 = _mm_unpacklo_epi64(o4, i6);\ + o5 = _mm_unpackhi_epi64(o5, i6);\ + o7 = i3;\ + o6 = _mm_unpacklo_epi64(o6, i7);\ + o7 = _mm_unpackhi_epi64(o7, i7);\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + o0 = i0;\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o0 = _mm_unpackhi_epi64(o0, i1);\ + o1 = i2;\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + o1 = _mm_unpackhi_epi64(o1, i3);\ + o2 = i4;\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + o2 = _mm_unpackhi_epi64(o2, i5);\ + o3 = i6;\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + o3 = _mm_unpackhi_epi64(o3, i7);\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + t0 = _mm_xor_si128(t0, t0);\ + i1 = i0;\ + i3 = i2;\ + i5 = i4;\ + i7 = i6;\ + i0 = _mm_unpacklo_epi64(i0, t0);\ + i1 = _mm_unpackhi_epi64(i1, t0);\ + i2 = _mm_unpacklo_epi64(i2, t0);\ + i3 = _mm_unpackhi_epi64(i3, t0);\ + i4 = _mm_unpacklo_epi64(i4, t0);\ + i5 = _mm_unpackhi_epi64(i5, t0);\ + i6 = _mm_unpacklo_epi64(i6, t0);\ + i7 = _mm_unpackhi_epi64(i7, t0);\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +}/**/ + + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst_CNT2(i, j){\ + xmm0 = ROUND_CONST_L0[i];\ + xmm1 = ROUND_CONST_L7[i];\ + xmm2 = ROUND_CONST_L0[j];\ + xmm3 = ROUND_CONST_L7[j];\ + VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ + xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ + xmm1 = _mm_xor_si128(xmm1, (ALL_15));\ + xmm2 = _mm_xor_si128(xmm2, (ALL_15));\ + xmm3 = _mm_xor_si128(xmm3, (ALL_15));\ + ROUND_CONST_L0[i] = xmm0;\ + ROUND_CONST_L7[i] = xmm1;\ + ROUND_CONST_L0[j] = xmm2;\ + ROUND_CONST_L7[j] = xmm3;\ +}/**/ + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst(){\ + xmm0 = ROUND_CONST_Lx;\ + VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ + xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ + ROUND_CONST_Lx = xmm0;\ + VPERM_Transform_RoundConst_CNT2(0, 1);\ + VPERM_Transform_RoundConst_CNT2(2, 3);\ + VPERM_Transform_RoundConst_CNT2(4, 5);\ + VPERM_Transform_RoundConst_CNT2(6, 7);\ + VPERM_Transform_RoundConst_CNT2(8, 9);\ +}/**/ + +void INIT(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15; + + /* transform round constants into VPERM mode */ + VPERM_Transform_RoundConst(); + + /* load IV into registers xmm12 - xmm15 */ + xmm12 = chaining[0]; + xmm13 = chaining[1]; + xmm14 = chaining[2]; + xmm15 = chaining[3]; + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* store transposed IV */ + chaining[0] = xmm12; + chaining[1] = xmm2; + chaining[2] = xmm6; + chaining[3] = xmm7; +} + +void TF512(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP_MUL1[8]; + static __m128i TEMP_MUL2[8]; + static __m128i TEMP_MUL4; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm12 - xmm15 */ + xmm12 = message[0]; + xmm13 = message[1]; + xmm14 = message[2]; + xmm15 = message[3]; + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + xmm8 = chaining[0]; + xmm0 = chaining[1]; + xmm4 = chaining[2]; + xmm5 = chaining[3]; + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + xmm8 = _mm_xor_si128(xmm8, xmm12); + xmm0 = _mm_xor_si128(xmm0, xmm2); + xmm4 = _mm_xor_si128(xmm4, xmm6); + xmm5 = _mm_xor_si128(xmm5, xmm7); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, xmm8); + xmm1 = _mm_xor_si128(xmm1, xmm10); + xmm2 = _mm_xor_si128(xmm2, xmm12); + xmm3 = _mm_xor_si128(xmm3, xmm14); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, (chaining[0])); + xmm1 = _mm_xor_si128(xmm1, (chaining[1])); + xmm2 = _mm_xor_si128(xmm2, (chaining[2])); + xmm3 = _mm_xor_si128(xmm3, (chaining[3])); + + /* store CV */ + chaining[0] = xmm0; + chaining[1] = xmm1; + chaining[2] = xmm2; + chaining[3] = xmm3; + +#ifdef IACA_TRACE + IACA_END; +#endif + + return; +} + +void OF512(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP_MUL1[8]; + static __m128i TEMP_MUL2[8]; + static __m128i TEMP_MUL4; + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + xmm8 = chaining[0]; + xmm10 = chaining[1]; + xmm12 = chaining[2]; + xmm14 = chaining[3]; + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm10 = _mm_xor_si128(xmm10, (chaining[1])); + xmm12 = _mm_xor_si128(xmm12, (chaining[2])); + xmm14 = _mm_xor_si128(xmm14, (chaining[3])); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); + VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); + + /* we only need to return the truncated half of the state */ + chaining[2] = xmm9; + chaining[3] = xmm11; + + return; +}//OF512() + +#endif + +#if (LENGTH > 256) + +#define SET_CONSTANTS(){\ + SET_SHARED_CONSTANTS();\ + ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ + SUBSH_MASK[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);\ + SUBSH_MASK[1] = _mm_set_epi32(0x000f0e0d, 0x0c0b0a09, 0x08070605, 0x04030201);\ + SUBSH_MASK[2] = _mm_set_epi32(0x01000f0e, 0x0d0c0b0a, 0x09080706, 0x05040302);\ + SUBSH_MASK[3] = _mm_set_epi32(0x0201000f, 0x0e0d0c0b, 0x0a090807, 0x06050403);\ + SUBSH_MASK[4] = _mm_set_epi32(0x03020100, 0x0f0e0d0c, 0x0b0a0908, 0x07060504);\ + SUBSH_MASK[5] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);\ + SUBSH_MASK[6] = _mm_set_epi32(0x05040302, 0x01000f0e, 0x0d0c0b0a, 0x09080706);\ + SUBSH_MASK[7] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);\ + for(i = 0; i < ROUNDS1024; i++)\ + {\ + ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\ + }\ +}/**/ + +/* one round + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* SubBytes + Multiplication */\ + VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +}/**/ + +#define ROUNDS_P(){\ + u8 round_counter = 0;\ + for(round_counter = 0; round_counter < 14; round_counter+=2) {\ + /* AddRoundConstant P1024 */\ + xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\ + xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\ + xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\ + xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\ + xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\ + xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\ + xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\ + xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + VPERM_Add_Constant(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, ALL_15, xmm8);\ + \ + /* AddRoundConstant P1024 */\ + xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\ + xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\ + xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\ + xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\ + xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\ + xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\ + xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\ + xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ + }\ +}/**/ + +#define ROUNDS_Q(){\ + VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm1);\ + u8 round_counter = 0;\ + for(round_counter = 0; round_counter < 14; round_counter+=2) {\ + /* AddRoundConstant Q1024 */\ + xmm1 = ALL_FF;\ + xmm8 = _mm_xor_si128(xmm8, xmm1);\ + xmm9 = _mm_xor_si128(xmm9, xmm1);\ + xmm10 = _mm_xor_si128(xmm10, xmm1);\ + xmm11 = _mm_xor_si128(xmm11, xmm1);\ + xmm12 = _mm_xor_si128(xmm12, xmm1);\ + xmm13 = _mm_xor_si128(xmm13, xmm1);\ + xmm14 = _mm_xor_si128(xmm14, xmm1);\ + xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\ + xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\ + xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\ + xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\ + xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\ + xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\ + xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\ + xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + \ + /* AddRoundConstant Q1024 */\ + xmm9 = ALL_FF;\ + xmm0 = _mm_xor_si128(xmm0, xmm9);\ + xmm1 = _mm_xor_si128(xmm1, xmm9);\ + xmm2 = _mm_xor_si128(xmm2, xmm9);\ + xmm3 = _mm_xor_si128(xmm3, xmm9);\ + xmm4 = _mm_xor_si128(xmm4, xmm9);\ + xmm5 = _mm_xor_si128(xmm5, xmm9);\ + xmm6 = _mm_xor_si128(xmm6, xmm9);\ + xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\ + xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\ + xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\ + xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\ + xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\ + xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\ + xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\ + xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\ + /* SubBytes + MixBytes*/ \ + SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + }\ + VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm1);\ +}/**/ + + +/* Matrix Transpose + * input is a 1024-bit state with two columns in one xmm + * output is a 1024-bit state with two rows in one xmm + * inputs: i0-i7 + * outputs: i0-i7 + * clobbers: t0-t7 + */ +#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ + t0 = TRANSP_MASK;\ +\ + i6 = _mm_shuffle_epi8(i6, t0);\ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + t1 = i2;\ + i4 = _mm_shuffle_epi8(i4, t0);\ + i5 = _mm_shuffle_epi8(i5, t0);\ + t2 = i4;\ + t3 = i6;\ + i7 = _mm_shuffle_epi8(i7, t0);\ +\ + /* continue with unpack using 4 temp registers */\ + t0 = i0;\ + t2 = _mm_unpackhi_epi16(t2, i5);\ + i4 = _mm_unpacklo_epi16(i4, i5);\ + t3 = _mm_unpackhi_epi16(t3, i7);\ + i6 = _mm_unpacklo_epi16(i6, i7);\ + t0 = _mm_unpackhi_epi16(t0, i1);\ + t1 = _mm_unpackhi_epi16(t1, i3);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + i0 = _mm_unpacklo_epi16(i0, i1);\ +\ + /* shuffle with immediate */\ + t0 = _mm_shuffle_epi32(t0, 216);\ + t1 = _mm_shuffle_epi32(t1, 216);\ + t2 = _mm_shuffle_epi32(t2, 216);\ + t3 = _mm_shuffle_epi32(t3, 216);\ + i0 = _mm_shuffle_epi32(i0, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + i4 = _mm_shuffle_epi32(i4, 216);\ + i6 = _mm_shuffle_epi32(i6, 216);\ +\ + /* continue with unpack */\ + t4 = i0;\ + i0 = _mm_unpacklo_epi32(i0, i2);\ + t4 = _mm_unpackhi_epi32(t4, i2);\ + t5 = t0;\ + t0 = _mm_unpacklo_epi32(t0, t1);\ + t5 = _mm_unpackhi_epi32(t5, t1);\ + t6 = i4;\ + i4 = _mm_unpacklo_epi32(i4, i6);\ + t7 = t2;\ + t6 = _mm_unpackhi_epi32(t6, i6);\ + i2 = t0;\ + t2 = _mm_unpacklo_epi32(t2, t3);\ + i3 = t0;\ + t7 = _mm_unpackhi_epi32(t7, t3);\ +\ + /* there are now 2 rows in each xmm */\ + /* unpack to get 1 row of CV in each xmm */\ + i1 = i0;\ + i1 = _mm_unpackhi_epi64(i1, i4);\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + i4 = t4;\ + i3 = _mm_unpackhi_epi64(i3, t2);\ + i5 = t4;\ + i2 = _mm_unpacklo_epi64(i2, t2);\ + i6 = t5;\ + i5 = _mm_unpackhi_epi64(i5, t6);\ + i7 = t5;\ + i4 = _mm_unpacklo_epi64(i4, t6);\ + i7 = _mm_unpackhi_epi64(i7, t7);\ + i6 = _mm_unpacklo_epi64(i6, t7);\ + /* transpose done */\ +}/**/ + +/* Matrix Transpose Inverse + * input is a 1024-bit state with two rows in one xmm + * output is a 1024-bit state with two columns in one xmm + * inputs: i0-i7 + * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) + * clobbers: t0-t4 + */ +#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ + /* transpose matrix to get output format */\ + o1 = i0;\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o1 = _mm_unpackhi_epi64(o1, i1);\ + t0 = i2;\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + t0 = _mm_unpackhi_epi64(t0, i3);\ + t1 = i4;\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + t1 = _mm_unpackhi_epi64(t1, i5);\ + t2 = i6;\ + o0 = TRANSP_MASK;\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + t2 = _mm_unpackhi_epi64(t2, i7);\ + /* load transpose mask into a register, because it will be used 8 times */\ + i0 = _mm_shuffle_epi8(i0, o0);\ + i2 = _mm_shuffle_epi8(i2, o0);\ + i4 = _mm_shuffle_epi8(i4, o0);\ + i6 = _mm_shuffle_epi8(i6, o0);\ + o1 = _mm_shuffle_epi8(o1, o0);\ + t0 = _mm_shuffle_epi8(t0, o0);\ + t1 = _mm_shuffle_epi8(t1, o0);\ + t2 = _mm_shuffle_epi8(t2, o0);\ + /* continue with unpack using 4 temp registers */\ + t3 = i4;\ + o2 = o1;\ + o0 = i0;\ + t4 = t1;\ + \ + t3 = _mm_unpackhi_epi16(t3, i6);\ + i4 = _mm_unpacklo_epi16(i4, i6);\ + o0 = _mm_unpackhi_epi16(o0, i2);\ + i0 = _mm_unpacklo_epi16(i0, i2);\ + o2 = _mm_unpackhi_epi16(o2, t0);\ + o1 = _mm_unpacklo_epi16(o1, t0);\ + t4 = _mm_unpackhi_epi16(t4, t2);\ + t1 = _mm_unpacklo_epi16(t1, t2);\ + /* shuffle with immediate */\ + i4 = _mm_shuffle_epi32(i4, 216);\ + t3 = _mm_shuffle_epi32(t3, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + o2 = _mm_shuffle_epi32(o2, 216);\ + i0 = _mm_shuffle_epi32(i0, 216);\ + o0 = _mm_shuffle_epi32(o0, 216);\ + t1 = _mm_shuffle_epi32(t1, 216);\ + t4 = _mm_shuffle_epi32(t4, 216);\ + /* continue with unpack */\ + i1 = i0;\ + i3 = o0;\ + i5 = o1;\ + i7 = o2;\ + i0 = _mm_unpacklo_epi32(i0, i4);\ + i1 = _mm_unpackhi_epi32(i1, i4);\ + o0 = _mm_unpacklo_epi32(o0, t3);\ + i3 = _mm_unpackhi_epi32(i3, t3);\ + o1 = _mm_unpacklo_epi32(o1, t1);\ + i5 = _mm_unpackhi_epi32(i5, t1);\ + o2 = _mm_unpacklo_epi32(o2, t4);\ + i7 = _mm_unpackhi_epi32(i7, t4);\ + /* transpose done */\ +}/**/ + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst_CNT2(i, j){\ + xmm0 = ROUND_CONST_P[i];\ + xmm1 = ROUND_CONST_P[j];\ + xmm2 = ROUND_CONST_Q[i];\ + xmm3 = ROUND_CONST_Q[j];\ + VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ + xmm2 = _mm_xor_si128(xmm2, (ALL_15));\ + xmm3 = _mm_xor_si128(xmm3, (ALL_15));\ + ROUND_CONST_P[i] = xmm0;\ + ROUND_CONST_P[j] = xmm1;\ + ROUND_CONST_Q[i] = xmm2;\ + ROUND_CONST_Q[j] = xmm3;\ +}/**/ + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst(){\ + VPERM_Transform_RoundConst_CNT2(0, 1);\ + VPERM_Transform_RoundConst_CNT2(2, 3);\ + VPERM_Transform_RoundConst_CNT2(4, 5);\ + VPERM_Transform_RoundConst_CNT2(6, 7);\ + VPERM_Transform_RoundConst_CNT2(8, 9);\ + VPERM_Transform_RoundConst_CNT2(10, 11);\ + VPERM_Transform_RoundConst_CNT2(12, 13);\ + xmm0 = ALL_FF;\ + VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ + xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ + ALL_FF = xmm0;\ +}/**/ + + +void INIT(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + + /* transform round constants into VPERM mode */ + VPERM_Transform_RoundConst(); + + /* load IV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* transform chaining value from column ordering into row ordering */ + VPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store transposed IV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; +} + +void TF1024(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP_MUL1[8]; + static __m128i TEMP_MUL2[8]; + static __m128i TEMP_MUL4; + static __m128i QTEMP[8]; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm8 - xmm15 (Q = message) */ + xmm8 = message[0]; + xmm9 = message[1]; + xmm10 = message[2]; + xmm11 = message[3]; + xmm12 = message[4]; + xmm13 = message[5]; + xmm14 = message[6]; + xmm15 = message[7]; + + /* transform message M from column ordering into row ordering */ + VPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store message M (Q input) for later */ + QTEMP[0] = xmm8; + QTEMP[1] = xmm9; + QTEMP[2] = xmm10; + QTEMP[3] = xmm11; + QTEMP[4] = xmm12; + QTEMP[5] = xmm13; + QTEMP[6] = xmm14; + QTEMP[7] = xmm15; + + /* xor CV to message to get P input */ + /* result: CV+M in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* compute permutation P */ + /* result: P(CV+M) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV+M)+CV in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* store P(CV+M)+CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + + /* load message M (Q input) into xmm8-15 */ + xmm8 = QTEMP[0]; + xmm9 = QTEMP[1]; + xmm10 = QTEMP[2]; + xmm11 = QTEMP[3]; + xmm12 = QTEMP[4]; + xmm13 = QTEMP[5]; + xmm14 = QTEMP[6]; + xmm15 = QTEMP[7]; + + /* compute permutation Q */ + /* result: Q(M) in xmm8...xmm15 */ + ROUNDS_Q(); + + /* xor Q output */ + /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* store CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + +#ifdef IACA_TRACE + IACA_END; +#endif + + return; +} + +void OF1024(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP_MUL1[8]; + static __m128i TEMP_MUL2[8]; + static __m128i TEMP_MUL4; + + /* load CV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* compute permutation P */ + /* result: P(CV) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8...xmm15 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm9 = _mm_xor_si128(xmm9, (chaining[1])); + xmm10 = _mm_xor_si128(xmm10, (chaining[2])); + xmm11 = _mm_xor_si128(xmm11, (chaining[3])); + xmm12 = _mm_xor_si128(xmm12, (chaining[4])); + xmm13 = _mm_xor_si128(xmm13, (chaining[5])); + xmm14 = _mm_xor_si128(xmm14, (chaining[6])); + xmm15 = _mm_xor_si128(xmm15, (chaining[7])); + + /* transpose CV back from row ordering to column ordering */ + /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ + Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7); + VPERM_Transform_State(xmm0, xmm6, xmm13, xmm15, VPERM_OPT, xmm1, xmm2, xmm3, xmm5, xmm7, xmm10, xmm12); + + /* we only need to return the truncated half of the state */ + chaining[4] = xmm0; + chaining[5] = xmm6; + chaining[6] = xmm13; + chaining[7] = xmm15; + + return; +} + +#endif + diff --git a/algo/aes_ni/groestl-version.h b/algo/aes_ni/groestl-version.h new file mode 100644 index 000000000..cdbd81627 --- /dev/null +++ b/algo/aes_ni/groestl-version.h @@ -0,0 +1,16 @@ +// specify assembly or intrinsics implementation +//#define TASM +#define TINTR + +//#define AES_NI + +//#ifdef AES_NI +// specify AES-NI, AVX (with AES-NI) or vector-permute implementation + +//#ifndef NO_AES_NI + +#define VAES +// #define VAVX +// #define VVPERM + +//#endif diff --git a/algo/aes_ni/groestl256-asm-aes.h b/algo/aes_ni/groestl256-asm-aes.h new file mode 100644 index 000000000..0810b5e83 --- /dev/null +++ b/algo/aes_ni/groestl256-asm-aes.h @@ -0,0 +1,529 @@ +/* groestl-asm-aes.h Aug 2011 + * + * Groestl implementation with inline assembly using ssse3, sse4.1, and aes + * instructions. + * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include "hash-groestl256.h" +/* global constants */ +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; +__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16]; +__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16]; +__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; +__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; + +/* temporary variables */ +__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; +__attribute__ ((aligned (16))) unsigned char TEMP[3*16]; + + +#define tos(a) #a +#define tostr(a) tos(a) + + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b */ +#define MUL2(i, j, k){\ + asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\ + asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\ + asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\ + asm("pand xmm"tostr(j)", xmm"tostr(k)"");\ + asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\ +}/**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\ + asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\ + asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\ + asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\ + asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\ + asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\ + asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\ + asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\ + asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\ + asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\ + asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\ + asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\ + asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\ + asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\ + asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\ + asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ + asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\ + asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ + asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ + asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\ + asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\ + /* spill values y_4, y_5 to memory */\ + asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\ + asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\ + asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\ + asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\ + asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ + asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\ + asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\ + asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ + asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\ + asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\ + asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ + asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\ + asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ + asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ + \ + /* compute x_i = t_i + t_{i+3} */\ + asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\ + asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\ + asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\ + asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\ + asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\ + asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\ + asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + asm("movaps xmm"tostr(b1)", [ALL_1B]");\ + MUL2(a0, b0, b1);\ + asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\ + MUL2(a1, b0, b1);\ + asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\ + MUL2(a2, b0, b1);\ + asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\ + MUL2(a3, b0, b1);\ + asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\ + MUL2(a4, b0, b1);\ + asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\ + MUL2(a5, b0, b1);\ + asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\ + MUL2(a6, b0, b1);\ + asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\ + MUL2(a7, b0, b1);\ + asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ + MUL2(a1, b0, b1);\ + asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\ + MUL2(a2, b0, b1);\ + asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ + MUL2(a5, b0, b1);\ + asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\ + MUL2(a6, b0, b1);\ + asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ + MUL2(a7, b0, b1);\ + asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\ + asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\ + asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ + asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ +}/*MixBytes*/ + +#define SET_CONSTANTS(){\ + ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ + ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ + ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\ + ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\ + ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\ + ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\ + ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\ + ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\ + ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\ + ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\ + ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\ + ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\ + ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\ + ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\ + ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\ + ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\ + ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\ + ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ + ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ + ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ + }\ + ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ +}while(0); + +#define Push_All_Regs() do{\ +/* not using any... + asm("push rax");\ + asm("push rbx");\ + asm("push rcx");*/\ +}while(0); + +#define Pop_All_Regs() do{\ +/* not using any... + asm("pop rcx");\ + asm("pop rbx");\ + asm("pop rax");*/\ +}while(0); + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant */\ + asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ + asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ + asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ + /* ShiftBytes + SubBytes (interleaved) */\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ + asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ + asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ + asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ + asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ + asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ + asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ + asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\ + asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ + asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ + \ + asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ + \ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ + \ + asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ + asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ + \ + asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ + asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ + asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ + \ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ + \ + asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ + asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ + asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ + asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ +}/**/ + + +void INIT256(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + asm volatile ("emms"); + + /* load IV into registers xmm12 - xmm15 */ + asm ("movaps xmm12, [rdi+0*16]"); + asm ("movaps xmm13, [rdi+1*16]"); + asm ("movaps xmm14, [rdi+2*16]"); + asm ("movaps xmm15, [rdi+3*16]"); + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* store transposed IV */ + asm ("movaps [rdi+0*16], xmm12"); + asm ("movaps [rdi+1*16], xmm2"); + asm ("movaps [rdi+2*16], xmm6"); + asm ("movaps [rdi+3*16], xmm7"); + + asm volatile ("emms"); + asm (".att_syntax noprefix"); +} + +void TF512(u64* h, u64* m) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + /* message M in rsi */ + +#ifdef IACA_TRACE + IACA_START; +#endif + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load message into registers xmm12 - xmm15 (Q = message) */ + asm ("movaps xmm12, [rsi+0*16]"); + asm ("movaps xmm13, [rsi+1*16]"); + asm ("movaps xmm14, [rsi+2*16]"); + asm ("movaps xmm15, [rsi+3*16]"); + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm0, [rdi+1*16]"); + asm ("movaps xmm4, [rdi+2*16]"); + asm ("movaps xmm5, [rdi+3*16]"); + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + asm ("pxor xmm8, xmm12"); + asm ("pxor xmm0, xmm2"); + asm ("pxor xmm4, xmm6"); + asm ("pxor xmm5, xmm7"); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + asm ("pxor xmm0, xmm8"); + asm ("pxor xmm1, xmm10"); + asm ("pxor xmm2, xmm12"); + asm ("pxor xmm3, xmm14"); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + asm ("pxor xmm0, [rdi+0*16]"); + asm ("pxor xmm1, [rdi+1*16]"); + asm ("pxor xmm2, [rdi+2*16]"); + asm ("pxor xmm3, [rdi+3*16]"); + + /* store CV */ + asm ("movaps [rdi+0*16], xmm0"); + asm ("movaps [rdi+1*16], xmm1"); + asm ("movaps [rdi+2*16], xmm2"); + asm ("movaps [rdi+3*16], xmm3"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm10, [rdi+1*16]"); + asm ("movaps xmm12, [rdi+2*16]"); + asm ("movaps xmm14, [rdi+3*16]"); + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm10, [rdi+1*16]"); + asm ("pxor xmm12, [rdi+2*16]"); + asm ("pxor xmm14, [rdi+3*16]"); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); + + /* we only need to return the truncated half of the state */ + asm ("movaps [rdi+2*16], xmm9"); + asm ("movaps [rdi+3*16], xmm11"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + + return; +} + diff --git a/algo/aes_ni/groestl256-asm-avx.h b/algo/aes_ni/groestl256-asm-avx.h new file mode 100644 index 000000000..e7cb4c782 --- /dev/null +++ b/algo/aes_ni/groestl256-asm-avx.h @@ -0,0 +1,519 @@ +/* groestl-asm-avx.h Aug 2011 + * + * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx + * instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include "hash-groestl256.h" + +/* global variables */ +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16]; +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; +__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; +__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16]; +__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16]; +__attribute__ ((aligned (32))) unsigned char ALL_1B[32]; +__attribute__ ((aligned (32))) unsigned char ALL_FF[32]; + +/* temporary variables */ +__attribute__ ((aligned (32))) unsigned char TEMP[6*32]; + + +#define tos(a) #a +#define tostr(a) tos(a) + +#define SET_CONSTANTS(){\ + ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ + ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ + ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\ + ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\ + ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\ + ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\ + ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\ + ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\ + ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\ + ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\ + ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\ + ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\ + ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\ + ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\ + ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\ + ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\ + ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\ + ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ + ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ + ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ + }\ + ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ +}while(0); + +#define Push_All_Regs() do{\ +/* not using any... + asm("push rax");\ + asm("push rbx");\ + asm("push rcx");*/\ +}while(0); + +#define Pop_All_Regs() do{\ +/* not using any... + asm("pop rcx");\ + asm("pop rbx");\ + asm("pop rax");*/\ +}while(0); + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b + * xmm[z] has to be zero */ +#define VMUL2(i, j, k, z){\ + asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\ + asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ + asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\ + asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ +}/**/ + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b + * xmm[z] has to be zero */ +#define VMUL2v2(i, j, k, z){\ + asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\ + asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ + asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ +}/**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ + asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\ + asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\ + asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\ + asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\ + asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\ + asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\ + asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\ + asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\ + \ + /* t_i = a_i + a_{i+1} */\ + asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\ + asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\ + asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\ + asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\ + asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\ + asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\ + asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\ + asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\ + asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\ + asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\ + asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\ + \ + asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\ + asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\ + asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\ + asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\ + asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\ + asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\ + \ + /* spill values y_4, y_5 to memory */\ + asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\ + asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\ + asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\ + \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\ + asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\ + asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\ + \ + /* compute x_i = t_i + t_{i+3} */\ + asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\ + asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\ + asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\ + asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\ + asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\ + \ + /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\ + asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\ + VMUL2(a7, b0, b1, b2);\ + VMUL2(a6, b0, b1, b2);\ + VMUL2(a5, b0, b1, b2);\ + VMUL2(a4, b0, b1, b2);\ + VMUL2(a3, b0, b1, b2);\ + VMUL2(a2, b0, b1, b2);\ + VMUL2(a1, b0, b1, b2);\ + VMUL2(a0, b0, b1, b2);\ + \ + /* compute w_i : add y_{i+4} */\ + asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\ + asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\ + asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\ + asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\ + asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\ + asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\ + asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\ + asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\ + \ + /*compute v_i: double w_i */\ + VMUL2(a0, b0, b1, b2);\ + VMUL2(a1, b0, b1, b2);\ + VMUL2(a2, b0, b1, b2);\ + VMUL2(a3, b0, b1, b2);\ + VMUL2(a4, b0, b1, b2);\ + VMUL2(a5, b0, b1, b2);\ + VMUL2(a6, b0, b1, b2);\ + VMUL2(a7, b0, b1, b2);\ + \ + /* add to y_4 y_5 .. v3, v4, ... */\ + asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\ + asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\ + asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\ + asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\ + asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\ + asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\ + asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\ + asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\ +}/*MixBytes*/ + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant */\ + asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ + asm ("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ + asm ("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm ("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ + /* ShiftBytes + SubBytes (interleaved) */\ + asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ + asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ + asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ + asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ + asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ + asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ + asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ + asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\ + asm ("vpshufb xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ + asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\ +\ + asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\ +\ + asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ +\ + asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ + asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ + asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ +\ + asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\ + asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\ + asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\ + asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\ + asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\ + asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\ + asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\ + asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ +}/**/ + + +void INIT256(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + asm volatile ("emms"); + + /* load IV into registers xmm12 - xmm15 */ + asm ("vmovaps xmm12, [rdi+0*16]"); + asm ("vmovaps xmm13, [rdi+1*16]"); + asm ("vmovaps xmm14, [rdi+2*16]"); + asm ("vmovaps xmm15, [rdi+3*16]"); + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* store transposed IV */ + asm ("vmovaps [rdi+0*16], xmm12"); + asm ("vmovaps [rdi+1*16], xmm2"); + asm ("vmovaps [rdi+2*16], xmm6"); + asm ("vmovaps [rdi+3*16], xmm7"); + + asm volatile ("emms"); + asm (".att_syntax noprefix"); +} + +void TF512(u64* h, u64* m) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + /* message M in rsi */ + +#ifdef IACA_TRACE + IACA_START; +#endif + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load message into registers xmm12 - xmm15 (Q = message) */ + asm ("vmovaps xmm12, [rsi+0*16]"); + asm ("vmovaps xmm13, [rsi+1*16]"); + asm ("vmovaps xmm14, [rsi+2*16]"); + asm ("vmovaps xmm15, [rsi+3*16]"); + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* load previous chaining value and xor message to CV to get input of P */ + /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + asm ("vpxor xmm8, xmm12, [rdi+0*16]"); + asm ("vpxor xmm0, xmm2, [rdi+1*16]"); + asm ("vpxor xmm4, xmm6, [rdi+2*16]"); + asm ("vpxor xmm5, xmm7, [rdi+3*16]"); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + asm ("vpxor xmm0, xmm0, xmm8"); + asm ("vpxor xmm1, xmm1, xmm10"); + asm ("vpxor xmm2, xmm2, xmm12"); + asm ("vpxor xmm3, xmm3, xmm14"); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + asm ("vpxor xmm0, xmm0, [rdi+0*16]"); + asm ("vpxor xmm1, xmm1, [rdi+1*16]"); + asm ("vpxor xmm2, xmm2, [rdi+2*16]"); + asm ("vpxor xmm3, xmm3, [rdi+3*16]"); + + /* store CV */ + asm ("vmovaps [rdi+0*16], xmm0"); + asm ("vmovaps [rdi+1*16], xmm1"); + asm ("vmovaps [rdi+2*16], xmm2"); + asm ("vmovaps [rdi+3*16], xmm3"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + asm ("vmovaps xmm8, [rdi+0*16]"); + asm ("vmovaps xmm10, [rdi+1*16]"); + asm ("vmovaps xmm12, [rdi+2*16]"); + asm ("vmovaps xmm14, [rdi+3*16]"); + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + asm ("vpxor xmm8, xmm8, [rdi+0*16]"); + asm ("vpxor xmm10, xmm10, [rdi+1*16]"); + asm ("vpxor xmm12, xmm12, [rdi+2*16]"); + asm ("vpxor xmm14, xmm14, [rdi+3*16]"); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); + + /* we only need to return the truncated half of the state */ + asm ("vmovaps [rdi+2*16], xmm9"); + asm ("vmovaps [rdi+3*16], xmm11"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + + return; +} + diff --git a/algo/aes_ni/groestl256-asm-vperm.h b/algo/aes_ni/groestl256-asm-vperm.h new file mode 100644 index 000000000..a25ade795 --- /dev/null +++ b/algo/aes_ni/groestl256-asm-vperm.h @@ -0,0 +1,856 @@ +/* groestl-asm-vperm.h Aug 2011 + * + * Groestl implementation with inline assembly using ssse3 instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * Based on the vperm and aes_ni implementations of the hash function Groestl + * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ + * Institute of Applied Mathematics, Middle East Technical University, Turkey + * + * This code is placed in the public domain + */ + +#include "hash-groestl256.h" + +/* global constants */ +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; +__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; +__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16]; +__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16]; +__attribute__ ((aligned (16))) unsigned char ALL_0F[16]; +__attribute__ ((aligned (16))) unsigned char ALL_15[16]; +__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; +__attribute__ ((aligned (16))) unsigned char ALL_63[16]; +__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; +__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16]; +__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16]; + +/* temporary variables */ +__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16]; +__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16]; +__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16]; +__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; +__attribute__ ((aligned (16))) unsigned char TEMP[8*16]; + + +#define tos(a) #a +#define tostr(a) tos(a) + +#define SET_SHARED_CONSTANTS(){\ + ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ + ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ + ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ + ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\ + ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\ + ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\ + ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\ + ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\ + ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\ + ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\ + ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\ + ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\ + ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\ + ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\ + ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\ + ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\ + ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\ + ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\ + ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\ + ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\ + ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\ + ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\ + ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\ + ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\ + ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\ + ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\ + ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\ + ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\ + ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\ + ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\ + ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\ +/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\ + ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\ + ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\ + ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\ + ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\ + ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\ +}/**/ + +/* VPERM + * Transform w/o settings c* + * transforms 2 rows to/from "vperm mode" + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0, a1 = 2 rows + * table = transformation table to use + * t*, c* = clobbers + * outputs: + * a0, a1 = 2 rows transformed with table + * */ +#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ + asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\ + asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ + asm ("pandn xmm"tostr(t1)", xmm"tostr(a1)"");\ + asm ("psrld xmm"tostr(t0)", 4");\ + asm ("psrld xmm"tostr(t1)", 4");\ + asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ + asm ("pand xmm"tostr(a1)", xmm"tostr(c0)"");\ + asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\ + asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\ + asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\ + asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\ + asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\ + asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\ + asm ("pxor xmm"tostr(a0)", xmm"tostr(t2)"");\ + asm ("pxor xmm"tostr(a1)", xmm"tostr(t3)"");\ +}/**/ + +#define VPERM_Transform_Set_Const(table, c0, c1, c2){\ + asm ("movaps xmm"tostr(c0)", [ALL_0F]");\ + asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\ + asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\ +}/**/ + +/* VPERM + * Transform + * transforms 2 rows to/from "vperm mode" + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0, a1 = 2 rows + * table = transformation table to use + * t*, c* = clobbers + * outputs: + * a0, a1 = 2 rows transformed with table + * */ +#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ + VPERM_Transform_Set_Const(table, c0, c1, c2);\ + VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ +}/**/ + +/* VPERM + * Transform State + * inputs: + * a0-a3 = state + * table = transformation table to use + * t* = clobbers + * outputs: + * a0-a3 = transformed state + * */ +#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ + VPERM_Transform_Set_Const(table, c0, c1, c2);\ + VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ + VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ +}/**/ + +/* VPERM + * Add Constant to State + * inputs: + * a0-a7 = state + * constant = constant to add + * t0 = clobber + * outputs: + * a0-a7 = state + constant + * */ +#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ + asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\ + asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a1)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a2)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a3)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a4)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a5)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a6)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(a7)", xmm"tostr(t0)"");\ +}/**/ + +/* VPERM + * Set Substitute Core Constants + * */ +#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\ + VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\ +}/**/ + +/* VPERM + * Substitute Core + * first part of sbox inverse computation + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0 = 1 row + * t*, c* = clobbers + * outputs: + * b0a, b0b = inputs for lookup step + * */ +#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ + asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ + asm ("psrld xmm"tostr(t0)", 4");\ + asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ + asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\ + asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\ + asm ("pxor xmm"tostr(b0b)", xmm"tostr(b0a)"");\ + asm ("movdqa xmm"tostr(t1)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(t1)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(t1)", xmm"tostr(b0a)"");\ + asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\ + asm ("pxor xmm"tostr(b0a)", xmm"tostr(a0)"");\ + asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ + asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\ + asm ("pxor xmm"tostr(b0b)", xmm"tostr(t0)"");\ +}/**/ + +/* VPERM + * Lookup + * second part of sbox inverse computation + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0a, a0b = output of Substitution Core + * table = lookup table to use (*1 / *2 / *4) + * t0 = clobber + * outputs: + * b0 = output of sbox + multiplication + * */ +#define VPERM_Lookup(a0a, a0b, table, b0, t0){\ + asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\ + asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\ + asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\ + asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(t0)"");\ +}/**/ + +/* VPERM + * SubBytes and *2 / *4 + * this function is derived from: + * Constant-time SSSE3 AES core implementation + * by Mike Hamburg + * and + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0-a7 = state + * t*, c* = clobbers + * outputs: + * a0-a7 = state * 4 + * c2 = row0 * 2 -> b0 + * c1 = row7 * 2 -> b3 + * c0 = row7 * 1 -> b4 + * t2 = row4 * 1 -> b7 + * TEMP_MUL1 = row(i) * 1 + * TEMP_MUL2 = row(i) * 2 + * + * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ +#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ + /* set Constants */\ + VPERM_Substitute_Core_Set_Const(c0, c1, c2);\ + /* row 1 */\ + VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\ + /* --- */\ + /* row 2 */\ + VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\ + /* --- */\ + /* row 3 */\ + VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\ + /* --- */\ + /* row 5 */\ + VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\ + /* --- */\ + /* row 6 */\ + VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\ + /* --- */\ + /* row 7 */\ + VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\ + VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\ + /* --- */\ + /* row 4 */\ + VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\ + /* --- */\ + /* row 0 */\ + VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\ + VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\ + asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\ + VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\ + /* --- */\ +}/**/ + + +/* Optimized MixBytes + * inputs: + * a0-a7 = (row0-row7) * 4 + * b0 = row0 * 2 + * b3 = row7 * 2 + * b4 = row7 * 1 + * b7 = row4 * 1 + * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 + * output: b0-b7 + * */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* save one value */\ + asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\ + /* 1 */\ + asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\ + asm ("pxor xmm"tostr(b1)", [TEMP_MUL2+3*16]");\ + asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\ + \ + /* 2 */\ + asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a4)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\ + asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\ + \ + /* 4 */\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(a6)"");\ + /*asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\ + asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+6*16]");\ + asm ("pxor xmm"tostr(b7)", [TEMP_MUL2+1*16]");\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\ + asm ("pxor xmm"tostr(b2)", xmm"tostr(b7)"");\ + \ + /* 3 */\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(a7)"");\ + asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+5*16]");\ + asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+7*16]");\ + /*asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\ + asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+2*16]");\ + asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(b0)"");\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\ + \ + /* 5 */\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ + /*asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\ + asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+2*16]");\ + asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+3*16]");\ + asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+5*16]");\ + asm ("pxor xmm"tostr(b3)", xmm"tostr(b4)"");\ + asm ("pxor xmm"tostr(b6)", xmm"tostr(b4)"");\ + \ + /* 6 */\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ + \ + /* 7 */\ + asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\ + asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\ + asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\ + asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ + \ + /* 8 */\ + asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\ + asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\ + asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ + \ + /* 9 */\ + asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+5*16]");\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ + \ + /* 10 */\ + asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\ + asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+1*16]");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(a1)"");\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a1)"");\ + \ + /* 11 */\ + asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\ + asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+6*16]");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ + asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ + \ + /* 12 */\ + asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\ + asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+2*16]");\ + asm ("pxor xmm"tostr(b2)", xmm"tostr(a3)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ + \ + /* 13 */\ + asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\ + asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ + asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ + asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ + asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ + asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ +}/**/ + +//#if (LENGTH <= 256) + +#define SET_CONSTANTS(){\ + SET_SHARED_CONSTANTS();\ + ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\ + ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\ + ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\ + ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\ + ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\ + ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\ + ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\ + ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\ + ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\ + ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\ + ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\ + ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\ + ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\ + ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\ + ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\ + ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ + ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ + ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ + }\ + ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ + ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ +}/**/ + +#define Push_All_Regs(){\ +/* not using any... + asm("push rax");\ + asm("push rbx");\ + asm("push rcx");*/\ +}/**/ + +#define Pop_All_Regs(){\ +/* not using any... + asm("pop rcx");\ + asm("pop rbx");\ + asm("pop rax");*/\ +}/**/ + + +/* vperm: + * transformation before rounds with ipt + * first round add transformed constant + * middle rounds: add constant XOR 0x15...15 + * last round: additionally add 0x15...15 after MB + * transformation after rounds with opt + */ +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant + ShiftBytes (interleaved) */\ + asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ + asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ + asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ + asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ + asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ + asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ + asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ + asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ + asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ + asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ + asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ + asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ + asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ + asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ + asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ + /* SubBytes + Multiplication by 2 and 4 */\ + VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +}/**/ + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ + ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ + ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ + VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ +} + + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ +\ + asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ +\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ +\ + asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ + asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ +\ + asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ + asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ + asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ + asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ +\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ +\ + asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ + asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ + asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ + asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ + asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ + asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ + asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ + asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ + asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ + asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ + asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ + asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ + asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ + asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ + asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ + asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ + asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ + asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ +}/**/ + + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst_CNT2(i, j){\ + asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\ + asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\ + asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\ + asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\ + VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ + asm ("pxor xmm0, [ALL_15]");\ + asm ("pxor xmm1, [ALL_15]");\ + asm ("pxor xmm2, [ALL_15]");\ + asm ("pxor xmm3, [ALL_15]");\ + asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\ + asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\ + asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\ + asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\ +}/**/ + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst(){\ + asm ("movaps xmm0, [ROUND_CONST_Lx]");\ + VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ + asm ("pxor xmm0, [ALL_15]");\ + asm ("movaps [ROUND_CONST_Lx], xmm0");\ + VPERM_Transform_RoundConst_CNT2(0, 1);\ + VPERM_Transform_RoundConst_CNT2(2, 3);\ + VPERM_Transform_RoundConst_CNT2(4, 5);\ + VPERM_Transform_RoundConst_CNT2(6, 7);\ + VPERM_Transform_RoundConst_CNT2(8, 9);\ +}/**/ + +void INIT256(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + asm volatile ("emms"); + + /* transform round constants into VPERM mode */ + VPERM_Transform_RoundConst(); + + /* load IV into registers xmm12 - xmm15 */ + asm ("movaps xmm12, [rdi+0*16]"); + asm ("movaps xmm13, [rdi+1*16]"); + asm ("movaps xmm14, [rdi+2*16]"); + asm ("movaps xmm15, [rdi+3*16]"); + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* store transposed IV */ + asm ("movaps [rdi+0*16], xmm12"); + asm ("movaps [rdi+1*16], xmm2"); + asm ("movaps [rdi+2*16], xmm6"); + asm ("movaps [rdi+3*16], xmm7"); + + asm volatile ("emms"); + asm (".att_syntax noprefix"); +} + +void TF512(u64* h, u64* m) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + /* message M in rsi */ + +#ifdef IACA_TRACE + IACA_START; +#endif + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load message into registers xmm12 - xmm15 (Q = message) */ + asm ("movaps xmm12, [rsi+0*16]"); + asm ("movaps xmm13, [rsi+1*16]"); + asm ("movaps xmm14, [rsi+2*16]"); + asm ("movaps xmm15, [rsi+3*16]"); + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); + Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm0, [rdi+1*16]"); + asm ("movaps xmm4, [rdi+2*16]"); + asm ("movaps xmm5, [rdi+3*16]"); + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + asm ("pxor xmm8, xmm12"); + asm ("pxor xmm0, xmm2"); + asm ("pxor xmm4, xmm6"); + asm ("pxor xmm5, xmm7"); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + asm ("pxor xmm0, xmm8"); + asm ("pxor xmm1, xmm10"); + asm ("pxor xmm2, xmm12"); + asm ("pxor xmm3, xmm14"); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + asm ("pxor xmm0, [rdi+0*16]"); + asm ("pxor xmm1, [rdi+1*16]"); + asm ("pxor xmm2, [rdi+2*16]"); + asm ("pxor xmm3, [rdi+3*16]"); + + /* store CV */ + asm ("movaps [rdi+0*16], xmm0"); + asm ("movaps [rdi+1*16], xmm1"); + asm ("movaps [rdi+2*16], xmm2"); + asm ("movaps [rdi+3*16], xmm3"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + +#ifdef IACA_TRACE + IACA_END; +#endif + + return; +} + +void OF512(u64* h) +{ + /* __cdecl calling convention: */ + /* chaining value CV in rdi */ + + asm (".intel_syntax noprefix"); + Push_All_Regs(); + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + asm ("movaps xmm8, [rdi+0*16]"); + asm ("movaps xmm10, [rdi+1*16]"); + asm ("movaps xmm12, [rdi+2*16]"); + asm ("movaps xmm14, [rdi+3*16]"); + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + asm ("pxor xmm8, [rdi+0*16]"); + asm ("pxor xmm10, [rdi+1*16]"); + asm ("pxor xmm12, [rdi+2*16]"); + asm ("pxor xmm14, [rdi+3*16]"); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); + VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7); + + /* we only need to return the truncated half of the state */ + asm ("movaps [rdi+2*16], xmm9"); + asm ("movaps [rdi+3*16], xmm11"); + + Pop_All_Regs(); + asm (".att_syntax noprefix"); + + return; +} + + diff --git a/algo/aes_ni/groestl256-intr-aes.h b/algo/aes_ni/groestl256-intr-aes.h new file mode 100644 index 000000000..9ef6e1bc4 --- /dev/null +++ b/algo/aes_ni/groestl256-intr-aes.h @@ -0,0 +1,496 @@ +/* groestl-intr-aes.h Aug 2011 + * + * Groestl implementation with intrinsics using ssse3, sse4.1, and aes + * instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include +#include +#include "hash-groestl256.h" + +/* global constants */ +__m128i ROUND_CONST_Lx; +__m128i ROUND_CONST_L0[ROUNDS512]; +__m128i ROUND_CONST_L7[ROUNDS512]; +__m128i ROUND_CONST_P[ROUNDS1024]; +__m128i ROUND_CONST_Q[ROUNDS1024]; +__m128i TRANSP_MASK; +__m128i SUBSH_MASK[8]; +__m128i ALL_1B; +__m128i ALL_FF; + + +#define tos(a) #a +#define tostr(a) tos(a) + + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b */ +#define MUL2(i, j, k){\ + j = _mm_xor_si128(j, j);\ + j = _mm_cmpgt_epi8(j, i);\ + i = _mm_add_epi8(i, i);\ + j = _mm_and_si128(j, k);\ + i = _mm_xor_si128(i, j);\ +} + + /**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm_xor_si128(a0, a1);\ + b0 = a2;\ + a1 = _mm_xor_si128(a1, a2);\ + b1 = a3;\ + a2 = _mm_xor_si128(a2, a3);\ + b2 = a4;\ + a3 = _mm_xor_si128(a3, a4);\ + b3 = a5;\ + a4 = _mm_xor_si128(a4, a5);\ + b4 = a6;\ + a5 = _mm_xor_si128(a5, a6);\ + b5 = a7;\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm_xor_si128(b0, a4);\ + b6 = _mm_xor_si128(b6, a4);\ + b1 = _mm_xor_si128(b1, a5);\ + b7 = _mm_xor_si128(b7, a5);\ + b2 = _mm_xor_si128(b2, a6);\ + b0 = _mm_xor_si128(b0, a6);\ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + b3 = _mm_xor_si128(b3, a7);\ + b1 = _mm_xor_si128(b1, a7);\ + TEMP1 = b1;\ + b4 = _mm_xor_si128(b4, a0);\ + b2 = _mm_xor_si128(b2, a0);\ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b5 = _mm_xor_si128(b5, a1);\ + b3 = _mm_xor_si128(b3, a1);\ + b1 = a1;\ + b6 = _mm_xor_si128(b6, a2);\ + b4 = _mm_xor_si128(b4, a2);\ + TEMP2 = a2;\ + b7 = _mm_xor_si128(b7, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(a2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = ALL_1B;\ + MUL2(a0, b0, b1);\ + a0 = _mm_xor_si128(a0, TEMP0);\ + MUL2(a1, b0, b1);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + MUL2(a2, b0, b1);\ + a2 = _mm_xor_si128(a2, b2);\ + MUL2(a3, b0, b1);\ + a3 = _mm_xor_si128(a3, b3);\ + MUL2(a4, b0, b1);\ + a4 = _mm_xor_si128(a4, b4);\ + MUL2(a5, b0, b1);\ + a5 = _mm_xor_si128(a5, b5);\ + MUL2(a6, b0, b1);\ + a6 = _mm_xor_si128(a6, b6);\ + MUL2(a7, b0, b1);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + b5 = _mm_xor_si128(b5, a0);\ + MUL2(a1, b0, b1);\ + b6 = _mm_xor_si128(b6, a1);\ + MUL2(a2, b0, b1);\ + b7 = _mm_xor_si128(b7, a2);\ + MUL2(a5, b0, b1);\ + b2 = _mm_xor_si128(b2, a5);\ + MUL2(a6, b0, b1);\ + b3 = _mm_xor_si128(b3, a6);\ + MUL2(a7, b0, b1);\ + b4 = _mm_xor_si128(b4, a7);\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm_xor_si128(b0, a3);\ + b1 = _mm_xor_si128(b1, a4);\ +}/*MixBytes*/ + +#define SET_CONSTANTS(){\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ + SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ + SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ + SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ + SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ + SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ + SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ + SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ + }\ + ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ +}while(0); \ + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant */\ + b1 = ROUND_CONST_Lx;\ + a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ + a1 = _mm_xor_si128(a1, b1);\ + a2 = _mm_xor_si128(a2, b1);\ + a3 = _mm_xor_si128(a3, b1);\ + a4 = _mm_xor_si128(a4, b1);\ + a5 = _mm_xor_si128(a5, b1);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + \ + /* ShiftBytes + SubBytes (interleaved) */\ + b0 = _mm_xor_si128(b0, b0);\ + a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ + a0 = _mm_aesenclast_si128(a0, b0);\ + a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ + a1 = _mm_aesenclast_si128(a1, b0);\ + a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ + a2 = _mm_aesenclast_si128(a2, b0);\ + a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ + a3 = _mm_aesenclast_si128(a3, b0);\ + a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ + a4 = _mm_aesenclast_si128(a4, b0);\ + a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ + a5 = _mm_aesenclast_si128(a5, b0);\ + a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ + a6 = _mm_aesenclast_si128(a6, b0);\ + a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ + a7 = _mm_aesenclast_si128(a7, b0);\ + \ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + t0 = TRANSP_MASK;\ + \ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + \ + o1 = i0;\ + t0 = i2;\ + \ + i0 = _mm_unpacklo_epi16(i0, i1);\ + o1 = _mm_unpackhi_epi16(o1, i1);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + t0 = _mm_unpackhi_epi16(t0, i3);\ + \ + i0 = _mm_shuffle_epi32(i0, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + t0 = _mm_shuffle_epi32(t0, 216);\ + \ + o2 = i0;\ + o3 = o1;\ + \ + i0 = _mm_unpacklo_epi32(i0, i2);\ + o1 = _mm_unpacklo_epi32(o1, t0);\ + o2 = _mm_unpackhi_epi32(o2, i2);\ + o3 = _mm_unpackhi_epi32(o3, t0);\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + o1 = i0;\ + o2 = i1;\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + o1 = _mm_unpackhi_epi64(o1, i4);\ + o3 = i1;\ + o4 = i2;\ + o2 = _mm_unpacklo_epi64(o2, i5);\ + o3 = _mm_unpackhi_epi64(o3, i5);\ + o5 = i2;\ + o6 = i3;\ + o4 = _mm_unpacklo_epi64(o4, i6);\ + o5 = _mm_unpackhi_epi64(o5, i6);\ + o7 = i3;\ + o6 = _mm_unpacklo_epi64(o6, i7);\ + o7 = _mm_unpackhi_epi64(o7, i7);\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + o0 = i0;\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o0 = _mm_unpackhi_epi64(o0, i1);\ + o1 = i2;\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + o1 = _mm_unpackhi_epi64(o1, i3);\ + o2 = i4;\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + o2 = _mm_unpackhi_epi64(o2, i5);\ + o3 = i6;\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + o3 = _mm_unpackhi_epi64(o3, i7);\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + t0 = _mm_xor_si128(t0, t0);\ + i1 = i0;\ + i3 = i2;\ + i5 = i4;\ + i7 = i6;\ + i0 = _mm_unpacklo_epi64(i0, t0);\ + i1 = _mm_unpackhi_epi64(i1, t0);\ + i2 = _mm_unpacklo_epi64(i2, t0);\ + i3 = _mm_unpackhi_epi64(i3, t0);\ + i4 = _mm_unpacklo_epi64(i4, t0);\ + i5 = _mm_unpackhi_epi64(i5, t0);\ + i6 = _mm_unpacklo_epi64(i6, t0);\ + i7 = _mm_unpackhi_epi64(i7, t0);\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +}/**/ + + +void INIT256(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7; + static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm12 - xmm15 */ + xmm12 = chaining[0]; + xmm13 = chaining[1]; + xmm14 = chaining[2]; + xmm15 = chaining[3]; + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* store transposed IV */ + chaining[0] = xmm12; + chaining[1] = xmm2; + chaining[2] = xmm6; + chaining[3] = xmm7; +} + +void TF512(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm12 - xmm15 */ + xmm12 = message[0]; + xmm13 = message[1]; + xmm14 = message[2]; + xmm15 = message[3]; + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + xmm8 = chaining[0]; + xmm0 = chaining[1]; + xmm4 = chaining[2]; + xmm5 = chaining[3]; + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + xmm8 = _mm_xor_si128(xmm8, xmm12); + xmm0 = _mm_xor_si128(xmm0, xmm2); + xmm4 = _mm_xor_si128(xmm4, xmm6); + xmm5 = _mm_xor_si128(xmm5, xmm7); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, xmm8); + xmm1 = _mm_xor_si128(xmm1, xmm10); + xmm2 = _mm_xor_si128(xmm2, xmm12); + xmm3 = _mm_xor_si128(xmm3, xmm14); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, (chaining[0])); + xmm1 = _mm_xor_si128(xmm1, (chaining[1])); + xmm2 = _mm_xor_si128(xmm2, (chaining[2])); + xmm3 = _mm_xor_si128(xmm3, (chaining[3])); + + /* store CV */ + chaining[0] = xmm0; + chaining[1] = xmm1; + chaining[2] = xmm2; + chaining[3] = xmm3; + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + xmm8 = chaining[0]; + xmm10 = chaining[1]; + xmm12 = chaining[2]; + xmm14 = chaining[3]; + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm10 = _mm_xor_si128(xmm10, (chaining[1])); + xmm12 = _mm_xor_si128(xmm12, (chaining[2])); + xmm14 = _mm_xor_si128(xmm14, (chaining[3])); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); + + /* we only need to return the truncated half of the state */ + chaining[2] = xmm9; + chaining[3] = xmm11; +} + + diff --git a/algo/aes_ni/groestl256-intr-avx.h b/algo/aes_ni/groestl256-intr-avx.h new file mode 100644 index 000000000..3eb8397d9 --- /dev/null +++ b/algo/aes_ni/groestl256-intr-avx.h @@ -0,0 +1,482 @@ +/* groestl-intr-avx.h Aug 2011 + * + * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx + * instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include +#include +#include +#include "hash-groestl256.h" + +/* global constants */ +__m128i ROUND_CONST_Lx; +__m128i ROUND_CONST_L0[ROUNDS512]; +__m128i ROUND_CONST_L7[ROUNDS512]; +__m128i ROUND_CONST_P[ROUNDS1024]; +__m128i ROUND_CONST_Q[ROUNDS1024]; +__m128i TRANSP_MASK; +__m128i SUBSH_MASK[8]; +__m128i ALL_FF; +//#if LENGTH <= 256 +__m128i ALL_1B; +//#else +//__m256d ALL_1B; +//#endif + +#define tos(a) #a +#define tostr(a) tos(a) + +#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos))) +#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos)) + +#define SET_CONSTANTS(){\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ + SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ + SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ + SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ + SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ + SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ + SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ + SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ + }\ + ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ +}while(0); + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b + * xmm[z] has to be zero */ +#define VMUL2(i, j, k, z){\ + j = _mm_cmpgt_epi8(z, i);\ + i = _mm_add_epi8(i, i);\ + j = _mm_and_si128(j, k);\ + i = _mm_xor_si128(i, j);\ +}/**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ + b0 = a2;\ + b1 = a3;\ + b2 = a4;\ + b3 = a5;\ + b4 = a6;\ + b5 = a7;\ + b6 = a0;\ + b7 = a1;\ + \ + /* t_i = a_i + a_{i+1} */\ + a0 = _mm_xor_si128(a0, a1);\ + a1 = _mm_xor_si128(a1, a2);\ + a2 = _mm_xor_si128(a2, a3);\ + a3 = _mm_xor_si128(a3, a4);\ + a4 = _mm_xor_si128(a4, a5);\ + a5 = _mm_xor_si128(a5, a6);\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm_xor_si128(b0, a4);\ + b1 = _mm_xor_si128(b1, a5);\ + b2 = _mm_xor_si128(b2, a6);\ + b3 = _mm_xor_si128(b3, a7);\ + b4 = _mm_xor_si128(b4, a0);\ + b5 = _mm_xor_si128(b5, a1);\ + b6 = _mm_xor_si128(b6, a2);\ + b7 = _mm_xor_si128(b7, a3);\ + \ + b0 = _mm_xor_si128(b0, a6);\ + b1 = _mm_xor_si128(b1, a7);\ + b2 = _mm_xor_si128(b2, a0);\ + b3 = _mm_xor_si128(b3, a1);\ + b4 = _mm_xor_si128(b4, a2);\ + b5 = _mm_xor_si128(b5, a3);\ + b6 = _mm_xor_si128(b6, a4);\ + b7 = _mm_xor_si128(b7, a5);\ + \ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + TEMP1 = b1;\ + TEMP2 = b2;\ + \ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b1 = a1;\ + TEMP3 = a2;\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(a2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP3);\ + \ + /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + b1 = ALL_1B;\ + b2 = _mm_xor_si128(b2, b2);\ + VMUL2(a7, b0, b1, b2);\ + VMUL2(a6, b0, b1, b2);\ + VMUL2(a5, b0, b1, b2);\ + VMUL2(a4, b0, b1, b2);\ + VMUL2(a3, b0, b1, b2);\ + VMUL2(a2, b0, b1, b2);\ + VMUL2(a1, b0, b1, b2);\ + VMUL2(a0, b0, b1, b2);\ + \ + /* compute w_i : add y_{i+4} */\ + a0 = _mm_xor_si128(a0, TEMP0);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + a2 = _mm_xor_si128(a2, TEMP2);\ + a3 = _mm_xor_si128(a3, b3);\ + a4 = _mm_xor_si128(a4, b4);\ + a5 = _mm_xor_si128(a5, b5);\ + a6 = _mm_xor_si128(a6, b6);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /*compute v_i: double w_i */\ + VMUL2(a0, b0, b1, b2);\ + VMUL2(a1, b0, b1, b2);\ + VMUL2(a2, b0, b1, b2);\ + VMUL2(a3, b0, b1, b2);\ + VMUL2(a4, b0, b1, b2);\ + VMUL2(a5, b0, b1, b2);\ + VMUL2(a6, b0, b1, b2);\ + VMUL2(a7, b0, b1, b2);\ + \ + /* add to y_4 y_5 .. v3, v4, ... */\ + b0 = _mm_xor_si128(a3, TEMP0);\ + b1 = _mm_xor_si128(a4, TEMP1);\ + b2 = _mm_xor_si128(a5, TEMP2);\ + b3 = _mm_xor_si128(b3, a6);\ + b4 = _mm_xor_si128(b4, a7);\ + b5 = _mm_xor_si128(b5, a0);\ + b6 = _mm_xor_si128(b6, a1);\ + b7 = _mm_xor_si128(b7, a2);\ +}/*MixBytes*/ + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* Add Round Constant */\ + b1 = ROUND_CONST_Lx;\ + a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ + a1 = _mm_xor_si128(a1, b1);\ + a2 = _mm_xor_si128(a2, b1);\ + a3 = _mm_xor_si128(a3, b1);\ + a4 = _mm_xor_si128(a4, b1);\ + a5 = _mm_xor_si128(a5, b1);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + \ + /* ShiftBytes + SubBytes (interleaved) */\ + b0 = _mm_xor_si128(b0, b0);\ + a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ + a0 = _mm_aesenclast_si128(a0, b0);\ + a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ + a1 = _mm_aesenclast_si128(a1, b0);\ + a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ + a2 = _mm_aesenclast_si128(a2, b0);\ + a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ + a3 = _mm_aesenclast_si128(a3, b0);\ + a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ + a4 = _mm_aesenclast_si128(a4, b0);\ + a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ + a5 = _mm_aesenclast_si128(a5, b0);\ + a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ + a6 = _mm_aesenclast_si128(a6, b0);\ + a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ + a7 = _mm_aesenclast_si128(a7, b0);\ + \ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + t0 = TRANSP_MASK;\ + \ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + \ + o1 = _mm_unpackhi_epi16(i0, i1);\ + i0 = _mm_unpacklo_epi16(i0, i1);\ + t0 = _mm_unpackhi_epi16(i2, i3);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + \ + i0 = _mm_shuffle_epi32(i0, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + t0 = _mm_shuffle_epi32(t0, 216);\ + \ + o2 = _mm_unpackhi_epi32(i0, i2);\ + o3 = _mm_unpackhi_epi32(o1, t0);\ + i0 = _mm_unpacklo_epi32(i0, i2);\ + o1 = _mm_unpacklo_epi32(o1, t0);\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + o1 = _mm_unpackhi_epi64(i0, i4);\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + o2 = _mm_unpacklo_epi64(i1, i5);\ + o3 = _mm_unpackhi_epi64(i1, i5);\ + o4 = _mm_unpacklo_epi64(i2, i6);\ + o5 = _mm_unpackhi_epi64(i2, i6);\ + o6 = _mm_unpacklo_epi64(i3, i7);\ + o7 = _mm_unpackhi_epi64(i3, i7);\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + o0 = _mm_unpackhi_epi64(i0, i1);\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o1 = _mm_unpackhi_epi64(i2, i3);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + o2 = _mm_unpackhi_epi64(i4, i5);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + o3 = _mm_unpackhi_epi64(i6, i7);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + t0 = _mm_xor_si128(t0, t0);\ + i1 = _mm_unpackhi_epi64(i0, t0);\ + i0 = _mm_unpacklo_epi64(i0, t0);\ + i3 = _mm_unpackhi_epi64(i2, t0);\ + i2 = _mm_unpacklo_epi64(i2, t0);\ + i5 = _mm_unpackhi_epi64(i4, t0);\ + i4 = _mm_unpacklo_epi64(i4, t0);\ + i7 = _mm_unpackhi_epi64(i6, t0);\ + i6 = _mm_unpacklo_epi64(i6, t0);\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +}/**/ + + +void INIT256(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7; + static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm12 - xmm15 */ + xmm12 = chaining[0]; + xmm13 = chaining[1]; + xmm14 = chaining[2]; + xmm15 = chaining[3]; + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* store transposed IV */ + chaining[0] = xmm12; + chaining[1] = xmm2; + chaining[2] = xmm6; + chaining[3] = xmm7; +} + +void TF512(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + static __m128i TEMP3; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm12 - xmm15 */ + xmm12 = message[0]; + xmm13 = message[1]; + xmm14 = message[2]; + xmm15 = message[3]; + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* load previous chaining value and xor message to CV to get input of P */ + /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + xmm8 = _mm_xor_si128(xmm12, chaining[0]); + xmm0 = _mm_xor_si128(xmm2, chaining[1]); + xmm4 = _mm_xor_si128(xmm6, chaining[2]); + xmm5 = _mm_xor_si128(xmm7, chaining[3]); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, xmm8); + xmm1 = _mm_xor_si128(xmm1, xmm10); + xmm2 = _mm_xor_si128(xmm2, xmm12); + xmm3 = _mm_xor_si128(xmm3, xmm14); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, chaining[0]); + xmm1 = _mm_xor_si128(xmm1, chaining[1]); + xmm2 = _mm_xor_si128(xmm2, chaining[2]); + xmm3 = _mm_xor_si128(xmm3, chaining[3]); + + /* store CV */ + chaining[0] = xmm0; + chaining[1] = xmm1; + chaining[2] = xmm2; + chaining[3] = xmm3; + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + static __m128i TEMP3; + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + xmm8 = chaining[0]; + xmm10 = chaining[1]; + xmm12 = chaining[2]; + xmm14 = chaining[3]; + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm10 = _mm_xor_si128(xmm10, (chaining[1])); + xmm12 = _mm_xor_si128(xmm12, (chaining[2])); + xmm14 = _mm_xor_si128(xmm14, (chaining[3])); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); + + /* we only need to return the truncated half of the state */ + chaining[2] = xmm9; + chaining[3] = xmm11; +} + + diff --git a/algo/aes_ni/groestl256-intr-vperm.h b/algo/aes_ni/groestl256-intr-vperm.h new file mode 100644 index 000000000..f6baa17e6 --- /dev/null +++ b/algo/aes_ni/groestl256-intr-vperm.h @@ -0,0 +1,793 @@ +/* groestl-intr-vperm.h Aug 2011 + * + * Groestl implementation with intrinsics using ssse3 instructions. + * Author: Günther A. Roland, Martin Schläffer + * + * Based on the vperm and aes_ni implementations of the hash function Groestl + * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ + * Institute of Applied Mathematics, Middle East Technical University, Turkey + * + * This code is placed in the public domain + */ + +#include +#include "hash-groestl256.h" + +/* global constants */ +__m128i ROUND_CONST_Lx; +__m128i ROUND_CONST_L0[ROUNDS512]; +__m128i ROUND_CONST_L7[ROUNDS512]; +__m128i ROUND_CONST_P[ROUNDS1024]; +__m128i ROUND_CONST_Q[ROUNDS1024]; +__m128i TRANSP_MASK; +__m128i SUBSH_MASK[8]; +__m128i ALL_0F; +__m128i ALL_15; +__m128i ALL_1B; +__m128i ALL_63; +__m128i ALL_FF; +__m128i VPERM_IPT[2]; +__m128i VPERM_OPT[2]; +__m128i VPERM_INV[2]; +__m128i VPERM_SB1[2]; +__m128i VPERM_SB2[2]; +__m128i VPERM_SB4[2]; +__m128i VPERM_SBO[2]; + + +#define tos(a) #a +#define tostr(a) tos(a) + +#define SET_SHARED_CONSTANTS(){\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\ + ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\ + ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\ + VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\ + VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\ + VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\ + VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\ + VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\ + VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\ + VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\ + VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\ + VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\ + VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\ + VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\ + VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\ +}/**/ + +/* VPERM + * Transform w/o settings c* + * transforms 2 rows to/from "vperm mode" + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0, a1 = 2 rows + * table = transformation table to use + * t*, c* = clobbers + * outputs: + * a0, a1 = 2 rows transformed with table + * */ +#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ + t0 = c0;\ + t1 = c0;\ + t0 = _mm_andnot_si128(t0, a0);\ + t1 = _mm_andnot_si128(t1, a1);\ + t0 = _mm_srli_epi32(t0, 4);\ + t1 = _mm_srli_epi32(t1, 4);\ + a0 = _mm_and_si128(a0, c0);\ + a1 = _mm_and_si128(a1, c0);\ + t2 = c2;\ + t3 = c2;\ + t2 = _mm_shuffle_epi8(t2, a0);\ + t3 = _mm_shuffle_epi8(t3, a1);\ + a0 = c1;\ + a1 = c1;\ + a0 = _mm_shuffle_epi8(a0, t0);\ + a1 = _mm_shuffle_epi8(a1, t1);\ + a0 = _mm_xor_si128(a0, t2);\ + a1 = _mm_xor_si128(a1, t3);\ +}/**/ + +#define VPERM_Transform_Set_Const(table, c0, c1, c2){\ + c0 = ALL_0F;\ + c1 = ((__m128i*) table )[0];\ + c2 = ((__m128i*) table )[1];\ +}/**/ + +/* VPERM + * Transform + * transforms 2 rows to/from "vperm mode" + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0, a1 = 2 rows + * table = transformation table to use + * t*, c* = clobbers + * outputs: + * a0, a1 = 2 rows transformed with table + * */ +#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ + VPERM_Transform_Set_Const(table, c0, c1, c2);\ + VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ +}/**/ + +/* VPERM + * Transform State + * inputs: + * a0-a3 = state + * table = transformation table to use + * t* = clobbers + * outputs: + * a0-a3 = transformed state + * */ +#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ + VPERM_Transform_Set_Const(table, c0, c1, c2);\ + VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ + VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ +}/**/ + +/* VPERM + * Add Constant to State + * inputs: + * a0-a7 = state + * constant = constant to add + * t0 = clobber + * outputs: + * a0-a7 = state + constant + * */ +#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ + t0 = constant;\ + a0 = _mm_xor_si128(a0, t0);\ + a1 = _mm_xor_si128(a1, t0);\ + a2 = _mm_xor_si128(a2, t0);\ + a3 = _mm_xor_si128(a3, t0);\ + a4 = _mm_xor_si128(a4, t0);\ + a5 = _mm_xor_si128(a5, t0);\ + a6 = _mm_xor_si128(a6, t0);\ + a7 = _mm_xor_si128(a7, t0);\ +}/**/ + +/* VPERM + * Set Substitute Core Constants + * */ +#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\ + VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\ +}/**/ + +/* VPERM + * Substitute Core + * first part of sbox inverse computation + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0 = 1 row + * t*, c* = clobbers + * outputs: + * b0a, b0b = inputs for lookup step + * */ +#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ + t0 = c0;\ + t0 = _mm_andnot_si128(t0, a0);\ + t0 = _mm_srli_epi32(t0, 4);\ + a0 = _mm_and_si128(a0, c0);\ + b0a = c1;\ + b0a = _mm_shuffle_epi8(b0a, a0);\ + a0 = _mm_xor_si128(a0, t0);\ + b0b = c2;\ + b0b = _mm_shuffle_epi8(b0b, t0);\ + b0b = _mm_xor_si128(b0b, b0a);\ + t1 = c2;\ + t1 = _mm_shuffle_epi8(t1, a0);\ + t1 = _mm_xor_si128(t1, b0a);\ + b0a = c2;\ + b0a = _mm_shuffle_epi8(b0a, b0b);\ + b0a = _mm_xor_si128(b0a, a0);\ + b0b = c2;\ + b0b = _mm_shuffle_epi8(b0b, t1);\ + b0b = _mm_xor_si128(b0b, t0);\ +}/**/ + +/* VPERM + * Lookup + * second part of sbox inverse computation + * this function is derived from: + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0a, a0b = output of Substitution Core + * table = lookup table to use (*1 / *2 / *4) + * t0 = clobber + * outputs: + * b0 = output of sbox + multiplication + * */ +#define VPERM_Lookup(a0a, a0b, table, b0, t0){\ + b0 = ((__m128i*) table )[0];\ + t0 = ((__m128i*) table )[1];\ + b0 = _mm_shuffle_epi8(b0, a0b);\ + t0 = _mm_shuffle_epi8(t0, a0a);\ + b0 = _mm_xor_si128(b0, t0);\ +}/**/ + +/* VPERM + * SubBytes and *2 / *4 + * this function is derived from: + * Constant-time SSSE3 AES core implementation + * by Mike Hamburg + * and + * vperm and aes_ni implementations of hash function Grostl + * by Cagdas CALIK + * inputs: + * a0-a7 = state + * t*, c* = clobbers + * outputs: + * a0-a7 = state * 4 + * c2 = row0 * 2 -> b0 + * c1 = row7 * 2 -> b3 + * c0 = row7 * 1 -> b4 + * t2 = row4 * 1 -> b7 + * TEMP_MUL1 = row(i) * 1 + * TEMP_MUL2 = row(i) * 2 + * + * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ +#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ + /* set Constants */\ + VPERM_Substitute_Core_Set_Const(c0, c1, c2);\ + /* row 1 */\ + VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[1] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[1] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\ + /* --- */\ + /* row 2 */\ + VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[2] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[2] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\ + /* --- */\ + /* row 3 */\ + VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[3] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[3] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\ + /* --- */\ + /* row 5 */\ + VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[5] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[5] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\ + /* --- */\ + /* row 6 */\ + VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[6] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[6] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\ + /* --- */\ + /* row 7 */\ + VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ + TEMP_MUL1[7] = t2;\ + VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\ + VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\ + /* --- */\ + /* row 4 */\ + VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\ + VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ + TEMP_MUL2[4] = t3;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\ + /* --- */\ + /* row 0 */\ + VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\ + VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\ + VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\ + TEMP_MUL2[0] = c2;\ + VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\ + /* --- */\ +}/**/ + + +/* Optimized MixBytes + * inputs: + * a0-a7 = (row0-row7) * 4 + * b0 = row0 * 2 + * b3 = row7 * 2 + * b4 = row7 * 1 + * b7 = row4 * 1 + * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 + * output: b0-b7 + * */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* save one value */\ + TEMP_MUL4 = a3;\ + /* 1 */\ + b1 = a0;\ + b1 = _mm_xor_si128(b1, a5);\ + b1 = _mm_xor_si128(b1, b4); /* -> helper! */\ + b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\ + b2 = b1;\ + \ + /* 2 */\ + b5 = a1;\ + b5 = _mm_xor_si128(b5, a4);\ + b5 = _mm_xor_si128(b5, b7); /* -> helper! */\ + b5 = _mm_xor_si128(b5, b3); /* -> helper! */\ + b6 = b5;\ + \ + /* 4 */\ + b7 = _mm_xor_si128(b7, a6);\ + /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\ + b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\ + b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\ + b7 = _mm_xor_si128(b7, b3); /* -> helper! */\ + b2 = _mm_xor_si128(b2, b7);\ + \ + /* 3 */\ + b0 = _mm_xor_si128(b0, a7);\ + b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\ + b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\ + /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\ + b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\ + b3 = b0;\ + b1 = _mm_xor_si128(b1, b0);\ + b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\ + \ + /* 5 */\ + b4 = _mm_xor_si128(b4, a2);\ + /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\ + b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\ + b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\ + b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\ + b3 = _mm_xor_si128(b3, b4);\ + b6 = _mm_xor_si128(b6, b4);\ + \ + /* 6 */\ + a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\ + a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\ + a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\ + a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\ + b4 = _mm_xor_si128(b4, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + b7 = _mm_xor_si128(b7, a3);\ + \ + /* 7 */\ + a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\ + a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\ + b2 = _mm_xor_si128(b2, a1);\ + b3 = _mm_xor_si128(b3, a1);\ + \ + /* 8 */\ + a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\ + a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\ + b6 = _mm_xor_si128(b6, a5);\ + b7 = _mm_xor_si128(b7, a5);\ + \ + /* 9 */\ + a3 = TEMP_MUL1[2];\ + a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\ + b0 = _mm_xor_si128(b0, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + \ + /* 10 */\ + a1 = TEMP_MUL1[6];\ + a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\ + b1 = _mm_xor_si128(b1, a1);\ + b4 = _mm_xor_si128(b4, a1);\ + \ + /* 11 */\ + a5 = TEMP_MUL1[3];\ + a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\ + b1 = _mm_xor_si128(b1, a5);\ + b6 = _mm_xor_si128(b6, a5);\ + \ + /* 12 */\ + a3 = TEMP_MUL1[7];\ + a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\ + b2 = _mm_xor_si128(b2, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + \ + /* 13 */\ + b0 = _mm_xor_si128(b0, (TEMP_MUL4));\ + b0 = _mm_xor_si128(b0, a4);\ + b1 = _mm_xor_si128(b1, a4);\ + b3 = _mm_xor_si128(b3, a6);\ + b4 = _mm_xor_si128(b4, a0);\ + b4 = _mm_xor_si128(b4, a7);\ + b5 = _mm_xor_si128(b5, a0);\ + b7 = _mm_xor_si128(b7, a2);\ +}/**/ + +#define SET_CONSTANTS(){\ + SET_SHARED_CONSTANTS();\ + SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\ + SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\ + SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\ + SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\ + SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\ + SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\ + SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\ + SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ + }\ + ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ +}/**/ + +/* vperm: + * transformation before rounds with ipt + * first round add transformed constant + * middle rounds: add constant XOR 0x15...15 + * last round: additionally add 0x15...15 after MB + * transformation after rounds with opt + */ +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant + ShiftBytes (interleaved) */\ + b1 = ROUND_CONST_Lx;\ + a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ + a1 = _mm_xor_si128(a1, b1);\ + a2 = _mm_xor_si128(a2, b1);\ + a3 = _mm_xor_si128(a3, b1);\ + a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ + a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ + a4 = _mm_xor_si128(a4, b1);\ + a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ + a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ + a5 = _mm_xor_si128(a5, b1);\ + a6 = _mm_xor_si128(a6, b1);\ + a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ + a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ + a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ + a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ + /* SubBytes + Multiplication by 2 and 4 */\ + VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +}/**/ + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ + ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ +} + + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + t0 = TRANSP_MASK;\ +\ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ +\ + o1 = i0;\ + t0 = i2;\ +\ + i0 = _mm_unpacklo_epi16(i0, i1);\ + o1 = _mm_unpackhi_epi16(o1, i1);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + t0 = _mm_unpackhi_epi16(t0, i3);\ +\ + i0 = _mm_shuffle_epi32(i0, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + t0 = _mm_shuffle_epi32(t0, 216);\ +\ + o2 = i0;\ + o3 = o1;\ +\ + i0 = _mm_unpacklo_epi32(i0, i2);\ + o1 = _mm_unpacklo_epi32(o1, t0);\ + o2 = _mm_unpackhi_epi32(o2, i2);\ + o3 = _mm_unpackhi_epi32(o3, t0);\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + o1 = i0;\ + o2 = i1;\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + o1 = _mm_unpackhi_epi64(o1, i4);\ + o3 = i1;\ + o4 = i2;\ + o2 = _mm_unpacklo_epi64(o2, i5);\ + o3 = _mm_unpackhi_epi64(o3, i5);\ + o5 = i2;\ + o6 = i3;\ + o4 = _mm_unpacklo_epi64(o4, i6);\ + o5 = _mm_unpackhi_epi64(o5, i6);\ + o7 = i3;\ + o6 = _mm_unpacklo_epi64(o6, i7);\ + o7 = _mm_unpackhi_epi64(o7, i7);\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + o0 = i0;\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o0 = _mm_unpackhi_epi64(o0, i1);\ + o1 = i2;\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + o1 = _mm_unpackhi_epi64(o1, i3);\ + o2 = i4;\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + o2 = _mm_unpackhi_epi64(o2, i5);\ + o3 = i6;\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + o3 = _mm_unpackhi_epi64(o3, i7);\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + t0 = _mm_xor_si128(t0, t0);\ + i1 = i0;\ + i3 = i2;\ + i5 = i4;\ + i7 = i6;\ + i0 = _mm_unpacklo_epi64(i0, t0);\ + i1 = _mm_unpackhi_epi64(i1, t0);\ + i2 = _mm_unpacklo_epi64(i2, t0);\ + i3 = _mm_unpackhi_epi64(i3, t0);\ + i4 = _mm_unpacklo_epi64(i4, t0);\ + i5 = _mm_unpackhi_epi64(i5, t0);\ + i6 = _mm_unpacklo_epi64(i6, t0);\ + i7 = _mm_unpackhi_epi64(i7, t0);\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +}/**/ + + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst_CNT2(i, j){\ + xmm0 = ROUND_CONST_L0[i];\ + xmm1 = ROUND_CONST_L7[i];\ + xmm2 = ROUND_CONST_L0[j];\ + xmm3 = ROUND_CONST_L7[j];\ + VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ + xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ + xmm1 = _mm_xor_si128(xmm1, (ALL_15));\ + xmm2 = _mm_xor_si128(xmm2, (ALL_15));\ + xmm3 = _mm_xor_si128(xmm3, (ALL_15));\ + ROUND_CONST_L0[i] = xmm0;\ + ROUND_CONST_L7[i] = xmm1;\ + ROUND_CONST_L0[j] = xmm2;\ + ROUND_CONST_L7[j] = xmm3;\ +}/**/ + +/* transform round constants into VPERM mode */ +#define VPERM_Transform_RoundConst(){\ + xmm0 = ROUND_CONST_Lx;\ + VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ + xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ + ROUND_CONST_Lx = xmm0;\ + VPERM_Transform_RoundConst_CNT2(0, 1);\ + VPERM_Transform_RoundConst_CNT2(2, 3);\ + VPERM_Transform_RoundConst_CNT2(4, 5);\ + VPERM_Transform_RoundConst_CNT2(6, 7);\ + VPERM_Transform_RoundConst_CNT2(8, 9);\ +}/**/ + +void INIT256(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15; + + /* transform round constants into VPERM mode */ + VPERM_Transform_RoundConst(); + + /* load IV into registers xmm12 - xmm15 */ + xmm12 = chaining[0]; + xmm13 = chaining[1]; + xmm14 = chaining[2]; + xmm15 = chaining[3]; + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* store transposed IV */ + chaining[0] = xmm12; + chaining[1] = xmm2; + chaining[2] = xmm6; + chaining[3] = xmm7; +} + +void TF512(u64* h, u64* m) +{ + __m128i* const chaining = (__m128i*) h; + __m128i* const message = (__m128i*) m; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP_MUL1[8]; + static __m128i TEMP_MUL2[8]; + static __m128i TEMP_MUL4; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm12 - xmm15 */ + xmm12 = message[0]; + xmm13 = message[1]; + xmm14 = message[2]; + xmm15 = message[3]; + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + xmm8 = chaining[0]; + xmm0 = chaining[1]; + xmm4 = chaining[2]; + xmm5 = chaining[3]; + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + xmm8 = _mm_xor_si128(xmm8, xmm12); + xmm0 = _mm_xor_si128(xmm0, xmm2); + xmm4 = _mm_xor_si128(xmm4, xmm6); + xmm5 = _mm_xor_si128(xmm5, xmm7); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, xmm8); + xmm1 = _mm_xor_si128(xmm1, xmm10); + xmm2 = _mm_xor_si128(xmm2, xmm12); + xmm3 = _mm_xor_si128(xmm3, xmm14); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, (chaining[0])); + xmm1 = _mm_xor_si128(xmm1, (chaining[1])); + xmm2 = _mm_xor_si128(xmm2, (chaining[2])); + xmm3 = _mm_xor_si128(xmm3, (chaining[3])); + + /* store CV */ + chaining[0] = xmm0; + chaining[1] = xmm1; + chaining[2] = xmm2; + chaining[3] = xmm3; + +#ifdef IACA_TRACE + IACA_END; +#endif + + return; +} + +void OF512(u64* h) +{ + __m128i* const chaining = (__m128i*) h; + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP_MUL1[8]; + static __m128i TEMP_MUL2[8]; + static __m128i TEMP_MUL4; + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + xmm8 = chaining[0]; + xmm10 = chaining[1]; + xmm12 = chaining[2]; + xmm14 = chaining[3]; + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm10 = _mm_xor_si128(xmm10, (chaining[1])); + xmm12 = _mm_xor_si128(xmm12, (chaining[2])); + xmm14 = _mm_xor_si128(xmm14, (chaining[3])); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); + VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); + + /* we only need to return the truncated half of the state */ + chaining[2] = xmm9; + chaining[3] = xmm11; + + return; +}//OF512() + + + diff --git a/algo/aes_ni/hash-groestl.c b/algo/aes_ni/hash-groestl.c new file mode 100644 index 000000000..47c6a1276 --- /dev/null +++ b/algo/aes_ni/hash-groestl.c @@ -0,0 +1,306 @@ +/* hash.c Aug 2011 + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +#include "hash-groestl.h" +#include "miner.h" + +#ifndef NO_AES_NI + +#include "groestl-version.h" + +#ifdef TASM + #ifdef VAES + #include "groestl-asm-aes.h" + #else + #ifdef VAVX + #include "groestl-asm-avx.h" + #else + #ifdef VVPERM + #include "groestl-asm-vperm.h" + #else + #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) + #endif + #endif + #endif +#else + #ifdef TINTR + #ifdef VAES + #include "groestl-intr-aes.h" + #else + #ifdef VAVX + #include "groestl-intr-avx.h" + #else + #ifdef VVPERM + #include "groestl-intr-vperm.h" + #else + #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) + #endif + #endif + #endif + #else + #error NO TYPE SPECIFIED (-DT[ASM/INTR]) + #endif +#endif + + +/* digest up to len bytes of input (full blocks only) */ +void Transform(hashState_groestl *ctx, + const u8 *in, + unsigned long long len) { + /* increment block counter */ + ctx->block_counter += len/SIZE; + + /* digest message, one block at a time */ + for (; len >= SIZE; len -= SIZE, in += SIZE) +#if LENGTH<=256 + TF512((u64*)ctx->chaining, (u64*)in); +#else + TF1024((u64*)ctx->chaining, (u64*)in); +#endif + + asm volatile ("emms"); +} + +/* given state h, do h <- P(h)+h */ +void OutputTransformation(hashState_groestl *ctx) { + /* determine variant */ +#if (LENGTH <= 256) + OF512((u64*)ctx->chaining); +#else + OF1024((u64*)ctx->chaining); +#endif + + asm volatile ("emms"); +} + +/* initialise context */ +HashReturn_gr init_groestl(hashState_groestl* ctx) { + u8 i = 0; + /* output size (in bits) must be a positive integer less than or + equal to 512, and divisible by 8 */ + if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512) + return BAD_HASHBITLEN_GR; + + /* set number of state columns and state size depending on + variant */ + ctx->columns = COLS; + ctx->statesize = SIZE; +#if (LENGTH <= 256) + ctx->v = SHoRT; +#else + ctx->v = LoNG; +#endif + + SET_CONSTANTS(); + + for (i=0; ichaining[i] = 0; + for (i=0; ibuffer[i] = 0; + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return FAIL_GR; + + /* set initial value */ + ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH); + + INIT(ctx->chaining); + + /* set other variables */ + ctx->buf_ptr = 0; + ctx->block_counter = 0; + ctx->bits_in_last_byte = 0; + + return SUCCESS_GR; +} + + +HashReturn_gr reinit_groestl(hashState_groestl* ctx) + { + int i; + for (i=0; ichaining[i] = 0; + for (i=0; ibuffer[i] = 0; + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return FAIL_GR; + + /* set initial value */ + ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH); + + INIT(ctx->chaining); + + /* set other variables */ + ctx->buf_ptr = 0; + ctx->block_counter = 0; + ctx->bits_in_last_byte = 0; + + return SUCCESS_GR; +} + + +/* update state with databitlen bits of input */ +HashReturn_gr update_groestl(hashState_groestl* ctx, + const BitSequence_gr* input, + DataLength_gr databitlen) { + int index = 0; + int msglen = (int)(databitlen/8); + int rem = (int)(databitlen%8); + + /* non-integral number of message bytes can only be supplied in the + last call to this function */ + if (ctx->bits_in_last_byte) return FAIL_GR; + + /* if the buffer contains data that has not yet been digested, first + add data to buffer until full */ + +// The following block of code never gets hit when hashing x11 or quark +// leave it here in case it might be needed. +// if (ctx->buf_ptr) +// { +// while (ctx->buf_ptr < ctx->statesize && index < msglen) +// { +// ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; +// } +// if (ctx->buf_ptr < ctx->statesize) +// { +// /* buffer still not full, return */ +// if (rem) +// { +// ctx->bits_in_last_byte = rem; +// ctx->buffer[(int)ctx->buf_ptr++] = input[index]; +// } +// return SUCCESS_GR; +// } +// /* digest buffer */ +// ctx->buf_ptr = 0; +// printf("error\n"); +// Transform(ctx, ctx->buffer, ctx->statesize); +// end dead code +// } + + /* digest bulk of message */ + Transform(ctx, input+index, msglen-index); + index += ((msglen-index)/ctx->statesize)*ctx->statesize; + + /* store remaining data in buffer */ + while (index < msglen) + { + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + } + +// Another block that doesn't get used by x11 or quark +// /* if non-integral number of bytes have been supplied, store +// remaining bits in last byte, together with information about +// number of bits */ +// if (rem) +// { +// ctx->bits_in_last_byte = rem; +// ctx->buffer[(int)ctx->buf_ptr++] = input[index]; +// } + + return SUCCESS_GR; +} + +#define BILB ctx->bits_in_last_byte + +/* finalise: process remaining data (including padding), perform + output transformation, and write hash result to 'output' */ +HashReturn_gr final_groestl(hashState_groestl* ctx, + BitSequence_gr* output) { + int i, j = 0, hashbytelen = LENGTH/8; + u8 *s = (BitSequence_gr*)ctx->chaining; + + /* pad with '1'-bit and first few '0'-bits */ + if (BILB) { + ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB); + BILB = 0; + } + else ctx->buffer[(int)ctx->buf_ptr++] = 0x80; + + /* pad with '0'-bits */ + if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) { + /* padding requires two blocks */ + while (ctx->buf_ptr < ctx->statesize) { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + /* digest first padding block */ + Transform(ctx, ctx->buffer, ctx->statesize); + ctx->buf_ptr = 0; + } + while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + + /* length padding */ + ctx->block_counter++; + ctx->buf_ptr = ctx->statesize; + while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) { + ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter; + ctx->block_counter >>= 8; + } + + /* digest final padding block */ + Transform(ctx, ctx->buffer, ctx->statesize); + /* perform output transformation */ + OutputTransformation(ctx); + + /* store hash result in output */ + for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) { + output[j] = s[i]; + } + + /* zeroise relevant variables and deallocate memory */ + + for (i = 0; i < ctx->columns; i++) { + ctx->chaining[i] = 0; + } + + for (i = 0; i < ctx->statesize; i++) { + ctx->buffer[i] = 0; + } +// free(ctx->chaining); +// free(ctx->buffer); + + return SUCCESS_GR; +} + +/* hash bit sequence */ +HashReturn_gr hash_groestl(int hashbitlen, + const BitSequence_gr* data, + DataLength_gr databitlen, + BitSequence_gr* hashval) { + HashReturn_gr ret; + hashState_groestl context; + + /* initialise */ + if ((ret = init_groestl(&context)) != SUCCESS_GR) + return ret; + + /* process message */ + if ((ret = update_groestl(&context, data, databitlen)) != SUCCESS_GR) + return ret; + + /* finalise */ + ret = final_groestl(&context, hashval); + + return ret; +} + +/* eBash API */ +#ifdef crypto_hash_BYTES +int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen) +{ + if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0; + return -1; +} +#endif + +#endif diff --git a/algo/aes_ni/hash-groestl.h b/algo/aes_ni/hash-groestl.h new file mode 100644 index 000000000..24603d395 --- /dev/null +++ b/algo/aes_ni/hash-groestl.h @@ -0,0 +1,110 @@ +/* hash.h Aug 2011 + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +#ifndef __hash_h +#define __hash_h + +#include +#if defined(_WIN64) || defined(__WINDOWS__) +#include +#endif +#include + +/* eBash API begin */ +/* +#include "crypto_hash.h" +#ifdef crypto_hash_BYTES + +#include +#include +#include +typedef crypto_uint8 u8; +typedef crypto_uint32 u32; +typedef crypto_uint64 u64; +#endif + * / +/* eBash API end */ + +#define LENGTH (512) + +#include "brg_endian.h" +#define NEED_UINT_64T +#include "brg_types.h" + +#ifdef IACA_TRACE + #include IACA_MARKS +#endif + +#ifndef LENGTH +#define LENGTH (256) +#endif + +/* some sizes (number of bytes) */ +#define ROWS (8) +#define LENGTHFIELDLEN (ROWS) +#define COLS512 (8) +#define COLS1024 (16) +#define SIZE512 ((ROWS)*(COLS512)) +#define SIZE1024 ((ROWS)*(COLS1024)) +#define ROUNDS512 (10) +#define ROUNDS1024 (14) + +#if LENGTH<=256 +#define COLS (COLS512) +#define SIZE (SIZE512) +#define ROUNDS (ROUNDS512) +#else +#define COLS (COLS1024) +#define SIZE (SIZE1024) +#define ROUNDS (ROUNDS1024) +#endif + +#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) + +#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) +#define U64BIG(a) (a) +#endif /* IS_BIG_ENDIAN */ + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) +#define U64BIG(a) \ + ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ + (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ + (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ + (ROTL64(a,56) & li_64(FF000000FF000000))) +#endif /* IS_LITTLE_ENDIAN */ + +typedef enum { LoNG, SHoRT } Var; + +/* NIST API begin */ + +typedef unsigned char BitSequence_gr; +typedef unsigned long long DataLength_gr; +typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr; + +typedef struct { + __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */ + __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */ + u64 block_counter; /* message block counter */ + int buf_ptr; /* data buffer pointer */ + int bits_in_last_byte; /* no. of message bits in last byte of + data buffer */ + int columns; /* no. of columns in state */ + int statesize; /* total no. of bytes in state */ + Var v; /* LONG or SHORT */ +} hashState_groestl; + +HashReturn_gr init_groestl(hashState_groestl*); +HashReturn_gr reinit_groestl(hashState_groestl*); +HashReturn_gr update_groestl(hashState_groestl*, const BitSequence_gr*, DataLength_gr); +HashReturn_gr final_groestl(hashState_groestl*, BitSequence_gr*); +HashReturn_gr hash_groestl(int, const BitSequence_gr*, DataLength_gr, BitSequence_gr*); +/* NIST API end */ + +#endif /* __hash_h */ diff --git a/algo/aes_ni/hash-groestl256.c b/algo/aes_ni/hash-groestl256.c new file mode 100644 index 000000000..19f0f7877 --- /dev/null +++ b/algo/aes_ni/hash-groestl256.c @@ -0,0 +1,318 @@ +/* hash.c Aug 2011 + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +#include "hash-groestl256.h" +//#include "miner.h" + +//#ifndef NO_AES_NI +#ifdef __AVX2__ + +#include "groestl-version.h" + +//#ifdef TASM +// #ifdef VAES +// #include "groestl256-asm-aes.h" +// #else +// #ifdef VAVX +// #include "groestl256-asm-avx.h" +// #else +// #ifdef VVPERM +// #include "groestl256-asm-vperm.h" +// #else +// #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) +// #endif +// #endif +// #endif +//#else +// #ifdef TINTR +// #ifdef VAES +// #include "groestl256-intr-aes.h" +// #else +// #ifdef VAVX + #include "groestl256-intr-avx.h" +// #else +// #ifdef VVPERM +// #include "groestl256-intr-vperm.h" +// #else +// #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) +// #endif +// #endif +// #endif +// #else +// #error NO TYPE SPECIFIED (-DT[ASM/INTR]) +// #endif +//#endif + + +/* digest up to len bytes of input (full blocks only) */ +void Transform256(hashState_groestl256 *ctx, + const u8 *in, + unsigned long long len) { + /* increment block counter */ + ctx->block_counter += len/SIZE; + + /* digest message, one block at a time */ + for (; len >= SIZE; len -= SIZE, in += SIZE) +//#if LENGTH<=256 + TF512((u64*)ctx->chaining, (u64*)in); +//#else +// TF1024((u64*)ctx->chaining, (u64*)in); +//#endif + +#ifdef _MSC_VER + //__asm emms +#else + asm volatile ("emms"); +#endif +} + +/* given state h, do h <- P(h)+h */ +void OutputTransformation256(hashState_groestl256 *ctx) { + /* determine variant */ +//#if (LENGTH <= 256) + OF512((u64*)ctx->chaining); +//#else +// OF1024((u64*)ctx->chaining); +//#endif + +#ifdef _MSC_VER + //__asm emms +#else + asm volatile("emms"); +#endif +} + +/* initialise context */ +HashReturn_gr init_groestl256(hashState_groestl256* ctx) { + u8 i = 0; + /* output size (in bits) must be a positive integer less than or + equal to 512, and divisible by 8 */ +// if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512) +// return BAD_HASHBITLEN_GR; + + /* set number of state columns and state size depending on + variant */ + ctx->columns = COLS; + ctx->statesize = SIZE; +//#if (LENGTH <= 256) + ctx->v = SHoRT; +//#else +// ctx->v = LoNG; +//#endif + + SET_CONSTANTS(); + + for (i=0; ichaining[i] = 0; + for (i=0; ibuffer[i] = 0; + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return FAIL_GR; + + /* set initial value */ +// ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH); + ctx->chaining[ctx->columns-1] = U64BIG((u64)256); + + INIT256(ctx->chaining); + + /* set other variables */ + ctx->buf_ptr = 0; + ctx->block_counter = 0; + ctx->bits_in_last_byte = 0; + + return SUCCESS_GR; +} + + +HashReturn_gr reinit_groestl256(hashState_groestl256* ctx) + { + int i; + for (i=0; ichaining[i] = 0; + for (i=0; ibuffer[i] = 0; + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return FAIL_GR; + + /* set initial value */ +// ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH); + ctx->chaining[ctx->columns-1] = 256; + + INIT256(ctx->chaining); + + /* set other variables */ + ctx->buf_ptr = 0; + ctx->block_counter = 0; + ctx->bits_in_last_byte = 0; + + return SUCCESS_GR; +} + + +/* update state with databitlen bits of input */ +HashReturn_gr update_groestl256(hashState_groestl256* ctx, + const BitSequence_gr* input, + DataLength_gr databitlen) { + int index = 0; + int msglen = (int)(databitlen/8); + int rem = (int)(databitlen%8); + + /* non-integral number of message bytes can only be supplied in the + last call to this function */ + if (ctx->bits_in_last_byte) return FAIL_GR; + + /* if the buffer contains data that has not yet been digested, first + add data to buffer until full */ + +// The following block of code never gets hit when hashing x11 or quark +// leave it here in case it might be needed. +// if (ctx->buf_ptr) +// { +// while (ctx->buf_ptr < ctx->statesize && index < msglen) +// { +// ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; +// } +// if (ctx->buf_ptr < ctx->statesize) +// { +// /* buffer still not full, return */ +// if (rem) +// { +// ctx->bits_in_last_byte = rem; +// ctx->buffer[(int)ctx->buf_ptr++] = input[index]; +// } +// return SUCCESS_GR; +// } +// /* digest buffer */ +// ctx->buf_ptr = 0; +// printf("error\n"); +// Transform(ctx, ctx->buffer, ctx->statesize); +// end dead code +// } + + /* digest bulk of message */ + Transform256(ctx, input+index, msglen-index); + index += ((msglen-index)/ctx->statesize)*ctx->statesize; + + /* store remaining data in buffer */ + while (index < msglen) + { + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + } + +// Another block that doesn't get used by x11 or quark +// /* if non-integral number of bytes have been supplied, store +// remaining bits in last byte, together with information about +// number of bits */ +// if (rem) +// { +// ctx->bits_in_last_byte = rem; +// ctx->buffer[(int)ctx->buf_ptr++] = input[index]; +// } + + return SUCCESS_GR; +} + +#define BILB ctx->bits_in_last_byte + +/* finalise: process remaining data (including padding), perform + output transformation, and write hash result to 'output' */ +HashReturn_gr final_groestl256(hashState_groestl256* ctx, + BitSequence_gr* output) { +// int i, j = 0, hashbytelen = LENGTH/8; + int i, j = 0, hashbytelen = 256/8; + u8 *s = (BitSequence_gr*)ctx->chaining; + + /* pad with '1'-bit and first few '0'-bits */ + if (BILB) { + ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB); + BILB = 0; + } + else ctx->buffer[(int)ctx->buf_ptr++] = 0x80; + + /* pad with '0'-bits */ + if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) { + /* padding requires two blocks */ + while (ctx->buf_ptr < ctx->statesize) { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + /* digest first padding block */ + Transform256(ctx, ctx->buffer, ctx->statesize); + ctx->buf_ptr = 0; + } + while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + + /* length padding */ + ctx->block_counter++; + ctx->buf_ptr = ctx->statesize; + while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) { + ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter; + ctx->block_counter >>= 8; + } + + /* digest final padding block */ + Transform256(ctx, ctx->buffer, ctx->statesize); + /* perform output transformation */ + OutputTransformation256(ctx); + + /* store hash result in output */ + for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) { + output[j] = s[i]; + } + + /* zeroise relevant variables and deallocate memory */ + + for (i = 0; i < ctx->columns; i++) { + ctx->chaining[i] = 0; + } + + for (i = 0; i < ctx->statesize; i++) { + ctx->buffer[i] = 0; + } +// free(ctx->chaining); +// free(ctx->buffer); + + return SUCCESS_GR; +} + +/* hash bit sequence */ +//HashReturn_gr hash_groestl256(int hashbitlen, +// const BitSequence_gr* data, +// DataLength_gr databitlen, +// BitSequence_gr* hashval) { +// HashReturn_gr ret; +// hashState_groestl256 context; +// +// +// if ((ret = init_groestl256(&context)) != SUCCESS_GR) +// return ret; +// +// +// if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR) +// return ret; +// +// +// ret = final_groestl256(&context, hashval); +// +// return ret; +//} + +/* eBash API */ +//#ifdef crypto_hash_BYTES +//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen) +//{ +// if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0; +// return -1; +//} +//#endif + +#endif diff --git a/algo/aes_ni/hash-groestl256.h b/algo/aes_ni/hash-groestl256.h new file mode 100644 index 000000000..b4dcfe07b --- /dev/null +++ b/algo/aes_ni/hash-groestl256.h @@ -0,0 +1,116 @@ +/* hash.h Aug 2011 + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +#ifndef __hash_h +#define __hash_h + +#include +#if defined(_WIN64) || defined(__WINDOWS__) +#include +#endif +#include + +/* eBash API begin */ +/* +#include "crypto_hash.h" +#ifdef crypto_hash_BYTES + +#include +#include +#include +typedef crypto_uint8 u8; +typedef crypto_uint32 u32; +typedef crypto_uint64 u64; +#endif + */ +/* eBash API end */ + +//#define LENGTH (512) + +#include "brg_endian.h" +#define NEED_UINT_64T +#include "brg_types.h" + +#ifdef IACA_TRACE + #include IACA_MARKS +#endif + +//#ifndef LENGTH +//#define LENGTH (256) +//#endif + +/* some sizes (number of bytes) */ +#define ROWS (8) +#define LENGTHFIELDLEN (ROWS) +#define COLS512 (8) +#define COLS1024 (16) +#define SIZE512 ((ROWS)*(COLS512)) +#define SIZE1024 ((ROWS)*(COLS1024)) +#define ROUNDS512 (10) +#define ROUNDS1024 (14) + +//#if LENGTH<=256 +#define COLS (COLS512) +#define SIZE (SIZE512) +#define ROUNDS (ROUNDS512) +//#else +//#define COLS (COLS1024) +//#define SIZE (SIZE1024) +//#define ROUNDS (ROUNDS1024) +//#endif + +#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) + +#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) +#define U64BIG(a) (a) +#endif /* IS_BIG_ENDIAN */ + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) +#define U64BIG(a) \ + ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ + (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ + (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ + (ROTL64(a,56) & li_64(FF000000FF000000))) +#endif /* IS_LITTLE_ENDIAN */ + +typedef enum { LoNG, SHoRT } Var; + +/* NIST API begin */ + +typedef unsigned char BitSequence_gr; +typedef unsigned long long DataLength_gr; +typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr; + +typedef struct { +#ifndef _MSC_VER + __attribute__ ((aligned (32))) +#endif + u64 chaining[SIZE/8]; /* actual state */ +#ifndef _MSC_VER + __attribute__ ((aligned (32))) +#endif + BitSequence_gr buffer[SIZE]; /* data buffer */ + u64 block_counter; /* message block counter */ + int buf_ptr; /* data buffer pointer */ + int bits_in_last_byte; /* no. of message bits in last byte of + data buffer */ + int columns; /* no. of columns in state */ + int statesize; /* total no. of bytes in state */ + Var v; /* LONG or SHORT */ +} hashState_groestl256; + +HashReturn_gr init_groestl(hashState_groestl256*); +HashReturn_gr reinit_groestl(hashState_groestl256*); +HashReturn_gr update_groestl(hashState_groestl256*, const BitSequence_gr*, DataLength_gr); +HashReturn_gr final_groestl(hashState_groestl256*, BitSequence_gr*); +HashReturn_gr hash_groestl(int, const BitSequence_gr*, DataLength_gr, BitSequence_gr*); +/* NIST API end */ + +#endif /* __hash_h */ diff --git a/algo/aes_ni/implementors b/algo/aes_ni/implementors new file mode 100644 index 000000000..e7ac1b28c --- /dev/null +++ b/algo/aes_ni/implementors @@ -0,0 +1,3 @@ +Krystian Matusiewicz +Günther A. Roland +Martin Schläffer diff --git a/algo/lyra2re.c b/algo/lyra2re.c index 38fb62bb4..655c06ae5 100644 --- a/algo/lyra2re.c +++ b/algo/lyra2re.c @@ -9,7 +9,7 @@ #include "miner.h" -void lyra2_hash(void *state, const void *input) +/*void lyra2_hash(void *state, const void *input) { sph_blake256_context ctx_blake; sph_keccak256_context ctx_keccak; @@ -36,6 +36,69 @@ void lyra2_hash(void *state, const void *input) sph_groestl256(&ctx_groestl, hashB, 32); sph_groestl256_close(&ctx_groestl, hashA); + memcpy(state, hashA, 32); +}*/ + + +#ifdef __AVX2__ +//#define __AES_NI +#endif + +#ifdef __AES_NI +#include "algo/aes_ni/hash-groestl256.h" +#endif + +typedef struct { + sph_blake256_context blake; + sph_keccak256_context keccak; + sph_skein256_context skein; +#ifdef __AES_NI + hashState_groestl256 groestl; +#else + sph_groestl256_context groestl; +#endif +} lyra2re_ctx_holder; + +lyra2re_ctx_holder lyra2re_ctx; + +void init_lyra2re_ctx() +{ + sph_blake256_init(&lyra2re_ctx.blake); + sph_keccak256_init(&lyra2re_ctx.keccak); + sph_skein256_init(&lyra2re_ctx.skein); +#ifdef __AES_NI + init_groestl256(&lyra2re_ctx.groestl); +#else + sph_groestl256_init(&lyra2re_ctx.groestl); +#endif +} + +void lyra2_hash(void *state, const void *input) +{ + lyra2re_ctx_holder ctx; + memcpy(&ctx, &lyra2re_ctx, sizeof(lyra2re_ctx)); + + uint32_t hashA[8], hashB[8]; + + sph_blake256(&ctx.blake, input, 80); + sph_blake256_close(&ctx.blake, hashA); + + sph_keccak256(&ctx.keccak, hashA, 32); + sph_keccak256_close(&ctx.keccak, hashB); + + LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + + sph_skein256(&ctx.skein, hashA, 32); + sph_skein256_close(&ctx.skein, hashB); + +#ifdef __AES_NI + update_groestl256(&ctx.groestl, hashB, 256); + final_groestl256(&ctx.groestl, hashA); +#else + sph_groestl256(&ctx.groestl, hashB, 32); + sph_groestl256_close(&ctx.groestl, hashA); +#endif + memcpy(state, hashA, 32); } diff --git a/cpu-miner.c b/cpu-miner.c index 8008acffc..1798a964a 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2994,7 +2994,8 @@ int main(int argc, char *argv[]) { if (opt_algo == ALGO_QUARK) { init_quarkhash_contexts(); - } else if(opt_algo == ALGO_CRYPTONIGHT) { + } + else if (opt_algo == ALGO_CRYPTONIGHT) { jsonrpc_2 = true; opt_extranonce = false; aes_ni_supported = has_aes_ni(); @@ -3003,6 +3004,9 @@ int main(int argc, char *argv[]) { applog(LOG_INFO, "CPU Supports AES-NI: %s", aes_ni_supported ? "YES" : "NO"); } } + else if (opt_algo == ALGO_LYRA2) { + init_lyra2re_ctx(); + } if (!opt_benchmark && !rpc_url) { fprintf(stderr, "%s: no URL supplied\n", argv[0]); diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c index b79ec9b16..f0c9b5439 100644 --- a/lyra2/Lyra2.c +++ b/lyra2/Lyra2.c @@ -21,9 +21,9 @@ #include #include #include - -#include "Lyra2.h" -#include "Sponge.h" +#include "compat.h" +#include "lyra2.h" +#include "sponge.h" /** * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords @@ -44,7 +44,7 @@ * * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) */ -int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) +int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols) { //============================= Basic variables ============================// int64_t row = 2; //index of row to be processed @@ -55,25 +55,32 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 int64_t i; //auxiliary iteration counter + int64_t v64; // 64bit var for memcpy //==========================================================================/ //========== Initializing the Memory Matrix and pointers to it =============// //Tries to allocate enough space for the whole memory matrix - i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); - uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i); + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; + + i = (int64_t)ROW_LEN_BYTES * nRows; + uint64_t _ALIGN(256) *wholeMatrix = malloc(i); if (wholeMatrix == NULL) { return -1; } - memset(wholeMatrix, 0, (size_t) i); + memset(wholeMatrix, 0, i); //Allocates pointers to each row of the matrix - uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*)); + uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows); if (memMatrix == NULL) { return -1; } //Places the pointers in the correct positions uint64_t *ptrWord = wholeMatrix; - for (i = 0; i < (int64_t) nRows; i++) { + for (i = 0; i < nRows; i++) { memMatrix[i] = ptrWord; ptrWord += ROW_LEN_INT64; } @@ -84,32 +91,38 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //but this ensures that the password copied locally will be overwritten as soon as possible //First, we clean enough blocks for the password, salt, basil and padding - uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; byte *ptrByte = (byte*) wholeMatrix; - memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); //Prepends the password - memcpy(ptrByte, pwd, (size_t) pwdlen); + memcpy(ptrByte, pwd, pwdlen); ptrByte += pwdlen; //Concatenates the salt - memcpy(ptrByte, salt, (size_t) saltlen); + memcpy(ptrByte, salt, saltlen); ptrByte += saltlen; + memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)); + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &saltlen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &timeCost, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &nRows, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &nCols, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &kLen, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = pwdlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = saltlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = timeCost; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nRows; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nCols; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); //Now comes the padding *ptrByte = 0x80; //first byte of padding: right after the password @@ -120,30 +133,27 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //======================= Initializing the Sponge State ====================// //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - uint64_t *state = malloc(16 * sizeof (uint64_t)); - if (state == NULL) { - return -1; - } + uint64_t _ALIGN(256) state[16]; initState(state); //==========================================================================/ //================================ Setup Phase =============================// //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits ptrWord = wholeMatrix; - for (i = 0; i < (int64_t) nBlocksInput; i++) { + for (i = 0; i < nBlocksInput; i++) { absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) } //Initializes M[0] and M[1] - reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here - reducedDuplexRow1(state, memMatrix[0], memMatrix[1]); + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); do { //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); //updates the value of row* (deterministically picked during Setup)) rowa = (rowa + step) & (window - 1); @@ -159,36 +169,35 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * gap = -gap; //inverts the modifier to the step } - } while (row < (int64_t) nRows); + } while (row < nRows); //==========================================================================/ //============================ Wandering Phase =============================// row = 0; //Resets the visitation to the first row of the memory matrix - for (tau = 1; tau <= (int64_t) timeCost; tau++) { + for (tau = 1; tau <= timeCost; tau++) { //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; do { //Selects a pseudorandom index row* //------------------------------------------------------------------------------------------ - //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) //------------------------------------------------------------------------------------------ //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); //update prev: it now points to the last row ever computed prev = row; //updates row: goes to the next row to be computed //------------------------------------------------------------------------------------------ - //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) //------------------------------------------------------------------------------------------ } while (row != 0); } - //==========================================================================/ //============================ Wrap-up Phase ===============================// //Absorbs the last block of the memory matrix @@ -196,16 +205,10 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //Squeezes the key squeeze(state, K, (unsigned int) kLen); - //==========================================================================/ //========================= Freeing the memory =============================// free(memMatrix); free(wholeMatrix); - //Wiping out the sponge's internal state before freeing it - memset(state, 0, 16 * sizeof (uint64_t)); - free(state); - //==========================================================================/ - return 0; } diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h index 229b2c9cc..edf917927 100644 --- a/lyra2/Lyra2.h +++ b/lyra2/Lyra2.h @@ -37,14 +37,6 @@ typedef unsigned char byte; #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes #endif -#ifndef N_COLS - #define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default -#endif - -#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks -#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row - - -int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); +int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); #endif /* LYRA2_H_ */ diff --git a/lyra2/Sponge.c b/lyra2/Sponge.c index e0a001e0e..cc042809f 100644 --- a/lyra2/Sponge.c +++ b/lyra2/Sponge.c @@ -21,10 +21,9 @@ #include #include #include -#include "Sponge.h" -#include "Lyra2.h" - - +#include +#include "sponge.h" +#include "lyra2.h" /** * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder @@ -37,20 +36,97 @@ * * @param state The 1024-bit array to be initialized */ - void initState(uint64_t state[/*16*/]) { - //First 512 bis are zeros - memset(state, 0, 64); - //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV - - state[8] = blake2b_IV[0]; - state[9] = blake2b_IV[1]; - state[10] = blake2b_IV[2]; - state[11] = blake2b_IV[3]; - state[12] = blake2b_IV[4]; - state[13] = blake2b_IV[5]; - state[14] = blake2b_IV[6]; - state[15] = blake2b_IV[7]; +void initState(uint64_t state[/*16*/]) { + //First 512 bis are zeros + memset(state, 0, 64); + //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV + state[8] = blake2b_IV[0]; + state[9] = blake2b_IV[1]; + state[10] = blake2b_IV[2]; + state[11] = blake2b_IV[3]; + state[12] = blake2b_IV[4]; + state[13] = blake2b_IV[5]; + state[14] = blake2b_IV[6]; + state[15] = blake2b_IV[7]; +} +#ifdef _MSC_VER +__forceinline +#else +inline +#endif +static void lyra_round( uint64_t *v ) +{ +#ifdef __AVX2__ + + __m256i a = _mm256_load_si256( (__m256i*)(&v[ 0]) ); + __m256i b = _mm256_load_si256( (__m256i*)(&v[ 4]) ); + __m256i c = _mm256_load_si256( (__m256i*)(&v[ 8]) ); + __m256i d = _mm256_load_si256( (__m256i*)(&v[12]) ); + + G_4X64( a, b, c, d ); + + // swap words + b = mm256_rotl256_1x64( b ); + c = mm256_swap128( c ); + d = mm256_rotr256_1x64( d ); + + G_4X64( a, b, c, d ); + + // unswap + b = mm256_rotr256_1x64( b ); + c = mm256_swap128( c ); + d = mm256_rotl256_1x64( d ); + + _mm256_store_si256( (__m256i*)(&v[ 0]), a ); + _mm256_store_si256( (__m256i*)(&v[ 4]), b ); + _mm256_store_si256( (__m256i*)(&v[ 8]), c ); + _mm256_store_si256( (__m256i*)(&v[12]), d ); + +#elif defined __AVX__ + + __m128i a0, a1, b0, b1, c0, c1, d0, d1; + + a0 = _mm_load_si128( (__m128i*)(&v[ 0]) ); + a1 = _mm_load_si128( (__m128i*)(&v[ 2]) ); + b0 = _mm_load_si128( (__m128i*)(&v[ 4]) ); + b1 = _mm_load_si128( (__m128i*)(&v[ 6]) ); + c0 = _mm_load_si128( (__m128i*)(&v[ 8]) ); + c1 = _mm_load_si128( (__m128i*)(&v[10]) ); + d0 = _mm_load_si128( (__m128i*)(&v[12]) ); + d1 = _mm_load_si128( (__m128i*)(&v[14]) ); + + G_2X64( a0, b0, c0, d0 ); + G_2X64( a1, b1, c1, d1 ); + + // swap words + mm128_rotl256_1x64( b0, b1 ); + mm128_swap128( c0, c1 ); + mm128_rotr256_1x64( d0, d1 ); + + G_2X64( a0, b0, c0, d0 ); + G_2X64( a1, b1, c1, d1 ); + + // unswap + mm128_rotr256_1x64( b0, b1 ); + mm128_swap128( c0, c1 ); + mm128_rotl256_1x64( d0, d1 ); + + _mm_store_si128( (__m128i*)(&v[ 0]), a0 ); + _mm_store_si128( (__m128i*)(&v[ 2]), a1 ); + _mm_store_si128( (__m128i*)(&v[ 4]), b0 ); + _mm_store_si128( (__m128i*)(&v[ 6]), b1 ); + _mm_store_si128( (__m128i*)(&v[ 8]), c0 ); + _mm_store_si128( (__m128i*)(&v[10]), c1 ); + _mm_store_si128( (__m128i*)(&v[12]), d0 ); + _mm_store_si128( (__m128i*)(&v[14]), d1 ); + +#else + + // macro assumes v is defined + ROUND_LYRA(0); + +#endif } /** @@ -58,27 +134,39 @@ * * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function */ -__inline static void blake2bLyra(uint64_t *v) { - ROUND_LYRA(0); - ROUND_LYRA(1); - ROUND_LYRA(2); - ROUND_LYRA(3); - ROUND_LYRA(4); - ROUND_LYRA(5); - ROUND_LYRA(6); - ROUND_LYRA(7); - ROUND_LYRA(8); - ROUND_LYRA(9); - ROUND_LYRA(10); - ROUND_LYRA(11); +#ifdef _MSC_VER +__forceinline +#else +__inline +#endif +static void blake2bLyra(uint64_t *v) +{ + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); + lyra_round( v ); } /** * Executes a reduced version of Blake2b's G function with only one round * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function */ -__inline static void reducedBlake2bLyra(uint64_t *v) { - ROUND_LYRA(0); +#ifdef _MSC_VER +__forceinline +#else +__inline +#endif +static void reducedBlake2bLyra(uint64_t *v) { + + lyra_round( v ); } /** @@ -89,21 +177,24 @@ __inline static void reducedBlake2bLyra(uint64_t *v) { * @param out Array that will receive the data squeezed * @param len The number of bytes to be squeezed into the "out" array */ - void squeeze(uint64_t *state, byte *out, unsigned int len) { - int fullBlocks = len / BLOCK_LEN_BYTES; - byte *ptr = out; - int i; - //Squeezes full blocks - for (i = 0; i < fullBlocks; i++) { - memcpy(ptr, state, BLOCK_LEN_BYTES); - blake2bLyra(state); - ptr += BLOCK_LEN_BYTES; - } - - //Squeezes remaining bytes - memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); +void squeeze(uint64_t *state, byte *out, unsigned int len) +{ + int fullBlocks = len / BLOCK_LEN_BYTES; + byte *ptr = out; + int i; + + //Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy(ptr, state, BLOCK_LEN_BYTES); + blake2bLyra(state); + ptr += BLOCK_LEN_BYTES; + } + + //Squeezes remaining bytes + memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); } + /** * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words * of type uint64_t), using Blake2b's G function as the internal permutation @@ -111,23 +202,78 @@ __inline static void reducedBlake2bLyra(uint64_t *v) { * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_INT64 words) */ -void absorbBlock(uint64_t *state, const uint64_t *in) { - //XORs the first BLOCK_LEN_INT64 words of "in" with the current state - state[0] ^= in[0]; - state[1] ^= in[1]; - state[2] ^= in[2]; - state[3] ^= in[3]; - state[4] ^= in[4]; - state[5] ^= in[5]; - state[6] ^= in[6]; - state[7] ^= in[7]; - state[8] ^= in[8]; - state[9] ^= in[9]; - state[10] ^= in[10]; - state[11] ^= in[11]; - - //Applies the transformation f to the sponge's state - blake2bLyra(state); +void absorbBlock(uint64_t *state, const uint64_t *in) +{ +//XORs the first BLOCK_LEN_INT64 words of "in" with the current state +#if defined __AVX2__ + + __m256i state_v[2], in_v[2]; + + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + in_v [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + in_v [2] = _mm256_loadu_si256( (__m256i*)(&in[8]) ); + + _mm256_store_si256( (__m256i*)&state[0], + _mm256_xor_si256( state_v[0], in_v[0] ) ); + _mm256_store_si256( (__m256i*)&state[4], + _mm256_xor_si256( state_v[1], in_v[1] ) ); + _mm256_store_si256( (__m256i*)&state[8], + _mm256_xor_si256( state_v[2], in_v[2] ) ); + +#elif defined __AVX__ + + __m128i state_v[4], in_v[4]; + + state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); + state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); + state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); + state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); + state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); + state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); + + in_v[0] = _mm_loadu_si128( (__m128i*)(&in[0]) ); + in_v[1] = _mm_loadu_si128( (__m128i*)(&in[2]) ); + in_v[2] = _mm_loadu_si128( (__m128i*)(&in[4]) ); + in_v[3] = _mm_loadu_si128( (__m128i*)(&in[6]) ); + in_v[4] = _mm_loadu_si128( (__m128i*)(&in[8]) ); + in_v[5] = _mm_loadu_si128( (__m128i*)(&in[10]) ); + + _mm_store_si128( (__m128i*)(&state[0]), + _mm_xor_si128( state_v[0], in_v[0] ) ); + _mm_store_si128( (__m128i*)(&state[2]), + _mm_xor_si128( state_v[1], in_v[1] ) ); + _mm_store_si128( (__m128i*)(&state[4]), + _mm_xor_si128( state_v[2], in_v[2] ) ); + _mm_store_si128( (__m128i*)(&state[6]), + _mm_xor_si128( state_v[3], in_v[3] ) ); + _mm_store_si128( (__m128i*)(&state[8]), + _mm_xor_si128( state_v[4], in_v[4] ) ); + _mm_store_si128( (__m128i*)(&state[10]), + _mm_xor_si128( state_v[5], in_v[5] ) ); + +#else + + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + state[8] ^= in[8]; + state[9] ^= in[9]; + state[10] ^= in[10]; + state[11] ^= in[11]; + +#endif + +//Applies the transformation f to the sponge's state +blake2bLyra(state); + } /** @@ -137,25 +283,63 @@ void absorbBlock(uint64_t *state, const uint64_t *in) { * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ -void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { - //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state - state[0] ^= in[0]; - state[1] ^= in[1]; - state[2] ^= in[2]; - state[3] ^= in[3]; - state[4] ^= in[4]; - state[5] ^= in[5]; - state[6] ^= in[6]; - state[7] ^= in[7]; - - //Applies the transformation f to the sponge's state - blake2bLyra(state); -/* - for(int i = 0; i<16; i++) { - printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32), - (uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32)); - } -*/ +void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) +{ + +//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state +#if defined __AVX2__ + + __m256i state_v[2], in_v[2]; + + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + in_v [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) ); + + _mm256_store_si256( (__m256i*)(&state[0]), + _mm256_xor_si256( state_v[0], in_v[0] ) ); + _mm256_store_si256( (__m256i*)(&state[4]), + _mm256_xor_si256( state_v[1], in_v[1] ) ); + +#elif defined __AVX__ + + __m128i state_v[4], in_v[4]; + + state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); + state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); + state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); + state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); + + in_v[0] = _mm_loadu_si128( (__m128i*)(&in[0]) ); + in_v[1] = _mm_loadu_si128( (__m128i*)(&in[2]) ); + in_v[2] = _mm_loadu_si128( (__m128i*)(&in[4]) ); + in_v[3] = _mm_loadu_si128( (__m128i*)(&in[6]) ); + + _mm_store_si128( (__m128i*)(&state[0]), + _mm_xor_si128( state_v[0], in_v[0] ) ); + _mm_store_si128( (__m128i*)(&state[2]), + _mm_xor_si128( state_v[1], in_v[1] ) ); + _mm_store_si128( (__m128i*)(&state[4]), + _mm_xor_si128( state_v[2], in_v[2] ) ); + _mm_store_si128( (__m128i*)(&state[6]), + _mm_xor_si128( state_v[3], in_v[3] ) ); + +#else + + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + +#endif + +//Applies the transformation f to the sponge's state +blake2bLyra(state); + } /** @@ -166,36 +350,31 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { * @param state The current state of the sponge * @param rowOut Row to receive the data squeezed */ -void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { - uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] - int i; - //M[row][C-1-col] = H.reduced_squeeze() - for (i = 0; i < N_COLS; i++) { - - ptrWord[0] = state[0]; - ptrWord[1] = state[1]; - ptrWord[2] = state[2]; - ptrWord[3] = state[3]; - ptrWord[4] = state[4]; - ptrWord[5] = state[5]; - ptrWord[6] = state[6]; - ptrWord[7] = state[7]; - ptrWord[8] = state[8]; - ptrWord[9] = state[9]; - ptrWord[10] = state[10]; - ptrWord[11] = state[11]; - /* -for (int i = 0; i<12; i++) { - printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32), - (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32)); +void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols) +{ + uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] + unsigned int i; + //M[row][C-1-col] = H.reduced_squeeze() + for (i = 0; i < nCols; i++) { + ptrWord[0] = state[0]; + ptrWord[1] = state[1]; + ptrWord[2] = state[2]; + ptrWord[3] = state[3]; + ptrWord[4] = state[4]; + ptrWord[5] = state[5]; + ptrWord[6] = state[6]; + ptrWord[7] = state[7]; + ptrWord[8] = state[8]; + ptrWord[9] = state[9]; + ptrWord[10] = state[10]; + ptrWord[11] = state[11]; + + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); } -*/ - //Goes to next block (column) that will receive the squeezed data - ptrWord -= BLOCK_LEN_INT64; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - } } /** @@ -207,35 +386,137 @@ for (int i = 0; i<12; i++) { * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ - void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row - int i; - - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[prev][col]" - state[0] ^= (ptrWordIn[0]); - state[1] ^= (ptrWordIn[1]); - state[2] ^= (ptrWordIn[2]); - state[3] ^= (ptrWordIn[3]); - state[4] ^= (ptrWordIn[4]); - state[5] ^= (ptrWordIn[5]); - state[6] ^= (ptrWordIn[6]); - state[7] ^= (ptrWordIn[7]); - state[8] ^= (ptrWordIn[8]); - state[9] ^= (ptrWordIn[9]); - state[10] ^= (ptrWordIn[10]); - state[11] ^= (ptrWordIn[11]); - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[row][C-1-col] = M[prev][col] XOR rand - ptrWordOut[0] = ptrWordIn[0] ^ state[0]; - ptrWordOut[1] = ptrWordIn[1] ^ state[1]; - ptrWordOut[2] = ptrWordIn[2] ^ state[2]; - ptrWordOut[3] = ptrWordIn[3] ^ state[3]; +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols) +{ + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + unsigned int i; + + for (i = 0; i < nCols; i++) + { + //Absorbing "M[prev][col]" + #if defined __AVX2__ + + __m256i state_v[3], in_v[3]; + + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); + + _mm256_store_si256( (__m256i*)(&state[0]), + _mm256_xor_si256( state_v[0], in_v[0] ) ); + _mm256_store_si256( (__m256i*)(&state[4]), + _mm256_xor_si256( state_v[1], in_v[1] ) ); + _mm256_store_si256( (__m256i*)(&state[8]), + _mm256_xor_si256( state_v[2], in_v[2] ) ); + + #elif defined __AVX__ + + __m128i state_v[6], in_v[6]; + + state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); + state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); + state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); + state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); + state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); + state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); + + in_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) ); + in_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) ); + in_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) ); + in_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) ); + in_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) ); + in_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) ); + + _mm_store_si128( (__m128i*)(&state[0]), + _mm_xor_si128( state_v[0], in_v[0] ) ); + _mm_store_si128( (__m128i*)(&state[2]), + _mm_xor_si128( state_v[1], in_v[1] ) ); + _mm_store_si128( (__m128i*)(&state[4]), + _mm_xor_si128( state_v[2], in_v[2] ) ); + _mm_store_si128( (__m128i*)(&state[6]), + _mm_xor_si128( state_v[3], in_v[3] ) ); + _mm_store_si128( (__m128i*)(&state[8]), + _mm_xor_si128( state_v[4], in_v[4] ) ); + _mm_store_si128( (__m128i*)(&state[10]), + _mm_xor_si128( state_v[5], in_v[5] ) ); + + #else + + state[0] ^= (ptrWordIn[0]); + state[1] ^= (ptrWordIn[1]); + state[2] ^= (ptrWordIn[2]); + state[3] ^= (ptrWordIn[3]); + state[4] ^= (ptrWordIn[4]); + state[5] ^= (ptrWordIn[5]); + state[6] ^= (ptrWordIn[6]); + state[7] ^= (ptrWordIn[7]); + state[8] ^= (ptrWordIn[8]); + state[9] ^= (ptrWordIn[9]); + state[10] ^= (ptrWordIn[10]); + state[11] ^= (ptrWordIn[11]); + + #endif + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][C-1-col] = M[prev][col] XOR rand + #if defined __AVX2__ +// in_v should not need to be reloaded, but it does and it segfaults if +// loading alogned + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); + + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), + _mm256_xor_si256( state_v[0], in_v[0] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), + _mm256_xor_si256( state_v[1], in_v[1] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), + _mm256_xor_si256( state_v[2], in_v[2] ) ); + + #elif defined __AVX__ + + state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); + state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); + state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); + state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); + state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); + state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); + + in_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) ); + in_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) ); + in_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) ); + in_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) ); + in_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) ); + in_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) ); + + _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]), + _mm_xor_si128( state_v[0], in_v[0] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]), + _mm_xor_si128( state_v[1], in_v[1] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]), + _mm_xor_si128( state_v[2], in_v[2] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]), + _mm_xor_si128( state_v[3], in_v[3] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]), + _mm_xor_si128( state_v[4], in_v[4] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]), + _mm_xor_si128( state_v[5], in_v[5] ) ); + + #else + + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; ptrWordOut[4] = ptrWordIn[4] ^ state[4]; ptrWordOut[5] = ptrWordIn[5] ^ state[5]; ptrWordOut[6] = ptrWordIn[6] ^ state[6]; @@ -244,13 +525,13 @@ for (int i = 0; i<12; i++) { ptrWordOut[9] = ptrWordIn[9] ^ state[9]; ptrWordOut[10] = ptrWordIn[10] ^ state[10]; ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + #endif - - //Input: next column (i.e., next block in sequence) - ptrWordIn += BLOCK_LEN_INT64; - //Output: goes to previous column - ptrWordOut -= BLOCK_LEN_INT64; - } + //Input: next column (i.e., next block in sequence) + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } } /** @@ -267,13 +548,94 @@ for (int i = 0; i<12; i++) { * @param rowOut Row receiving the output * */ - void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row - int i; - for (i = 0; i < N_COLS; i++) { - //Absorbing "M[prev] [+] M[row*]" +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols) +{ + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + unsigned int i; + + for (i = 0; i < nCols; i++) + { + //Absorbing "M[prev] [+] M[row*]" + #if defined __AVX2__ + + __m256i state_v[3], in_v[3], inout_v[3]; + + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); + inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); + inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); + inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) ); + + _mm256_store_si256( (__m256i*)(&state[0]), + _mm256_xor_si256( state_v[0], + _mm256_add_epi64( in_v[0], + inout_v[0] ) ) ); + _mm256_store_si256( (__m256i*)(&state[4]), + _mm256_xor_si256( state_v[1], + _mm256_add_epi64( in_v[1], + inout_v[1] ) ) ); + _mm256_store_si256( (__m256i*)(&state[8]), + _mm256_xor_si256( state_v[2], + _mm256_add_epi64( in_v[2], + inout_v[2] ) ) ); + #elif defined __AVX__ + + __m128i state_v[6], in_v[6], inout_v[6]; + + state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); + state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); + state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); + state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); + state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); + state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); + + inout_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[0]) ); + inout_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[2]) ); + inout_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[4]) ); + inout_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[6]) ); + inout_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[8]) ); + inout_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[10]) ); + + in_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) ); + in_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) ); + in_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) ); + in_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) ); + in_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) ); + in_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) ); + + _mm_store_si128( (__m128i*)(&state[0]), + _mm_xor_si128( state_v[0], + _mm_add_epi64( in_v[0], + inout_v[0] ) ) ); + _mm_store_si128( (__m128i*)(&state[2]), + _mm_xor_si128( state_v[1], + _mm_add_epi64( in_v[1], + inout_v[1] ) ) ); + _mm_store_si128( (__m128i*)(&state[4]), + _mm_xor_si128( state_v[2], + _mm_add_epi64( in_v[2], + inout_v[2] ) ) ); + _mm_store_si128( (__m128i*)(&state[6]), + _mm_xor_si128( state_v[3], + _mm_add_epi64( in_v[3], + inout_v[3] ) ) ); + _mm_store_si128( (__m128i*)(&state[8]), + _mm_xor_si128( state_v[4], + _mm_add_epi64( in_v[4], + inout_v[4] ) ) ); + _mm_store_si128( (__m128i*)(&state[10]), + _mm_xor_si128( state_v[5], + _mm_add_epi64( in_v[5], + inout_v[5] ) ) ); + + #else + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); @@ -286,44 +648,93 @@ for (int i = 0; i<12; i++) { state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[row][col] = M[prev][col] XOR rand - ptrWordOut[0] = ptrWordIn[0] ^ state[0]; - ptrWordOut[1] = ptrWordIn[1] ^ state[1]; - ptrWordOut[2] = ptrWordIn[2] ^ state[2]; - ptrWordOut[3] = ptrWordIn[3] ^ state[3]; - ptrWordOut[4] = ptrWordIn[4] ^ state[4]; - ptrWordOut[5] = ptrWordIn[5] ^ state[5]; - ptrWordOut[6] = ptrWordIn[6] ^ state[6]; - ptrWordOut[7] = ptrWordIn[7] ^ state[7]; - ptrWordOut[8] = ptrWordIn[8] ^ state[8]; - ptrWordOut[9] = ptrWordIn[9] ^ state[9]; - ptrWordOut[10] = ptrWordIn[10] ^ state[10]; - ptrWordOut[11] = ptrWordIn[11] ^ state[11]; - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[11]; - ptrWordInOut[1] ^= state[0]; - ptrWordInOut[2] ^= state[1]; - ptrWordInOut[3] ^= state[2]; - ptrWordInOut[4] ^= state[3]; - ptrWordInOut[5] ^= state[4]; - ptrWordInOut[6] ^= state[5]; - ptrWordInOut[7] ^= state[6]; - ptrWordInOut[8] ^= state[7]; - ptrWordInOut[9] ^= state[8]; - ptrWordInOut[10] ^= state[9]; - ptrWordInOut[11] ^= state[10]; - - //Inputs: next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - //Output: goes to previous column - ptrWordOut -= BLOCK_LEN_INT64; - } + #endif + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = M[prev][col] XOR rand + #if defined __AVX2__ + + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); + + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), + _mm256_xor_si256( state_v[0], in_v[0] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), + _mm256_xor_si256( state_v[1], in_v[1] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), + _mm256_xor_si256( state_v[2], in_v[2] ) ); + + #elif defined __AVX__ + + state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); + state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); + state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); + state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); + state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); + state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); + + in_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) ); + in_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) ); + in_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) ); + in_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) ); + in_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) ); + in_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) ); + + _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]), + _mm_xor_si128( state_v[0], in_v[0] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]), + _mm_xor_si128( state_v[1], in_v[1] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]), + _mm_xor_si128( state_v[2], in_v[2] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]), + _mm_xor_si128( state_v[3], in_v[3] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]), + _mm_xor_si128( state_v[4], in_v[4] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]), + _mm_xor_si128( state_v[5], in_v[5] ) ); + + #else + + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + #endif + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } } /** @@ -340,410 +751,203 @@ for (int i = 0; i<12; i++) { * @param rowOut Row receiving the output * */ -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - int i; - - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[prev] [+] M[row*]" - state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); - state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); - state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); - state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); - state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); - state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); - state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); - state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); - state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); - state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); - state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); - state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[rowOut][col] = M[rowOut][col] XOR rand - ptrWordOut[0] ^= state[0]; - ptrWordOut[1] ^= state[1]; - ptrWordOut[2] ^= state[2]; - ptrWordOut[3] ^= state[3]; - ptrWordOut[4] ^= state[4]; - ptrWordOut[5] ^= state[5]; - ptrWordOut[6] ^= state[6]; - ptrWordOut[7] ^= state[7]; - ptrWordOut[8] ^= state[8]; - ptrWordOut[9] ^= state[9]; - ptrWordOut[10] ^= state[10]; - ptrWordOut[11] ^= state[11]; - - //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[11]; - ptrWordInOut[1] ^= state[0]; - ptrWordInOut[2] ^= state[1]; - ptrWordInOut[3] ^= state[2]; - ptrWordInOut[4] ^= state[3]; - ptrWordInOut[5] ^= state[4]; - ptrWordInOut[6] ^= state[5]; - ptrWordInOut[7] ^= state[6]; - ptrWordInOut[8] ^= state[7]; - ptrWordInOut[9] ^= state[8]; - ptrWordInOut[10] ^= state[9]; - ptrWordInOut[11] ^= state[10]; - - //Goes to next block - ptrWordOut += BLOCK_LEN_INT64; - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - } -} - - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -/** - * Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand" - * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit - * rotation to the left. - * - * @param state The current state of the sponge - * @param rowIn Row used only as input - * @param rowInOut Row used as input and to receive output after rotation - * @param rowOut Row receiving the output - * - */ -/* -inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - int i; - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] ^ ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] ^ ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] ^ ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] ^ ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] ^ ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] ^ ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] ^ ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] ^ ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] ^ ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] ^ ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] ^ ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] ^ ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[row][col] = rand - ptrWordOut[0] = state[0]; - ptrWordOut[1] = state[1]; - ptrWordOut[2] = state[2]; - ptrWordOut[3] = state[3]; - ptrWordOut[4] = state[4]; - ptrWordOut[5] = state[5]; - ptrWordOut[6] = state[6]; - ptrWordOut[7] = state[7]; - ptrWordOut[8] = state[8]; - ptrWordOut[9] = state[9]; - ptrWordOut[10] = state[10]; - ptrWordOut[11] = state[11]; - - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[10]; - ptrWordInOut[1] ^= state[11]; - ptrWordInOut[2] ^= state[0]; - ptrWordInOut[3] ^= state[1]; - ptrWordInOut[4] ^= state[2]; - ptrWordInOut[5] ^= state[3]; - ptrWordInOut[6] ^= state[4]; - ptrWordInOut[7] ^= state[5]; - ptrWordInOut[8] ^= state[6]; - ptrWordInOut[9] ^= state[7]; - ptrWordInOut[10] ^= state[8]; - ptrWordInOut[11] ^= state[9]; - - //Goes to next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - ptrWordOut += BLOCK_LEN_INT64; - } -} -*/ - -/** - * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" - * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit - * rotation to the left. - * - * @param state The current state of the sponge - * @param rowIn Row used only as input - * @param rowInOut Row used as input and to receive output after rotation - * @param rowOut Row receiving the output - * - */ -/* -inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - int i; - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[10]; - ptrWordInOut[1] ^= state[11]; - ptrWordInOut[2] ^= state[0]; - ptrWordInOut[3] ^= state[1]; - ptrWordInOut[4] ^= state[2]; - ptrWordInOut[5] ^= state[3]; - ptrWordInOut[6] ^= state[4]; - ptrWordInOut[7] ^= state[5]; - ptrWordInOut[8] ^= state[6]; - ptrWordInOut[9] ^= state[7]; - ptrWordInOut[10] ^= state[8]; - ptrWordInOut[11] ^= state[9]; - - - //M[row][col] = rand - ptrWordOut[0] = state[0] ^ ptrWordIn[0]; - ptrWordOut[1] = state[1] ^ ptrWordIn[1]; - ptrWordOut[2] = state[2] ^ ptrWordIn[2]; - ptrWordOut[3] = state[3] ^ ptrWordIn[3]; - ptrWordOut[4] = state[4] ^ ptrWordIn[4]; - ptrWordOut[5] = state[5] ^ ptrWordIn[5]; - ptrWordOut[6] = state[6] ^ ptrWordIn[6]; - ptrWordOut[7] = state[7] ^ ptrWordIn[7]; - ptrWordOut[8] = state[8] ^ ptrWordIn[8]; - ptrWordOut[9] = state[9] ^ ptrWordIn[9]; - ptrWordOut[10] = state[10] ^ ptrWordIn[10]; - ptrWordOut[11] = state[11] ^ ptrWordIn[11]; - - //Goes to next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - ptrWordOut += BLOCK_LEN_INT64; - } -} -*/ - -/** - * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" - * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit - * rotation to the left. - * - * @param state The current state of the sponge - * @param rowIn Row used only as input - * @param rowInOut Row used as input and to receive output after rotation - * @param rowOut Row receiving the output - * - */ -/* -inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut; - int i; - - for (i = 0; i < N_COLS / 2; i++) { - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[10]; - ptrWordInOut[1] ^= state[11]; - ptrWordInOut[2] ^= state[0]; - ptrWordInOut[3] ^= state[1]; - ptrWordInOut[4] ^= state[2]; - ptrWordInOut[5] ^= state[3]; - ptrWordInOut[6] ^= state[4]; - ptrWordInOut[7] ^= state[5]; - ptrWordInOut[8] ^= state[6]; - ptrWordInOut[9] ^= state[7]; - ptrWordInOut[10] ^= state[8]; - ptrWordInOut[11] ^= state[9]; - - - //M[row][col] = rand - ptrWordOut[0] = state[0] ^ ptrWordIn[0]; - ptrWordOut[1] = state[1] ^ ptrWordIn[1]; - ptrWordOut[2] = state[2] ^ ptrWordIn[2]; - ptrWordOut[3] = state[3] ^ ptrWordIn[3]; - ptrWordOut[4] = state[4] ^ ptrWordIn[4]; - ptrWordOut[5] = state[5] ^ ptrWordIn[5]; - ptrWordOut[6] = state[6] ^ ptrWordIn[6]; - ptrWordOut[7] = state[7] ^ ptrWordIn[7]; - ptrWordOut[8] = state[8] ^ ptrWordIn[8]; - ptrWordOut[9] = state[9] ^ ptrWordIn[9]; - ptrWordOut[10] = state[10] ^ ptrWordIn[10]; - ptrWordOut[11] = state[11] ^ ptrWordIn[11]; - - //Goes to next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - ptrWordOut += 2 * BLOCK_LEN_INT64; - } - - ptrWordOut = rowOut + BLOCK_LEN_INT64; - for (i = 0; i < N_COLS / 2; i++) { - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[10]; - ptrWordInOut[1] ^= state[11]; - ptrWordInOut[2] ^= state[0]; - ptrWordInOut[3] ^= state[1]; - ptrWordInOut[4] ^= state[2]; - ptrWordInOut[5] ^= state[3]; - ptrWordInOut[6] ^= state[4]; - ptrWordInOut[7] ^= state[5]; - ptrWordInOut[8] ^= state[6]; - ptrWordInOut[9] ^= state[7]; - ptrWordInOut[10] ^= state[8]; - ptrWordInOut[11] ^= state[9]; - - - //M[row][col] = rand - ptrWordOut[0] = state[0] ^ ptrWordIn[0]; - ptrWordOut[1] = state[1] ^ ptrWordIn[1]; - ptrWordOut[2] = state[2] ^ ptrWordIn[2]; - ptrWordOut[3] = state[3] ^ ptrWordIn[3]; - ptrWordOut[4] = state[4] ^ ptrWordIn[4]; - ptrWordOut[5] = state[5] ^ ptrWordIn[5]; - ptrWordOut[6] = state[6] ^ ptrWordIn[6]; - ptrWordOut[7] = state[7] ^ ptrWordIn[7]; - ptrWordOut[8] = state[8] ^ ptrWordIn[8]; - ptrWordOut[9] = state[9] ^ ptrWordIn[9]; - ptrWordOut[10] = state[10] ^ ptrWordIn[10]; - ptrWordOut[11] = state[11] ^ ptrWordIn[11]; - - //Goes to next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - ptrWordOut += 2 * BLOCK_LEN_INT64; - } -} -*/ - -/** - * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand" - * to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)", - * where rotW is a 64-bit rotation to the left. - * - * @param state The current state of the sponge - * @param rowIn Row used only as input - * @param rowInOut Row used as input and to receive output after rotation - * @param rowOut Row receiving the output - * - */ -/* -inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - int i; - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[rowOut][col] = M[rowOut][col] XOR rand - ptrWordOut[0] ^= state[0]; - ptrWordOut[1] ^= state[1]; - ptrWordOut[2] ^= state[2]; - ptrWordOut[3] ^= state[3]; - ptrWordOut[4] ^= state[4]; - ptrWordOut[5] ^= state[5]; - ptrWordOut[6] ^= state[6]; - ptrWordOut[7] ^= state[7]; - ptrWordOut[8] ^= state[8]; - ptrWordOut[9] ^= state[9]; - ptrWordOut[10] ^= state[10]; - ptrWordOut[11] ^= state[11]; - - //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) - - - //Goes to next block - ptrWordOut += BLOCK_LEN_INT64; - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - } +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols) +{ + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + unsigned int i; + + for (i = 0; i < nCols; i++) + { + + //Absorbing "M[prev] [+] M[row*]" + #if defined __AVX2__ + + __m256i state_v[3], in_v[3], inout_v[3]; + #define out_v in_v // reuse register in next code block + + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); + inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); + inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); + inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) ); + + _mm256_store_si256( (__m256i*)(&state[0]), + _mm256_xor_si256( state_v[0], + _mm256_add_epi64( in_v[0], + inout_v[0] ) ) ); + _mm256_store_si256( (__m256i*)(&state[4]), + _mm256_xor_si256( state_v[1], + _mm256_add_epi64( in_v[1], + inout_v[1] ) ) ); + _mm256_store_si256( (__m256i*)(&state[8]), + _mm256_xor_si256( state_v[2], + _mm256_add_epi64( in_v[2], + inout_v[2] ) ) ); + #elif defined __AVX__ + + __m128i state_v[6], in_v[6], inout_v[6]; + #define out_v in_v // reuse register in next code block + + state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); + state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); + state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); + state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); + state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); + state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); + + inout_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[0]) ); + inout_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[2]) ); + inout_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[4]) ); + inout_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[6]) ); + inout_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[8]) ); + inout_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[10]) ); + + in_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) ); + in_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) ); + in_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) ); + in_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) ); + in_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) ); + in_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) ); + + _mm_store_si128( (__m128i*)(&state[0]), + _mm_xor_si128( state_v[0], + _mm_add_epi64( in_v[0], + inout_v[0] ) ) ); + _mm_store_si128( (__m128i*)(&state[2]), + _mm_xor_si128( state_v[1], + _mm_add_epi64( in_v[1], + inout_v[1] ) ) ); + _mm_store_si128( (__m128i*)(&state[4]), + _mm_xor_si128( state_v[2], + _mm_add_epi64( in_v[2], + inout_v[2] ) ) ); + _mm_store_si128( (__m128i*)(&state[6]), + _mm_xor_si128( state_v[3], + _mm_add_epi64( in_v[3], + inout_v[3] ) ) ); + _mm_store_si128( (__m128i*)(&state[8]), + _mm_xor_si128( state_v[4], + _mm_add_epi64( in_v[4], + inout_v[4] ) ) ); + _mm_store_si128( (__m128i*)(&state[10]), + _mm_xor_si128( state_v[5], + _mm_add_epi64( in_v[5], + inout_v[5] ) ) ); + + #else + + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + #endif + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[rowOut][col] = M[rowOut][col] XOR rand + #if defined __AVX2__ + + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + out_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + out_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + out_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) ); + + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), + _mm256_xor_si256( state_v[0], out_v[0] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), + _mm256_xor_si256( state_v[1], out_v[1] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), + _mm256_xor_si256( state_v[2], out_v[2] ) ); + + #elif defined __AVX__ + + state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); + state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); + state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); + state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); + state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); + state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); + + out_v[0] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[0]) ); + out_v[1] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[2]) ); + out_v[2] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[4]) ); + out_v[3] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[6]) ); + out_v[4] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[8]) ); + out_v[5] = _mm_loadu_si128( (__m128i*)(&ptrWordOut[10]) ); + + _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]), + _mm_xor_si128( state_v[0], out_v[0] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]), + _mm_xor_si128( state_v[1], out_v[1] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]), + _mm_xor_si128( state_v[2], out_v[2] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]), + _mm_xor_si128( state_v[3], out_v[3] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]), + _mm_xor_si128( state_v[4], out_v[4] ) ); + _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]), + _mm_xor_si128( state_v[5], out_v[5] ) ); + + #else + + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; + + #endif + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } } -*/ /** - Prints an array of unsigned chars + * Prints an array of unsigned chars */ -void printArray(unsigned char *array, unsigned int size, char *name) { +void printArray(unsigned char *array, unsigned int size, char *name) +{ unsigned int i; printf("%s: ", name); for (i = 0; i < size; i++) { diff --git a/lyra2/Sponge.h b/lyra2/Sponge.h index 9bd8ed664..d151ef837 100644 --- a/lyra2/Sponge.h +++ b/lyra2/Sponge.h @@ -24,53 +24,129 @@ #include -#if defined(__GNUC__) -#define ALIGN __attribute__ ((aligned(32))) -#elif defined(_MSC_VER) -#define ALIGN __declspec(align(32)) -#else -#define ALIGN -#endif - - -/*Blake2b IV Array*/ +/* Blake2b IV Array */ static const uint64_t blake2b_IV[8] = { - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL }; -/*Blake2b's rotation*/ -static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){ - return ( w >> c ) | ( w << ( 64 - c ) ); +/* Blake2b's rotation */ +static +#ifdef _MSC_VER +__forceinline +#else +__inline +#endif +uint64_t rotr64(const uint64_t w, const unsigned c) { + return ( w >> c ) | ( w << ( 64 - c ) ); } -/*Blake2b's G function*/ -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ +#if defined __AVX2__ + +// _m256i +#define mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \ + _mm256_slli_epi64(w, 64 - c)) + +// Rotate uint64 by one uint64 +// __m256i +#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 ) +#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 ) + +// swap hi and lo 128 bits in 256 bit vector +// _m256i +#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 ) + +// void +#define G_4X64(a,b,c,d) \ + a = _mm256_add_epi64( a, b ); \ + d = mm256_rotr_64( _mm256_xor_si256( d, a), 32 ); \ + c = _mm256_add_epi64( c, d ); \ + b = mm256_rotr_64( _mm256_xor_si256( b, c ), 24 ); \ + a = _mm256_add_epi64( a, b ); \ + d = mm256_rotr_64( _mm256_xor_si256( d, a ), 16 ); \ + c = _mm256_add_epi64( c, d ); \ + b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 ); + +#elif defined __AVX__ + +// _m128i +#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \ + _mm_slli_epi64(w, 64 - c)) + +// swap 128 bit source vectors +// void +#define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \ + s1 = _mm_xor_si128(s0, s1); \ + s0 = _mm_xor_si128(s0, s1); + +// swap uint64 in source vector +// __m128i +#define mm128_swap64(s) _mm_or_si128( _mm_slli_si128( s, 8 ), \ + _mm_srli_si128( s, 8 ) ) + +// rotate 2 128 bit vectors as one 256 vector by 1 uint64 +//void +#define mm128_rotl256_1x64(s0, s1) do { \ + __m128i t; \ + s0 = mm128_swap64( s0); \ + s1 = mm128_swap64( s1); \ + t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \ + _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \ + s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \ + _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \ + s0 = t; \ +} while(0) + +#define mm128_rotr256_1x64(s0, s1) do { \ + __m128i t; \ + s0 = mm128_swap64( s0); \ + s1 = mm128_swap64( s1); \ + t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \ + _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \ + s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \ + _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \ + s0 = t; \ +} while(0) + + +#define G_2X64(a,b,c,d) \ + a = _mm_add_epi64( a, b ); \ + d = mm_rotr_64( _mm_xor_si128( d, a), 32 ); \ + c = _mm_add_epi64( c, d ); \ + b = mm_rotr_64( _mm_xor_si128( b, c ), 24 ); \ + a = _mm_add_epi64( a, b ); \ + d = mm_rotr_64( _mm_xor_si128( d, a ), 16 ); \ + c = _mm_add_epi64( c, d ); \ + b = mm_rotr_64( _mm_xor_si128( b, c ), 63 ); + +#endif // AVX2 + +/* Blake2b's G function */ +#define G(r,i,a,b,c,d) do { \ + a = a + b; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ } while(0) /*One Round of the Blake2b's compression function*/ -#define ROUND_LYRA(r) \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); +#define ROUND_LYRA(r) \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); //---- Housekeeping @@ -78,31 +154,18 @@ void initState(uint64_t state[/*16*/]); //---- Squeezes void squeeze(uint64_t *state, unsigned char *out, unsigned int len); -void reducedSqueezeRow0(uint64_t* state, uint64_t* row); +void reducedSqueezeRow0(uint64_t* state, uint64_t* row, const uint32_t nCols); //---- Absorbs void absorbBlock(uint64_t *state, const uint64_t *in); void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); //---- Duplexes -void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); -void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols); +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols); +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols); //---- Misc void printArray(unsigned char *array, unsigned int size, char *name); -//////////////////////////////////////////////////////////////////////////////////////////////// - - -////TESTS//// -//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2); -//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -///////////// - - #endif /* SPONGE_H_ */ diff --git a/mingw64avx.sh b/mingw64avx.sh index eddd80ae2..8955ce6c5 100644 --- a/mingw64avx.sh +++ b/mingw64avx.sh @@ -21,7 +21,7 @@ windres res/icon.rc icon.o ./configure --build=x86_64-w64-mingw32 --with-crypto=$SSL_PREFIX --with-curl=$CURL_PREFIX \ CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" LDFLAGS="icon.o" -make +make -j8 strip -p --strip-debug --strip-unneeded cpuminer.exe diff --git a/mingw64avx2.sh b/mingw64avx2.sh index eda8a2651..8b73506ad 100644 --- a/mingw64avx2.sh +++ b/mingw64avx2.sh @@ -21,7 +21,7 @@ windres res/icon.rc icon.o ./configure --build=x86_64-w64-mingw32 --with-crypto=$SSL_PREFIX --with-curl=$CURL_PREFIX \ CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" LDFLAGS="icon.o" -make +make -j8 strip -p --strip-debug --strip-unneeded cpuminer.exe diff --git a/mingw64sse2.sh b/mingw64sse2.sh index c49270b24..eaa19214e 100644 --- a/mingw64sse2.sh +++ b/mingw64sse2.sh @@ -21,7 +21,7 @@ windres res/icon.rc icon.o ./configure --build=x86_64-w64-mingw32 --with-crypto=$SSL_PREFIX --with-curl=$CURL_PREFIX \ CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" LDFLAGS="icon.o" -make +make -j8 strip -p --strip-debug --strip-unneeded cpuminer.exe