From 5a2c6d232e2f8dec1a302510956c30140b8413cd Mon Sep 17 00:00:00 2001
From: nicehashdev <info@nicehash.com>
Date: Mon, 22 Aug 2016 13:17:31 +0200
Subject: [PATCH] Faster Lyra2RE

---
 algo/aes_ni/README                  |   14 +
 algo/aes_ni/api.h                   |    2 +
 algo/aes_ni/architectures           |    1 +
 algo/aes_ni/brg_endian.h            |  133 +++
 algo/aes_ni/brg_types.h             |  234 +++++
 algo/aes_ni/groestl-asm-aes.h       | 1043 ++++++++++++++++++++
 algo/aes_ni/groestl-asm-avx.h       | 1105 +++++++++++++++++++++
 algo/aes_ni/groestl-asm-vperm.h     | 1397 +++++++++++++++++++++++++++
 algo/aes_ni/groestl-intr-aes.h      |  965 ++++++++++++++++++
 algo/aes_ni/groestl-intr-avx.h      | 1072 ++++++++++++++++++++
 algo/aes_ni/groestl-intr-vperm.h    | 1294 +++++++++++++++++++++++++
 algo/aes_ni/groestl-version.h       |   16 +
 algo/aes_ni/groestl256-asm-aes.h    |  529 ++++++++++
 algo/aes_ni/groestl256-asm-avx.h    |  519 ++++++++++
 algo/aes_ni/groestl256-asm-vperm.h  |  856 ++++++++++++++++
 algo/aes_ni/groestl256-intr-aes.h   |  496 ++++++++++
 algo/aes_ni/groestl256-intr-avx.h   |  482 +++++++++
 algo/aes_ni/groestl256-intr-vperm.h |  793 +++++++++++++++
 algo/aes_ni/hash-groestl.c          |  306 ++++++
 algo/aes_ni/hash-groestl.h          |  110 +++
 algo/aes_ni/hash-groestl256.c       |  318 ++++++
 algo/aes_ni/hash-groestl256.h       |  116 +++
 algo/aes_ni/implementors            |    3 +
 algo/lyra2re.c                      |   65 +-
 cpu-miner.c                         |    6 +-
 lyra2/Lyra2.c                       |   99 +-
 lyra2/Lyra2.h                       |   10 +-
 lyra2/Sponge.c                      | 1384 +++++++++++++++-----------
 lyra2/Sponge.h                      |  171 ++--
 mingw64avx.sh                       |    2 +-
 mingw64avx2.sh                      |    2 +-
 mingw64sse2.sh                      |    2 +-
 32 files changed, 12839 insertions(+), 706 deletions(-)
 create mode 100644 algo/aes_ni/README
 create mode 100644 algo/aes_ni/api.h
 create mode 100644 algo/aes_ni/architectures
 create mode 100644 algo/aes_ni/brg_endian.h
 create mode 100644 algo/aes_ni/brg_types.h
 create mode 100644 algo/aes_ni/groestl-asm-aes.h
 create mode 100644 algo/aes_ni/groestl-asm-avx.h
 create mode 100644 algo/aes_ni/groestl-asm-vperm.h
 create mode 100644 algo/aes_ni/groestl-intr-aes.h
 create mode 100644 algo/aes_ni/groestl-intr-avx.h
 create mode 100644 algo/aes_ni/groestl-intr-vperm.h
 create mode 100644 algo/aes_ni/groestl-version.h
 create mode 100644 algo/aes_ni/groestl256-asm-aes.h
 create mode 100644 algo/aes_ni/groestl256-asm-avx.h
 create mode 100644 algo/aes_ni/groestl256-asm-vperm.h
 create mode 100644 algo/aes_ni/groestl256-intr-aes.h
 create mode 100644 algo/aes_ni/groestl256-intr-avx.h
 create mode 100644 algo/aes_ni/groestl256-intr-vperm.h
 create mode 100644 algo/aes_ni/hash-groestl.c
 create mode 100644 algo/aes_ni/hash-groestl.h
 create mode 100644 algo/aes_ni/hash-groestl256.c
 create mode 100644 algo/aes_ni/hash-groestl256.h
 create mode 100644 algo/aes_ni/implementors

diff --git a/algo/aes_ni/README b/algo/aes_ni/README
new file mode 100644
index 000000000..e55be0b59
--- /dev/null
+++ b/algo/aes_ni/README
@@ -0,0 +1,14 @@
+This package contains an implementation of the Groestl-512 hash
+function optimized for the Intel AES instructions.
+
+Authors are Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+
+There are no known present or future claims by a copyright holder that
+the distribution of this software infringes the copyright. In
+particular, the author of the software is not making such claims and
+does not intend to make such claims.
+
+Moreover, there are no known present or future claims by a patent
+holder that the use of this software infringes the patent. In
+particular, the author of the software is not making such claims and
+does not intend to make such claims.
diff --git a/algo/aes_ni/api.h b/algo/aes_ni/api.h
new file mode 100644
index 000000000..e56a47f18
--- /dev/null
+++ b/algo/aes_ni/api.h
@@ -0,0 +1,2 @@
+#define CRYPTO_BYTES 64
+#define CRYPTO_VERSION "2.2"
diff --git a/algo/aes_ni/architectures b/algo/aes_ni/architectures
new file mode 100644
index 000000000..21d5bd8c7
--- /dev/null
+++ b/algo/aes_ni/architectures
@@ -0,0 +1 @@
+amd64
diff --git a/algo/aes_ni/brg_endian.h b/algo/aes_ni/brg_endian.h
new file mode 100644
index 000000000..e3cf0d11d
--- /dev/null
+++ b/algo/aes_ni/brg_endian.h
@@ -0,0 +1,133 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
diff --git a/algo/aes_ni/brg_types.h b/algo/aes_ni/brg_types.h
new file mode 100644
index 000000000..fd603b752
--- /dev/null
+++ b/algo/aes_ni/brg_types.h
@@ -0,0 +1,234 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ (a few lines added by Soeren S. Thomsen, October 2008)
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef _BRG_TYPES_H
+#define _BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
+#  include <stddef.h>
+#  define ptrint_t intptr_t
+#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
+#  include <stdint.h>
+#  define ptrint_t intptr_t
+#else
+#  define ptrint_t int
+#endif
+
+#ifndef BRG_UI8
+#  define BRG_UI8
+#  if UCHAR_MAX == 255u
+     typedef unsigned char uint_8t;
+#  else
+#    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI16
+#  define BRG_UI16
+#  if USHRT_MAX == 65535u
+     typedef unsigned short uint_16t;
+#  else
+#    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI32
+#  define BRG_UI32
+#  if UINT_MAX == 4294967295u
+#    define li_32(h) 0x##h##u
+     typedef unsigned int uint_32t;
+#  elif ULONG_MAX == 4294967295u
+#    define li_32(h) 0x##h##ul
+     typedef unsigned long uint_32t;
+#  elif defined( _CRAY )
+#    error This code needs 32-bit data types, which Cray machines do not provide
+#  else
+#    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI64
+#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned long long uint_64t;
+#  elif defined( __MVS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned int long long uint_64t;
+#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+#    if UINT_MAX == 18446744073709551615u
+#      define BRG_UI64
+#      define li_64(h) 0x##h##u
+       typedef unsigned int uint_64t;
+#    endif
+#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+#    if ULONG_MAX == 18446744073709551615ul
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ul
+       typedef unsigned long uint_64t;
+#    endif
+#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+#    if ULLONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+#    if ULONG_LONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  endif
+#endif
+
+#if !defined( BRG_UI64 )
+#  if defined( NEED_UINT_64T )
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+  /*#    error Please define uint_64t as an unsigned 64 bit type in brg_types.h*/
+#  endif
+#endif
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*	These defines are used to detect and set the memory alignment of pointers.
+    Note that offsets are in bytes.
+
+	ALIGN_OFFSET(x,n)			return the positive or zero offset of 
+								the memory addressed by the pointer 'x' 
+								from an address that is aligned on an 
+								'n' byte boundary ('n' is a power of 2)
+
+	ALIGN_FLOOR(x,n)			return a pointer that points to memory
+								that is aligned on an 'n' byte boundary 
+								and is not higher than the memory address
+								pointed to by 'x' ('n' is a power of 2)
+
+	ALIGN_CEIL(x,n)				return a pointer that points to memory
+								that is aligned on an 'n' byte boundary 
+								and is not lower than the memory address
+								pointed to by 'x' ('n' is a power of 2)
+*/
+
+#define ALIGN_OFFSET(x,n)	(((ptrint_t)(x)) & ((n) - 1))
+#define ALIGN_FLOOR(x,n)	((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
+#define ALIGN_CEIL(x,n)		((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8. NOTE that the 
+    buffer size is in bytes but the type length is in bits
+
+    UNIT_TYPEDEF(x,size)        declares a variable 'x' of length 
+                                'size' bits
+
+    BUFR_TYPEDEF(x,size,bsize)  declares a buffer 'x' of length 'bsize' 
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a 
+                                multiple of size / 8)
+
+    UNIT_CAST(x,size)           casts a variable to a type of 
+                                length 'size' bits
+
+    UPTR_CAST(x,size)           casts a pointer to a pointer to a 
+                                varaiable of length 'size' bits
+*/
+
+#define UI_TYPE(size)               uint_##size##t
+#define UNIT_TYPEDEF(x,size)        typedef UI_TYPE(size) x
+#define BUFR_TYPEDEF(x,size,bsize)  typedef UI_TYPE(size) x[bsize / (size >> 3)]
+#define UNIT_CAST(x,size)           ((UI_TYPE(size) )(x))  
+#define UPTR_CAST(x,size)           ((UI_TYPE(size)*)(x))
+
+  /* Added by Soeren S. Thomsen (begin) */
+#define u8 uint_8t
+#define u32 uint_32t
+#define u64 uint_64t
+  /* (end) */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/algo/aes_ni/groestl-asm-aes.h b/algo/aes_ni/groestl-asm-aes.h
new file mode 100644
index 000000000..c4e44a4d6
--- /dev/null
+++ b/algo/aes_ni/groestl-asm-aes.h
@@ -0,0 +1,1043 @@
+/* groestl-asm-aes.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3, sse4.1, and aes
+ * instructions.
+ * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl.h"
+/* global constants  */
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
+__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
+
+/* temporary variables  */
+__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
+  asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
+  asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
+  asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
+  asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
+  asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
+  asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
+  asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
+  asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
+  asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
+  asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
+  /* spill values y_4, y_5 to memory */\
+  asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
+  asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
+  asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
+  asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  asm("movaps xmm"tostr(b1)", [ALL_1B]");\
+  MUL2(a0, b0, b1);\
+  asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
+  MUL2(a1, b0, b1);\
+  asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
+  MUL2(a2, b0, b1);\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
+  MUL2(a3, b0, b1);\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
+  MUL2(a4, b0, b1);\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
+  MUL2(a5, b0, b1);\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
+  MUL2(a6, b0, b1);\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
+  MUL2(a7, b0, b1);\
+  asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
+  MUL2(a1, b0, b1);\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
+  MUL2(a2, b0, b1);\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
+  MUL2(a5, b0, b1);\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
+  MUL2(a6, b0, b1);\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
+  MUL2(a7, b0, b1);\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
+  asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
+}/*MixBytes*/
+
+#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}while(0);
+
+#define Push_All_Regs() do{\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}while(0);
+
+#define Pop_All_Regs() do{\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}while(0);
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  /* ShiftBytes + SubBytes (interleaved) */\
+  asm ("pxor xmm"tostr(b0)",  xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+  \
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+  \
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+  \
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
+  \
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+  \
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
+  \
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
+  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
+  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
+  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+void INIT(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("movaps xmm12, [rdi+0*16]");
+  asm ("movaps xmm13, [rdi+1*16]");
+  asm ("movaps xmm14, [rdi+2*16]");
+  asm ("movaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm12");
+  asm ("movaps [rdi+1*16], xmm2");
+  asm ("movaps [rdi+2*16], xmm6");
+  asm ("movaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("movaps xmm12, [rsi+0*16]");
+  asm ("movaps xmm13, [rsi+1*16]");
+  asm ("movaps xmm14, [rsi+2*16]");
+  asm ("movaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  asm ("movaps xmm8, [rdi+0*16]");
+  asm ("movaps xmm0, [rdi+1*16]");
+  asm ("movaps xmm4, [rdi+2*16]");
+  asm ("movaps xmm5, [rdi+3*16]");
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("pxor xmm8, xmm12");
+  asm ("pxor xmm0, xmm2");
+  asm ("pxor xmm4, xmm6");
+  asm ("pxor xmm5, xmm7");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("pxor xmm0, xmm8");
+  asm ("pxor xmm1, xmm10");
+  asm ("pxor xmm2, xmm12");
+  asm ("pxor xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("pxor xmm0, [rdi+0*16]");
+  asm ("pxor xmm1, [rdi+1*16]");
+  asm ("pxor xmm2, [rdi+2*16]");
+  asm ("pxor xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm0");
+  asm ("movaps [rdi+1*16], xmm1");
+  asm ("movaps [rdi+2*16], xmm2");
+  asm ("movaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm10, [rdi+1*16]");
+  asm ("movaps xmm12, [rdi+2*16]");
+  asm ("movaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm10, [rdi+1*16]");
+  asm ("pxor xmm12, [rdi+2*16]");
+  asm ("pxor xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+2*16], xmm9");
+  asm ("movaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
+#endif
+
+#if (LENGTH > 256)
+
+#define SET_CONSTANTS(){\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\
+  ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0b0e0104070a0d00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x0306090c0f020508ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0c0f0205080b0e01ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070a0d00030609ULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x0d000306090c0f02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05080b0e0104070aULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0e0104070a0d0003ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x06090c0f0205080bULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0f0205080b0e0104ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x070a0d000306090cULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x000306090c0f0205ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x080b0e0104070a0dULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0104070a0d000306ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x090c0f0205080b0eULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x06090c0f0205080bULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x0e0104070a0d0003ULL;\
+  for(i = 0; i < ROUNDS1024; i++)\
+  {\
+    ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\
+    ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\
+    ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
+  }\
+}while(0);
+
+#define Push_All_Regs() do{\
+  asm("push rax");\
+  asm("push rbx");\
+  asm("push rcx");\
+}while(0);
+
+#define Pop_All_Regs() do{\
+  asm("pop rcx");\
+  asm("pop rbx");\
+  asm("pop rax");\
+}while(0);
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes */\
+  asm ("pxor       xmm"tostr(b0)", xmm"tostr(b0)"");\
+  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
+  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
+  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
+  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
+  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
+  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
+  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+#define ROUNDS_P(){\
+  asm ("xor rax, rax");\
+  asm ("xor rbx, rbx");\
+  asm ("add bl, 2");\
+  asm ("1:");\
+  /* AddRoundConstant P1024 */\
+  asm ("pxor xmm8, [ROUND_CONST_P+eax*8]");\
+  /* ShiftBytes P1024 + pre-AESENCLAST */\
+  asm ("pshufb xmm8,  [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm9,  [SUBSH_MASK+1*16]");\
+  asm ("pshufb xmm10, [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm11, [SUBSH_MASK+3*16]");\
+  asm ("pshufb xmm12, [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm13, [SUBSH_MASK+5*16]");\
+  asm ("pshufb xmm14, [SUBSH_MASK+6*16]");\
+  asm ("pshufb xmm15, [SUBSH_MASK+7*16]");\
+  /* SubBytes + MixBytes */\
+  SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  \
+  /* AddRoundConstant P1024 */\
+  asm ("pxor xmm0, [ROUND_CONST_P+ebx*8]");\
+  /* ShiftBytes P1024 + pre-AESENCLAST */\
+  asm ("pshufb xmm0, [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm1, [SUBSH_MASK+1*16]");\
+  asm ("pshufb xmm2, [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm3, [SUBSH_MASK+3*16]");\
+  asm ("pshufb xmm4, [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm5, [SUBSH_MASK+5*16]");\
+  asm ("pshufb xmm6, [SUBSH_MASK+6*16]");\
+  asm ("pshufb xmm7, [SUBSH_MASK+7*16]");\
+  /* SubBytes + MixBytes */\
+  SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  asm ("add al, 4");\
+  asm ("add bl, 4");\
+  asm ("mov rcx, rax");\
+  asm ("sub cl, 28");\
+  asm ("jb 1b");\
+}
+
+#define ROUNDS_Q(){\
+  asm ("xor rax, rax");\
+  asm ("xor rbx, rbx");\
+  asm ("add bl, 2");\
+  asm ("2:");\
+  /* AddRoundConstant Q1024 */\
+  asm ("movaps xmm1,  [ALL_FF]");\
+  asm ("pxor xmm8,  xmm1");\
+  asm ("pxor xmm9,  xmm1");\
+  asm ("pxor xmm10, xmm1");\
+  asm ("pxor xmm11, xmm1");\
+  asm ("pxor xmm12, xmm1");\
+  asm ("pxor xmm13, xmm1");\
+  asm ("pxor xmm14, xmm1");\
+  asm ("pxor xmm15, [ROUND_CONST_Q+eax*8]");\
+  /* ShiftBytes Q1024 + pre-AESENCLAST */\
+  asm ("pshufb xmm8,  [SUBSH_MASK+1*16]");\
+  asm ("pshufb xmm9,  [SUBSH_MASK+3*16]");\
+  asm ("pshufb xmm10, [SUBSH_MASK+5*16]");\
+  asm ("pshufb xmm11, [SUBSH_MASK+7*16]");\
+  asm ("pshufb xmm12, [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm13, [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm14, [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm15, [SUBSH_MASK+6*16]");\
+  /* SubBytes + MixBytes */\
+  SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  \
+  /* AddConstant */\
+  asm ("movaps xmm9,  [ALL_FF]");\
+  asm ("pxor xmm0,  xmm9");\
+  asm ("pxor xmm1,  xmm9");\
+  asm ("pxor xmm2,  xmm9");\
+  asm ("pxor xmm3,  xmm9");\
+  asm ("pxor xmm4,  xmm9");\
+  asm ("pxor xmm5,  xmm9");\
+  asm ("pxor xmm6,  xmm9");\
+  asm ("pxor xmm7,  [ROUND_CONST_Q+ebx*8]");\
+  /* ShiftBytes Q1024 + pre-AESENCLAST */\
+  asm ("pshufb xmm0, [SUBSH_MASK+1*16]");\
+  asm ("pshufb xmm1, [SUBSH_MASK+3*16]");\
+  asm ("pshufb xmm2, [SUBSH_MASK+5*16]");\
+  asm ("pshufb xmm3, [SUBSH_MASK+7*16]");\
+  asm ("pshufb xmm4, [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm5, [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm6, [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm7, [SUBSH_MASK+6*16]");\
+  /* SubBytes + MixBytes */\
+  SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  asm ("add al, 4");\
+  asm ("add bl, 4");\
+  asm ("mov rcx, rax");\
+  asm ("sub cl, 28");\
+  asm ("jb 2b");\
+}
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+  \
+  asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\
+  asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\
+  asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\
+  \
+  /* continue with unpack using 4 temp registers */\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\
+  asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\
+  asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\
+  asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+  \
+  /* shuffle with immediate */\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+  asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
+  asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\
+  asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
+  asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\
+  \
+  /* continue with unpack */\
+  asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\
+  asm ("punpckldq xmm"tostr(i0)",  xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(t4)",  xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\
+  asm ("punpckldq xmm"tostr(t0)",  xmm"tostr(t1)"");\
+  asm ("punpckhdq xmm"tostr(t5)",  xmm"tostr(t1)"");\
+  asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\
+  asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\
+  asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\
+  asm ("punpckhdq xmm"tostr(t6)",  xmm"tostr(i6)"");\
+  asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckldq xmm"tostr(t2)",  xmm"tostr(t3)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(t7)",  xmm"tostr(t3)"");\
+  \
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\
+  asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\
+  asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\
+  asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\
+  asm ("movaps xmm"tostr(o0)", [TRANSP_MASK]");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\
+  /* continue with unpack using 4 temp registers */\
+  asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\
+  asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\
+  \
+  asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\
+  asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\
+  asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\
+  asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\
+  asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\
+  /* shuffle with immediate */\
+  asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
+  asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\
+  asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
+  asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\
+  /* continue with unpack */\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\
+  asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\
+  asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\
+  asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\
+  asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\
+  asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\
+  /* transpose done */\
+}/**/
+
+
+void INIT(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm8 - xmm15 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm9,  [rdi+1*16]");
+  asm ("movaps xmm10, [rdi+2*16]");
+  asm ("movaps xmm11, [rdi+3*16]");
+  asm ("movaps xmm12, [rdi+4*16]");
+  asm ("movaps xmm13, [rdi+5*16]");
+  asm ("movaps xmm14, [rdi+6*16]");
+  asm ("movaps xmm15, [rdi+7*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm8");
+  asm ("movaps [rdi+1*16], xmm9");
+  asm ("movaps [rdi+2*16], xmm10");
+  asm ("movaps [rdi+3*16], xmm11");
+  asm ("movaps [rdi+4*16], xmm12");
+  asm ("movaps [rdi+5*16], xmm13");
+  asm ("movaps [rdi+6*16], xmm14");
+  asm ("movaps [rdi+7*16], xmm15");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF1024(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  asm ("movaps xmm8,  [rsi+0*16]");
+  asm ("movaps xmm9,  [rsi+1*16]");
+  asm ("movaps xmm10, [rsi+2*16]");
+  asm ("movaps xmm11, [rsi+3*16]");
+  asm ("movaps xmm12, [rsi+4*16]");
+  asm ("movaps xmm13, [rsi+5*16]");
+  asm ("movaps xmm14, [rsi+6*16]");
+  asm ("movaps xmm15, [rsi+7*16]");
+
+  /* transform message M from column ordering into row ordering */
+  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+
+  /* store message M (Q input) for later */
+  asm ("movaps [QTEMP+0*16], xmm8");
+  asm ("movaps [QTEMP+1*16], xmm9");
+  asm ("movaps [QTEMP+2*16], xmm10");
+  asm ("movaps [QTEMP+3*16], xmm11");
+  asm ("movaps [QTEMP+4*16], xmm12");
+  asm ("movaps [QTEMP+5*16], xmm13");
+  asm ("movaps [QTEMP+6*16], xmm14");
+  asm ("movaps [QTEMP+7*16], xmm15");
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm9,  [rdi+1*16]");
+  asm ("pxor xmm10, [rdi+2*16]");
+  asm ("pxor xmm11, [rdi+3*16]");
+  asm ("pxor xmm12, [rdi+4*16]");
+  asm ("pxor xmm13, [rdi+5*16]");
+  asm ("pxor xmm14, [rdi+6*16]");
+  asm ("pxor xmm15, [rdi+7*16]");
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm9,  [rdi+1*16]");
+  asm ("pxor xmm10, [rdi+2*16]");
+  asm ("pxor xmm11, [rdi+3*16]");
+  asm ("pxor xmm12, [rdi+4*16]");
+  asm ("pxor xmm13, [rdi+5*16]");
+  asm ("pxor xmm14, [rdi+6*16]");
+  asm ("pxor xmm15, [rdi+7*16]");
+
+  /* store P(CV+M)+CV */
+  asm ("movaps [rdi+0*16], xmm8");
+  asm ("movaps [rdi+1*16], xmm9");
+  asm ("movaps [rdi+2*16], xmm10");
+  asm ("movaps [rdi+3*16], xmm11");
+  asm ("movaps [rdi+4*16], xmm12");
+  asm ("movaps [rdi+5*16], xmm13");
+  asm ("movaps [rdi+6*16], xmm14");
+  asm ("movaps [rdi+7*16], xmm15");
+
+  /* load message M (Q input) into xmm8-15 */
+  asm ("movaps xmm8,  [QTEMP+0*16]");
+  asm ("movaps xmm9,  [QTEMP+1*16]");
+  asm ("movaps xmm10, [QTEMP+2*16]");
+  asm ("movaps xmm11, [QTEMP+3*16]");
+  asm ("movaps xmm12, [QTEMP+4*16]");
+  asm ("movaps xmm13, [QTEMP+5*16]");
+  asm ("movaps xmm14, [QTEMP+6*16]");
+  asm ("movaps xmm15, [QTEMP+7*16]");
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  ROUNDS_Q();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm9,  [rdi+1*16]");
+  asm ("pxor xmm10, [rdi+2*16]");
+  asm ("pxor xmm11, [rdi+3*16]");
+  asm ("pxor xmm12, [rdi+4*16]");
+  asm ("pxor xmm13, [rdi+5*16]");
+  asm ("pxor xmm14, [rdi+6*16]");
+  asm ("pxor xmm15, [rdi+7*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm8");
+  asm ("movaps [rdi+1*16], xmm9");
+  asm ("movaps [rdi+2*16], xmm10");
+  asm ("movaps [rdi+3*16], xmm11");
+  asm ("movaps [rdi+4*16], xmm12");
+  asm ("movaps [rdi+5*16], xmm13");
+  asm ("movaps [rdi+6*16], xmm14");
+  asm ("movaps [rdi+7*16], xmm15");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF1024(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8 - xmm15 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm9,  [rdi+1*16]");
+  asm ("movaps xmm10, [rdi+2*16]");
+  asm ("movaps xmm11, [rdi+3*16]");
+  asm ("movaps xmm12, [rdi+4*16]");
+  asm ("movaps xmm13, [rdi+5*16]");
+  asm ("movaps xmm14, [rdi+6*16]");
+  asm ("movaps xmm15, [rdi+7*16]");
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm9,  [rdi+1*16]");
+  asm ("pxor xmm10, [rdi+2*16]");
+  asm ("pxor xmm11, [rdi+3*16]");
+  asm ("pxor xmm12, [rdi+4*16]");
+  asm ("pxor xmm13, [rdi+5*16]");
+  asm ("pxor xmm14, [rdi+6*16]");
+  asm ("pxor xmm15, [rdi+7*16]");
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+4*16], xmm0");
+  asm ("movaps [rdi+5*16], xmm6");
+  asm ("movaps [rdi+6*16], xmm13");
+  asm ("movaps [rdi+7*16], xmm15");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
+#endif
+
diff --git a/algo/aes_ni/groestl-asm-avx.h b/algo/aes_ni/groestl-asm-avx.h
new file mode 100644
index 000000000..6e8be1be4
--- /dev/null
+++ b/algo/aes_ni/groestl-asm-avx.h
@@ -0,0 +1,1105 @@
+/* groestl-asm-avx.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl.h"
+
+/* global variables  */
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
+__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
+
+/* temporary variables  */
+__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}while(0);
+
+#define Push_All_Regs() do{\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}while(0);
+
+#define Pop_All_Regs() do{\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}while(0);
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2(i, j, k, z){\
+  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2v2(i, j, k, z){\
+  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
+  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
+  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
+  asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
+  asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
+  asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
+  asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
+  asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
+  asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
+  \
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
+  \
+  /* spill values y_4, y_5 to memory */\
+  asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
+  asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
+  asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
+  \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
+  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
+  \
+  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
+  VMUL2(a7, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a0, b0, b1, b2);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
+}/*MixBytes*/
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("vpxor   xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("vpxor   xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  /* ShiftBytes + SubBytes (interleaved) */\
+  asm ("vpxor xmm"tostr(b0)",  xmm"tostr(b0)",  xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
+\
+  asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+\
+  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+\
+  asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
+  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
+  asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
+  asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
+  asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+void INIT(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("vmovaps xmm12, [rdi+0*16]");
+  asm ("vmovaps xmm13, [rdi+1*16]");
+  asm ("vmovaps xmm14, [rdi+2*16]");
+  asm ("vmovaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("vmovaps [rdi+0*16], xmm12");
+  asm ("vmovaps [rdi+1*16], xmm2");
+  asm ("vmovaps [rdi+2*16], xmm6");
+  asm ("vmovaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("vmovaps xmm12, [rsi+0*16]");
+  asm ("vmovaps xmm13, [rsi+1*16]");
+  asm ("vmovaps xmm14, [rsi+2*16]");
+  asm ("vmovaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("vpxor xmm8, xmm12, [rdi+0*16]");
+  asm ("vpxor xmm0, xmm2,  [rdi+1*16]");
+  asm ("vpxor xmm4, xmm6,  [rdi+2*16]");
+  asm ("vpxor xmm5, xmm7,  [rdi+3*16]");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("vpxor xmm0, xmm0, xmm8");
+  asm ("vpxor xmm1, xmm1, xmm10");
+  asm ("vpxor xmm2, xmm2, xmm12");
+  asm ("vpxor xmm3, xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("vpxor xmm0, xmm0, [rdi+0*16]");
+  asm ("vpxor xmm1, xmm1, [rdi+1*16]");
+  asm ("vpxor xmm2, xmm2, [rdi+2*16]");
+  asm ("vpxor xmm3, xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("vmovaps [rdi+0*16], xmm0");
+  asm ("vmovaps [rdi+1*16], xmm1");
+  asm ("vmovaps [rdi+2*16], xmm2");
+  asm ("vmovaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("vmovaps xmm8,  [rdi+0*16]");
+  asm ("vmovaps xmm10, [rdi+1*16]");
+  asm ("vmovaps xmm12, [rdi+2*16]");
+  asm ("vmovaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
+  asm ("vpxor xmm10, xmm10, [rdi+1*16]");
+  asm ("vpxor xmm12, xmm12, [rdi+2*16]");
+  asm ("vpxor xmm14, xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+
+  /* we only need to return the truncated half of the state */
+  asm ("vmovaps [rdi+2*16], xmm9");
+  asm ("vmovaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
+#endif
+
+#if (LENGTH > 256)
+
+#define SET_CONSTANTS(){\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\
+  ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ALL_FF)[2] = 0x0000000000000000ULL;\
+  ((u64*)ALL_FF)[3] = 0x0000000000000000ULL;\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[2] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[3] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0b0e0104070a0d00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x0306090c0f020508ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0c0f0205080b0e01ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070a0d00030609ULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x0d000306090c0f02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05080b0e0104070aULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0e0104070a0d0003ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x06090c0f0205080bULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0f0205080b0e0104ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x070a0d000306090cULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x000306090c0f0205ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x080b0e0104070a0dULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0104070a0d000306ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x090c0f0205080b0eULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x06090c0f0205080bULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x0e0104070a0d0003ULL;\
+  for(i = 0; i < ROUNDS1024; i++)\
+  {\
+    ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\
+    ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\
+    ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
+  }\
+}while(0);
+
+#define Push_All_Regs() do{\
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");\
+}while(0);
+
+#define Pop_All_Regs() do{\
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");\
+}while(0);
+
+/* AVX MUL2
+ * ymm[i] will be multiplied by 2
+ * ymm[j] will be lost
+ * ymm[k] has to be all 0x1b
+ * ymm[z] has to be zero
+ * clobbers: t2, t3 */
+#define VMUL2(i, j, k, z, ih, jh){\
+    asm("vextractf128 xmm"tostr(ih)", ymm"tostr(i)", 1");\
+    asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
+    asm("vpcmpgtb xmm"tostr(jh)", xmm"tostr(z)", xmm"tostr(ih)"");\
+    asm("vpaddb xmm"tostr(i)",  xmm"tostr(i)",  xmm"tostr(i)"");\
+    asm("vpaddb xmm"tostr(ih)", xmm"tostr(ih)", xmm"tostr(ih)"");\
+    asm("vinsertf128 ymm"tostr(j)", ymm"tostr(j)", xmm"tostr(jh)", 1");\
+    asm("vinsertf128 ymm"tostr(i)",  ymm"tostr(i)",  xmm"tostr(ih)", 1");\
+    asm("vandpd ymm"tostr(j)", ymm"tostr(j)", ymm"tostr(k)"");\
+    asm("vxorpd ymm"tostr(i)",  ymm"tostr(i)",  ymm"tostr(j)"");\
+}/**/
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2v2(i, j, k, z){\
+  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2v3(i, j, k, z){\
+  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* ymm"tostr(8..ymm"tostr(15 = a2 a3... a0 a1 */\
+  asm("vmovdqa ymm"tostr(b0)", ymm"tostr(a2)"");\
+  asm("vmovdqa ymm"tostr(b1)", ymm"tostr(a3)"");\
+  asm("vmovdqa ymm"tostr(b2)", ymm"tostr(a4)"");\
+  asm("vmovdqa ymm"tostr(b3)", ymm"tostr(a5)"");\
+  asm("vmovdqa ymm"tostr(b4)", ymm"tostr(a6)"");\
+  asm("vmovdqa ymm"tostr(b5)", ymm"tostr(a7)"");\
+  asm("vmovdqa ymm"tostr(b6)", ymm"tostr(a0)"");\
+  asm("vmovdqa ymm"tostr(b7)", ymm"tostr(a1)"");\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", ymm"tostr(a1)"");\
+  asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", ymm"tostr(a2)"");\
+  asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", ymm"tostr(a3)"");\
+  asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", ymm"tostr(a4)"");\
+  asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", ymm"tostr(a5)"");\
+  asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(a6)"");\
+  asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(a7)"");\
+  asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", ymm"tostr(b6)"");\
+  \
+  /* build y4 y5 y6 ... in regs ymm8, ymm9, ymm10 by adding t_i*/\
+  asm("vxorpd ymm"tostr(b0)", ymm"tostr(b0)", ymm"tostr(a4)"");\
+  asm("vxorpd ymm"tostr(b1)", ymm"tostr(b1)", ymm"tostr(a5)"");\
+  asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(a6)"");\
+  asm("vxorpd ymm"tostr(b3)", ymm"tostr(b3)", ymm"tostr(a7)"");\
+  asm("vxorpd ymm"tostr(b4)", ymm"tostr(b4)", ymm"tostr(a0)"");\
+  asm("vxorpd ymm"tostr(b5)", ymm"tostr(b5)", ymm"tostr(a1)"");\
+  asm("vxorpd ymm"tostr(b6)", ymm"tostr(b6)", ymm"tostr(a2)"");\
+  asm("vxorpd ymm"tostr(b7)", ymm"tostr(b7)", ymm"tostr(a3)"");\
+  \
+  asm("vxorpd ymm"tostr(b0)", ymm"tostr(b0)", ymm"tostr(a6)"");\
+  asm("vxorpd ymm"tostr(b1)", ymm"tostr(b1)", ymm"tostr(a7)"");\
+  asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(a0)"");\
+  asm("vxorpd ymm"tostr(b3)", ymm"tostr(b3)", ymm"tostr(a1)"");\
+  asm("vxorpd ymm"tostr(b4)", ymm"tostr(b4)", ymm"tostr(a2)"");\
+  asm("vxorpd ymm"tostr(b5)", ymm"tostr(b5)", ymm"tostr(a3)"");\
+  asm("vxorpd ymm"tostr(b6)", ymm"tostr(b6)", ymm"tostr(a4)"");\
+  asm("vxorpd ymm"tostr(b7)", ymm"tostr(b7)", ymm"tostr(a5)"");\
+  \
+  /* spill values y_4, y_5 to memory */\
+  asm("vmovaps [TEMP+0*32], ymm"tostr(b0)"");\
+  asm("vmovaps [TEMP+1*32], ymm"tostr(b1)"");\
+  asm("vmovaps [TEMP+2*32], ymm"tostr(b2)"");\
+  asm("vmovaps [TEMP+3*32], ymm"tostr(b3)"");\
+  asm("vmovaps [TEMP+4*32], ymm"tostr(b4)"");\
+  \
+  /* save values t0, t1, t2 to ymm8, ymm9 and memory */\
+  asm("vmovdqa ymm"tostr(b0)", ymm"tostr(a0)"");\
+  asm("vmovdqa ymm"tostr(b1)", ymm"tostr(a1)"");\
+  asm("vmovaps [TEMP+5*32], ymm"tostr(a2)"");\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", ymm"tostr(a3)"");\
+  asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", ymm"tostr(a4)"");\
+  asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", ymm"tostr(a5)"");\
+  asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", ymm"tostr(a6)"");\
+  asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", ymm"tostr(a7)"");\
+  asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(b0)"");\
+  asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(b1)"");\
+  asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", [TEMP+5*32]");\
+  \
+  /*compute z_i : double x_i using temp ymm8 and 1B ymm9 */\
+  asm("vmovaps ymm"tostr(b1)", [ALL_1B]");\
+  asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(b2)"");\
+  VMUL2(a7, b0, b1, b2, b3, b4);\
+  VMUL2(a6, b0, b1, b2, b3, b4);\
+  VMUL2(a5, b0, b1, b2, b3, b4);\
+  VMUL2(a4, b0, b1, b2, b3, b4);\
+  VMUL2(a3, b0, b1, b2, b3, b4);\
+  VMUL2(a2, b0, b1, b2, b3, b4);\
+  VMUL2(a1, b0, b1, b2, b3, b4);\
+  VMUL2(a0, b0, b1, b2, b3, b4);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", [TEMP+0*32]");\
+  asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", [TEMP+1*32]");\
+  asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", [TEMP+2*32]");\
+  asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", [TEMP+3*32]");\
+  asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", [TEMP+4*32]");\
+  asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(b5)"");\
+  asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(b6)"");\
+  asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", ymm"tostr(b7)"");\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0, b0, b1, b2, b3, b4);\
+  VMUL2(a1, b0, b1, b2, b3, b4);\
+  VMUL2(a2, b0, b1, b2, b3, b4);\
+  VMUL2(a3, b0, b1, b2, b3, b4);\
+  VMUL2(a4, b0, b1, b2, b3, b4);\
+  VMUL2(a5, b0, b1, b2, b3, b4);\
+  VMUL2(a6, b0, b1, b2, b3, b4);\
+  VMUL2(a7, b0, b1, b2, b3, b4);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  asm("vxorpd ymm"tostr(b0)", ymm"tostr(a3)", [TEMP+0*32]");\
+  asm("vxorpd ymm"tostr(b1)", ymm"tostr(a4)", [TEMP+1*32]");\
+  asm("vxorpd ymm"tostr(b2)", ymm"tostr(a5)", [TEMP+2*32]");\
+  asm("vxorpd ymm"tostr(b3)", ymm"tostr(a6)", [TEMP+3*32]");\
+  asm("vxorpd ymm"tostr(b4)", ymm"tostr(a7)", [TEMP+4*32]");\
+  asm("vxorpd ymm"tostr(b5)", ymm"tostr(a0)", ymm"tostr(b5)"");\
+  asm("vxorpd ymm"tostr(b6)", ymm"tostr(a1)", ymm"tostr(b6)"");\
+  asm("vxorpd ymm"tostr(b7)", ymm"tostr(a2)", ymm"tostr(b7)"");\
+}/*MixBytes*/
+
+/* AVX SubShift
+ * inputs:
+ * * i
+ * * c0 (must be 0)
+ * * ShiftP
+ * * ShiftQ
+ * output i = S[Shift(i_1, ShiftQ)|Shift(i_0, ShiftP)]
+ * clobbers: t0
+ * */
+#define SubShift(i, t0, c0, ShiftP, ShiftQ){\
+    asm("vextractf128 xmm"tostr(t0)", ymm"tostr(i)", 1");\
+    asm("vpshufb xmm"tostr(i)",  xmm"tostr(i)",  [SUBSH_MASK+"tostr(ShiftP)"*16]");\
+    asm("vpshufb xmm"tostr(t0)", xmm"tostr(t0)", [SUBSH_MASK+"tostr(ShiftQ)"*16]");\
+    asm("vaesenclast xmm"tostr(i)",  xmm"tostr(i)",  xmm"tostr(c0)"");\
+    asm("vaesenclast xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(c0)"");\
+    asm("vinsertf128 ymm"tostr(i)",  ymm"tostr(i)",  xmm"tostr(t0)", 1");\
+}/**/
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBSHIFTMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+    /* ShiftBytes + SubBytes */\
+    asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\
+    SubShift(a0, b1, b0, 0, 1);\
+    SubShift(a1, b1, b0, 1, 3);\
+    SubShift(a2, b1, b0, 2, 5);\
+    SubShift(a3, b1, b0, 3, 7);\
+    SubShift(a4, b1, b0, 4, 0);\
+    SubShift(a5, b1, b0, 5, 2);\
+    SubShift(a6, b1, b0, 6, 4);\
+    SubShift(a7, b1, b0, 7, 6);\
+    /* MixBytes */\
+    MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+#define ROUNDS_P_Q(){\
+  asm ("xor rax, rax");\
+  asm ("1:");\
+  /* AddRoundConstant */\
+  asm ("vxorpd ymm6, ymm6, ymm6");\
+  asm ("vinsertf128 ymm7, ymm6, [ROUND_CONST_Q+eax*8], 1");\
+  asm ("vinsertf128 ymm6, ymm6, [ALL_FF], 1");\
+  asm ("vinsertf128 ymm0, ymm6, [ROUND_CONST_P+eax*8], 0");\
+  asm ("vxorpd ymm0, ymm8,  ymm0");\
+  asm ("vxorpd ymm1, ymm9,  ymm6");\
+  asm ("vxorpd ymm2, ymm10, ymm6");\
+  asm ("vxorpd ymm3, ymm11, ymm6");\
+  asm ("vxorpd ymm4, ymm12, ymm6");\
+  asm ("vxorpd ymm5, ymm13, ymm6");\
+  asm ("vxorpd ymm6, ymm14, ymm6");\
+  asm ("vxorpd ymm7, ymm15, ymm7");\
+  /* SubBytes + ShiftBytes + MixBytes */\
+  SUBSHIFTMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  asm ("add al, 2");\
+  asm ("mov rbx, rax");\
+  asm ("sub bl, 28");\
+  asm ("jb 1b");\
+}
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("vpshufb xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i5)", xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i7)", xmm"tostr(i7)", xmm"tostr(t0)"");\
+\
+  /* continue with unpack */\
+  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhwd xmm"tostr(t1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpckhwd xmm"tostr(t2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpckhwd xmm"tostr(t3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklwd xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklwd xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+\
+  /* shuffle with immediate */\
+  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+  asm ("vpshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
+  asm ("vpshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\
+  asm ("vpshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
+  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("vpshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
+  asm ("vpshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\
+\
+  /* continue with unpack */\
+  asm ("vpunpckhdq xmm"tostr(t4)", xmm"tostr(i0)",  xmm"tostr(i2)"");\
+  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)",  xmm"tostr(i2)"");\
+  asm ("vpunpckhdq xmm"tostr(t5)", xmm"tostr(t0)",  xmm"tostr(t1)"");\
+  asm ("vpunpckldq xmm"tostr(t0)", xmm"tostr(t0)",  xmm"tostr(t1)"");\
+  asm ("vpunpckhdq xmm"tostr(t6)", xmm"tostr(i4)",  xmm"tostr(i6)"");\
+  asm ("vpunpckldq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i6)"");\
+  asm ("vpunpckhdq xmm"tostr(t7)", xmm"tostr(t2)",  xmm"tostr(t3)"");\
+  asm ("vpunpckldq xmm"tostr(t2)", xmm"tostr(t2)",  xmm"tostr(t3)"");\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(t0)", xmm"tostr(t2)"");\
+  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(t0)", xmm"tostr(t2)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(t4)", xmm"tostr(t6)"");\
+  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(t4)", xmm"tostr(t6)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(t5)", xmm"tostr(t7)"");\
+  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(t5)", xmm"tostr(t7)"");\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  asm ("vmovaps xmm"tostr(o0)", [TRANSP_MASK]");\
+  /*  transpose matrix to get output format */\
+  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhqdq xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpckhqdq xmm"tostr(t1)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpckhqdq xmm"tostr(t2)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(o0)"");\
+  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(o0)"");\
+  asm ("vpshufb xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(o0)"");\
+  asm ("vpshufb xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(o0)"");\
+  asm ("vpshufb xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(o0)"");\
+  asm ("vpshufb xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(o0)"");\
+  asm ("vpshufb xmm"tostr(t1)", xmm"tostr(t1)", xmm"tostr(o0)"");\
+  asm ("vpshufb xmm"tostr(t2)", xmm"tostr(t2)", xmm"tostr(o0)"");\
+  /* continue with unpack */\
+  asm ("vpunpckhwd xmm"tostr(t3)", xmm"tostr(i4)", xmm"tostr(i6)"");\
+  asm ("vpunpcklwd xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i6)"");\
+  asm ("vpunpckhwd xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpckhwd xmm"tostr(o2)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("vpunpcklwd xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("vpunpckhwd xmm"tostr(t4)", xmm"tostr(t1)", xmm"tostr(t2)"");\
+  asm ("vpunpcklwd xmm"tostr(t1)", xmm"tostr(t1)", xmm"tostr(t2)"");\
+  /* shuffle with immediate */\
+  asm ("vpshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
+  asm ("vpshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
+  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("vpshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\
+  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("vpshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\
+  asm ("vpshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
+  asm ("vpshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\
+  /* continue with unpack */\
+  asm ("vpunpckhdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpckhdq xmm"tostr(i3)", xmm"tostr(o0)", xmm"tostr(t3)"");\
+  asm ("vpunpckldq xmm"tostr(o0)", xmm"tostr(o0)", xmm"tostr(t3)"");\
+  asm ("vpunpckhdq xmm"tostr(i5)", xmm"tostr(o1)", xmm"tostr(t1)"");\
+  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t1)"");\
+  asm ("vpunpckhdq xmm"tostr(i7)", xmm"tostr(o2)", xmm"tostr(t4)"");\
+  asm ("vpunpckldq xmm"tostr(o2)", xmm"tostr(o2)", xmm"tostr(t4)"");\
+  /* transpose done */\
+}/**/
+
+
+void INIT(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm8 - xmm15 */
+  asm ("vmovaps xmm8,  [rdi+0*16]");
+  asm ("vmovaps xmm9,  [rdi+1*16]");
+  asm ("vmovaps xmm10, [rdi+2*16]");
+  asm ("vmovaps xmm11, [rdi+3*16]");
+  asm ("vmovaps xmm12, [rdi+4*16]");
+  asm ("vmovaps xmm13, [rdi+5*16]");
+  asm ("vmovaps xmm14, [rdi+6*16]");
+  asm ("vmovaps xmm15, [rdi+7*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+
+  /* store transposed IV */
+  asm ("vmovaps [rdi+0*16], xmm8");
+  asm ("vmovaps [rdi+1*16], xmm9");
+  asm ("vmovaps [rdi+2*16], xmm10");
+  asm ("vmovaps [rdi+3*16], xmm11");
+  asm ("vmovaps [rdi+4*16], xmm12");
+  asm ("vmovaps [rdi+5*16], xmm13");
+  asm ("vmovaps [rdi+6*16], xmm14");
+  asm ("vmovaps [rdi+7*16], xmm15");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF1024(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm8...xmm15 (Q = message) */
+  asm ("vmovaps xmm0, [rsi+0*16]");
+  asm ("vmovaps xmm1, [rsi+1*16]");
+  asm ("vmovaps xmm2, [rsi+2*16]");
+  asm ("vmovaps xmm3, [rsi+3*16]");
+  asm ("vmovaps xmm4, [rsi+4*16]");
+  asm ("vmovaps xmm5, [rsi+5*16]");
+  asm ("vmovaps xmm6, [rsi+6*16]");
+  asm ("vmovaps xmm7, [rsi+7*16]");
+
+  /* transform message M from column ordering into row ordering */
+  Matrix_Transpose(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+  /* we put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+  /* result: CV+M in xmm8...xmm15 */
+  asm ("vpxor xmm8,  xmm0, [rdi+0*16]");
+  asm ("vpxor xmm9,  xmm1, [rdi+1*16]");
+  asm ("vpxor xmm10, xmm2, [rdi+2*16]");
+  asm ("vpxor xmm11, xmm3, [rdi+3*16]");
+  asm ("vpxor xmm12, xmm4, [rdi+4*16]");
+  asm ("vpxor xmm13, xmm5, [rdi+5*16]");
+  asm ("vpxor xmm14, xmm6, [rdi+6*16]");
+  asm ("vpxor xmm15, xmm7, [rdi+7*16]");
+
+  /* generate AVX registers with Q in high and P in low 128 bits */
+  asm ("vinsertf128 ymm8,  ymm8,  xmm0, 1");
+  asm ("vinsertf128 ymm9,  ymm9,  xmm1, 1");
+  asm ("vinsertf128 ymm10, ymm10, xmm2, 1");
+  asm ("vinsertf128 ymm11, ymm11, xmm3, 1");
+  asm ("vinsertf128 ymm12, ymm12, xmm4, 1");
+  asm ("vinsertf128 ymm13, ymm13, xmm5, 1");
+  asm ("vinsertf128 ymm14, ymm14, xmm6, 1");
+  asm ("vinsertf128 ymm15, ymm15, xmm7, 1");
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* extract output of Q to xmm0...xmm7 */
+  asm ("vextractf128 xmm0, ymm8,  1");
+  asm ("vextractf128 xmm1, ymm9,  1");
+  asm ("vextractf128 xmm2, ymm10, 1");
+  asm ("vextractf128 xmm3, ymm11, 1");
+  asm ("vextractf128 xmm4, ymm12, 1");
+  asm ("vextractf128 xmm5, ymm13, 1");
+  asm ("vextractf128 xmm6, ymm14, 1");
+  asm ("vextractf128 xmm7, ymm15, 1");
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm8...xmm15 */
+  asm ("vpxor xmm8,  xmm8,  xmm0");
+  asm ("vpxor xmm9,  xmm9,  xmm1");
+  asm ("vpxor xmm10, xmm10, xmm2");
+  asm ("vpxor xmm11, xmm11, xmm3");
+  asm ("vpxor xmm12, xmm12, xmm4");
+  asm ("vpxor xmm13, xmm13, xmm5");
+  asm ("vpxor xmm14, xmm14, xmm6");
+  asm ("vpxor xmm15, xmm15, xmm7");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm8...xmm15 */
+  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
+  asm ("vpxor xmm9,  xmm9,  [rdi+1*16]");
+  asm ("vpxor xmm10, xmm10, [rdi+2*16]");
+  asm ("vpxor xmm11, xmm11, [rdi+3*16]");
+  asm ("vpxor xmm12, xmm12, [rdi+4*16]");
+  asm ("vpxor xmm13, xmm13, [rdi+5*16]");
+  asm ("vpxor xmm14, xmm14, [rdi+6*16]");
+  asm ("vpxor xmm15, xmm15, [rdi+7*16]");
+
+  /* store CV */
+  asm ("vmovaps [rdi+0*16], xmm8");
+  asm ("vmovaps [rdi+1*16], xmm9");
+  asm ("vmovaps [rdi+2*16], xmm10");
+  asm ("vmovaps [rdi+3*16], xmm11");
+  asm ("vmovaps [rdi+4*16], xmm12");
+  asm ("vmovaps [rdi+5*16], xmm13");
+  asm ("vmovaps [rdi+6*16], xmm14");
+  asm ("vmovaps [rdi+7*16], xmm15");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF1024(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  asm ("vpxor xmm0, xmm0, xmm0");
+
+  /* load CV into registers xmm8...xmm15 */
+  asm ("vmovaps xmm8,  [rdi+0*16]");
+  asm ("vmovaps xmm9,  [rdi+1*16]");
+  asm ("vmovaps xmm10, [rdi+2*16]");
+  asm ("vmovaps xmm11, [rdi+3*16]");
+  asm ("vmovaps xmm12, [rdi+4*16]");
+  asm ("vmovaps xmm13, [rdi+5*16]");
+  asm ("vmovaps xmm14, [rdi+6*16]");
+  asm ("vmovaps xmm15, [rdi+7*16]");
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8...xmm15 */
+  ROUNDS_P_Q();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
+  asm ("vpxor xmm9,  xmm9,  [rdi+1*16]");
+  asm ("vpxor xmm10, xmm10, [rdi+2*16]");
+  asm ("vpxor xmm11, xmm11, [rdi+3*16]");
+  asm ("vpxor xmm12, xmm12, [rdi+4*16]");
+  asm ("vpxor xmm13, xmm13, [rdi+5*16]");
+  asm ("vpxor xmm14, xmm14, [rdi+6*16]");
+  asm ("vpxor xmm15, xmm15, [rdi+7*16]");
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7);
+
+  /* we only need to return the truncated half of the state */
+  asm ("vmovaps [rdi+4*16], xmm0");
+  asm ("vmovaps [rdi+5*16], xmm6");
+  asm ("vmovaps [rdi+6*16], xmm13");
+  asm ("vmovaps [rdi+7*16], xmm15");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
+#endif
+
diff --git a/algo/aes_ni/groestl-asm-vperm.h b/algo/aes_ni/groestl-asm-vperm.h
new file mode 100644
index 000000000..f8ae27caa
--- /dev/null
+++ b/algo/aes_ni/groestl-asm-vperm.h
@@ -0,0 +1,1397 @@
+/* groestl-asm-vperm.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3 instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * Based on the vperm and aes_ni implementations of the hash function Groestl
+ * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl.h"
+
+/* global constants  */
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
+__attribute__ ((aligned (16))) unsigned char ALL_15[16];
+__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
+__attribute__ ((aligned (16))) unsigned char ALL_63[16];
+__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
+__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
+
+/* temporary variables  */
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
+__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_SHARED_CONSTANTS(){\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
+  ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
+  ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
+  ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
+  ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
+  ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
+  ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
+  ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
+  ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
+  ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
+  ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
+  ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
+  ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
+  ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
+  ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
+  ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
+  ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
+  ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
+  ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
+  ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
+  ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
+  ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
+  ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
+  ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
+  ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
+  ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
+  ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
+  ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
+/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
+  ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
+  ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
+  ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
+  ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
+  ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
+}/**/
+
+/* VPERM
+ * Transform w/o settings c*
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
+  asm ("pandn  xmm"tostr(t0)", xmm"tostr(a0)"");\
+  asm ("pandn  xmm"tostr(t1)", xmm"tostr(a1)"");\
+  asm ("psrld  xmm"tostr(t0)", 4");\
+  asm ("psrld  xmm"tostr(t1)", 4");\
+  asm ("pand   xmm"tostr(a0)", xmm"tostr(c0)"");\
+  asm ("pand   xmm"tostr(a1)", xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
+  asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
+  asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
+  asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
+  asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
+  asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
+  asm ("pxor   xmm"tostr(a0)", xmm"tostr(t2)"");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(t3)"");\
+}/**/
+
+#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
+  asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
+  asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
+  asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
+}/**/
+
+/* VPERM
+ * Transform
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Transform State
+ * inputs:
+ * a0-a3 = state
+ * table = transformation table to use
+ * t* = clobbers
+ * outputs:
+ * a0-a3 = transformed state
+ * */
+#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Add Constant to State
+ * inputs:
+ * a0-a7 = state
+ * constant = constant to add
+ * t0 = clobber
+ * outputs:
+ * a0-a7 = state + constant
+ * */
+#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
+  asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
+  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a1)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a2)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a3)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a4)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a5)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a6)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a7)",  xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * Set Substitute Core Constants
+ * */
+#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
+  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Substitute Core
+ * first part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0 = 1 row
+ * t*, c* = clobbers
+ * outputs:
+ * b0a, b0b = inputs for lookup step
+ * */
+#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
+  asm ("movdqa xmm"tostr(t0)",  xmm"tostr(c0)"");\
+  asm ("pandn  xmm"tostr(t0)",  xmm"tostr(a0)"");\
+  asm ("psrld  xmm"tostr(t0)",  4");\
+  asm ("pand   xmm"tostr(a0)",  xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
+  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(b0a)"");\
+  asm ("movdqa xmm"tostr(t1)",  xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(t1)",  xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(t1)",  xmm"tostr(b0a)"");\
+  asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
+  asm ("pxor   xmm"tostr(b0a)", xmm"tostr(a0)"");\
+  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
+  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * Lookup
+ * second part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0a, a0b = output of Substitution Core
+ * table = lookup table to use (*1 / *2 / *4)
+ * t0 = clobber
+ * outputs:
+ * b0 = output of sbox + multiplication
+ * */
+#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
+  asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
+  asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
+  asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
+  asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * SubBytes and *2 / *4
+ * this function is derived from:
+ *   Constant-time SSSE3 AES core implementation
+ *   by Mike Hamburg
+ * and
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0-a7 = state
+ * t*, c* = clobbers
+ * outputs:
+ * a0-a7 = state * 4
+ * c2 = row0 * 2 -> b0
+ * c1 = row7 * 2 -> b3
+ * c0 = row7 * 1 -> b4
+ * t2 = row4 * 1 -> b7
+ * TEMP_MUL1 = row(i) * 1
+ * TEMP_MUL2 = row(i) * 2
+ *
+ * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
+#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
+  /* set Constants */\
+  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
+  /* row 1 */\
+  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
+  /* --- */\
+  /* row 2 */\
+  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
+  /* --- */\
+  /* row 3 */\
+  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
+  /* --- */\
+  /* row 5 */\
+  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
+  /* --- */\
+  /* row 6 */\
+  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
+  /* --- */\
+  /* row 7 */\
+  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
+  /* --- */\
+  /* row 4 */\
+  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
+  /* --- */\
+  /* row 0 */\
+  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
+  asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
+  /* --- */\
+}/**/
+
+
+/* Optimized MixBytes
+ * inputs:
+ * a0-a7 = (row0-row7) * 4
+ * b0 = row0 * 2
+ * b3 = row7 * 2
+ * b4 = row7 * 1
+ * b7 = row4 * 1
+ * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
+ * output: b0-b7
+ * */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* save one value */\
+  asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
+  /* 1 */\
+  asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
+  asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
+  \
+  /* 2 */\
+  asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a4)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
+  asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
+  \
+  /* 4 */\
+  asm ("pxor   xmm"tostr(b7)", xmm"tostr(a6)"");\
+  /*asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
+  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
+  asm ("pxor   xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b2)", xmm"tostr(b7)"");\
+  \
+  /* 3 */\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a7)"");\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
+  /*asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
+  asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b0)"");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
+  \
+  /* 5 */\
+  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a2)"");\
+  /*asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
+  asm ("pxor   xmm"tostr(b3)", xmm"tostr(b4)"");\
+  asm ("pxor   xmm"tostr(b6)", xmm"tostr(b4)"");\
+  \
+  /* 6 */\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
+  asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
+  \
+  /* 7 */\
+  asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
+  asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
+  asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
+  asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
+  \
+  /* 8 */\
+  asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
+  asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
+  asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
+  \
+  /* 9 */\
+  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
+  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a3)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* 10 */\
+  asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
+  asm ("pxor   xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a1)"");\
+  \
+  /* 11 */\
+  asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
+  asm ("pxor   xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm ("pxor   xmm"tostr(b6)", xmm"tostr(a5)"");\
+  \
+  /* 12 */\
+  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
+  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
+  asm ("pxor   xmm"tostr(b2)", xmm"tostr(a3)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* 13 */\
+  asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
+  asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
+  asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
+  asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
+}/**/
+
+#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}/**/
+
+#define Push_All_Regs(){\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}/**/
+
+#define Pop_All_Regs(){\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}/**/
+
+
+/* vperm:
+ * transformation before rounds with ipt
+ * first round add transformed constant
+ * middle rounds: add constant XOR 0x15...15
+ * last round: additionally add 0x15...15 after MB
+ * transformation after rounds with opt
+ */
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant + ShiftBytes (interleaved) */\
+  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  /* SubBytes + Multiplication by 2 and 4 */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
+}
+
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+\
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+\
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
+\
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+\
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
+\
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
+  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
+  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
+  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
+  asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
+  asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
+  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm0, [ALL_15]");\
+  asm ("pxor xmm1, [ALL_15]");\
+  asm ("pxor xmm2, [ALL_15]");\
+  asm ("pxor xmm3, [ALL_15]");\
+  asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
+  asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
+  asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
+  asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  asm ("movaps xmm0, [ROUND_CONST_Lx]");\
+  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm0, [ALL_15]");\
+  asm ("movaps [ROUND_CONST_Lx], xmm0");\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+}/**/
+
+void INIT(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("movaps xmm12, [rdi+0*16]");
+  asm ("movaps xmm13, [rdi+1*16]");
+  asm ("movaps xmm14, [rdi+2*16]");
+  asm ("movaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm12");
+  asm ("movaps [rdi+1*16], xmm2");
+  asm ("movaps [rdi+2*16], xmm6");
+  asm ("movaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("movaps xmm12, [rsi+0*16]");
+  asm ("movaps xmm13, [rsi+1*16]");
+  asm ("movaps xmm14, [rsi+2*16]");
+  asm ("movaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  asm ("movaps xmm8, [rdi+0*16]");
+  asm ("movaps xmm0, [rdi+1*16]");
+  asm ("movaps xmm4, [rdi+2*16]");
+  asm ("movaps xmm5, [rdi+3*16]");
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("pxor xmm8, xmm12");
+  asm ("pxor xmm0, xmm2");
+  asm ("pxor xmm4, xmm6");
+  asm ("pxor xmm5, xmm7");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("pxor xmm0, xmm8");
+  asm ("pxor xmm1, xmm10");
+  asm ("pxor xmm2, xmm12");
+  asm ("pxor xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("pxor xmm0, [rdi+0*16]");
+  asm ("pxor xmm1, [rdi+1*16]");
+  asm ("pxor xmm2, [rdi+2*16]");
+  asm ("pxor xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm0");
+  asm ("movaps [rdi+1*16], xmm1");
+  asm ("movaps [rdi+2*16], xmm2");
+  asm ("movaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm10, [rdi+1*16]");
+  asm ("movaps xmm12, [rdi+2*16]");
+  asm ("movaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm10, [rdi+1*16]");
+  asm ("pxor xmm12, [rdi+2*16]");
+  asm ("pxor xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+  VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+2*16], xmm9");
+  asm ("movaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
+#endif
+
+#if (LENGTH > 256)
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\
+  ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x0f0e0d0c0b0a0908ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0807060504030201ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x000f0e0d0c0b0a09ULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x0908070605040302ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x01000f0e0d0c0b0aULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0a09080706050403ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0201000f0e0d0c0bULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0b0a090807060504ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x030201000f0e0d0cULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0c0b0a0908070605ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x04030201000f0e0dULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0d0c0b0a09080706ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x0504030201000f0eULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x0201000f0e0d0c0bULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x0a09080706050403ULL;\
+  for(i = 0; i < ROUNDS1024; i++)\
+  {\
+    ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL)  ^ 0xf0e0d0c0b0a09080ULL;\
+    ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL)  ^ 0x0f1f2f3f4f5f6f7fULL;\
+    ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+  }\
+}/**/
+
+#define Push_All_Regs(){\
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");\
+}/**/
+
+#define Pop_All_Regs(){\
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");\
+}/**/
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes + Multiplication */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+#define ROUNDS_P(){\
+  asm ("xor rax, rax");\
+  asm ("xor rbx, rbx");\
+  asm ("add bl, 2");\
+  asm ("1:");\
+  /* AddRoundConstant P1024 */\
+  asm ("pxor xmm8, [ROUND_CONST_P+eax*8]");\
+  /* ShiftBytes P1024 + pre-AESENCLAST */\
+  asm ("pshufb xmm8,  [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm9,  [SUBSH_MASK+1*16]");\
+  asm ("pshufb xmm10, [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm11, [SUBSH_MASK+3*16]");\
+  asm ("pshufb xmm12, [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm13, [SUBSH_MASK+5*16]");\
+  asm ("pshufb xmm14, [SUBSH_MASK+6*16]");\
+  asm ("pshufb xmm15, [SUBSH_MASK+7*16]");\
+  /* SubBytes + MixBytes */\
+  SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  VPERM_Add_Constant(0, 1, 2, 3, 4, 5, 6, 7, ALL_15, 8);\
+  /* AddRoundConstant P1024 */\
+  asm ("pxor xmm0, [ROUND_CONST_P+ebx*8]");\
+  /* ShiftBytes P1024 + pre-AESENCLAST */\
+  asm ("pshufb xmm0, [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm1, [SUBSH_MASK+1*16]");\
+  asm ("pshufb xmm2, [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm3, [SUBSH_MASK+3*16]");\
+  asm ("pshufb xmm4, [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm5, [SUBSH_MASK+5*16]");\
+  asm ("pshufb xmm6, [SUBSH_MASK+6*16]");\
+  asm ("pshufb xmm7, [SUBSH_MASK+7*16]");\
+  /* SubBytes + MixBytes */\
+  SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
+  asm ("add al, 4");\
+  asm ("add bl, 4");\
+  asm ("mov rcx, rax");\
+  asm ("sub cl, 28");\
+  asm ("jb 1b");\
+}/**/
+
+#define ROUNDS_Q(){\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\
+  asm ("xor rax, rax");\
+  asm ("xor rbx, rbx");\
+  asm ("add bl, 2");\
+  asm ("2:");\
+  /* AddRoundConstant Q1024 */\
+  asm ("movaps xmm1,  [ALL_FF]");\
+  asm ("pxor xmm8,  xmm1");\
+  asm ("pxor xmm9,  xmm1");\
+  asm ("pxor xmm10, xmm1");\
+  asm ("pxor xmm11, xmm1");\
+  asm ("pxor xmm12, xmm1");\
+  asm ("pxor xmm13, xmm1");\
+  asm ("pxor xmm14, xmm1");\
+  asm ("pxor xmm15, [ROUND_CONST_Q+eax*8]");\
+  /* ShiftBytes Q1024 + pre-AESENCLAST */\
+  asm ("pshufb xmm8,  [SUBSH_MASK+1*16]");\
+  asm ("pshufb xmm9,  [SUBSH_MASK+3*16]");\
+  asm ("pshufb xmm10, [SUBSH_MASK+5*16]");\
+  asm ("pshufb xmm11, [SUBSH_MASK+7*16]");\
+  asm ("pshufb xmm12, [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm13, [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm14, [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm15, [SUBSH_MASK+6*16]");\
+  /* SubBytes + MixBytes */\
+  SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  /* AddRoundConstant Q1024 */\
+  asm ("movaps xmm9,  [ALL_FF]");\
+  asm ("pxor xmm0,  xmm9");\
+  asm ("pxor xmm1,  xmm9");\
+  asm ("pxor xmm2,  xmm9");\
+  asm ("pxor xmm3,  xmm9");\
+  asm ("pxor xmm4,  xmm9");\
+  asm ("pxor xmm5,  xmm9");\
+  asm ("pxor xmm6,  xmm9");\
+  asm ("pxor xmm7,  [ROUND_CONST_Q+ebx*8]");\
+  /* ShiftBytes Q1024 + pre-AESENCLAST */\
+  asm ("pshufb xmm0, [SUBSH_MASK+1*16]");\
+  asm ("pshufb xmm1, [SUBSH_MASK+3*16]");\
+  asm ("pshufb xmm2, [SUBSH_MASK+5*16]");\
+  asm ("pshufb xmm3, [SUBSH_MASK+7*16]");\
+  asm ("pshufb xmm4, [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm5, [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm6, [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm7, [SUBSH_MASK+6*16]");\
+  /* SubBytes + MixBytes */\
+  SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  asm ("add al, 4");\
+  asm ("add bl, 4");\
+  asm ("mov rcx, rax");\
+  asm ("sub cl, 28");\
+  asm ("jb 2b");\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\
+}/**/
+
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\
+  asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\
+  asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\
+\
+  /* continue with unpack using 4 temp registers */\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\
+  asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\
+  asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\
+  asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+\
+  /* shuffle with immediate */\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+  asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
+  asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\
+  asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
+  asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\
+\
+  /* continue with unpack */\
+  asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\
+  asm ("punpckldq xmm"tostr(i0)",  xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(t4)",  xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\
+  asm ("punpckldq xmm"tostr(t0)",  xmm"tostr(t1)"");\
+  asm ("punpckhdq xmm"tostr(t5)",  xmm"tostr(t1)"");\
+  asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\
+  asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\
+  asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\
+  asm ("punpckhdq xmm"tostr(t6)",  xmm"tostr(i6)"");\
+  asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckldq xmm"tostr(t2)",  xmm"tostr(t3)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(t7)",  xmm"tostr(t3)"");\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\
+  asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\
+  asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\
+  asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\
+  asm ("movaps xmm"tostr(o0)", [TRANSP_MASK]");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\
+  asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\
+  /* continue with unpack using 4 temp registers */\
+  asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\
+  asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\
+  \
+  asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\
+  asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\
+  asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\
+  asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\
+  asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\
+  /* shuffle with immediate */\
+  asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
+  asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\
+  asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
+  asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\
+  /* continue with unpack */\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\
+  asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\
+  asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\
+  asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\
+  asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\
+  asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\
+  /* transpose done */\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  asm ("movaps xmm0, [ROUND_CONST_P+"tostr(i)"*16]");\
+  asm ("movaps xmm1, [ROUND_CONST_P+"tostr(j)"*16]");\
+  asm ("movaps xmm2, [ROUND_CONST_Q+"tostr(i)"*16]");\
+  asm ("movaps xmm3, [ROUND_CONST_Q+"tostr(j)"*16]");\
+  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm2, [ALL_15]");\
+  asm ("pxor xmm3, [ALL_15]");\
+  asm ("movaps [ROUND_CONST_P+"tostr(i)"*16], xmm0");\
+  asm ("movaps [ROUND_CONST_P+"tostr(j)"*16], xmm1");\
+  asm ("movaps [ROUND_CONST_Q+"tostr(i)"*16], xmm2");\
+  asm ("movaps [ROUND_CONST_Q+"tostr(j)"*16], xmm3");\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+  VPERM_Transform_RoundConst_CNT2(10, 11);\
+  VPERM_Transform_RoundConst_CNT2(12, 13);\
+  asm ("movaps xmm0, [ALL_FF]");\
+  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm0, [ALL_15]");\
+  asm ("movaps [ALL_FF], xmm0");\
+}/**/
+
+
+void INIT(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm8 - xmm15 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm9,  [rdi+1*16]");
+  asm ("movaps xmm10, [rdi+2*16]");
+  asm ("movaps xmm11, [rdi+3*16]");
+  asm ("movaps xmm12, [rdi+4*16]");
+  asm ("movaps xmm13, [rdi+5*16]");
+  asm ("movaps xmm14, [rdi+6*16]");
+  asm ("movaps xmm15, [rdi+7*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  VPERM_Transform_State( 8,  9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm8");
+  asm ("movaps [rdi+1*16], xmm9");
+  asm ("movaps [rdi+2*16], xmm10");
+  asm ("movaps [rdi+3*16], xmm11");
+  asm ("movaps [rdi+4*16], xmm12");
+  asm ("movaps [rdi+5*16], xmm13");
+  asm ("movaps [rdi+6*16], xmm14");
+  asm ("movaps [rdi+7*16], xmm15");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF1024(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  asm ("movaps xmm8,  [rsi+0*16]");
+  asm ("movaps xmm9,  [rsi+1*16]");
+  asm ("movaps xmm10, [rsi+2*16]");
+  asm ("movaps xmm11, [rsi+3*16]");
+  asm ("movaps xmm12, [rsi+4*16]");
+  asm ("movaps xmm13, [rsi+5*16]");
+  asm ("movaps xmm14, [rsi+6*16]");
+  asm ("movaps xmm15, [rsi+7*16]");
+
+  /* transform message M from column ordering into row ordering */
+  VPERM_Transform_State( 8,  9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+
+  /* store message M (Q input) for later */
+  asm ("movaps [QTEMP+0*16], xmm8");
+  asm ("movaps [QTEMP+1*16], xmm9");
+  asm ("movaps [QTEMP+2*16], xmm10");
+  asm ("movaps [QTEMP+3*16], xmm11");
+  asm ("movaps [QTEMP+4*16], xmm12");
+  asm ("movaps [QTEMP+5*16], xmm13");
+  asm ("movaps [QTEMP+6*16], xmm14");
+  asm ("movaps [QTEMP+7*16], xmm15");
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm9,  [rdi+1*16]");
+  asm ("pxor xmm10, [rdi+2*16]");
+  asm ("pxor xmm11, [rdi+3*16]");
+  asm ("pxor xmm12, [rdi+4*16]");
+  asm ("pxor xmm13, [rdi+5*16]");
+  asm ("pxor xmm14, [rdi+6*16]");
+  asm ("pxor xmm15, [rdi+7*16]");
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm9,  [rdi+1*16]");
+  asm ("pxor xmm10, [rdi+2*16]");
+  asm ("pxor xmm11, [rdi+3*16]");
+  asm ("pxor xmm12, [rdi+4*16]");
+  asm ("pxor xmm13, [rdi+5*16]");
+  asm ("pxor xmm14, [rdi+6*16]");
+  asm ("pxor xmm15, [rdi+7*16]");
+
+  /* store P(CV+M)+CV */
+  asm ("movaps [rdi+0*16], xmm8");
+  asm ("movaps [rdi+1*16], xmm9");
+  asm ("movaps [rdi+2*16], xmm10");
+  asm ("movaps [rdi+3*16], xmm11");
+  asm ("movaps [rdi+4*16], xmm12");
+  asm ("movaps [rdi+5*16], xmm13");
+  asm ("movaps [rdi+6*16], xmm14");
+  asm ("movaps [rdi+7*16], xmm15");
+
+  /* load message M (Q input) into xmm8-15 */
+  asm ("movaps xmm8,  [QTEMP+0*16]");
+  asm ("movaps xmm9,  [QTEMP+1*16]");
+  asm ("movaps xmm10, [QTEMP+2*16]");
+  asm ("movaps xmm11, [QTEMP+3*16]");
+  asm ("movaps xmm12, [QTEMP+4*16]");
+  asm ("movaps xmm13, [QTEMP+5*16]");
+  asm ("movaps xmm14, [QTEMP+6*16]");
+  asm ("movaps xmm15, [QTEMP+7*16]");
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  ROUNDS_Q();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm9,  [rdi+1*16]");
+  asm ("pxor xmm10, [rdi+2*16]");
+  asm ("pxor xmm11, [rdi+3*16]");
+  asm ("pxor xmm12, [rdi+4*16]");
+  asm ("pxor xmm13, [rdi+5*16]");
+  asm ("pxor xmm14, [rdi+6*16]");
+  asm ("pxor xmm15, [rdi+7*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm8");
+  asm ("movaps [rdi+1*16], xmm9");
+  asm ("movaps [rdi+2*16], xmm10");
+  asm ("movaps [rdi+3*16], xmm11");
+  asm ("movaps [rdi+4*16], xmm12");
+  asm ("movaps [rdi+5*16], xmm13");
+  asm ("movaps [rdi+6*16], xmm14");
+  asm ("movaps [rdi+7*16], xmm15");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF1024(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8 - xmm15 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm9,  [rdi+1*16]");
+  asm ("movaps xmm10, [rdi+2*16]");
+  asm ("movaps xmm11, [rdi+3*16]");
+  asm ("movaps xmm12, [rdi+4*16]");
+  asm ("movaps xmm13, [rdi+5*16]");
+  asm ("movaps xmm14, [rdi+6*16]");
+  asm ("movaps xmm15, [rdi+7*16]");
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm9,  [rdi+1*16]");
+  asm ("pxor xmm10, [rdi+2*16]");
+  asm ("pxor xmm11, [rdi+3*16]");
+  asm ("pxor xmm12, [rdi+4*16]");
+  asm ("pxor xmm13, [rdi+5*16]");
+  asm ("pxor xmm14, [rdi+6*16]");
+  asm ("pxor xmm15, [rdi+7*16]");
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7);
+  VPERM_Transform_State( 0, 6, 13, 15, VPERM_OPT, 1, 2, 3, 5, 7, 10, 12);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+4*16], xmm0");
+  asm ("movaps [rdi+5*16], xmm6");
+  asm ("movaps [rdi+6*16], xmm13");
+  asm ("movaps [rdi+7*16], xmm15");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
+#endif
+
diff --git a/algo/aes_ni/groestl-intr-aes.h b/algo/aes_ni/groestl-intr-aes.h
new file mode 100644
index 000000000..3502c0358
--- /dev/null
+++ b/algo/aes_ni/groestl-intr-aes.h
@@ -0,0 +1,965 @@
+/* groestl-intr-aes.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include "hash-groestl.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_1B;
+__m128i ALL_FF;
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  j = _mm_xor_si128(j, j);\
+  j = _mm_cmpgt_epi8(j, i);\
+  i = _mm_add_epi8(i, i);\
+  j = _mm_and_si128(j, k);\
+  i = _mm_xor_si128(i, j);\
+} 
+
+ /**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm_xor_si128(a0, a1);\
+  b0 = a2;\
+  a1 = _mm_xor_si128(a1, a2);\
+  b1 = a3;\
+  a2 = _mm_xor_si128(a2, a3);\
+  b2 = a4;\
+  a3 = _mm_xor_si128(a3, a4);\
+  b3 = a5;\
+  a4 = _mm_xor_si128(a4, a5);\
+  b4 = a6;\
+  a5 = _mm_xor_si128(a5, a6);\
+  b5 = a7;\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm_xor_si128(b0, a4);\
+  b6 = _mm_xor_si128(b6, a4);\
+  b1 = _mm_xor_si128(b1, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  b2 = _mm_xor_si128(b2, a6);\
+  b0 = _mm_xor_si128(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm_xor_si128(b3, a7);\
+  b1 = _mm_xor_si128(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm_xor_si128(b4, a0);\
+  b2 = _mm_xor_si128(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm_xor_si128(b5, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  b1 = a1;\
+  b6 = _mm_xor_si128(b6, a2);\
+  b4 = _mm_xor_si128(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm_xor_si128(b7, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(a2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = ALL_1B;\
+  MUL2(a0, b0, b1);\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm_xor_si128(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm_xor_si128(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm_xor_si128(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm_xor_si128(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm_xor_si128(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm_xor_si128(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm_xor_si128(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm_xor_si128(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm_xor_si128(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm_xor_si128(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm_xor_si128(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm_xor_si128(b0, a3);\
+  b1 = _mm_xor_si128(b1, a4);\
+}/*MixBytes*/
+
+#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0); \
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = i0;\
+  t0 = i2;\
+  \
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = i0;\
+  o3 = o1;\
+  \
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+endif\
+}/**/
+
+
+void INIT(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+#endif
+
+#if (LENGTH > 256)
+
+#define SET_CONSTANTS(){\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
+  for(i = 0; i < ROUNDS1024; i++)\
+  {\
+    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
+  }\
+}while(0);\
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes */\
+  b0 = _mm_xor_si128(b0, b0);\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+#define ROUNDS_P(){\
+  u8 round_counter = 0;\
+  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
+    /* AddRoundConstant P1024 */\
+    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
+    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant P1024 */\
+    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+#define ROUNDS_Q(){\
+  u8 round_counter = 0;\
+  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
+    /* AddRoundConstant Q1024 */\
+    xmm1 = ALL_FF;\
+    xmm8  = _mm_xor_si128(xmm8,  xmm1);\
+    xmm9  = _mm_xor_si128(xmm9,  xmm1);\
+    xmm10 = _mm_xor_si128(xmm10, xmm1);\
+    xmm11 = _mm_xor_si128(xmm11, xmm1);\
+    xmm12 = _mm_xor_si128(xmm12, xmm1);\
+    xmm13 = _mm_xor_si128(xmm13, xmm1);\
+    xmm14 = _mm_xor_si128(xmm14, xmm1);\
+    xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[1]));\
+    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[3]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant Q1024 */\
+    xmm9 = ALL_FF;\
+    xmm0 = _mm_xor_si128(xmm0,  xmm9);\
+    xmm1 = _mm_xor_si128(xmm1,  xmm9);\
+    xmm2 = _mm_xor_si128(xmm2,  xmm9);\
+    xmm3 = _mm_xor_si128(xmm3,  xmm9);\
+    xmm4 = _mm_xor_si128(xmm4,  xmm9);\
+    xmm5 = _mm_xor_si128(xmm5,  xmm9);\
+    xmm6 = _mm_xor_si128(xmm6,  xmm9);\
+    xmm7 = _mm_xor_si128(xmm7,  (ROUND_CONST_Q[round_counter+1]));\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  t0 = TRANSP_MASK;\
+\
+  i6 = _mm_shuffle_epi8(i6, t0);\
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  t1 = i2;\
+  i4 = _mm_shuffle_epi8(i4, t0);\
+  i5 = _mm_shuffle_epi8(i5, t0);\
+  t2 = i4;\
+  t3 = i6;\
+  i7 = _mm_shuffle_epi8(i7, t0);\
+\
+  /* continue with unpack using 4 temp registers */\
+  t0 = i0;\
+  t2 = _mm_unpackhi_epi16(t2, i5);\
+  i4 = _mm_unpacklo_epi16(i4, i5);\
+  t3 = _mm_unpackhi_epi16(t3, i7);\
+  i6 = _mm_unpacklo_epi16(i6, i7);\
+  t0 = _mm_unpackhi_epi16(t0, i1);\
+  t1 = _mm_unpackhi_epi16(t1, i3);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+\
+  /* shuffle with immediate */\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t2 = _mm_shuffle_epi32(t2, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  i6 = _mm_shuffle_epi32(i6, 216);\
+\
+  /* continue with unpack */\
+  t4 = i0;\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  t4 = _mm_unpackhi_epi32(t4, i2);\
+  t5 = t0;\
+  t0 = _mm_unpacklo_epi32(t0, t1);\
+  t5 = _mm_unpackhi_epi32(t5, t1);\
+  t6 = i4;\
+  i4 = _mm_unpacklo_epi32(i4, i6);\
+  t7 = t2;\
+  t6 = _mm_unpackhi_epi32(t6, i6);\
+  i2 = t0;\
+  t2 = _mm_unpacklo_epi32(t2, t3);\
+  i3 = t0;\
+  t7 = _mm_unpackhi_epi32(t7, t3);\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  i1 = i0;\
+  i1 = _mm_unpackhi_epi64(i1, i4);\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  i4 = t4;\
+  i3 = _mm_unpackhi_epi64(i3, t2);\
+  i5 = t4;\
+  i2 = _mm_unpacklo_epi64(i2, t2);\
+  i6 = t5;\
+  i5 = _mm_unpackhi_epi64(i5, t6);\
+  i7 = t5;\
+  i4 = _mm_unpacklo_epi64(i4, t6);\
+  i7 = _mm_unpackhi_epi64(i7, t7);\
+  i6 = _mm_unpacklo_epi64(i6, t7);\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  o1 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o1 = _mm_unpackhi_epi64(o1, i1);\
+  t0 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  t0 = _mm_unpackhi_epi64(t0, i3);\
+  t1 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  t1 = _mm_unpackhi_epi64(t1, i5);\
+  t2 = i6;\
+  o0 = TRANSP_MASK;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  t2 = _mm_unpackhi_epi64(t2, i7);\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  i0 = _mm_shuffle_epi8(i0, o0);\
+  i2 = _mm_shuffle_epi8(i2, o0);\
+  i4 = _mm_shuffle_epi8(i4, o0);\
+  i6 = _mm_shuffle_epi8(i6, o0);\
+  o1 = _mm_shuffle_epi8(o1, o0);\
+  t0 = _mm_shuffle_epi8(t0, o0);\
+  t1 = _mm_shuffle_epi8(t1, o0);\
+  t2 = _mm_shuffle_epi8(t2, o0);\
+  /* continue with unpack using 4 temp registers */\
+  t3 = i4;\
+  o2 = o1;\
+  o0 = i0;\
+  t4 = t1;\
+  \
+  t3 = _mm_unpackhi_epi16(t3, i6);\
+  i4 = _mm_unpacklo_epi16(i4, i6);\
+  o0 = _mm_unpackhi_epi16(o0, i2);\
+  i0 = _mm_unpacklo_epi16(i0, i2);\
+  o2 = _mm_unpackhi_epi16(o2, t0);\
+  o1 = _mm_unpacklo_epi16(o1, t0);\
+  t4 = _mm_unpackhi_epi16(t4, t2);\
+  t1 = _mm_unpacklo_epi16(t1, t2);\
+  /* shuffle with immediate */\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  o2 = _mm_shuffle_epi32(o2, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o0 = _mm_shuffle_epi32(o0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t4 = _mm_shuffle_epi32(t4, 216);\
+  /* continue with unpack */\
+  i1 = i0;\
+  i3 = o0;\
+  i5 = o1;\
+  i7 = o2;\
+  i0 = _mm_unpacklo_epi32(i0, i4);\
+  i1 = _mm_unpackhi_epi32(i1, i4);\
+  o0 = _mm_unpacklo_epi32(o0, t3);\
+  i3 = _mm_unpackhi_epi32(i3, t3);\
+  o1 = _mm_unpacklo_epi32(o1, t1);\
+  i5 = _mm_unpackhi_epi32(i5, t1);\
+  o2 = _mm_unpacklo_epi32(o2, t4);\
+  i7 = _mm_unpackhi_epi32(i7, t4);\
+  /* transpose done */\
+}/**/
+
+
+void INIT(u64* h)
+{
+   __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* transform chaining value from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store transposed IV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+}
+
+void TF1024(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i QTEMP[8];
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  xmm8 = message[0];
+  xmm9 = message[1];
+  xmm10 = message[2];
+  xmm11 = message[3];
+  xmm12 = message[4];
+  xmm13 = message[5];
+  xmm14 = message[6];
+  xmm15 = message[7];
+
+  /* transform message M from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store message M (Q input) for later */
+  QTEMP[0] = xmm8;
+  QTEMP[1] = xmm9;
+  QTEMP[2] = xmm10;
+  QTEMP[3] = xmm11;
+  QTEMP[4] = xmm12;
+  QTEMP[5] = xmm13;
+  QTEMP[6] = xmm14;
+  QTEMP[7] = xmm15;
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* store P(CV+M)+CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  /* load message M (Q input) into xmm8-15 */
+  xmm8 = QTEMP[0];
+  xmm9 = QTEMP[1];
+  xmm10 = QTEMP[2];
+  xmm11 = QTEMP[3];
+  xmm12 = QTEMP[4];
+  xmm13 = QTEMP[5];
+  xmm14 = QTEMP[6];
+  xmm15 = QTEMP[7];
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  ROUNDS_Q();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* store CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF1024(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+  /* load CV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[4] = xmm0;
+  chaining[5] = xmm6;
+  chaining[6] = xmm13;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+#endif
+
diff --git a/algo/aes_ni/groestl-intr-avx.h b/algo/aes_ni/groestl-intr-avx.h
new file mode 100644
index 000000000..97f08dd69
--- /dev/null
+++ b/algo/aes_ni/groestl-intr-avx.h
@@ -0,0 +1,1072 @@
+/* groestl-intr-avx.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <immintrin.h>
+#include "hash-groestl.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_FF;
+#if LENGTH <= 256
+__m128i ALL_1B;
+#else
+__m256d ALL_1B;
+#endif
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
+#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
+
+#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0);
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2(i, j, k, z){\
+  j = _mm_cmpgt_epi8(z, i);\
+  i = _mm_add_epi8(i, i);\
+  j = _mm_and_si128(j, k);\
+  i = _mm_xor_si128(i, j);\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
+  b0 = a2;\
+  b1 = a3;\
+  b2 = a4;\
+  b3 = a5;\
+  b4 = a6;\
+  b5 = a7;\
+  b6 = a0;\
+  b7 = a1;\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  a0 = _mm_xor_si128(a0, a1);\
+  a1 = _mm_xor_si128(a1, a2);\
+  a2 = _mm_xor_si128(a2, a3);\
+  a3 = _mm_xor_si128(a3, a4);\
+  a4 = _mm_xor_si128(a4, a5);\
+  a5 = _mm_xor_si128(a5, a6);\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm_xor_si128(b0, a4);\
+  b1 = _mm_xor_si128(b1, a5);\
+  b2 = _mm_xor_si128(b2, a6);\
+  b3 = _mm_xor_si128(b3, a7);\
+  b4 = _mm_xor_si128(b4, a0);\
+  b5 = _mm_xor_si128(b5, a1);\
+  b6 = _mm_xor_si128(b6, a2);\
+  b7 = _mm_xor_si128(b7, a3);\
+  \
+  b0 = _mm_xor_si128(b0, a6);\
+  b1 = _mm_xor_si128(b1, a7);\
+  b2 = _mm_xor_si128(b2, a0);\
+  b3 = _mm_xor_si128(b3, a1);\
+  b4 = _mm_xor_si128(b4, a2);\
+  b5 = _mm_xor_si128(b5, a3);\
+  b6 = _mm_xor_si128(b6, a4);\
+  b7 = _mm_xor_si128(b7, a5);\
+  \
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  TEMP1 = b1;\
+  TEMP2 = b2;\
+  \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b1 = a1;\
+  TEMP3 = a2;\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(a2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP3);\
+  \
+  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  b1 = ALL_1B;\
+  b2 = _mm_xor_si128(b2, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a0, b0, b1, b2);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  a2 = _mm_xor_si128(a2, TEMP2);\
+  a3 = _mm_xor_si128(a3, b3);\
+  a4 = _mm_xor_si128(a4, b4);\
+  a5 = _mm_xor_si128(a5, b5);\
+  a6 = _mm_xor_si128(a6, b6);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  b0 = _mm_xor_si128(a3, TEMP0);\
+  b1 = _mm_xor_si128(a4, TEMP1);\
+  b2 = _mm_xor_si128(a5, TEMP2);\
+  b3 = _mm_xor_si128(b3, a6);\
+  b4 = _mm_xor_si128(b4, a7);\
+  b5 = _mm_xor_si128(b5, a0);\
+  b6 = _mm_xor_si128(b6, a1);\
+  b7 = _mm_xor_si128(b7, a2);\
+}/*MixBytes*/
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* Add Round Constant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = _mm_unpackhi_epi16(i0, i1);\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  t0 = _mm_unpackhi_epi16(i2, i3);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = _mm_unpackhi_epi32(i0, i2);\
+  o3 = _mm_unpackhi_epi32(o1, t0);\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = _mm_unpackhi_epi64(i0, i4);\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o2 = _mm_unpacklo_epi64(i1, i5);\
+  o3 = _mm_unpackhi_epi64(i1, i5);\
+  o4 = _mm_unpacklo_epi64(i2, i6);\
+  o5 = _mm_unpackhi_epi64(i2, i6);\
+  o6 = _mm_unpacklo_epi64(i3, i7);\
+  o7 = _mm_unpackhi_epi64(i3, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = _mm_unpackhi_epi64(i0, i1);\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o1 = _mm_unpackhi_epi64(i2, i3);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o2 = _mm_unpackhi_epi64(i4, i5);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o3 = _mm_unpackhi_epi64(i6, i7);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = _mm_unpackhi_epi64(i0, t0);\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i3 = _mm_unpackhi_epi64(i2, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i5 = _mm_unpackhi_epi64(i4, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i7 = _mm_unpackhi_epi64(i6, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+void INIT(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+  static __m128i TEMP3;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm12, chaining[0]);
+  xmm0 = _mm_xor_si128(xmm2,  chaining[1]);
+  xmm4 = _mm_xor_si128(xmm6,  chaining[2]);
+  xmm5 = _mm_xor_si128(xmm7,  chaining[3]);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, chaining[0]);
+  xmm1 = _mm_xor_si128(xmm1, chaining[1]);
+  xmm2 = _mm_xor_si128(xmm2, chaining[2]);
+  xmm3 = _mm_xor_si128(xmm3, chaining[3]);
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+  static __m128i TEMP3;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+#endif
+
+#if (LENGTH > 256)
+
+#define SET_CONSTANTS(){\
+  __m128i xmm0, xmm1;\
+  __m256d ymm0;\
+  xmm0 = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  xmm1 = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  ymm0 = insert_m128i_in_m256d(ymm0, xmm0, 0);\
+  ymm0 = insert_m128i_in_m256d(ymm0, xmm1, 1);\
+  ALL_1B = ymm0;\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
+  for(i = 0; i < ROUNDS1024; i++)\
+  {\
+    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
+  }\
+}while(0);
+
+/* AVX MUL2
+ * input: i
+ * output i = 2 * i
+ * */
+#define VMUL2(i){\
+    xmmZERO = _mm_xor_si128(xmmZERO, xmmZERO);\
+    xmmIL = extract_m128i_from_m256d(i, 0);\
+    xmmIH = extract_m128i_from_m256d(i, 1);\
+    xmmJL = _mm_cmpgt_epi8(xmmZERO, xmmIL);\
+    xmmJH = _mm_cmpgt_epi8(xmmZERO, xmmIH);\
+    xmmIL = _mm_add_epi8(xmmIL, xmmIL);\
+    xmmIH = _mm_add_epi8(xmmIH, xmmIH);\
+    ymmJ = insert_m128i_in_m256d(ymmJ, xmmJL, 0);\
+    ymmJ = insert_m128i_in_m256d(ymmJ, xmmJH, 1);\
+    ymmJ = _mm256_and_pd(ymmJ, ALL_1B);\
+    i = insert_m128i_in_m256d(i, xmmIL, 0);\
+    i = insert_m128i_in_m256d(i, xmmIH, 1);\
+    i = _mm256_xor_pd(i, ymmJ);\
+}/**/
+
+/* AVX SubShift
+ * inputs:
+ * * i
+ * * c0 (must be 0)
+ * * ShiftP
+ * * ShiftQ
+ * output i = S(Shift(i_1, ShiftQ)|Shift(i_0, ShiftP))
+ * clobbers: t0
+ * */
+#define SubShift(i, ShiftP, ShiftQ){\
+  xmmZERO = _mm_xor_si128(xmmZERO, xmmZERO);\
+  xmmIL = extract_m128i_from_m256d(i, 0);\
+  xmmIH = extract_m128i_from_m256d(i, 1);\
+  xmmIL = _mm_shuffle_epi8(xmmIL, SUBSH_MASK[ShiftP]);\
+  xmmIH = _mm_shuffle_epi8(xmmIH, SUBSH_MASK[ShiftQ]);\
+  xmmIL = _mm_aesenclast_si128(xmmIL, xmmZERO);\
+  xmmIH = _mm_aesenclast_si128(xmmIH, xmmZERO);\
+  i = insert_m128i_in_m256d(i, xmmIL, 0);\
+  i = insert_m128i_in_m256d(i, xmmIH, 1);\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
+  b0 = a2;\
+  b1 = a3;\
+  b2 = a4;\
+  b3 = a5;\
+  b4 = a6;\
+  b5 = a7;\
+  b6 = a0;\
+  b7 = a1;\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  a0 = _mm256_xor_pd(a0, a1);\
+  a1 = _mm256_xor_pd(a1, a2);\
+  a2 = _mm256_xor_pd(a2, a3);\
+  a3 = _mm256_xor_pd(a3, a4);\
+  a4 = _mm256_xor_pd(a4, a5);\
+  a5 = _mm256_xor_pd(a5, a6);\
+  a6 = _mm256_xor_pd(a6, a7);\
+  a7 = _mm256_xor_pd(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm256_xor_pd(b0, a4);\
+  b1 = _mm256_xor_pd(b1, a5);\
+  b2 = _mm256_xor_pd(b2, a6);\
+  b3 = _mm256_xor_pd(b3, a7);\
+  b4 = _mm256_xor_pd(b4, a0);\
+  b5 = _mm256_xor_pd(b5, a1);\
+  b6 = _mm256_xor_pd(b6, a2);\
+  b7 = _mm256_xor_pd(b7, a3);\
+  \
+  b0 = _mm256_xor_pd(b0, a6);\
+  b1 = _mm256_xor_pd(b1, a7);\
+  b2 = _mm256_xor_pd(b2, a0);\
+  b3 = _mm256_xor_pd(b3, a1);\
+  b4 = _mm256_xor_pd(b4, a2);\
+  b5 = _mm256_xor_pd(b5, a3);\
+  b6 = _mm256_xor_pd(b6, a4);\
+  b7 = _mm256_xor_pd(b7, a5);\
+  \
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  TEMP1 = b1;\
+  TEMP2 = b2;\
+  \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b1 = a1;\
+  TEMP3 = a2;\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm256_xor_pd(a0, a3);\
+  a1 = _mm256_xor_pd(a1, a4);\
+  a2 = _mm256_xor_pd(a2, a5);\
+  a3 = _mm256_xor_pd(a3, a6);\
+  a4 = _mm256_xor_pd(a4, a7);\
+  a5 = _mm256_xor_pd(a5, b0);\
+  a6 = _mm256_xor_pd(a6, b1);\
+  a7 = _mm256_xor_pd(a7, TEMP3);\
+  \
+  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  b1 = ALL_1B;\
+  b2 = _mm256_xor_pd(b2, b2);\
+  VMUL2(a7);\
+  VMUL2(a6);\
+  VMUL2(a5);\
+  VMUL2(a4);\
+  VMUL2(a3);\
+  VMUL2(a2);\
+  VMUL2(a1);\
+  VMUL2(a0);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  a0 = _mm256_xor_pd(a0, TEMP0);\
+  a1 = _mm256_xor_pd(a1, TEMP1);\
+  a2 = _mm256_xor_pd(a2, TEMP2);\
+  a3 = _mm256_xor_pd(a3, b3);\
+  a4 = _mm256_xor_pd(a4, b4);\
+  a5 = _mm256_xor_pd(a5, b5);\
+  a6 = _mm256_xor_pd(a6, b6);\
+  a7 = _mm256_xor_pd(a7, b7);\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0);\
+  VMUL2(a1);\
+  VMUL2(a2);\
+  VMUL2(a3);\
+  VMUL2(a4);\
+  VMUL2(a5);\
+  VMUL2(a6);\
+  VMUL2(a7);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  b0 = _mm256_xor_pd(a3, TEMP0);\
+  b1 = _mm256_xor_pd(a4, TEMP1);\
+  b2 = _mm256_xor_pd(a5, TEMP2);\
+  b3 = _mm256_xor_pd(b3, a6);\
+  b4 = _mm256_xor_pd(b4, a7);\
+  b5 = _mm256_xor_pd(b5, a0);\
+  b6 = _mm256_xor_pd(b6, a1);\
+  b7 = _mm256_xor_pd(b7, a2);\
+}/*MixBytes*/
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBSHIFTMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* ShiftBytes + SubBytes */\
+  SubShift(a0, 0, 1);\
+  SubShift(a1, 1, 3);\
+  SubShift(a2, 2, 5);\
+  SubShift(a3, 3, 7);\
+  SubShift(a4, 4, 0);\
+  SubShift(a5, 5, 2);\
+  SubShift(a6, 6, 4);\
+  SubShift(a7, 7, 6);\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+#define ROUNDS_P_Q(){\
+  u8 round_counter = 0;\
+  for(round_counter = 0; round_counter < 14; round_counter++) {\
+    /* AddRoundConstant */\
+    ymm6 = _mm256_xor_pd(ymm6, ymm6);\
+    ymm7 = insert_m128i_in_m256d(ymm6, ROUND_CONST_Q[round_counter], 1);\
+    ymm6 = insert_m128i_in_m256d(ymm6, ALL_FF, 1);\
+    ymm0 = insert_m128i_in_m256d(ymm6, ROUND_CONST_P[round_counter], 0);\
+    ymm0 = _mm256_xor_pd(ymm8, ymm0);\
+    ymm1 = _mm256_xor_pd(ymm9, ymm6);\
+    ymm2 = _mm256_xor_pd(ymm10, ymm6);\
+    ymm3 = _mm256_xor_pd(ymm11, ymm6);\
+    ymm4 = _mm256_xor_pd(ymm12, ymm6);\
+    ymm5 = _mm256_xor_pd(ymm13, ymm6);\
+    ymm6 = _mm256_xor_pd(ymm14, ymm6);\
+    ymm7 = _mm256_xor_pd(ymm15, ymm7);\
+    /* SubBytes + ShiftBytes + MixBytes */\
+    SUBSHIFTMIX(ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15);\
+  }\
+}
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  t0 = TRANSP_MASK;\
+  \
+  i6 = _mm_shuffle_epi8(i6, t0);\
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  i4 = _mm_shuffle_epi8(i4, t0);\
+  i5 = _mm_shuffle_epi8(i5, t0);\
+  i7 = _mm_shuffle_epi8(i7, t0);\
+  \
+  /* continue with unpack */\
+  t0 = _mm_unpackhi_epi16(i0, i1);\
+  t1 = _mm_unpackhi_epi16(i2, i3);\
+  t2 = _mm_unpackhi_epi16(i4, i5);\
+  t3 = _mm_unpackhi_epi16(i6, i7);\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  i4 = _mm_unpacklo_epi16(i4, i5);\
+  i6 = _mm_unpacklo_epi16(i6, i7);\
+  \
+  /* shuffle with immediate */\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t2 = _mm_shuffle_epi32(t2, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  i6 = _mm_shuffle_epi32(i6, 216);\
+  \
+  /* continue with unpack */\
+  t4 = _mm_unpackhi_epi32(i0,  i2);\
+  i0 = _mm_unpacklo_epi32(i0,  i2);\
+  t5 = _mm_unpackhi_epi32(t0,  t1);\
+  t0 = _mm_unpacklo_epi32(t0,  t1);\
+  t6 = _mm_unpackhi_epi32(i4,  i6);\
+  i4 = _mm_unpacklo_epi32(i4, i6);\
+  t7 = _mm_unpackhi_epi32(t2,  t3);\
+  t2 = _mm_unpacklo_epi32(t2,  t3);\
+  \
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  i1 = _mm_unpackhi_epi64(i0, i4);\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  i2 = _mm_unpacklo_epi64(t0, t2);\
+  i3 = _mm_unpackhi_epi64(t0, t2);\
+  i4 = _mm_unpacklo_epi64(t4, t6);\
+  i5 = _mm_unpackhi_epi64(t4, t6);\
+  i6 = _mm_unpacklo_epi64(t5, t7);\
+  i7 = _mm_unpackhi_epi64(t5, t7);\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  o0 = TRANSP_MASK;\
+  /*  transpose matrix to get output format */\
+  o1 = _mm_unpackhi_epi64(i0, i1);\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  t0 = _mm_unpackhi_epi64(i2, i3);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  t1 = _mm_unpackhi_epi64(i4, i5);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  t2 = _mm_unpackhi_epi64(i6, i7);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  i0 = _mm_shuffle_epi8(i0, o0);\
+  i2 = _mm_shuffle_epi8(i2, o0);\
+  i4 = _mm_shuffle_epi8(i4, o0);\
+  i6 = _mm_shuffle_epi8(i6, o0);\
+  o1 = _mm_shuffle_epi8(o1, o0);\
+  t0 = _mm_shuffle_epi8(t0, o0);\
+  t1 = _mm_shuffle_epi8(t1, o0);\
+  t2 = _mm_shuffle_epi8(t2, o0);\
+  /* continue with unpack */\
+  t3 = _mm_unpackhi_epi16(i4, i6);\
+  i4 = _mm_unpacklo_epi16(i4, i6);\
+  o0 = _mm_unpackhi_epi16(i0, i2);\
+  i0 = _mm_unpacklo_epi16(i0, i2);\
+  o2 = _mm_unpackhi_epi16(o1, t0);\
+  o1 = _mm_unpacklo_epi16(o1, t0);\
+  t4 = _mm_unpackhi_epi16(t1, t2);\
+  t1 = _mm_unpacklo_epi16(t1, t2);\
+  /* shuffle with immediate */\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  o2 = _mm_shuffle_epi32(o2, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o0 = _mm_shuffle_epi32(o0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t4 = _mm_shuffle_epi32(t4, 216);\
+  /* continue with unpack */\
+  i1 = _mm_unpackhi_epi32(i0, i4);\
+  i0 = _mm_unpacklo_epi32(i0, i4);\
+  i3 = _mm_unpackhi_epi32(o0, t3);\
+  o0 = _mm_unpacklo_epi32(o0, t3);\
+  i5 = _mm_unpackhi_epi32(o1, t1);\
+  o1 = _mm_unpacklo_epi32(o1, t1);\
+  i7 = _mm_unpackhi_epi32(o2, t4);\
+  o2 = _mm_unpacklo_epi32(o2, t4);\
+  /* transpose done */\
+}/**/
+
+void INIT(u64* h)
+{
+   __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* transform chaining value from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store transposed IV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+}
+
+void TF1024(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i xmmIL, xmmIH, xmmJL, xmmJH, xmmZERO;
+  static __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+  static __m256d ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+  static __m256d ymmJ;
+  static __m256d TEMP0;
+  static __m256d TEMP1;
+  static __m256d TEMP2;
+  static __m256d TEMP3;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  xmm0 = message[0];
+  xmm1 = message[1];
+  xmm2 = message[2];
+  xmm3 = message[3];
+  xmm4 = message[4];
+  xmm5 = message[5];
+  xmm6 = message[6];
+  xmm7 = message[7];
+
+  /* transform message M from column ordering into row ordering */
+  Matrix_Transpose(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+   /* we put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+   /* result: CV+M in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm0, chaining[0]);
+  xmm9 = _mm_xor_si128(xmm1, chaining[1]);
+  xmm10 = _mm_xor_si128(xmm2, chaining[2]);
+  xmm11 = _mm_xor_si128(xmm3, chaining[3]);
+  xmm12 = _mm_xor_si128(xmm4, chaining[4]);
+  xmm13 = _mm_xor_si128(xmm5, chaining[5]);
+  xmm14 = _mm_xor_si128(xmm6, chaining[6]);
+  xmm15 = _mm_xor_si128(xmm7, chaining[7]);
+
+  /* generate AVX registers with Q in high and P in low 128 bits */
+  ymm8 =  insert_m128i_in_m256d(ymm8,  xmm8,  0);
+  ymm9 =  insert_m128i_in_m256d(ymm9,  xmm9,  0);
+  ymm10 = insert_m128i_in_m256d(ymm10, xmm10, 0);
+  ymm11 = insert_m128i_in_m256d(ymm11, xmm11, 0);
+  ymm12 = insert_m128i_in_m256d(ymm12, xmm12, 0);
+  ymm13 = insert_m128i_in_m256d(ymm13, xmm13, 0);
+  ymm14 = insert_m128i_in_m256d(ymm14, xmm14, 0);
+  ymm15 = insert_m128i_in_m256d(ymm15, xmm15, 0);
+
+  ymm8 =  insert_m128i_in_m256d(ymm8,  xmm0, 1);
+  ymm9 =  insert_m128i_in_m256d(ymm9,  xmm1, 1);
+  ymm10 = insert_m128i_in_m256d(ymm10, xmm2, 1);
+  ymm11 = insert_m128i_in_m256d(ymm11, xmm3, 1);
+  ymm12 = insert_m128i_in_m256d(ymm12, xmm4, 1);
+  ymm13 = insert_m128i_in_m256d(ymm13, xmm5, 1);
+  ymm14 = insert_m128i_in_m256d(ymm14, xmm6, 1);
+  ymm15 = insert_m128i_in_m256d(ymm15, xmm7, 1);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* extract Q to xmm */
+  xmm0 = extract_m128i_from_m256d(ymm8, 1);
+  xmm1 = extract_m128i_from_m256d(ymm9, 1);
+  xmm2 = extract_m128i_from_m256d(ymm10, 1);
+  xmm3 = extract_m128i_from_m256d(ymm11, 1);
+  xmm4 = extract_m128i_from_m256d(ymm12, 1);
+  xmm5 = extract_m128i_from_m256d(ymm13, 1);
+  xmm6 = extract_m128i_from_m256d(ymm14, 1);
+  xmm7 = extract_m128i_from_m256d(ymm15, 1);
+
+  /* extract P to xmm */
+  xmm8  =  extract_m128i_from_m256d(ymm8, 0);
+  xmm9  =  extract_m128i_from_m256d(ymm9, 0);
+  xmm10 =  extract_m128i_from_m256d(ymm10, 0);
+  xmm11 =  extract_m128i_from_m256d(ymm11, 0);
+  xmm12 =  extract_m128i_from_m256d(ymm12, 0);
+  xmm13 =  extract_m128i_from_m256d(ymm13, 0);
+  xmm14 =  extract_m128i_from_m256d(ymm14, 0);
+  xmm15 =  extract_m128i_from_m256d(ymm15, 0);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  xmm0);
+  xmm9 = _mm_xor_si128(xmm9,  xmm1);
+  xmm10 = _mm_xor_si128(xmm10, xmm2);
+  xmm11 = _mm_xor_si128(xmm11, xmm3);
+  xmm12 = _mm_xor_si128(xmm12, xmm4);
+  xmm13 = _mm_xor_si128(xmm13, xmm5);
+  xmm14 = _mm_xor_si128(xmm14, xmm6);
+  xmm15 = _mm_xor_si128(xmm15, xmm7);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  chaining[0]);
+  xmm9 = _mm_xor_si128(xmm9,  chaining[1]);
+  xmm10 = _mm_xor_si128(xmm10, chaining[2]);
+  xmm11 = _mm_xor_si128(xmm11, chaining[3]);
+  xmm12 = _mm_xor_si128(xmm12, chaining[4]);
+  xmm13 = _mm_xor_si128(xmm13, chaining[5]);
+  xmm14 = _mm_xor_si128(xmm14, chaining[6]);
+  xmm15 = _mm_xor_si128(xmm15, chaining[7]);
+
+  /* store CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF1024(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i xmmIL, xmmIH, xmmJL, xmmJH, xmmZERO;
+  static __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+  static __m256d ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+  static __m256d ymmJ;
+  static __m256d TEMP0;
+  static __m256d TEMP1;
+  static __m256d TEMP2;
+  static __m256d TEMP3;
+
+  /* load CV into registers xmm8...xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+  
+  xmm0 = _mm_xor_si128(xmm0, xmm0);
+
+  /* generate AVX registers with Q in high and P in low 128 bits */
+  ymm8 =  insert_m128i_in_m256d(ymm8,  xmm8,  0);
+  ymm9 =  insert_m128i_in_m256d(ymm9,  xmm9,  0);
+  ymm10 = insert_m128i_in_m256d(ymm10, xmm10, 0);
+  ymm11 = insert_m128i_in_m256d(ymm11, xmm11, 0);
+  ymm12 = insert_m128i_in_m256d(ymm12, xmm12, 0);
+  ymm13 = insert_m128i_in_m256d(ymm13, xmm13, 0);
+  ymm14 = insert_m128i_in_m256d(ymm14, xmm14, 0);
+  ymm15 = insert_m128i_in_m256d(ymm15, xmm15, 0);
+
+  ymm8 =  insert_m128i_in_m256d(ymm8,  xmm0, 1);
+  ymm9 =  insert_m128i_in_m256d(ymm9,  xmm0, 1);
+  ymm10 = insert_m128i_in_m256d(ymm10, xmm0, 1);
+  ymm11 = insert_m128i_in_m256d(ymm11, xmm0, 1);
+  ymm12 = insert_m128i_in_m256d(ymm12, xmm0, 1);
+  ymm13 = insert_m128i_in_m256d(ymm13, xmm0, 1);
+  ymm14 = insert_m128i_in_m256d(ymm14, xmm0, 1);
+  ymm15 = insert_m128i_in_m256d(ymm15, xmm0, 1);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8...xmm15 */
+  ROUNDS_P_Q();
+
+  xmm8  =  extract_m128i_from_m256d(ymm8, 0);
+  xmm9  =  extract_m128i_from_m256d(ymm9, 0);
+  xmm10 =  extract_m128i_from_m256d(ymm10, 0);
+  xmm11 =  extract_m128i_from_m256d(ymm11, 0);
+  xmm12 =  extract_m128i_from_m256d(ymm12, 0);
+  xmm13 =  extract_m128i_from_m256d(ymm13, 0);
+  xmm14 =  extract_m128i_from_m256d(ymm14, 0);
+  xmm15 =  extract_m128i_from_m256d(ymm15, 0);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  chaining[0]);
+  xmm9 = _mm_xor_si128(xmm9,  chaining[1]);
+  xmm10 = _mm_xor_si128(xmm10, chaining[2]);
+  xmm11 = _mm_xor_si128(xmm11, chaining[3]);
+  xmm12 = _mm_xor_si128(xmm12, chaining[4]);
+  xmm13 = _mm_xor_si128(xmm13, chaining[5]);
+  xmm14 = _mm_xor_si128(xmm14, chaining[6]);
+  xmm15 = _mm_xor_si128(xmm15, chaining[7]);
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[0] = xmm8;
+  chaining[1] = xmm4;
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+  chaining[4] = xmm0;
+  chaining[5] = xmm6;
+  chaining[6] = xmm13;
+  chaining[7] = xmm15;
+
+  return;
+}//OF1024()
+
+#endif
+
diff --git a/algo/aes_ni/groestl-intr-vperm.h b/algo/aes_ni/groestl-intr-vperm.h
new file mode 100644
index 000000000..c75522961
--- /dev/null
+++ b/algo/aes_ni/groestl-intr-vperm.h
@@ -0,0 +1,1294 @@
+/* groestl-intr-vperm.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3 instructions.
+ * Author: Günther A. Roland, Martin Schläffer
+ *
+ * Based on the vperm and aes_ni implementations of the hash function Groestl
+ * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey
+ *
+ * This code is placed in the public domain
+ */
+
+#include <tmmintrin.h>
+#include "hash-groestl.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_0F;
+__m128i ALL_15;
+__m128i ALL_1B;
+__m128i ALL_63;
+__m128i ALL_FF;
+__m128i VPERM_IPT[2];
+__m128i VPERM_OPT[2];
+__m128i VPERM_INV[2];
+__m128i VPERM_SB1[2];
+__m128i VPERM_SB2[2];
+__m128i VPERM_SB4[2];
+__m128i VPERM_SBO[2];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_SHARED_CONSTANTS(){\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
+  ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
+  ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
+  VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
+  VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
+  VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
+  VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
+  VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
+  VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
+  VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
+  VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
+  VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
+  VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
+  VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
+  VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
+}/**/
+
+/* VPERM
+ * Transform w/o settings c*
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
+  t0 = c0;\
+  t1 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t1 = _mm_andnot_si128(t1, a1);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  t1 = _mm_srli_epi32(t1, 4);\
+  a0 = _mm_and_si128(a0, c0);\
+  a1 = _mm_and_si128(a1, c0);\
+  t2 = c2;\
+  t3 = c2;\
+  t2 = _mm_shuffle_epi8(t2, a0);\
+  t3 = _mm_shuffle_epi8(t3, a1);\
+  a0 = c1;\
+  a1 = c1;\
+  a0 = _mm_shuffle_epi8(a0, t0);\
+  a1 = _mm_shuffle_epi8(a1, t1);\
+  a0 = _mm_xor_si128(a0, t2);\
+  a1 = _mm_xor_si128(a1, t3);\
+}/**/
+
+#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
+  c0 = ALL_0F;\
+  c1 = ((__m128i*) table )[0];\
+  c2 = ((__m128i*) table )[1];\
+}/**/
+
+/* VPERM
+ * Transform
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Transform State
+ * inputs:
+ * a0-a3 = state
+ * table = transformation table to use
+ * t* = clobbers
+ * outputs:
+ * a0-a3 = transformed state
+ * */
+#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Add Constant to State
+ * inputs:
+ * a0-a7 = state
+ * constant = constant to add
+ * t0 = clobber
+ * outputs:
+ * a0-a7 = state + constant
+ * */
+#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
+  t0 = constant;\
+  a0 = _mm_xor_si128(a0,  t0);\
+  a1 = _mm_xor_si128(a1,  t0);\
+  a2 = _mm_xor_si128(a2,  t0);\
+  a3 = _mm_xor_si128(a3,  t0);\
+  a4 = _mm_xor_si128(a4,  t0);\
+  a5 = _mm_xor_si128(a5,  t0);\
+  a6 = _mm_xor_si128(a6,  t0);\
+  a7 = _mm_xor_si128(a7,  t0);\
+}/**/
+
+/* VPERM
+ * Set Substitute Core Constants
+ * */
+#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
+  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Substitute Core
+ * first part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0 = 1 row
+ * t*, c* = clobbers
+ * outputs:
+ * b0a, b0b = inputs for lookup step
+ * */
+#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
+  t0 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  a0 = _mm_and_si128(a0,  c0);\
+  b0a = c1;\
+  b0a = _mm_shuffle_epi8(b0a, a0);\
+  a0 = _mm_xor_si128(a0,  t0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t0);\
+  b0b = _mm_xor_si128(b0b, b0a);\
+  t1 = c2;\
+  t1 = _mm_shuffle_epi8(t1,  a0);\
+  t1 = _mm_xor_si128(t1,  b0a);\
+  b0a = c2;\
+  b0a = _mm_shuffle_epi8(b0a, b0b);\
+  b0a = _mm_xor_si128(b0a, a0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t1);\
+  b0b = _mm_xor_si128(b0b, t0);\
+}/**/
+
+/* VPERM
+ * Lookup
+ * second part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0a, a0b = output of Substitution Core
+ * table = lookup table to use (*1 / *2 / *4)
+ * t0 = clobber
+ * outputs:
+ * b0 = output of sbox + multiplication
+ * */
+#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
+  b0 = ((__m128i*) table )[0];\
+  t0 = ((__m128i*) table )[1];\
+  b0 = _mm_shuffle_epi8(b0, a0b);\
+  t0 = _mm_shuffle_epi8(t0, a0a);\
+  b0 = _mm_xor_si128(b0, t0);\
+}/**/
+
+/* VPERM
+ * SubBytes and *2 / *4
+ * this function is derived from:
+ *   Constant-time SSSE3 AES core implementation
+ *   by Mike Hamburg
+ * and
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0-a7 = state
+ * t*, c* = clobbers
+ * outputs:
+ * a0-a7 = state * 4
+ * c2 = row0 * 2 -> b0
+ * c1 = row7 * 2 -> b3
+ * c0 = row7 * 1 -> b4
+ * t2 = row4 * 1 -> b7
+ * TEMP_MUL1 = row(i) * 1
+ * TEMP_MUL2 = row(i) * 2
+ *
+ * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
+#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
+  /* set Constants */\
+  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
+  /* row 1 */\
+  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[1] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[1] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
+  /* --- */\
+  /* row 2 */\
+  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[2] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[2] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
+  /* --- */\
+  /* row 3 */\
+  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[3] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[3] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
+  /* --- */\
+  /* row 5 */\
+  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[5] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[5] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
+  /* --- */\
+  /* row 6 */\
+  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[6] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[6] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
+  /* --- */\
+  /* row 7 */\
+  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[7] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
+  /* --- */\
+  /* row 4 */\
+  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[4] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
+  /* --- */\
+  /* row 0 */\
+  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
+  TEMP_MUL2[0] = c2;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
+  /* --- */\
+}/**/
+
+
+/* Optimized MixBytes
+ * inputs:
+ * a0-a7 = (row0-row7) * 4
+ * b0 = row0 * 2
+ * b3 = row7 * 2
+ * b4 = row7 * 1
+ * b7 = row4 * 1
+ * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
+ * output: b0-b7
+ * */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* save one value */\
+  TEMP_MUL4 = a3;\
+  /* 1 */\
+  b1 = a0;\
+  b1 = _mm_xor_si128(b1, a5);\
+  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
+  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
+  b2 = b1;\
+  \
+  /* 2 */\
+  b5 = a1;\
+  b5 = _mm_xor_si128(b5, a4);\
+  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
+  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
+  b6 = b5;\
+  \
+  /* 4 */\
+  b7 = _mm_xor_si128(b7, a6);\
+  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
+  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
+  b2 = _mm_xor_si128(b2, b7);\
+  \
+  /* 3 */\
+  b0 = _mm_xor_si128(b0, a7);\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
+  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
+  b3 = b0;\
+  b1 = _mm_xor_si128(b1, b0);\
+  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
+  \
+  /* 5 */\
+  b4 = _mm_xor_si128(b4, a2);\
+  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
+  b3 = _mm_xor_si128(b3, b4);\
+  b6 = _mm_xor_si128(b6, b4);\
+  \
+  /* 6 */\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
+  b4 = _mm_xor_si128(b4, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  b7 = _mm_xor_si128(b7, a3);\
+  \
+  /* 7 */\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
+  b2 = _mm_xor_si128(b2, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  \
+  /* 8 */\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
+  b6 = _mm_xor_si128(b6, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  \
+  /* 9 */\
+  a3 = TEMP_MUL1[2];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
+  b0 = _mm_xor_si128(b0, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 10 */\
+  a1 = TEMP_MUL1[6];\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
+  b1 = _mm_xor_si128(b1, a1);\
+  b4 = _mm_xor_si128(b4, a1);\
+  \
+  /* 11 */\
+  a5 = TEMP_MUL1[3];\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
+  b1 = _mm_xor_si128(b1, a5);\
+  b6 = _mm_xor_si128(b6, a5);\
+  \
+  /* 12 */\
+  a3 = TEMP_MUL1[7];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
+  b2 = _mm_xor_si128(b2, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 13 */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
+  b0 = _mm_xor_si128(b0, a4);\
+  b1 = _mm_xor_si128(b1, a4);\
+  b3 = _mm_xor_si128(b3, a6);\
+  b4 = _mm_xor_si128(b4, a0);\
+  b4 = _mm_xor_si128(b4, a7);\
+  b5 = _mm_xor_si128(b5, a0);\
+  b7 = _mm_xor_si128(b7, a2);\
+}/**/
+
+#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}/**/
+
+/* vperm:
+ * transformation before rounds with ipt
+ * first round add transformed constant
+ * middle rounds: add constant XOR 0x15...15
+ * last round: additionally add 0x15...15 after MB
+ * transformation after rounds with opt
+ */
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant + ShiftBytes (interleaved) */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a4 = _mm_xor_si128(a4, b1);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  /* SubBytes + Multiplication by 2 and 4 */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
+}
+
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+\
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+\
+  o1 = i0;\
+  t0 = i2;\
+\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+\
+  o2 = i0;\
+  o3 = o1;\
+\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  xmm0 = ROUND_CONST_L0[i];\
+  xmm1 = ROUND_CONST_L7[i];\
+  xmm2 = ROUND_CONST_L0[j];\
+  xmm3 = ROUND_CONST_L7[j];\
+  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
+  xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
+  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
+  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
+  ROUND_CONST_L0[i] = xmm0;\
+  ROUND_CONST_L7[i] = xmm1;\
+  ROUND_CONST_L0[j] = xmm2;\
+  ROUND_CONST_L7[j] = xmm3;\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  xmm0 = ROUND_CONST_Lx;\
+  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
+  ROUND_CONST_Lx = xmm0;\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+}/**/
+
+void INIT(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+  VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+
+  return;
+}//OF512()
+
+#endif
+
+#if (LENGTH > 256)
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x000f0e0d, 0x0c0b0a09, 0x08070605, 0x04030201);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x01000f0e, 0x0d0c0b0a, 0x09080706, 0x05040302);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0201000f, 0x0e0d0c0b, 0x0a090807, 0x06050403);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x03020100, 0x0f0e0d0c, 0x0b0a0908, 0x07060504);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x05040302, 0x01000f0e, 0x0d0c0b0a, 0x09080706);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);\
+  for(i = 0; i < ROUNDS1024; i++)\
+  {\
+    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
+  }\
+}/**/
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes + Multiplication */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+#define ROUNDS_P(){\
+  u8 round_counter = 0;\
+  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
+    /* AddRoundConstant P1024 */\
+    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm8 = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
+    xmm9 = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    VPERM_Add_Constant(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, ALL_15, xmm8);\
+    \
+    /* AddRoundConstant P1024 */\
+    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+    VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
+  }\
+}/**/
+
+#define ROUNDS_Q(){\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm1);\
+  u8 round_counter = 0;\
+  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
+    /* AddRoundConstant Q1024 */\
+    xmm1 = ALL_FF;\
+    xmm8 = _mm_xor_si128(xmm8, xmm1);\
+    xmm9 = _mm_xor_si128(xmm9, xmm1);\
+    xmm10 = _mm_xor_si128(xmm10, xmm1);\
+    xmm11 = _mm_xor_si128(xmm11, xmm1);\
+    xmm12 = _mm_xor_si128(xmm12, xmm1);\
+    xmm13 = _mm_xor_si128(xmm13, xmm1);\
+    xmm14 = _mm_xor_si128(xmm14, xmm1);\
+    xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\
+    xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant Q1024 */\
+    xmm9 = ALL_FF;\
+    xmm0 = _mm_xor_si128(xmm0, xmm9);\
+    xmm1 = _mm_xor_si128(xmm1, xmm9);\
+    xmm2 = _mm_xor_si128(xmm2, xmm9);\
+    xmm3 = _mm_xor_si128(xmm3, xmm9);\
+    xmm4 = _mm_xor_si128(xmm4, xmm9);\
+    xmm5 = _mm_xor_si128(xmm5, xmm9);\
+    xmm6 = _mm_xor_si128(xmm6, xmm9);\
+    xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
+    /* SubBytes + MixBytes*/ \
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm1);\
+}/**/
+
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  t0 = TRANSP_MASK;\
+\
+  i6 = _mm_shuffle_epi8(i6, t0);\
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  t1 = i2;\
+  i4 = _mm_shuffle_epi8(i4, t0);\
+  i5 = _mm_shuffle_epi8(i5, t0);\
+  t2 = i4;\
+  t3 = i6;\
+  i7 = _mm_shuffle_epi8(i7, t0);\
+\
+  /* continue with unpack using 4 temp registers */\
+  t0 = i0;\
+  t2 = _mm_unpackhi_epi16(t2, i5);\
+  i4 = _mm_unpacklo_epi16(i4, i5);\
+  t3 = _mm_unpackhi_epi16(t3, i7);\
+  i6 = _mm_unpacklo_epi16(i6, i7);\
+  t0 = _mm_unpackhi_epi16(t0, i1);\
+  t1 = _mm_unpackhi_epi16(t1, i3);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+\
+  /* shuffle with immediate */\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t2 = _mm_shuffle_epi32(t2, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  i6 = _mm_shuffle_epi32(i6, 216);\
+\
+  /* continue with unpack */\
+  t4 = i0;\
+  i0 = _mm_unpacklo_epi32(i0,  i2);\
+  t4 = _mm_unpackhi_epi32(t4,  i2);\
+  t5 = t0;\
+  t0 = _mm_unpacklo_epi32(t0,  t1);\
+  t5 = _mm_unpackhi_epi32(t5,  t1);\
+  t6 = i4;\
+  i4 = _mm_unpacklo_epi32(i4, i6);\
+  t7 = t2;\
+  t6 = _mm_unpackhi_epi32(t6,  i6);\
+  i2 = t0;\
+  t2 = _mm_unpacklo_epi32(t2,  t3);\
+  i3 = t0;\
+  t7 = _mm_unpackhi_epi32(t7,  t3);\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  i1 = i0;\
+  i1 = _mm_unpackhi_epi64(i1, i4);\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  i4 = t4;\
+  i3 = _mm_unpackhi_epi64(i3, t2);\
+  i5 = t4;\
+  i2 = _mm_unpacklo_epi64(i2, t2);\
+  i6 = t5;\
+  i5 = _mm_unpackhi_epi64(i5, t6);\
+  i7 = t5;\
+  i4 = _mm_unpacklo_epi64(i4, t6);\
+  i7 = _mm_unpackhi_epi64(i7, t7);\
+  i6 = _mm_unpacklo_epi64(i6, t7);\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  o1 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o1 = _mm_unpackhi_epi64(o1, i1);\
+  t0 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  t0 = _mm_unpackhi_epi64(t0, i3);\
+  t1 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  t1 = _mm_unpackhi_epi64(t1, i5);\
+  t2 = i6;\
+  o0 = TRANSP_MASK;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  t2 = _mm_unpackhi_epi64(t2, i7);\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  i0 = _mm_shuffle_epi8(i0, o0);\
+  i2 = _mm_shuffle_epi8(i2, o0);\
+  i4 = _mm_shuffle_epi8(i4, o0);\
+  i6 = _mm_shuffle_epi8(i6, o0);\
+  o1 = _mm_shuffle_epi8(o1, o0);\
+  t0 = _mm_shuffle_epi8(t0, o0);\
+  t1 = _mm_shuffle_epi8(t1, o0);\
+  t2 = _mm_shuffle_epi8(t2, o0);\
+  /* continue with unpack using 4 temp registers */\
+  t3 = i4;\
+  o2 = o1;\
+  o0 = i0;\
+  t4 = t1;\
+  \
+  t3 = _mm_unpackhi_epi16(t3, i6);\
+  i4 = _mm_unpacklo_epi16(i4, i6);\
+  o0 = _mm_unpackhi_epi16(o0, i2);\
+  i0 = _mm_unpacklo_epi16(i0, i2);\
+  o2 = _mm_unpackhi_epi16(o2, t0);\
+  o1 = _mm_unpacklo_epi16(o1, t0);\
+  t4 = _mm_unpackhi_epi16(t4, t2);\
+  t1 = _mm_unpacklo_epi16(t1, t2);\
+  /* shuffle with immediate */\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  o2 = _mm_shuffle_epi32(o2, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o0 = _mm_shuffle_epi32(o0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t4 = _mm_shuffle_epi32(t4, 216);\
+  /* continue with unpack */\
+  i1 = i0;\
+  i3 = o0;\
+  i5 = o1;\
+  i7 = o2;\
+  i0 = _mm_unpacklo_epi32(i0, i4);\
+  i1 = _mm_unpackhi_epi32(i1, i4);\
+  o0 = _mm_unpacklo_epi32(o0, t3);\
+  i3 = _mm_unpackhi_epi32(i3, t3);\
+  o1 = _mm_unpacklo_epi32(o1, t1);\
+  i5 = _mm_unpackhi_epi32(i5, t1);\
+  o2 = _mm_unpacklo_epi32(o2, t4);\
+  i7 = _mm_unpackhi_epi32(i7, t4);\
+  /* transpose done */\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  xmm0 = ROUND_CONST_P[i];\
+  xmm1 = ROUND_CONST_P[j];\
+  xmm2 = ROUND_CONST_Q[i];\
+  xmm3 = ROUND_CONST_Q[j];\
+  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
+  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
+  ROUND_CONST_P[i] = xmm0;\
+  ROUND_CONST_P[j] = xmm1;\
+  ROUND_CONST_Q[i] = xmm2;\
+  ROUND_CONST_Q[j] = xmm3;\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+  VPERM_Transform_RoundConst_CNT2(10, 11);\
+  VPERM_Transform_RoundConst_CNT2(12, 13);\
+  xmm0 = ALL_FF;\
+  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
+  ALL_FF = xmm0;\
+}/**/
+
+
+void INIT(u64* h)
+{
+   __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* transform chaining value from column ordering into row ordering */
+  VPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store transposed IV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+}
+
+void TF1024(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+  static __m128i QTEMP[8];
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  xmm8 = message[0];
+  xmm9 = message[1];
+  xmm10 = message[2];
+  xmm11 = message[3];
+  xmm12 = message[4];
+  xmm13 = message[5];
+  xmm14 = message[6];
+  xmm15 = message[7];
+
+  /* transform message M from column ordering into row ordering */
+  VPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store message M (Q input) for later */
+  QTEMP[0] = xmm8;
+  QTEMP[1] = xmm9;
+  QTEMP[2] = xmm10;
+  QTEMP[3] = xmm11;
+  QTEMP[4] = xmm12;
+  QTEMP[5] = xmm13;
+  QTEMP[6] = xmm14;
+  QTEMP[7] = xmm15;
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* store P(CV+M)+CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  /* load message M (Q input) into xmm8-15 */
+  xmm8 = QTEMP[0];
+  xmm9 = QTEMP[1];
+  xmm10 = QTEMP[2];
+  xmm11 = QTEMP[3];
+  xmm12 = QTEMP[4];
+  xmm13 = QTEMP[5];
+  xmm14 = QTEMP[6];
+  xmm15 = QTEMP[7];
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  ROUNDS_Q();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* store CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF1024(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+  /* load CV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
+  VPERM_Transform_State(xmm0, xmm6, xmm13, xmm15, VPERM_OPT, xmm1, xmm2, xmm3, xmm5, xmm7, xmm10, xmm12);
+
+  /* we only need to return the truncated half of the state */
+  chaining[4] = xmm0;
+  chaining[5] = xmm6;
+  chaining[6] = xmm13;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+#endif
+
diff --git a/algo/aes_ni/groestl-version.h b/algo/aes_ni/groestl-version.h
new file mode 100644
index 000000000..cdbd81627
--- /dev/null
+++ b/algo/aes_ni/groestl-version.h
@@ -0,0 +1,16 @@
+// specify assembly or intrinsics implementation
+//#define TASM
+#define TINTR
+
+//#define AES_NI
+
+//#ifdef AES_NI
+// specify AES-NI, AVX (with AES-NI) or vector-permute implementation
+
+//#ifndef NO_AES_NI
+
+#define VAES
+// #define VAVX
+// #define VVPERM
+
+//#endif
diff --git a/algo/aes_ni/groestl256-asm-aes.h b/algo/aes_ni/groestl256-asm-aes.h
new file mode 100644
index 000000000..0810b5e83
--- /dev/null
+++ b/algo/aes_ni/groestl256-asm-aes.h
@@ -0,0 +1,529 @@
+/* groestl-asm-aes.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3, sse4.1, and aes
+ * instructions.
+ * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+/* global constants  */
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
+__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
+
+/* temporary variables  */
+__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
+  asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
+  asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
+  asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
+  asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
+  asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
+  asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
+  asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
+  asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
+  asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
+  asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
+  /* spill values y_4, y_5 to memory */\
+  asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
+  asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
+  asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
+  asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  asm("movaps xmm"tostr(b1)", [ALL_1B]");\
+  MUL2(a0, b0, b1);\
+  asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
+  MUL2(a1, b0, b1);\
+  asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
+  MUL2(a2, b0, b1);\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
+  MUL2(a3, b0, b1);\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
+  MUL2(a4, b0, b1);\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
+  MUL2(a5, b0, b1);\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
+  MUL2(a6, b0, b1);\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
+  MUL2(a7, b0, b1);\
+  asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
+  MUL2(a1, b0, b1);\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
+  MUL2(a2, b0, b1);\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
+  MUL2(a5, b0, b1);\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
+  MUL2(a6, b0, b1);\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
+  MUL2(a7, b0, b1);\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
+  asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
+}/*MixBytes*/
+
+#define SET_CONSTANTS(){\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}while(0);
+
+#define Push_All_Regs() do{\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}while(0);
+
+#define Pop_All_Regs() do{\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}while(0);
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  /* ShiftBytes + SubBytes (interleaved) */\
+  asm ("pxor xmm"tostr(b0)",  xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+  \
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+  \
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+  \
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
+  \
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+  \
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
+  \
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
+  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
+  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
+  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("movaps xmm12, [rdi+0*16]");
+  asm ("movaps xmm13, [rdi+1*16]");
+  asm ("movaps xmm14, [rdi+2*16]");
+  asm ("movaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm12");
+  asm ("movaps [rdi+1*16], xmm2");
+  asm ("movaps [rdi+2*16], xmm6");
+  asm ("movaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("movaps xmm12, [rsi+0*16]");
+  asm ("movaps xmm13, [rsi+1*16]");
+  asm ("movaps xmm14, [rsi+2*16]");
+  asm ("movaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  asm ("movaps xmm8, [rdi+0*16]");
+  asm ("movaps xmm0, [rdi+1*16]");
+  asm ("movaps xmm4, [rdi+2*16]");
+  asm ("movaps xmm5, [rdi+3*16]");
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("pxor xmm8, xmm12");
+  asm ("pxor xmm0, xmm2");
+  asm ("pxor xmm4, xmm6");
+  asm ("pxor xmm5, xmm7");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("pxor xmm0, xmm8");
+  asm ("pxor xmm1, xmm10");
+  asm ("pxor xmm2, xmm12");
+  asm ("pxor xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("pxor xmm0, [rdi+0*16]");
+  asm ("pxor xmm1, [rdi+1*16]");
+  asm ("pxor xmm2, [rdi+2*16]");
+  asm ("pxor xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm0");
+  asm ("movaps [rdi+1*16], xmm1");
+  asm ("movaps [rdi+2*16], xmm2");
+  asm ("movaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm10, [rdi+1*16]");
+  asm ("movaps xmm12, [rdi+2*16]");
+  asm ("movaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm10, [rdi+1*16]");
+  asm ("pxor xmm12, [rdi+2*16]");
+  asm ("pxor xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+2*16], xmm9");
+  asm ("movaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
diff --git a/algo/aes_ni/groestl256-asm-avx.h b/algo/aes_ni/groestl256-asm-avx.h
new file mode 100644
index 000000000..e7cb4c782
--- /dev/null
+++ b/algo/aes_ni/groestl256-asm-avx.h
@@ -0,0 +1,519 @@
+/* groestl-asm-avx.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+
+/* global variables  */
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
+__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
+
+/* temporary variables  */
+__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_CONSTANTS(){\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}while(0);
+
+#define Push_All_Regs() do{\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}while(0);
+
+#define Pop_All_Regs() do{\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}while(0);
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2(i, j, k, z){\
+  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2v2(i, j, k, z){\
+  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
+  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
+  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
+  asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
+  asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
+  asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
+  asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
+  asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
+  asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
+  \
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
+  \
+  /* spill values y_4, y_5 to memory */\
+  asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
+  asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
+  asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
+  \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
+  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
+  \
+  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
+  VMUL2(a7, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a0, b0, b1, b2);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
+}/*MixBytes*/
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("vpxor   xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("vpxor   xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  /* ShiftBytes + SubBytes (interleaved) */\
+  asm ("vpxor xmm"tostr(b0)",  xmm"tostr(b0)",  xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
+\
+  asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+\
+  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+\
+  asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
+  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
+  asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
+  asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
+  asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("vmovaps xmm12, [rdi+0*16]");
+  asm ("vmovaps xmm13, [rdi+1*16]");
+  asm ("vmovaps xmm14, [rdi+2*16]");
+  asm ("vmovaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("vmovaps [rdi+0*16], xmm12");
+  asm ("vmovaps [rdi+1*16], xmm2");
+  asm ("vmovaps [rdi+2*16], xmm6");
+  asm ("vmovaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("vmovaps xmm12, [rsi+0*16]");
+  asm ("vmovaps xmm13, [rsi+1*16]");
+  asm ("vmovaps xmm14, [rsi+2*16]");
+  asm ("vmovaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("vpxor xmm8, xmm12, [rdi+0*16]");
+  asm ("vpxor xmm0, xmm2,  [rdi+1*16]");
+  asm ("vpxor xmm4, xmm6,  [rdi+2*16]");
+  asm ("vpxor xmm5, xmm7,  [rdi+3*16]");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("vpxor xmm0, xmm0, xmm8");
+  asm ("vpxor xmm1, xmm1, xmm10");
+  asm ("vpxor xmm2, xmm2, xmm12");
+  asm ("vpxor xmm3, xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("vpxor xmm0, xmm0, [rdi+0*16]");
+  asm ("vpxor xmm1, xmm1, [rdi+1*16]");
+  asm ("vpxor xmm2, xmm2, [rdi+2*16]");
+  asm ("vpxor xmm3, xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("vmovaps [rdi+0*16], xmm0");
+  asm ("vmovaps [rdi+1*16], xmm1");
+  asm ("vmovaps [rdi+2*16], xmm2");
+  asm ("vmovaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("vmovaps xmm8,  [rdi+0*16]");
+  asm ("vmovaps xmm10, [rdi+1*16]");
+  asm ("vmovaps xmm12, [rdi+2*16]");
+  asm ("vmovaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
+  asm ("vpxor xmm10, xmm10, [rdi+1*16]");
+  asm ("vpxor xmm12, xmm12, [rdi+2*16]");
+  asm ("vpxor xmm14, xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+
+  /* we only need to return the truncated half of the state */
+  asm ("vmovaps [rdi+2*16], xmm9");
+  asm ("vmovaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
diff --git a/algo/aes_ni/groestl256-asm-vperm.h b/algo/aes_ni/groestl256-asm-vperm.h
new file mode 100644
index 000000000..a25ade795
--- /dev/null
+++ b/algo/aes_ni/groestl256-asm-vperm.h
@@ -0,0 +1,856 @@
+/* groestl-asm-vperm.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3 instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * Based on the vperm and aes_ni implementations of the hash function Groestl
+ * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+
+/* global constants  */
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
+__attribute__ ((aligned (16))) unsigned char ALL_15[16];
+__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
+__attribute__ ((aligned (16))) unsigned char ALL_63[16];
+__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
+__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
+
+/* temporary variables  */
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
+__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_SHARED_CONSTANTS(){\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
+  ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
+  ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
+  ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
+  ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
+  ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
+  ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
+  ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
+  ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
+  ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
+  ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
+  ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
+  ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
+  ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
+  ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
+  ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
+  ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
+  ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
+  ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
+  ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
+  ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
+  ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
+  ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
+  ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
+  ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
+  ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
+  ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
+  ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
+/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
+  ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
+  ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
+  ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
+  ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
+  ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
+}/**/
+
+/* VPERM
+ * Transform w/o settings c*
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
+  asm ("pandn  xmm"tostr(t0)", xmm"tostr(a0)"");\
+  asm ("pandn  xmm"tostr(t1)", xmm"tostr(a1)"");\
+  asm ("psrld  xmm"tostr(t0)", 4");\
+  asm ("psrld  xmm"tostr(t1)", 4");\
+  asm ("pand   xmm"tostr(a0)", xmm"tostr(c0)"");\
+  asm ("pand   xmm"tostr(a1)", xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
+  asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
+  asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
+  asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
+  asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
+  asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
+  asm ("pxor   xmm"tostr(a0)", xmm"tostr(t2)"");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(t3)"");\
+}/**/
+
+#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
+  asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
+  asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
+  asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
+}/**/
+
+/* VPERM
+ * Transform
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Transform State
+ * inputs:
+ * a0-a3 = state
+ * table = transformation table to use
+ * t* = clobbers
+ * outputs:
+ * a0-a3 = transformed state
+ * */
+#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Add Constant to State
+ * inputs:
+ * a0-a7 = state
+ * constant = constant to add
+ * t0 = clobber
+ * outputs:
+ * a0-a7 = state + constant
+ * */
+#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
+  asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
+  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a1)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a2)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a3)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a4)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a5)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a6)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a7)",  xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * Set Substitute Core Constants
+ * */
+#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
+  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Substitute Core
+ * first part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0 = 1 row
+ * t*, c* = clobbers
+ * outputs:
+ * b0a, b0b = inputs for lookup step
+ * */
+#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
+  asm ("movdqa xmm"tostr(t0)",  xmm"tostr(c0)"");\
+  asm ("pandn  xmm"tostr(t0)",  xmm"tostr(a0)"");\
+  asm ("psrld  xmm"tostr(t0)",  4");\
+  asm ("pand   xmm"tostr(a0)",  xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
+  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(b0a)"");\
+  asm ("movdqa xmm"tostr(t1)",  xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(t1)",  xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(t1)",  xmm"tostr(b0a)"");\
+  asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
+  asm ("pxor   xmm"tostr(b0a)", xmm"tostr(a0)"");\
+  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
+  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * Lookup
+ * second part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0a, a0b = output of Substitution Core
+ * table = lookup table to use (*1 / *2 / *4)
+ * t0 = clobber
+ * outputs:
+ * b0 = output of sbox + multiplication
+ * */
+#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
+  asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
+  asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
+  asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
+  asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * SubBytes and *2 / *4
+ * this function is derived from:
+ *   Constant-time SSSE3 AES core implementation
+ *   by Mike Hamburg
+ * and
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0-a7 = state
+ * t*, c* = clobbers
+ * outputs:
+ * a0-a7 = state * 4
+ * c2 = row0 * 2 -> b0
+ * c1 = row7 * 2 -> b3
+ * c0 = row7 * 1 -> b4
+ * t2 = row4 * 1 -> b7
+ * TEMP_MUL1 = row(i) * 1
+ * TEMP_MUL2 = row(i) * 2
+ *
+ * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
+#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
+  /* set Constants */\
+  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
+  /* row 1 */\
+  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
+  /* --- */\
+  /* row 2 */\
+  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
+  /* --- */\
+  /* row 3 */\
+  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
+  /* --- */\
+  /* row 5 */\
+  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
+  /* --- */\
+  /* row 6 */\
+  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
+  /* --- */\
+  /* row 7 */\
+  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
+  /* --- */\
+  /* row 4 */\
+  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
+  /* --- */\
+  /* row 0 */\
+  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
+  asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
+  /* --- */\
+}/**/
+
+
+/* Optimized MixBytes
+ * inputs:
+ * a0-a7 = (row0-row7) * 4
+ * b0 = row0 * 2
+ * b3 = row7 * 2
+ * b4 = row7 * 1
+ * b7 = row4 * 1
+ * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
+ * output: b0-b7
+ * */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* save one value */\
+  asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
+  /* 1 */\
+  asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
+  asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
+  \
+  /* 2 */\
+  asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a4)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
+  asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
+  \
+  /* 4 */\
+  asm ("pxor   xmm"tostr(b7)", xmm"tostr(a6)"");\
+  /*asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
+  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
+  asm ("pxor   xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b2)", xmm"tostr(b7)"");\
+  \
+  /* 3 */\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a7)"");\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
+  /*asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
+  asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b0)"");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
+  \
+  /* 5 */\
+  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a2)"");\
+  /*asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
+  asm ("pxor   xmm"tostr(b3)", xmm"tostr(b4)"");\
+  asm ("pxor   xmm"tostr(b6)", xmm"tostr(b4)"");\
+  \
+  /* 6 */\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
+  asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
+  \
+  /* 7 */\
+  asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
+  asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
+  asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
+  asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
+  \
+  /* 8 */\
+  asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
+  asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
+  asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
+  \
+  /* 9 */\
+  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
+  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a3)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* 10 */\
+  asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
+  asm ("pxor   xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a1)"");\
+  \
+  /* 11 */\
+  asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
+  asm ("pxor   xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm ("pxor   xmm"tostr(b6)", xmm"tostr(a5)"");\
+  \
+  /* 12 */\
+  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
+  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
+  asm ("pxor   xmm"tostr(b2)", xmm"tostr(a3)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* 13 */\
+  asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
+  asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
+  asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
+  asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
+}/**/
+
+//#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}/**/
+
+#define Push_All_Regs(){\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}/**/
+
+#define Pop_All_Regs(){\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}/**/
+
+
+/* vperm:
+ * transformation before rounds with ipt
+ * first round add transformed constant
+ * middle rounds: add constant XOR 0x15...15
+ * last round: additionally add 0x15...15 after MB
+ * transformation after rounds with opt
+ */
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant + ShiftBytes (interleaved) */\
+  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  /* SubBytes + Multiplication by 2 and 4 */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
+}
+
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+\
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+\
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
+\
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+\
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
+\
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
+  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
+  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
+  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
+  asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
+  asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
+  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm0, [ALL_15]");\
+  asm ("pxor xmm1, [ALL_15]");\
+  asm ("pxor xmm2, [ALL_15]");\
+  asm ("pxor xmm3, [ALL_15]");\
+  asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
+  asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
+  asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
+  asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  asm ("movaps xmm0, [ROUND_CONST_Lx]");\
+  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm0, [ALL_15]");\
+  asm ("movaps [ROUND_CONST_Lx], xmm0");\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+}/**/
+
+void INIT256(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("movaps xmm12, [rdi+0*16]");
+  asm ("movaps xmm13, [rdi+1*16]");
+  asm ("movaps xmm14, [rdi+2*16]");
+  asm ("movaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm12");
+  asm ("movaps [rdi+1*16], xmm2");
+  asm ("movaps [rdi+2*16], xmm6");
+  asm ("movaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("movaps xmm12, [rsi+0*16]");
+  asm ("movaps xmm13, [rsi+1*16]");
+  asm ("movaps xmm14, [rsi+2*16]");
+  asm ("movaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  asm ("movaps xmm8, [rdi+0*16]");
+  asm ("movaps xmm0, [rdi+1*16]");
+  asm ("movaps xmm4, [rdi+2*16]");
+  asm ("movaps xmm5, [rdi+3*16]");
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("pxor xmm8, xmm12");
+  asm ("pxor xmm0, xmm2");
+  asm ("pxor xmm4, xmm6");
+  asm ("pxor xmm5, xmm7");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("pxor xmm0, xmm8");
+  asm ("pxor xmm1, xmm10");
+  asm ("pxor xmm2, xmm12");
+  asm ("pxor xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("pxor xmm0, [rdi+0*16]");
+  asm ("pxor xmm1, [rdi+1*16]");
+  asm ("pxor xmm2, [rdi+2*16]");
+  asm ("pxor xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm0");
+  asm ("movaps [rdi+1*16], xmm1");
+  asm ("movaps [rdi+2*16], xmm2");
+  asm ("movaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm10, [rdi+1*16]");
+  asm ("movaps xmm12, [rdi+2*16]");
+  asm ("movaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm10, [rdi+1*16]");
+  asm ("pxor xmm12, [rdi+2*16]");
+  asm ("pxor xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+  VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+2*16], xmm9");
+  asm ("movaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
+
diff --git a/algo/aes_ni/groestl256-intr-aes.h b/algo/aes_ni/groestl256-intr-aes.h
new file mode 100644
index 000000000..9ef6e1bc4
--- /dev/null
+++ b/algo/aes_ni/groestl256-intr-aes.h
@@ -0,0 +1,496 @@
+/* groestl-intr-aes.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include "hash-groestl256.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_1B;
+__m128i ALL_FF;
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  j = _mm_xor_si128(j, j);\
+  j = _mm_cmpgt_epi8(j, i);\
+  i = _mm_add_epi8(i, i);\
+  j = _mm_and_si128(j, k);\
+  i = _mm_xor_si128(i, j);\
+} 
+
+ /**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm_xor_si128(a0, a1);\
+  b0 = a2;\
+  a1 = _mm_xor_si128(a1, a2);\
+  b1 = a3;\
+  a2 = _mm_xor_si128(a2, a3);\
+  b2 = a4;\
+  a3 = _mm_xor_si128(a3, a4);\
+  b3 = a5;\
+  a4 = _mm_xor_si128(a4, a5);\
+  b4 = a6;\
+  a5 = _mm_xor_si128(a5, a6);\
+  b5 = a7;\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm_xor_si128(b0, a4);\
+  b6 = _mm_xor_si128(b6, a4);\
+  b1 = _mm_xor_si128(b1, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  b2 = _mm_xor_si128(b2, a6);\
+  b0 = _mm_xor_si128(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm_xor_si128(b3, a7);\
+  b1 = _mm_xor_si128(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm_xor_si128(b4, a0);\
+  b2 = _mm_xor_si128(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm_xor_si128(b5, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  b1 = a1;\
+  b6 = _mm_xor_si128(b6, a2);\
+  b4 = _mm_xor_si128(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm_xor_si128(b7, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(a2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = ALL_1B;\
+  MUL2(a0, b0, b1);\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm_xor_si128(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm_xor_si128(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm_xor_si128(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm_xor_si128(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm_xor_si128(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm_xor_si128(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm_xor_si128(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm_xor_si128(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm_xor_si128(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm_xor_si128(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm_xor_si128(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm_xor_si128(b0, a3);\
+  b1 = _mm_xor_si128(b1, a4);\
+}/*MixBytes*/
+
+#define SET_CONSTANTS(){\
+   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0); \
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = i0;\
+  t0 = i2;\
+  \
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = i0;\
+  o3 = o1;\
+  \
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+
diff --git a/algo/aes_ni/groestl256-intr-avx.h b/algo/aes_ni/groestl256-intr-avx.h
new file mode 100644
index 000000000..3eb8397d9
--- /dev/null
+++ b/algo/aes_ni/groestl256-intr-avx.h
@@ -0,0 +1,482 @@
+/* groestl-intr-avx.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <immintrin.h>
+#include "hash-groestl256.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_FF;
+//#if LENGTH <= 256
+__m128i ALL_1B;
+//#else
+//__m256d ALL_1B;
+//#endif
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
+#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
+
+#define SET_CONSTANTS(){\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0);
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2(i, j, k, z){\
+  j = _mm_cmpgt_epi8(z, i);\
+  i = _mm_add_epi8(i, i);\
+  j = _mm_and_si128(j, k);\
+  i = _mm_xor_si128(i, j);\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
+  b0 = a2;\
+  b1 = a3;\
+  b2 = a4;\
+  b3 = a5;\
+  b4 = a6;\
+  b5 = a7;\
+  b6 = a0;\
+  b7 = a1;\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  a0 = _mm_xor_si128(a0, a1);\
+  a1 = _mm_xor_si128(a1, a2);\
+  a2 = _mm_xor_si128(a2, a3);\
+  a3 = _mm_xor_si128(a3, a4);\
+  a4 = _mm_xor_si128(a4, a5);\
+  a5 = _mm_xor_si128(a5, a6);\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm_xor_si128(b0, a4);\
+  b1 = _mm_xor_si128(b1, a5);\
+  b2 = _mm_xor_si128(b2, a6);\
+  b3 = _mm_xor_si128(b3, a7);\
+  b4 = _mm_xor_si128(b4, a0);\
+  b5 = _mm_xor_si128(b5, a1);\
+  b6 = _mm_xor_si128(b6, a2);\
+  b7 = _mm_xor_si128(b7, a3);\
+  \
+  b0 = _mm_xor_si128(b0, a6);\
+  b1 = _mm_xor_si128(b1, a7);\
+  b2 = _mm_xor_si128(b2, a0);\
+  b3 = _mm_xor_si128(b3, a1);\
+  b4 = _mm_xor_si128(b4, a2);\
+  b5 = _mm_xor_si128(b5, a3);\
+  b6 = _mm_xor_si128(b6, a4);\
+  b7 = _mm_xor_si128(b7, a5);\
+  \
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  TEMP1 = b1;\
+  TEMP2 = b2;\
+  \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b1 = a1;\
+  TEMP3 = a2;\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(a2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP3);\
+  \
+  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  b1 = ALL_1B;\
+  b2 = _mm_xor_si128(b2, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a0, b0, b1, b2);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  a2 = _mm_xor_si128(a2, TEMP2);\
+  a3 = _mm_xor_si128(a3, b3);\
+  a4 = _mm_xor_si128(a4, b4);\
+  a5 = _mm_xor_si128(a5, b5);\
+  a6 = _mm_xor_si128(a6, b6);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  b0 = _mm_xor_si128(a3, TEMP0);\
+  b1 = _mm_xor_si128(a4, TEMP1);\
+  b2 = _mm_xor_si128(a5, TEMP2);\
+  b3 = _mm_xor_si128(b3, a6);\
+  b4 = _mm_xor_si128(b4, a7);\
+  b5 = _mm_xor_si128(b5, a0);\
+  b6 = _mm_xor_si128(b6, a1);\
+  b7 = _mm_xor_si128(b7, a2);\
+}/*MixBytes*/
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* Add Round Constant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = _mm_unpackhi_epi16(i0, i1);\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  t0 = _mm_unpackhi_epi16(i2, i3);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = _mm_unpackhi_epi32(i0, i2);\
+  o3 = _mm_unpackhi_epi32(o1, t0);\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = _mm_unpackhi_epi64(i0, i4);\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o2 = _mm_unpacklo_epi64(i1, i5);\
+  o3 = _mm_unpackhi_epi64(i1, i5);\
+  o4 = _mm_unpacklo_epi64(i2, i6);\
+  o5 = _mm_unpackhi_epi64(i2, i6);\
+  o6 = _mm_unpacklo_epi64(i3, i7);\
+  o7 = _mm_unpackhi_epi64(i3, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = _mm_unpackhi_epi64(i0, i1);\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o1 = _mm_unpackhi_epi64(i2, i3);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o2 = _mm_unpackhi_epi64(i4, i5);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o3 = _mm_unpackhi_epi64(i6, i7);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = _mm_unpackhi_epi64(i0, t0);\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i3 = _mm_unpackhi_epi64(i2, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i5 = _mm_unpackhi_epi64(i4, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i7 = _mm_unpackhi_epi64(i6, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+  static __m128i TEMP3;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm12, chaining[0]);
+  xmm0 = _mm_xor_si128(xmm2,  chaining[1]);
+  xmm4 = _mm_xor_si128(xmm6,  chaining[2]);
+  xmm5 = _mm_xor_si128(xmm7,  chaining[3]);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, chaining[0]);
+  xmm1 = _mm_xor_si128(xmm1, chaining[1]);
+  xmm2 = _mm_xor_si128(xmm2, chaining[2]);
+  xmm3 = _mm_xor_si128(xmm3, chaining[3]);
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+  static __m128i TEMP3;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+
diff --git a/algo/aes_ni/groestl256-intr-vperm.h b/algo/aes_ni/groestl256-intr-vperm.h
new file mode 100644
index 000000000..f6baa17e6
--- /dev/null
+++ b/algo/aes_ni/groestl256-intr-vperm.h
@@ -0,0 +1,793 @@
+/* groestl-intr-vperm.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3 instructions.
+ * Author: Günther A. Roland, Martin Schläffer
+ *
+ * Based on the vperm and aes_ni implementations of the hash function Groestl
+ * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey
+ *
+ * This code is placed in the public domain
+ */
+
+#include <tmmintrin.h>
+#include "hash-groestl256.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_0F;
+__m128i ALL_15;
+__m128i ALL_1B;
+__m128i ALL_63;
+__m128i ALL_FF;
+__m128i VPERM_IPT[2];
+__m128i VPERM_OPT[2];
+__m128i VPERM_INV[2];
+__m128i VPERM_SB1[2];
+__m128i VPERM_SB2[2];
+__m128i VPERM_SB4[2];
+__m128i VPERM_SBO[2];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_SHARED_CONSTANTS(){\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
+  ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
+  ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
+  VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
+  VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
+  VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
+  VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
+  VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
+  VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
+  VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
+  VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
+  VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
+  VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
+  VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
+  VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
+}/**/
+
+/* VPERM
+ * Transform w/o settings c*
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
+  t0 = c0;\
+  t1 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t1 = _mm_andnot_si128(t1, a1);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  t1 = _mm_srli_epi32(t1, 4);\
+  a0 = _mm_and_si128(a0, c0);\
+  a1 = _mm_and_si128(a1, c0);\
+  t2 = c2;\
+  t3 = c2;\
+  t2 = _mm_shuffle_epi8(t2, a0);\
+  t3 = _mm_shuffle_epi8(t3, a1);\
+  a0 = c1;\
+  a1 = c1;\
+  a0 = _mm_shuffle_epi8(a0, t0);\
+  a1 = _mm_shuffle_epi8(a1, t1);\
+  a0 = _mm_xor_si128(a0, t2);\
+  a1 = _mm_xor_si128(a1, t3);\
+}/**/
+
+#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
+  c0 = ALL_0F;\
+  c1 = ((__m128i*) table )[0];\
+  c2 = ((__m128i*) table )[1];\
+}/**/
+
+/* VPERM
+ * Transform
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Transform State
+ * inputs:
+ * a0-a3 = state
+ * table = transformation table to use
+ * t* = clobbers
+ * outputs:
+ * a0-a3 = transformed state
+ * */
+#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Add Constant to State
+ * inputs:
+ * a0-a7 = state
+ * constant = constant to add
+ * t0 = clobber
+ * outputs:
+ * a0-a7 = state + constant
+ * */
+#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
+  t0 = constant;\
+  a0 = _mm_xor_si128(a0,  t0);\
+  a1 = _mm_xor_si128(a1,  t0);\
+  a2 = _mm_xor_si128(a2,  t0);\
+  a3 = _mm_xor_si128(a3,  t0);\
+  a4 = _mm_xor_si128(a4,  t0);\
+  a5 = _mm_xor_si128(a5,  t0);\
+  a6 = _mm_xor_si128(a6,  t0);\
+  a7 = _mm_xor_si128(a7,  t0);\
+}/**/
+
+/* VPERM
+ * Set Substitute Core Constants
+ * */
+#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
+  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Substitute Core
+ * first part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0 = 1 row
+ * t*, c* = clobbers
+ * outputs:
+ * b0a, b0b = inputs for lookup step
+ * */
+#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
+  t0 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  a0 = _mm_and_si128(a0,  c0);\
+  b0a = c1;\
+  b0a = _mm_shuffle_epi8(b0a, a0);\
+  a0 = _mm_xor_si128(a0,  t0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t0);\
+  b0b = _mm_xor_si128(b0b, b0a);\
+  t1 = c2;\
+  t1 = _mm_shuffle_epi8(t1,  a0);\
+  t1 = _mm_xor_si128(t1,  b0a);\
+  b0a = c2;\
+  b0a = _mm_shuffle_epi8(b0a, b0b);\
+  b0a = _mm_xor_si128(b0a, a0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t1);\
+  b0b = _mm_xor_si128(b0b, t0);\
+}/**/
+
+/* VPERM
+ * Lookup
+ * second part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0a, a0b = output of Substitution Core
+ * table = lookup table to use (*1 / *2 / *4)
+ * t0 = clobber
+ * outputs:
+ * b0 = output of sbox + multiplication
+ * */
+#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
+  b0 = ((__m128i*) table )[0];\
+  t0 = ((__m128i*) table )[1];\
+  b0 = _mm_shuffle_epi8(b0, a0b);\
+  t0 = _mm_shuffle_epi8(t0, a0a);\
+  b0 = _mm_xor_si128(b0, t0);\
+}/**/
+
+/* VPERM
+ * SubBytes and *2 / *4
+ * this function is derived from:
+ *   Constant-time SSSE3 AES core implementation
+ *   by Mike Hamburg
+ * and
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0-a7 = state
+ * t*, c* = clobbers
+ * outputs:
+ * a0-a7 = state * 4
+ * c2 = row0 * 2 -> b0
+ * c1 = row7 * 2 -> b3
+ * c0 = row7 * 1 -> b4
+ * t2 = row4 * 1 -> b7
+ * TEMP_MUL1 = row(i) * 1
+ * TEMP_MUL2 = row(i) * 2
+ *
+ * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
+#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
+  /* set Constants */\
+  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
+  /* row 1 */\
+  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[1] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[1] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
+  /* --- */\
+  /* row 2 */\
+  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[2] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[2] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
+  /* --- */\
+  /* row 3 */\
+  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[3] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[3] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
+  /* --- */\
+  /* row 5 */\
+  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[5] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[5] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
+  /* --- */\
+  /* row 6 */\
+  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[6] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[6] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
+  /* --- */\
+  /* row 7 */\
+  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[7] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
+  /* --- */\
+  /* row 4 */\
+  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[4] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
+  /* --- */\
+  /* row 0 */\
+  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
+  TEMP_MUL2[0] = c2;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
+  /* --- */\
+}/**/
+
+
+/* Optimized MixBytes
+ * inputs:
+ * a0-a7 = (row0-row7) * 4
+ * b0 = row0 * 2
+ * b3 = row7 * 2
+ * b4 = row7 * 1
+ * b7 = row4 * 1
+ * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
+ * output: b0-b7
+ * */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* save one value */\
+  TEMP_MUL4 = a3;\
+  /* 1 */\
+  b1 = a0;\
+  b1 = _mm_xor_si128(b1, a5);\
+  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
+  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
+  b2 = b1;\
+  \
+  /* 2 */\
+  b5 = a1;\
+  b5 = _mm_xor_si128(b5, a4);\
+  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
+  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
+  b6 = b5;\
+  \
+  /* 4 */\
+  b7 = _mm_xor_si128(b7, a6);\
+  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
+  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
+  b2 = _mm_xor_si128(b2, b7);\
+  \
+  /* 3 */\
+  b0 = _mm_xor_si128(b0, a7);\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
+  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
+  b3 = b0;\
+  b1 = _mm_xor_si128(b1, b0);\
+  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
+  \
+  /* 5 */\
+  b4 = _mm_xor_si128(b4, a2);\
+  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
+  b3 = _mm_xor_si128(b3, b4);\
+  b6 = _mm_xor_si128(b6, b4);\
+  \
+  /* 6 */\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
+  b4 = _mm_xor_si128(b4, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  b7 = _mm_xor_si128(b7, a3);\
+  \
+  /* 7 */\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
+  b2 = _mm_xor_si128(b2, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  \
+  /* 8 */\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
+  b6 = _mm_xor_si128(b6, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  \
+  /* 9 */\
+  a3 = TEMP_MUL1[2];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
+  b0 = _mm_xor_si128(b0, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 10 */\
+  a1 = TEMP_MUL1[6];\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
+  b1 = _mm_xor_si128(b1, a1);\
+  b4 = _mm_xor_si128(b4, a1);\
+  \
+  /* 11 */\
+  a5 = TEMP_MUL1[3];\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
+  b1 = _mm_xor_si128(b1, a5);\
+  b6 = _mm_xor_si128(b6, a5);\
+  \
+  /* 12 */\
+  a3 = TEMP_MUL1[7];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
+  b2 = _mm_xor_si128(b2, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 13 */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
+  b0 = _mm_xor_si128(b0, a4);\
+  b1 = _mm_xor_si128(b1, a4);\
+  b3 = _mm_xor_si128(b3, a6);\
+  b4 = _mm_xor_si128(b4, a0);\
+  b4 = _mm_xor_si128(b4, a7);\
+  b5 = _mm_xor_si128(b5, a0);\
+  b7 = _mm_xor_si128(b7, a2);\
+}/**/
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}/**/
+
+/* vperm:
+ * transformation before rounds with ipt
+ * first round add transformed constant
+ * middle rounds: add constant XOR 0x15...15
+ * last round: additionally add 0x15...15 after MB
+ * transformation after rounds with opt
+ */
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant + ShiftBytes (interleaved) */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a4 = _mm_xor_si128(a4, b1);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  /* SubBytes + Multiplication by 2 and 4 */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
+}
+
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+\
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+\
+  o1 = i0;\
+  t0 = i2;\
+\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+\
+  o2 = i0;\
+  o3 = o1;\
+\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  xmm0 = ROUND_CONST_L0[i];\
+  xmm1 = ROUND_CONST_L7[i];\
+  xmm2 = ROUND_CONST_L0[j];\
+  xmm3 = ROUND_CONST_L7[j];\
+  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
+  xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
+  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
+  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
+  ROUND_CONST_L0[i] = xmm0;\
+  ROUND_CONST_L7[i] = xmm1;\
+  ROUND_CONST_L0[j] = xmm2;\
+  ROUND_CONST_L7[j] = xmm3;\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  xmm0 = ROUND_CONST_Lx;\
+  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
+  ROUND_CONST_Lx = xmm0;\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+}/**/
+
+void INIT256(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+  VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+
+  return;
+}//OF512()
+
+
+
diff --git a/algo/aes_ni/hash-groestl.c b/algo/aes_ni/hash-groestl.c
new file mode 100644
index 000000000..47c6a1276
--- /dev/null
+++ b/algo/aes_ni/hash-groestl.c
@@ -0,0 +1,306 @@
+/* hash.c     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl.h"
+#include "miner.h"
+
+#ifndef NO_AES_NI
+
+#include "groestl-version.h"
+
+#ifdef TASM
+  #ifdef VAES
+    #include "groestl-asm-aes.h"
+  #else
+    #ifdef VAVX
+      #include "groestl-asm-avx.h"
+    #else
+      #ifdef VVPERM
+        #include "groestl-asm-vperm.h"
+      #else
+        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+      #endif
+    #endif
+  #endif
+#else
+  #ifdef TINTR
+    #ifdef VAES
+      #include "groestl-intr-aes.h"
+    #else
+      #ifdef VAVX
+        #include "groestl-intr-avx.h"
+      #else
+        #ifdef VVPERM
+          #include "groestl-intr-vperm.h"
+        #else
+          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+        #endif
+      #endif
+    #endif
+  #else
+    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
+  #endif
+#endif
+
+
+/* digest up to len bytes of input (full blocks only) */
+void Transform(hashState_groestl *ctx,
+	       const u8 *in, 
+	       unsigned long long len) {
+    /* increment block counter */
+    ctx->block_counter += len/SIZE;
+
+    /* digest message, one block at a time */
+    for (; len >= SIZE; len -= SIZE, in += SIZE)
+#if LENGTH<=256
+      TF512((u64*)ctx->chaining, (u64*)in);
+#else
+      TF1024((u64*)ctx->chaining, (u64*)in);
+#endif
+
+    asm volatile ("emms");
+}
+
+/* given state h, do h <- P(h)+h */
+void OutputTransformation(hashState_groestl *ctx) {
+    /* determine variant */
+#if (LENGTH <= 256)
+    OF512((u64*)ctx->chaining);
+#else
+    OF1024((u64*)ctx->chaining);
+#endif
+
+    asm volatile ("emms");
+}
+
+/* initialise context */
+HashReturn_gr init_groestl(hashState_groestl* ctx) {
+  u8 i = 0;
+  /* output size (in bits) must be a positive integer less than or
+     equal to 512, and divisible by 8 */
+  if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512)
+    return BAD_HASHBITLEN_GR;
+
+  /* set number of state columns and state size depending on
+     variant */
+  ctx->columns = COLS;
+  ctx->statesize = SIZE;
+#if (LENGTH <= 256)
+    ctx->v = SHoRT;
+#else
+    ctx->v = LoNG;
+#endif
+
+  SET_CONSTANTS();
+
+  for (i=0; i<SIZE/8; i++)
+    ctx->chaining[i] = 0;
+  for (i=0; i<SIZE; i++)
+    ctx->buffer[i] = 0;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return FAIL_GR;
+
+  /* set initial value */
+  ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
+
+  INIT(ctx->chaining);
+
+  /* set other variables */
+  ctx->buf_ptr = 0;
+  ctx->block_counter = 0;
+  ctx->bits_in_last_byte = 0;
+
+  return SUCCESS_GR;
+}
+
+
+HashReturn_gr reinit_groestl(hashState_groestl* ctx)
+ {
+  int i;
+  for (i=0; i<SIZE/8; i++)
+    ctx->chaining[i] = 0;
+  for (i=0; i<SIZE; i++)
+    ctx->buffer[i] = 0;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return FAIL_GR;
+
+  /* set initial value */
+  ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
+
+  INIT(ctx->chaining);
+
+  /* set other variables */
+  ctx->buf_ptr = 0;
+  ctx->block_counter = 0;
+  ctx->bits_in_last_byte = 0;
+
+  return SUCCESS_GR;
+}
+
+
+/* update state with databitlen bits of input */
+HashReturn_gr update_groestl(hashState_groestl* ctx,
+		  const BitSequence_gr* input,
+		  DataLength_gr databitlen) {
+  int index = 0;
+  int msglen = (int)(databitlen/8);
+  int rem = (int)(databitlen%8);
+
+  /* non-integral number of message bytes can only be supplied in the
+     last call to this function */
+  if (ctx->bits_in_last_byte) return FAIL_GR;
+
+  /* if the buffer contains data that has not yet been digested, first
+     add data to buffer until full */
+
+// The following block of code never gets hit when hashing x11 or quark
+// leave it here in case it might be needed.
+//  if (ctx->buf_ptr)
+//  {
+//    while (ctx->buf_ptr < ctx->statesize && index < msglen)
+//    {
+//      ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+//    }
+//    if (ctx->buf_ptr < ctx->statesize)
+//    {
+//      /* buffer still not full, return */
+//      if (rem)
+//      {
+//        ctx->bits_in_last_byte = rem;
+//        ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+//      }
+//      return SUCCESS_GR;
+//    }
+//    /* digest buffer */
+//    ctx->buf_ptr = 0;
+//    printf("error\n");
+//    Transform(ctx, ctx->buffer, ctx->statesize);
+// end dead code
+//  }
+
+  /* digest bulk of message */
+  Transform(ctx, input+index, msglen-index);
+  index += ((msglen-index)/ctx->statesize)*ctx->statesize;
+
+  /* store remaining data in buffer */
+  while (index < msglen)
+  {
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+  }
+
+// Another block that doesn't get used by x11 or quark
+//  /* if non-integral number of bytes have been supplied, store
+//     remaining bits in last byte, together with information about
+//     number of bits */
+//  if (rem)
+//  {
+//    ctx->bits_in_last_byte = rem;
+//    ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+//  }
+
+  return SUCCESS_GR;
+}
+
+#define BILB ctx->bits_in_last_byte
+
+/* finalise: process remaining data (including padding), perform
+   output transformation, and write hash result to 'output' */
+HashReturn_gr final_groestl(hashState_groestl* ctx,
+		 BitSequence_gr* output) {
+  int i, j = 0, hashbytelen = LENGTH/8;
+  u8 *s = (BitSequence_gr*)ctx->chaining;
+
+  /* pad with '1'-bit and first few '0'-bits */
+  if (BILB) {
+    ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
+    ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
+    BILB = 0;
+  }
+  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+
+  /* pad with '0'-bits */
+  if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
+    /* padding requires two blocks */
+    while (ctx->buf_ptr < ctx->statesize) {
+      ctx->buffer[(int)ctx->buf_ptr++] = 0;
+    }
+    /* digest first padding block */
+    Transform(ctx, ctx->buffer, ctx->statesize);
+    ctx->buf_ptr = 0;
+  }
+  while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) {
+    ctx->buffer[(int)ctx->buf_ptr++] = 0;
+  }
+
+  /* length padding */
+  ctx->block_counter++;
+  ctx->buf_ptr = ctx->statesize;
+  while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
+    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
+    ctx->block_counter >>= 8;
+  }
+
+  /* digest final padding block */
+  Transform(ctx, ctx->buffer, ctx->statesize);
+  /* perform output transformation */
+  OutputTransformation(ctx);
+
+  /* store hash result in output */
+  for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) {
+    output[j] = s[i];
+  }
+
+  /* zeroise relevant variables and deallocate memory */
+  
+  for (i = 0; i < ctx->columns; i++) {
+    ctx->chaining[i] = 0;
+  }
+  
+  for (i = 0; i < ctx->statesize; i++) {
+    ctx->buffer[i] = 0;
+  }
+//  free(ctx->chaining);
+//  free(ctx->buffer);
+
+  return SUCCESS_GR;
+}
+
+/* hash bit sequence */
+HashReturn_gr hash_groestl(int hashbitlen,
+		const BitSequence_gr* data, 
+		DataLength_gr databitlen,
+		BitSequence_gr* hashval) {
+  HashReturn_gr ret;
+  hashState_groestl context;
+
+  /* initialise */
+  if ((ret = init_groestl(&context)) != SUCCESS_GR)
+    return ret;
+
+  /* process message */
+  if ((ret = update_groestl(&context, data, databitlen)) != SUCCESS_GR)
+    return ret;
+
+  /* finalise */
+  ret = final_groestl(&context, hashval);
+
+  return ret;
+}
+
+/* eBash API */
+#ifdef crypto_hash_BYTES
+int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
+{
+  if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
+  return -1;
+}
+#endif
+
+#endif
diff --git a/algo/aes_ni/hash-groestl.h b/algo/aes_ni/hash-groestl.h
new file mode 100644
index 000000000..24603d395
--- /dev/null
+++ b/algo/aes_ni/hash-groestl.h
@@ -0,0 +1,110 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#ifndef __hash_h
+#define __hash_h
+
+#include <stdio.h>
+#if defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#endif
+#include <stdlib.h>
+
+/* eBash API begin */
+/*
+#include "crypto_hash.h"
+#ifdef crypto_hash_BYTES
+
+#include <crypto_uint8.h>
+#include <crypto_uint32.h>
+#include <crypto_uint64.h>
+typedef crypto_uint8 u8;
+typedef crypto_uint32 u32;
+typedef crypto_uint64 u64;
+#endif
+ * /
+/* eBash API end */
+
+#define LENGTH (512)
+
+#include "brg_endian.h"
+#define NEED_UINT_64T
+#include "brg_types.h"
+
+#ifdef IACA_TRACE
+  #include IACA_MARKS
+#endif
+
+#ifndef LENGTH
+#define LENGTH (256)
+#endif
+
+/* some sizes (number of bytes) */
+#define ROWS (8)
+#define LENGTHFIELDLEN (ROWS)
+#define COLS512 (8)
+#define COLS1024 (16)
+#define SIZE512 ((ROWS)*(COLS512))
+#define SIZE1024 ((ROWS)*(COLS1024))
+#define ROUNDS512 (10)
+#define ROUNDS1024 (14)
+
+#if LENGTH<=256
+#define COLS (COLS512)
+#define SIZE (SIZE512)
+#define ROUNDS (ROUNDS512)
+#else
+#define COLS (COLS1024)
+#define SIZE (SIZE1024)
+#define ROUNDS (ROUNDS1024)
+#endif
+
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef enum { LoNG, SHoRT } Var;
+
+/* NIST API begin */
+
+typedef unsigned char BitSequence_gr;
+typedef unsigned long long DataLength_gr;
+typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
+
+typedef struct {
+  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
+  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
+  u64 block_counter;        /* message block counter */
+  int buf_ptr;              /* data buffer pointer */
+  int bits_in_last_byte;    /* no. of message bits in last byte of
+                               data buffer */
+  int columns;              /* no. of columns in state */
+  int statesize;            /* total no. of bytes in state */
+  Var v;                    /* LONG or SHORT */
+} hashState_groestl;
+
+HashReturn_gr init_groestl(hashState_groestl*);
+HashReturn_gr reinit_groestl(hashState_groestl*);
+HashReturn_gr update_groestl(hashState_groestl*, const BitSequence_gr*, DataLength_gr);
+HashReturn_gr final_groestl(hashState_groestl*, BitSequence_gr*);
+HashReturn_gr hash_groestl(int, const BitSequence_gr*, DataLength_gr, BitSequence_gr*);
+/* NIST API end   */
+
+#endif /* __hash_h */
diff --git a/algo/aes_ni/hash-groestl256.c b/algo/aes_ni/hash-groestl256.c
new file mode 100644
index 000000000..19f0f7877
--- /dev/null
+++ b/algo/aes_ni/hash-groestl256.c
@@ -0,0 +1,318 @@
+/* hash.c     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+//#include "miner.h"
+
+//#ifndef NO_AES_NI
+#ifdef __AVX2__
+
+#include "groestl-version.h"
+
+//#ifdef TASM
+//  #ifdef VAES
+//    #include "groestl256-asm-aes.h"
+//  #else
+//    #ifdef VAVX
+//      #include "groestl256-asm-avx.h"
+//    #else
+//      #ifdef VVPERM
+//        #include "groestl256-asm-vperm.h"
+//      #else
+//        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+//      #endif
+//    #endif
+//  #endif
+//#else
+//  #ifdef TINTR
+//    #ifdef VAES
+//      #include "groestl256-intr-aes.h"
+//    #else
+//      #ifdef VAVX
+        #include "groestl256-intr-avx.h"
+//      #else
+//        #ifdef VVPERM
+//          #include "groestl256-intr-vperm.h"
+//        #else
+//          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+//        #endif
+//      #endif
+//    #endif
+//  #else
+//    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
+//  #endif
+//#endif
+
+
+/* digest up to len bytes of input (full blocks only) */
+void Transform256(hashState_groestl256 *ctx,
+	       const u8 *in, 
+	       unsigned long long len) {
+    /* increment block counter */
+    ctx->block_counter += len/SIZE;
+
+    /* digest message, one block at a time */
+    for (; len >= SIZE; len -= SIZE, in += SIZE)
+//#if LENGTH<=256
+      TF512((u64*)ctx->chaining, (u64*)in);
+//#else
+//      TF1024((u64*)ctx->chaining, (u64*)in);
+//#endif
+
+#ifdef _MSC_VER
+	//__asm emms
+#else
+	asm volatile ("emms");
+#endif
+}
+
+/* given state h, do h <- P(h)+h */
+void OutputTransformation256(hashState_groestl256 *ctx) {
+    /* determine variant */
+//#if (LENGTH <= 256)
+    OF512((u64*)ctx->chaining);
+//#else
+//    OF1024((u64*)ctx->chaining);
+//#endif
+
+#ifdef _MSC_VER
+	//__asm emms
+#else
+	asm volatile("emms");
+#endif
+}
+
+/* initialise context */
+HashReturn_gr init_groestl256(hashState_groestl256* ctx) {
+  u8 i = 0;
+  /* output size (in bits) must be a positive integer less than or
+     equal to 512, and divisible by 8 */
+//  if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512)
+//    return BAD_HASHBITLEN_GR;
+
+  /* set number of state columns and state size depending on
+     variant */
+  ctx->columns = COLS;
+  ctx->statesize = SIZE;
+//#if (LENGTH <= 256)
+    ctx->v = SHoRT;
+//#else
+//    ctx->v = LoNG;
+//#endif
+
+  SET_CONSTANTS();
+
+  for (i=0; i<SIZE/8; i++)
+    ctx->chaining[i] = 0;
+  for (i=0; i<SIZE; i++)
+    ctx->buffer[i] = 0;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return FAIL_GR;
+
+  /* set initial value */
+//  ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
+  ctx->chaining[ctx->columns-1] = U64BIG((u64)256);
+
+  INIT256(ctx->chaining);
+
+  /* set other variables */
+  ctx->buf_ptr = 0;
+  ctx->block_counter = 0;
+  ctx->bits_in_last_byte = 0;
+
+  return SUCCESS_GR;
+}
+
+
+HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
+ {
+  int i;
+  for (i=0; i<SIZE/8; i++)
+    ctx->chaining[i] = 0;
+  for (i=0; i<SIZE; i++)
+    ctx->buffer[i] = 0;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return FAIL_GR;
+
+  /* set initial value */
+//  ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
+  ctx->chaining[ctx->columns-1] = 256;
+
+  INIT256(ctx->chaining);
+
+  /* set other variables */
+  ctx->buf_ptr = 0;
+  ctx->block_counter = 0;
+  ctx->bits_in_last_byte = 0;
+
+  return SUCCESS_GR;
+}
+
+
+/* update state with databitlen bits of input */
+HashReturn_gr update_groestl256(hashState_groestl256* ctx,
+		  const BitSequence_gr* input,
+		  DataLength_gr databitlen) {
+  int index = 0;
+  int msglen = (int)(databitlen/8);
+  int rem = (int)(databitlen%8);
+
+  /* non-integral number of message bytes can only be supplied in the
+     last call to this function */
+  if (ctx->bits_in_last_byte) return FAIL_GR;
+
+  /* if the buffer contains data that has not yet been digested, first
+     add data to buffer until full */
+
+// The following block of code never gets hit when hashing x11 or quark
+// leave it here in case it might be needed.
+//  if (ctx->buf_ptr)
+//  {
+//    while (ctx->buf_ptr < ctx->statesize && index < msglen)
+//    {
+//      ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+//    }
+//    if (ctx->buf_ptr < ctx->statesize)
+//    {
+//      /* buffer still not full, return */
+//      if (rem)
+//      {
+//        ctx->bits_in_last_byte = rem;
+//        ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+//      }
+//      return SUCCESS_GR;
+//    }
+//    /* digest buffer */
+//    ctx->buf_ptr = 0;
+//    printf("error\n");
+//    Transform(ctx, ctx->buffer, ctx->statesize);
+// end dead code
+//  }
+
+  /* digest bulk of message */
+  Transform256(ctx, input+index, msglen-index);
+  index += ((msglen-index)/ctx->statesize)*ctx->statesize;
+
+  /* store remaining data in buffer */
+  while (index < msglen)
+  {
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+  }
+
+// Another block that doesn't get used by x11 or quark
+//  /* if non-integral number of bytes have been supplied, store
+//     remaining bits in last byte, together with information about
+//     number of bits */
+//  if (rem)
+//  {
+//    ctx->bits_in_last_byte = rem;
+//    ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+//  }
+
+  return SUCCESS_GR;
+}
+
+#define BILB ctx->bits_in_last_byte
+
+/* finalise: process remaining data (including padding), perform
+   output transformation, and write hash result to 'output' */
+HashReturn_gr final_groestl256(hashState_groestl256* ctx,
+		 BitSequence_gr* output) {
+//  int i, j = 0, hashbytelen = LENGTH/8;
+  int i, j = 0, hashbytelen = 256/8;
+  u8 *s = (BitSequence_gr*)ctx->chaining;
+
+  /* pad with '1'-bit and first few '0'-bits */
+  if (BILB) {
+    ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
+    ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
+    BILB = 0;
+  }
+  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+
+  /* pad with '0'-bits */
+  if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
+    /* padding requires two blocks */
+    while (ctx->buf_ptr < ctx->statesize) {
+      ctx->buffer[(int)ctx->buf_ptr++] = 0;
+    }
+    /* digest first padding block */
+    Transform256(ctx, ctx->buffer, ctx->statesize);
+    ctx->buf_ptr = 0;
+  }
+  while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) {
+    ctx->buffer[(int)ctx->buf_ptr++] = 0;
+  }
+
+  /* length padding */
+  ctx->block_counter++;
+  ctx->buf_ptr = ctx->statesize;
+  while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
+    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
+    ctx->block_counter >>= 8;
+  }
+
+  /* digest final padding block */
+  Transform256(ctx, ctx->buffer, ctx->statesize);
+  /* perform output transformation */
+  OutputTransformation256(ctx);
+
+  /* store hash result in output */
+  for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) {
+    output[j] = s[i];
+  }
+
+  /* zeroise relevant variables and deallocate memory */
+  
+  for (i = 0; i < ctx->columns; i++) {
+    ctx->chaining[i] = 0;
+  }
+  
+  for (i = 0; i < ctx->statesize; i++) {
+    ctx->buffer[i] = 0;
+  }
+//  free(ctx->chaining);
+//  free(ctx->buffer);
+
+  return SUCCESS_GR;
+}
+
+/* hash bit sequence */
+//HashReturn_gr hash_groestl256(int hashbitlen,
+//		const BitSequence_gr* data, 
+//		DataLength_gr databitlen,
+//		BitSequence_gr* hashval) {
+//  HashReturn_gr ret;
+//  hashState_groestl256 context;
+//
+//  
+//  if ((ret = init_groestl256(&context)) != SUCCESS_GR)
+//    return ret;
+//
+//  
+//  if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
+//    return ret;
+//
+//  
+//  ret = final_groestl256(&context, hashval);
+//
+//  return ret;
+//}
+
+/* eBash API */
+//#ifdef crypto_hash_BYTES
+//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
+//{
+//  if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
+//  return -1;
+//}
+//#endif
+
+#endif
diff --git a/algo/aes_ni/hash-groestl256.h b/algo/aes_ni/hash-groestl256.h
new file mode 100644
index 000000000..b4dcfe07b
--- /dev/null
+++ b/algo/aes_ni/hash-groestl256.h
@@ -0,0 +1,116 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#ifndef __hash_h
+#define __hash_h
+
+#include <stdio.h>
+#if defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#endif
+#include <stdlib.h>
+
+/* eBash API begin */
+/*
+#include "crypto_hash.h"
+#ifdef crypto_hash_BYTES
+
+#include <crypto_uint8.h>
+#include <crypto_uint32.h>
+#include <crypto_uint64.h>
+typedef crypto_uint8 u8;
+typedef crypto_uint32 u32;
+typedef crypto_uint64 u64;
+#endif
+ */
+/* eBash API end */
+
+//#define LENGTH (512)
+
+#include "brg_endian.h"
+#define NEED_UINT_64T
+#include "brg_types.h"
+
+#ifdef IACA_TRACE
+  #include IACA_MARKS
+#endif
+
+//#ifndef LENGTH
+//#define LENGTH (256)
+//#endif
+
+/* some sizes (number of bytes) */
+#define ROWS (8)
+#define LENGTHFIELDLEN (ROWS)
+#define COLS512 (8)
+#define COLS1024 (16)
+#define SIZE512 ((ROWS)*(COLS512))
+#define SIZE1024 ((ROWS)*(COLS1024))
+#define ROUNDS512 (10)
+#define ROUNDS1024 (14)
+
+//#if LENGTH<=256
+#define COLS (COLS512)
+#define SIZE (SIZE512)
+#define ROUNDS (ROUNDS512)
+//#else
+//#define COLS (COLS1024)
+//#define SIZE (SIZE1024)
+//#define ROUNDS (ROUNDS1024)
+//#endif
+
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef enum { LoNG, SHoRT } Var;
+
+/* NIST API begin */
+
+typedef unsigned char BitSequence_gr;
+typedef unsigned long long DataLength_gr;
+typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
+
+typedef struct {
+#ifndef _MSC_VER
+  __attribute__ ((aligned (32)))
+#endif
+	  u64 chaining[SIZE/8];      /* actual state */
+#ifndef _MSC_VER
+  __attribute__ ((aligned (32))) 
+#endif
+	  BitSequence_gr buffer[SIZE];  /* data buffer */
+  u64 block_counter;        /* message block counter */
+  int buf_ptr;              /* data buffer pointer */
+  int bits_in_last_byte;    /* no. of message bits in last byte of
+                               data buffer */
+  int columns;              /* no. of columns in state */
+  int statesize;            /* total no. of bytes in state */
+  Var v;                    /* LONG or SHORT */
+} hashState_groestl256;
+
+HashReturn_gr init_groestl(hashState_groestl256*);
+HashReturn_gr reinit_groestl(hashState_groestl256*);
+HashReturn_gr update_groestl(hashState_groestl256*, const BitSequence_gr*, DataLength_gr);
+HashReturn_gr final_groestl(hashState_groestl256*, BitSequence_gr*);
+HashReturn_gr hash_groestl(int, const BitSequence_gr*, DataLength_gr, BitSequence_gr*);
+/* NIST API end   */
+
+#endif /* __hash_h */
diff --git a/algo/aes_ni/implementors b/algo/aes_ni/implementors
new file mode 100644
index 000000000..e7ac1b28c
--- /dev/null
+++ b/algo/aes_ni/implementors
@@ -0,0 +1,3 @@
+Krystian Matusiewicz
+Günther A. Roland
+Martin Schläffer
diff --git a/algo/lyra2re.c b/algo/lyra2re.c
index 38fb62bb4..655c06ae5 100644
--- a/algo/lyra2re.c
+++ b/algo/lyra2re.c
@@ -9,7 +9,7 @@
 
 #include "miner.h"
 
-void lyra2_hash(void *state, const void *input)
+/*void lyra2_hash(void *state, const void *input)
 {
 	sph_blake256_context     ctx_blake;
 	sph_keccak256_context    ctx_keccak;
@@ -36,6 +36,69 @@ void lyra2_hash(void *state, const void *input)
 	sph_groestl256(&ctx_groestl, hashB, 32);
 	sph_groestl256_close(&ctx_groestl, hashA);
 
+	memcpy(state, hashA, 32);
+}*/
+
+
+#ifdef __AVX2__
+//#define __AES_NI
+#endif
+
+#ifdef __AES_NI
+#include "algo/aes_ni/hash-groestl256.h"
+#endif
+
+typedef struct {
+	sph_blake256_context     blake;
+	sph_keccak256_context    keccak;
+	sph_skein256_context     skein;
+#ifdef __AES_NI
+	hashState_groestl256     groestl;
+#else
+	sph_groestl256_context   groestl;
+#endif
+} lyra2re_ctx_holder;
+
+lyra2re_ctx_holder lyra2re_ctx;
+
+void init_lyra2re_ctx()
+{
+	sph_blake256_init(&lyra2re_ctx.blake);
+	sph_keccak256_init(&lyra2re_ctx.keccak);
+	sph_skein256_init(&lyra2re_ctx.skein);
+#ifdef __AES_NI
+	init_groestl256(&lyra2re_ctx.groestl);
+#else
+	sph_groestl256_init(&lyra2re_ctx.groestl);
+#endif
+}
+
+void lyra2_hash(void *state, const void *input)
+{
+	lyra2re_ctx_holder ctx;
+	memcpy(&ctx, &lyra2re_ctx, sizeof(lyra2re_ctx));
+
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256(&ctx.blake, input, 80);
+	sph_blake256_close(&ctx.blake, hashA);
+
+	sph_keccak256(&ctx.keccak, hashA, 32);
+	sph_keccak256_close(&ctx.keccak, hashB);
+
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+	sph_skein256(&ctx.skein, hashA, 32);
+	sph_skein256_close(&ctx.skein, hashB);
+
+#ifdef __AES_NI
+	update_groestl256(&ctx.groestl, hashB, 256);
+	final_groestl256(&ctx.groestl, hashA);
+#else
+	sph_groestl256(&ctx.groestl, hashB, 32);
+	sph_groestl256_close(&ctx.groestl, hashA);
+#endif
+
 	memcpy(state, hashA, 32);
 }
 
diff --git a/cpu-miner.c b/cpu-miner.c
index 8008acffc..1798a964a 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2994,7 +2994,8 @@ int main(int argc, char *argv[]) {
 
 	if (opt_algo == ALGO_QUARK) {
 		init_quarkhash_contexts();
-	} else if(opt_algo == ALGO_CRYPTONIGHT) {
+	}
+	else if (opt_algo == ALGO_CRYPTONIGHT) {
 		jsonrpc_2 = true;
 		opt_extranonce = false;
 		aes_ni_supported = has_aes_ni();
@@ -3003,6 +3004,9 @@ int main(int argc, char *argv[]) {
 			applog(LOG_INFO, "CPU Supports AES-NI: %s", aes_ni_supported ? "YES" : "NO");
 		}
 	}
+	else if (opt_algo == ALGO_LYRA2) {
+		init_lyra2re_ctx();
+	}
 
 	if (!opt_benchmark && !rpc_url) {
 		fprintf(stderr, "%s: no URL supplied\n", argv[0]);
diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c
index b79ec9b16..f0c9b5439 100644
--- a/lyra2/Lyra2.c
+++ b/lyra2/Lyra2.c
@@ -21,9 +21,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
-
-#include "Lyra2.h"
-#include "Sponge.h"
+#include "compat.h"
+#include "lyra2.h"
+#include "sponge.h"
 
 /**
  * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
@@ -44,7 +44,7 @@
  *
  * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
  */
-int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols)
+int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
 {
 	//============================= Basic variables ============================//
 	int64_t row = 2; //index of row to be processed
@@ -55,25 +55,32 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
 	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
 	int64_t i; //auxiliary iteration counter
+	int64_t v64; // 64bit var for memcpy
 	//==========================================================================/
 
 	//========== Initializing the Memory Matrix and pointers to it =============//
 	//Tries to allocate enough space for the whole memory matrix
-	i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
-	uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i);
+
+	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+	// for Lyra2REv2, nCols = 4, v1 was using 8
+	const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+	i = (int64_t)ROW_LEN_BYTES * nRows;
+	uint64_t _ALIGN(256) *wholeMatrix = malloc(i);
 	if (wholeMatrix == NULL) {
 		return -1;
 	}
-	memset(wholeMatrix, 0, (size_t) i);
+	memset(wholeMatrix, 0, i);
 
 	//Allocates pointers to each row of the matrix
-	uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*));
+	uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
 	if (memMatrix == NULL) {
 		return -1;
 	}
 	//Places the pointers in the correct positions
 	uint64_t *ptrWord = wholeMatrix;
-	for (i = 0; i < (int64_t) nRows; i++) {
+	for (i = 0; i < nRows; i++) {
 		memMatrix[i] = ptrWord;
 		ptrWord += ROW_LEN_INT64;
 	}
@@ -84,32 +91,38 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 	//but this ensures that the password copied locally will be overwritten as soon as possible
 
 	//First, we clean enough blocks for the password, salt, basil and padding
-	uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+	int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
 
 	byte *ptrByte = (byte*) wholeMatrix;
-	memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
 
 	//Prepends the password
-	memcpy(ptrByte, pwd, (size_t) pwdlen);
+	memcpy(ptrByte, pwd, pwdlen);
 	ptrByte += pwdlen;
 
 	//Concatenates the salt
-	memcpy(ptrByte, salt, (size_t) saltlen);
+	memcpy(ptrByte, salt, saltlen);
 	ptrByte += saltlen;
 
+	memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen));
+
 	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-	memcpy(ptrByte, &kLen, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &saltlen, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &timeCost, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &nRows, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &nCols, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
+	memcpy(ptrByte, &kLen, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = pwdlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = saltlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = timeCost;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nRows;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nCols;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
 
 	//Now comes the padding
 	*ptrByte = 0x80; //first byte of padding: right after the password
@@ -120,30 +133,27 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 
 	//======================= Initializing the Sponge State ====================//
 	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-	uint64_t *state = malloc(16 * sizeof (uint64_t));
-	if (state == NULL) {
-		return -1;
-	}
+	uint64_t _ALIGN(256) state[16];
 	initState(state);
 	//==========================================================================/
 
 	//================================ Setup Phase =============================//
 	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
 	ptrWord = wholeMatrix;
-	for (i = 0; i < (int64_t) nBlocksInput; i++) {
+	for (i = 0; i < nBlocksInput; i++) {
 		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
-		ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
+		ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
 	}
 
 	//Initializes M[0] and M[1]
-	reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here
+	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
 
-	reducedDuplexRow1(state, memMatrix[0], memMatrix[1]);
+	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
 
 	do {
 		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
 
-		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
+		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
 
 		//updates the value of row* (deterministically picked during Setup))
 		rowa = (rowa + step) & (window - 1);
@@ -159,36 +169,35 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 		gap = -gap; //inverts the modifier to the step
 	}
 
-	} while (row < (int64_t) nRows);
+	} while (row < nRows);
 	//==========================================================================/
 
 	//============================ Wandering Phase =============================//
 	row = 0; //Resets the visitation to the first row of the memory matrix
-	for (tau = 1; tau <= (int64_t) timeCost; tau++) {
+	for (tau = 1; tau <= timeCost; tau++) {
 		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
 		step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
 		do {
 			//Selects a pseudorandom index row*
 			//------------------------------------------------------------------------------------------
-			//rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
-			rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+			//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
 			//------------------------------------------------------------------------------------------
 
 			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
+			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
 
 			//update prev: it now points to the last row ever computed
 			prev = row;
 
 			//updates row: goes to the next row to be computed
 			//------------------------------------------------------------------------------------------
-			//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
-			row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+			//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
 			//------------------------------------------------------------------------------------------
 
 		} while (row != 0);
 	}
-	//==========================================================================/
 
 	//============================ Wrap-up Phase ===============================//
 	//Absorbs the last block of the memory matrix
@@ -196,16 +205,10 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 
 	//Squeezes the key
 	squeeze(state, K, (unsigned int) kLen);
-	//==========================================================================/
 
 	//========================= Freeing the memory =============================//
 	free(memMatrix);
 	free(wholeMatrix);
 
-	//Wiping out the sponge's internal state before freeing it
-	memset(state, 0, 16 * sizeof (uint64_t));
-	free(state);
-	//==========================================================================/
-
 	return 0;
 }
diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h
index 229b2c9cc..edf917927 100644
--- a/lyra2/Lyra2.h
+++ b/lyra2/Lyra2.h
@@ -37,14 +37,6 @@ typedef unsigned char byte;
         #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
 #endif
 
-#ifndef N_COLS
-        #define N_COLS 8                                //Number of columns in the memory matrix: fixed to 64 by default
-#endif
-
-#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
-#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8)        //Number of bytes per row
-
-
-int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
+int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
 
 #endif /* LYRA2_H_ */
diff --git a/lyra2/Sponge.c b/lyra2/Sponge.c
index e0a001e0e..cc042809f 100644
--- a/lyra2/Sponge.c
+++ b/lyra2/Sponge.c
@@ -21,10 +21,9 @@
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
-#include "Sponge.h"
-#include "Lyra2.h"
-
-
+#include <immintrin.h>
+#include "sponge.h"
+#include "lyra2.h"
 
 /**
  * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
@@ -37,20 +36,97 @@
  *
  * @param state         The 1024-bit array to be initialized
  */
- void initState(uint64_t state[/*16*/]) {
-    //First 512 bis are zeros
-    memset(state, 0, 64);
-    //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
-
-    state[8] = blake2b_IV[0];
-    state[9] = blake2b_IV[1];
-    state[10] = blake2b_IV[2];
-    state[11] = blake2b_IV[3];
-    state[12] = blake2b_IV[4];
-    state[13] = blake2b_IV[5];
-    state[14] = blake2b_IV[6];
-    state[15] = blake2b_IV[7];
+void initState(uint64_t state[/*16*/]) {
+	//First 512 bis are zeros
+	memset(state, 0, 64);
+	//Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
+	state[8] = blake2b_IV[0];
+	state[9] = blake2b_IV[1];
+	state[10] = blake2b_IV[2];
+	state[11] = blake2b_IV[3];
+	state[12] = blake2b_IV[4];
+	state[13] = blake2b_IV[5];
+	state[14] = blake2b_IV[6];
+	state[15] = blake2b_IV[7];
+}
 
+#ifdef _MSC_VER
+__forceinline
+#else
+inline 
+#endif
+static void lyra_round( uint64_t *v )
+{
+#ifdef __AVX2__
+
+   __m256i a = _mm256_load_si256( (__m256i*)(&v[ 0]) );
+   __m256i b = _mm256_load_si256( (__m256i*)(&v[ 4]) );
+   __m256i c = _mm256_load_si256( (__m256i*)(&v[ 8]) );
+   __m256i d = _mm256_load_si256( (__m256i*)(&v[12]) );
+
+   G_4X64( a, b, c, d );
+
+   // swap words
+   b = mm256_rotl256_1x64( b );
+   c = mm256_swap128( c );
+   d = mm256_rotr256_1x64( d );
+
+   G_4X64( a, b, c, d );
+
+   // unswap
+   b = mm256_rotr256_1x64( b );
+   c = mm256_swap128( c );
+   d = mm256_rotl256_1x64( d );
+
+   _mm256_store_si256( (__m256i*)(&v[ 0]), a );
+   _mm256_store_si256( (__m256i*)(&v[ 4]), b );
+   _mm256_store_si256( (__m256i*)(&v[ 8]), c );
+   _mm256_store_si256( (__m256i*)(&v[12]), d );
+
+#elif defined __AVX__
+
+   __m128i a0, a1, b0, b1, c0, c1, d0, d1;
+
+   a0 = _mm_load_si128( (__m128i*)(&v[ 0]) );
+   a1 = _mm_load_si128( (__m128i*)(&v[ 2]) );
+   b0 = _mm_load_si128( (__m128i*)(&v[ 4]) );
+   b1 = _mm_load_si128( (__m128i*)(&v[ 6]) );
+   c0 = _mm_load_si128( (__m128i*)(&v[ 8]) );
+   c1 = _mm_load_si128( (__m128i*)(&v[10]) );
+   d0 = _mm_load_si128( (__m128i*)(&v[12]) );
+   d1 = _mm_load_si128( (__m128i*)(&v[14]) );
+
+   G_2X64( a0, b0, c0, d0 );
+   G_2X64( a1, b1, c1, d1 );
+
+   // swap words
+   mm128_rotl256_1x64( b0, b1 );
+   mm128_swap128( c0, c1 );
+   mm128_rotr256_1x64( d0, d1 );
+
+   G_2X64( a0, b0, c0, d0 );
+   G_2X64( a1, b1, c1, d1 );
+
+   // unswap
+   mm128_rotr256_1x64( b0, b1 );
+   mm128_swap128( c0, c1 );
+   mm128_rotl256_1x64( d0, d1 );
+
+   _mm_store_si128( (__m128i*)(&v[ 0]), a0 );
+   _mm_store_si128( (__m128i*)(&v[ 2]), a1 );
+   _mm_store_si128( (__m128i*)(&v[ 4]), b0 );
+   _mm_store_si128( (__m128i*)(&v[ 6]), b1 );
+   _mm_store_si128( (__m128i*)(&v[ 8]), c0 );
+   _mm_store_si128( (__m128i*)(&v[10]), c1 );
+   _mm_store_si128( (__m128i*)(&v[12]), d0 );
+   _mm_store_si128( (__m128i*)(&v[14]), d1 );
+
+#else
+
+   // macro assumes v is defined
+   ROUND_LYRA(0);
+
+#endif
 }
 
 /**
@@ -58,27 +134,39 @@
  *
  * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
  */
-__inline static void blake2bLyra(uint64_t *v) {
-    ROUND_LYRA(0);
-    ROUND_LYRA(1);
-    ROUND_LYRA(2);
-    ROUND_LYRA(3);
-    ROUND_LYRA(4);
-    ROUND_LYRA(5);
-    ROUND_LYRA(6);
-    ROUND_LYRA(7);
-    ROUND_LYRA(8);
-    ROUND_LYRA(9);
-    ROUND_LYRA(10);
-    ROUND_LYRA(11);
+#ifdef _MSC_VER
+__forceinline
+#else
+__inline
+#endif
+static void blake2bLyra(uint64_t *v)
+{
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
+   lyra_round( v );
 }
 
 /**
  * Executes a reduced version of Blake2b's G function with only one round
  * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
  */
-__inline static void reducedBlake2bLyra(uint64_t *v) {
-    ROUND_LYRA(0);
+#ifdef _MSC_VER
+__forceinline
+#else
+__inline
+#endif
+static void reducedBlake2bLyra(uint64_t *v) {
+
+   lyra_round( v );
 }
 
 /**
@@ -89,21 +177,24 @@ __inline static void reducedBlake2bLyra(uint64_t *v) {
  * @param out        Array that will receive the data squeezed
  * @param len        The number of bytes to be squeezed into the "out" array
  */
- void squeeze(uint64_t *state, byte *out, unsigned int len) {
-    int fullBlocks = len / BLOCK_LEN_BYTES;
-    byte *ptr = out;
-    int i;
-    //Squeezes full blocks
-    for (i = 0; i < fullBlocks; i++) {
-        memcpy(ptr, state, BLOCK_LEN_BYTES);
-        blake2bLyra(state);
-        ptr += BLOCK_LEN_BYTES;
-    }
-
-    //Squeezes remaining bytes
-    memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
+void squeeze(uint64_t *state, byte *out, unsigned int len)
+{
+	int fullBlocks = len / BLOCK_LEN_BYTES;
+	byte *ptr = out;
+	int i;
+
+	//Squeezes full blocks
+	for (i = 0; i < fullBlocks; i++) {
+		memcpy(ptr, state, BLOCK_LEN_BYTES);
+		blake2bLyra(state);
+		ptr += BLOCK_LEN_BYTES;
+	}
+
+	//Squeezes remaining bytes
+	memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
 }
 
+
 /**
  * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words
  * of type uint64_t), using Blake2b's G function as the internal permutation
@@ -111,23 +202,78 @@ __inline static void reducedBlake2bLyra(uint64_t *v) {
  * @param state The current state of the sponge
  * @param in    The block to be absorbed (BLOCK_LEN_INT64 words)
  */
-void absorbBlock(uint64_t *state, const uint64_t *in) {
-    //XORs the first BLOCK_LEN_INT64 words of "in" with the current state
-    state[0] ^= in[0];
-    state[1] ^= in[1];
-    state[2] ^= in[2];
-    state[3] ^= in[3];
-    state[4] ^= in[4];
-    state[5] ^= in[5];
-    state[6] ^= in[6];
-    state[7] ^= in[7];
-    state[8] ^= in[8];
-    state[9] ^= in[9];
-    state[10] ^= in[10];
-    state[11] ^= in[11];
-
-    //Applies the transformation f to the sponge's state
-    blake2bLyra(state);
+void absorbBlock(uint64_t *state, const uint64_t *in)
+{
+//XORs the first BLOCK_LEN_INT64 words of "in" with the current state
+#if defined __AVX2__
+
+    __m256i state_v[2], in_v[2];
+
+    state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+    in_v   [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) );
+    state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+    in_v   [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) );
+    state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+    in_v   [2] = _mm256_loadu_si256( (__m256i*)(&in[8]) ); 
+
+    _mm256_store_si256( (__m256i*)&state[0],
+                          _mm256_xor_si256( state_v[0], in_v[0] ) );
+    _mm256_store_si256( (__m256i*)&state[4],
+                          _mm256_xor_si256( state_v[1], in_v[1] ) );
+    _mm256_store_si256( (__m256i*)&state[8],
+                          _mm256_xor_si256( state_v[2], in_v[2] ) );
+
+#elif defined __AVX__
+
+    __m128i state_v[4], in_v[4];
+
+    state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
+    state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
+    state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
+    state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
+    state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
+    state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );
+
+    in_v[0]    = _mm_loadu_si128( (__m128i*)(&in[0]) );
+    in_v[1]    = _mm_loadu_si128( (__m128i*)(&in[2]) );
+    in_v[2]    = _mm_loadu_si128( (__m128i*)(&in[4]) );
+    in_v[3]    = _mm_loadu_si128( (__m128i*)(&in[6]) );
+    in_v[4]    = _mm_loadu_si128( (__m128i*)(&in[8]) );
+    in_v[5]    = _mm_loadu_si128( (__m128i*)(&in[10]) );
+
+    _mm_store_si128( (__m128i*)(&state[0]),
+                       _mm_xor_si128( state_v[0], in_v[0] ) );
+    _mm_store_si128( (__m128i*)(&state[2]),
+                       _mm_xor_si128( state_v[1], in_v[1] ) );
+    _mm_store_si128( (__m128i*)(&state[4]),
+                       _mm_xor_si128( state_v[2], in_v[2] ) );
+    _mm_store_si128( (__m128i*)(&state[6]),
+                       _mm_xor_si128( state_v[3], in_v[3] ) );
+    _mm_store_si128( (__m128i*)(&state[8]),
+                       _mm_xor_si128( state_v[4], in_v[4] ) );
+    _mm_store_si128( (__m128i*)(&state[10]),
+                       _mm_xor_si128( state_v[5], in_v[5] ) );
+
+#else
+
+   state[0] ^= in[0];
+   state[1] ^= in[1];
+   state[2] ^= in[2];
+   state[3] ^= in[3];
+   state[4] ^= in[4];
+   state[5] ^= in[5];
+   state[6] ^= in[6];
+   state[7] ^= in[7];
+   state[8] ^= in[8];
+   state[9] ^= in[9];
+   state[10] ^= in[10];
+   state[11] ^= in[11];
+
+#endif
+
+//Applies the transformation f to the sponge's state
+blake2bLyra(state);
+
 }
 
 /**
@@ -137,25 +283,63 @@ void absorbBlock(uint64_t *state, const uint64_t *in) {
  * @param state The current state of the sponge
  * @param in    The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
  */
-void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
-    //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
-    state[0] ^= in[0];
-    state[1] ^= in[1];
-    state[2] ^= in[2];
-    state[3] ^= in[3];
-    state[4] ^= in[4];
-    state[5] ^= in[5];
-    state[6] ^= in[6];
-    state[7] ^= in[7];
-
-    //Applies the transformation f to the sponge's state
-    blake2bLyra(state);
-/*
-    for(int i = 0; i<16; i++) {
-        printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32),
-        (uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32));
-    }
-*/
+void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in)
+{
+
+//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
+#if defined __AVX2__
+
+    __m256i state_v[2], in_v[2];
+
+    state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+    in_v   [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) );
+    state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+    in_v   [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) );
+
+    _mm256_store_si256( (__m256i*)(&state[0]),
+                          _mm256_xor_si256( state_v[0], in_v[0] ) );
+    _mm256_store_si256( (__m256i*)(&state[4]),
+                          _mm256_xor_si256( state_v[1], in_v[1] ) );
+
+#elif defined __AVX__
+
+    __m128i state_v[4], in_v[4];
+
+    state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
+    state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
+    state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
+    state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
+
+    in_v[0]    = _mm_loadu_si128( (__m128i*)(&in[0]) );
+    in_v[1]    = _mm_loadu_si128( (__m128i*)(&in[2]) );
+    in_v[2]    = _mm_loadu_si128( (__m128i*)(&in[4]) );
+    in_v[3]    = _mm_loadu_si128( (__m128i*)(&in[6]) );
+
+    _mm_store_si128( (__m128i*)(&state[0]),
+                       _mm_xor_si128( state_v[0], in_v[0] ) );
+    _mm_store_si128( (__m128i*)(&state[2]),
+                       _mm_xor_si128( state_v[1], in_v[1] ) );
+    _mm_store_si128( (__m128i*)(&state[4]),
+                       _mm_xor_si128( state_v[2], in_v[2] ) );
+    _mm_store_si128( (__m128i*)(&state[6]),
+                        _mm_xor_si128( state_v[3], in_v[3] ) );
+
+#else
+
+   state[0] ^= in[0];
+   state[1] ^= in[1];
+   state[2] ^= in[2];
+   state[3] ^= in[3];
+   state[4] ^= in[4];
+   state[5] ^= in[5];
+   state[6] ^= in[6];
+   state[7] ^= in[7];
+
+#endif
+
+//Applies the transformation f to the sponge's state
+blake2bLyra(state);
+
 }
 
 /**
@@ -166,36 +350,31 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
  * @param state     The current state of the sponge
  * @param rowOut    Row to receive the data squeezed
  */
-void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
-    uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
-    int i;
-    //M[row][C-1-col] = H.reduced_squeeze()
-    for (i = 0; i < N_COLS; i++) {
-
-	ptrWord[0] = state[0];
-	ptrWord[1] = state[1];
-	ptrWord[2] = state[2];
-	ptrWord[3] = state[3];
-	ptrWord[4] = state[4];
-	ptrWord[5] = state[5];
-	ptrWord[6] = state[6];
-	ptrWord[7] = state[7];
-	ptrWord[8] = state[8];
-	ptrWord[9] = state[9];
-	ptrWord[10] = state[10];
-	ptrWord[11] = state[11];
-	/*
-for (int i = 0; i<12; i++) {
-		printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32),
-			(uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32));
+void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols)
+{
+	uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+	unsigned int i;
+	//M[row][C-1-col] = H.reduced_squeeze()
+	for (i = 0; i < nCols; i++) {
+		ptrWord[0] = state[0];
+		ptrWord[1] = state[1];
+		ptrWord[2] = state[2];
+		ptrWord[3] = state[3];
+		ptrWord[4] = state[4];
+		ptrWord[5] = state[5];
+		ptrWord[6] = state[6];
+		ptrWord[7] = state[7];
+		ptrWord[8] = state[8];
+		ptrWord[9] = state[9];
+		ptrWord[10] = state[10];
+		ptrWord[11] = state[11];
+
+		//Goes to next block (column) that will receive the squeezed data
+		ptrWord -= BLOCK_LEN_INT64;
+
+		//Applies the reduced-round transformation f to the sponge's state
+		reducedBlake2bLyra(state);
 	}
-*/
-	//Goes to next block (column) that will receive the squeezed data
-	ptrWord -= BLOCK_LEN_INT64;
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
-    }
 }
 
 /**
@@ -207,35 +386,137 @@ for (int i = 0; i<12; i++) {
  * @param rowIn		Row to feed the sponge
  * @param rowOut	Row to receive the sponge's output
  */
- void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
-    uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
-    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
-    int i;
-
-    for (i = 0; i < N_COLS; i++) {
-
-	//Absorbing "M[prev][col]"
-	state[0]  ^= (ptrWordIn[0]);
-	state[1]  ^= (ptrWordIn[1]);
-	state[2]  ^= (ptrWordIn[2]);
-	state[3]  ^= (ptrWordIn[3]);
-	state[4]  ^= (ptrWordIn[4]);
-	state[5]  ^= (ptrWordIn[5]);
-	state[6]  ^= (ptrWordIn[6]);
-	state[7]  ^= (ptrWordIn[7]);
-	state[8]  ^= (ptrWordIn[8]);
-	state[9]  ^= (ptrWordIn[9]);
-	state[10] ^= (ptrWordIn[10]);
-	state[11] ^= (ptrWordIn[11]);
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
-
-	//M[row][C-1-col] = M[prev][col] XOR rand
-	ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
-	ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
-	ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
-	ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols)
+{
+   uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
+   uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+   unsigned int i;
+
+   for (i = 0; i < nCols; i++)
+   {
+      //Absorbing "M[prev][col]"
+      #if defined __AVX2__
+
+         __m256i state_v[3], in_v[3];
+
+         state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+         in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
+         state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+         in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
+         state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+         in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
+ 
+         _mm256_store_si256( (__m256i*)(&state[0]),
+                              _mm256_xor_si256( state_v[0], in_v[0] ) );
+         _mm256_store_si256( (__m256i*)(&state[4]),
+                              _mm256_xor_si256( state_v[1], in_v[1] ) );
+         _mm256_store_si256( (__m256i*)(&state[8]),
+                              _mm256_xor_si256( state_v[2], in_v[2] ) );
+
+      #elif defined __AVX__
+
+         __m128i state_v[6], in_v[6];
+
+         state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
+         state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
+         state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
+         state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
+         state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
+         state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );
+
+         in_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) );
+         in_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) );
+         in_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) );
+         in_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) );
+         in_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) );
+         in_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) );
+
+         _mm_store_si128( (__m128i*)(&state[0]),
+                           _mm_xor_si128( state_v[0], in_v[0] ) );
+         _mm_store_si128( (__m128i*)(&state[2]),
+                           _mm_xor_si128( state_v[1], in_v[1] ) );
+         _mm_store_si128( (__m128i*)(&state[4]),
+                           _mm_xor_si128( state_v[2], in_v[2] ) );
+         _mm_store_si128( (__m128i*)(&state[6]),
+                           _mm_xor_si128( state_v[3], in_v[3] ) );
+         _mm_store_si128( (__m128i*)(&state[8]),
+                           _mm_xor_si128( state_v[4], in_v[4] ) );
+         _mm_store_si128( (__m128i*)(&state[10]),
+                           _mm_xor_si128( state_v[5], in_v[5] ) );
+
+      #else
+
+         state[0]  ^= (ptrWordIn[0]);
+         state[1]  ^= (ptrWordIn[1]);
+         state[2]  ^= (ptrWordIn[2]);
+         state[3]  ^= (ptrWordIn[3]);
+         state[4]  ^= (ptrWordIn[4]);
+         state[5]  ^= (ptrWordIn[5]);
+         state[6]  ^= (ptrWordIn[6]);
+         state[7]  ^= (ptrWordIn[7]);
+         state[8]  ^= (ptrWordIn[8]);
+         state[9]  ^= (ptrWordIn[9]);
+         state[10] ^= (ptrWordIn[10]);
+         state[11] ^= (ptrWordIn[11]);
+
+      #endif
+
+      //Applies the reduced-round transformation f to the sponge's state
+      reducedBlake2bLyra(state);
+
+      //M[row][C-1-col] = M[prev][col] XOR rand
+      #if defined __AVX2__
+// in_v should not need to be reloaded, but it does and it segfaults if
+// loading alogned
+         state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+         in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
+         state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+         in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
+         state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+         in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
+
+         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
+                              _mm256_xor_si256( state_v[0], in_v[0] ) );
+         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
+                              _mm256_xor_si256( state_v[1], in_v[1] ) );
+         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
+                              _mm256_xor_si256( state_v[2], in_v[2] ) );
+
+      #elif defined __AVX__
+
+         state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
+         state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
+         state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
+         state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
+         state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
+         state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );
+
+         in_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) );
+         in_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) );
+         in_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) );
+         in_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) );
+         in_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) );
+         in_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) );
+
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]),
+                           _mm_xor_si128( state_v[0], in_v[0] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]),
+                           _mm_xor_si128( state_v[1], in_v[1] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]),
+                           _mm_xor_si128( state_v[2], in_v[2] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]),
+                           _mm_xor_si128( state_v[3], in_v[3] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]),
+                           _mm_xor_si128( state_v[4], in_v[4] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]),
+                            _mm_xor_si128( state_v[5], in_v[5] ) );
+
+      #else
+
+        ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+        ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+        ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+        ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
 	ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
 	ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
 	ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
@@ -244,13 +525,13 @@ for (int i = 0; i<12; i++) {
 	ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
 	ptrWordOut[10] = ptrWordIn[10] ^ state[10];
 	ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+     #endif
 
-
-	//Input: next column (i.e., next block in sequence)
-	ptrWordIn += BLOCK_LEN_INT64;
-	//Output: goes to previous column
-	ptrWordOut -= BLOCK_LEN_INT64;
-    }
+     //Input: next column (i.e., next block in sequence)
+     ptrWordIn += BLOCK_LEN_INT64;
+     //Output: goes to previous column
+     ptrWordOut -= BLOCK_LEN_INT64;
+   }
 }
 
 /**
@@ -267,13 +548,94 @@ for (int i = 0; i<12; i++) {
  * @param rowOut         Row receiving the output
  *
  */
- void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
-    uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
-    uint64_t* ptrWordInOut = rowInOut;				//In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
-    int i;
-    for (i = 0; i < N_COLS; i++) {
-	//Absorbing "M[prev] [+] M[row*]"
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols)
+{
+   uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
+   uint64_t* ptrWordInOut = rowInOut;				//In Lyra2: pointer to row*
+   uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+   unsigned int i;
+
+   for (i = 0; i < nCols; i++)
+   {
+      //Absorbing "M[prev] [+] M[row*]"
+      #if defined __AVX2__
+
+       __m256i state_v[3], in_v[3], inout_v[3];
+
+       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+       in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
+       inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
+       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+       in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
+       inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) );
+       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+       in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
+       inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );
+
+       _mm256_store_si256( (__m256i*)(&state[0]),
+                            _mm256_xor_si256( state_v[0],
+                                              _mm256_add_epi64( in_v[0],
+                                                               inout_v[0] ) ) );
+       _mm256_store_si256( (__m256i*)(&state[4]),
+                            _mm256_xor_si256( state_v[1],
+                                              _mm256_add_epi64( in_v[1],
+                                                               inout_v[1] ) ) );
+       _mm256_store_si256( (__m256i*)(&state[8]),
+                            _mm256_xor_si256( state_v[2],
+                                              _mm256_add_epi64( in_v[2],
+                                                               inout_v[2] ) ) );
+      #elif defined __AVX__
+
+        __m128i state_v[6], in_v[6], inout_v[6];
+
+        state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
+        state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
+        state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
+        state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
+        state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
+        state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );
+
+        inout_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[0]) );
+        inout_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[2]) );
+        inout_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[4]) );
+        inout_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[6]) );
+        inout_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[8]) );
+        inout_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[10]) );
+
+        in_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) );
+        in_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) );
+        in_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) );
+        in_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) );
+        in_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) );
+        in_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) );
+
+        _mm_store_si128( (__m128i*)(&state[0]),
+                          _mm_xor_si128( state_v[0],
+                                         _mm_add_epi64( in_v[0],
+                                                        inout_v[0] ) ) );
+        _mm_store_si128( (__m128i*)(&state[2]),
+                          _mm_xor_si128( state_v[1],
+                                         _mm_add_epi64( in_v[1],
+                                                        inout_v[1] ) ) );
+        _mm_store_si128( (__m128i*)(&state[4]),
+                          _mm_xor_si128( state_v[2],
+                                         _mm_add_epi64( in_v[2],
+                                                        inout_v[2] ) ) );
+        _mm_store_si128( (__m128i*)(&state[6]),
+                          _mm_xor_si128( state_v[3],
+                                         _mm_add_epi64( in_v[3],
+                                                        inout_v[3] ) ) );
+        _mm_store_si128( (__m128i*)(&state[8]),
+                          _mm_xor_si128( state_v[4],
+                                         _mm_add_epi64( in_v[4],
+                                                        inout_v[4] ) ) );
+        _mm_store_si128( (__m128i*)(&state[10]),
+                          _mm_xor_si128( state_v[5],
+                                         _mm_add_epi64( in_v[5],
+                                                        inout_v[5] ) ) );
+
+      #else
+
 	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
 	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
 	state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
@@ -286,44 +648,93 @@ for (int i = 0; i<12; i++) {
 	state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
 	state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
 	state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
-
-	//M[row][col] = M[prev][col] XOR rand
-	ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
-	ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
-	ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
-	ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
-	ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
-	ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
-	ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
-	ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
-	ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
-	ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
-	ptrWordOut[10] = ptrWordIn[10] ^ state[10];
-	ptrWordOut[11] = ptrWordIn[11] ^ state[11];
-
-	//M[row*][col] = M[row*][col] XOR rotW(rand)
-	ptrWordInOut[0]  ^= state[11];
-	ptrWordInOut[1]  ^= state[0];
-	ptrWordInOut[2]  ^= state[1];
-	ptrWordInOut[3]  ^= state[2];
-	ptrWordInOut[4]  ^= state[3];
-	ptrWordInOut[5]  ^= state[4];
-	ptrWordInOut[6]  ^= state[5];
-	ptrWordInOut[7]  ^= state[6];
-	ptrWordInOut[8]  ^= state[7];
-	ptrWordInOut[9]  ^= state[8];
-	ptrWordInOut[10] ^= state[9];
-	ptrWordInOut[11] ^= state[10];
-
-	//Inputs: next column (i.e., next block in sequence)
-	ptrWordInOut += BLOCK_LEN_INT64;
-	ptrWordIn += BLOCK_LEN_INT64;
-	//Output: goes to previous column
-	ptrWordOut -= BLOCK_LEN_INT64;
-    }
+      #endif
+
+      //Applies the reduced-round transformation f to the sponge's state
+      reducedBlake2bLyra(state);
+
+      //M[row][col] = M[prev][col] XOR rand
+      #if defined __AVX2__
+
+         state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+         in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
+         state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+         in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
+         state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+         in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
+
+         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
+                              _mm256_xor_si256( state_v[0], in_v[0] ) );
+         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
+                              _mm256_xor_si256( state_v[1], in_v[1] ) );
+         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
+                              _mm256_xor_si256( state_v[2], in_v[2] ) );
+
+      #elif defined __AVX__
+
+         state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
+         state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
+         state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
+         state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
+         state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
+         state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );
+
+         in_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) );
+         in_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) );
+         in_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) );
+         in_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) );
+         in_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) );
+         in_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) );
+
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]),
+                           _mm_xor_si128( state_v[0], in_v[0] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]),
+                           _mm_xor_si128( state_v[1], in_v[1] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]),
+                           _mm_xor_si128( state_v[2], in_v[2] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]),
+                           _mm_xor_si128( state_v[3], in_v[3] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]),
+                           _mm_xor_si128( state_v[4], in_v[4] ) );
+         _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]),
+                           _mm_xor_si128( state_v[5], in_v[5] ) );
+
+      #else
+
+         ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+         ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+         ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+         ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+         ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
+	 ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
+	 ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
+	 ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
+	 ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
+	 ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
+	 ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+	 ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+      #endif
+
+      //M[row*][col] = M[row*][col] XOR rotW(rand)
+      ptrWordInOut[0]  ^= state[11];
+      ptrWordInOut[1]  ^= state[0];
+      ptrWordInOut[2]  ^= state[1];
+      ptrWordInOut[3]  ^= state[2];
+      ptrWordInOut[4]  ^= state[3];
+      ptrWordInOut[5]  ^= state[4];
+      ptrWordInOut[6]  ^= state[5];
+      ptrWordInOut[7]  ^= state[6];
+      ptrWordInOut[8]  ^= state[7];
+      ptrWordInOut[9]  ^= state[8];
+      ptrWordInOut[10] ^= state[9];
+      ptrWordInOut[11] ^= state[10];
+
+      //Inputs: next column (i.e., next block in sequence)
+      ptrWordInOut += BLOCK_LEN_INT64;
+      ptrWordIn += BLOCK_LEN_INT64;
+      //Output: goes to previous column
+      ptrWordOut -= BLOCK_LEN_INT64;
+   }
 }
 
 /**
@@ -340,410 +751,203 @@ for (int i = 0; i<12; i++) {
  * @param rowOut         Row receiving the output
  *
  */
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
-    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
-    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
-    int i;
-
-    for (i = 0; i < N_COLS; i++) {
-
-	//Absorbing "M[prev] [+] M[row*]"
-	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
-	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
-	state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
-	state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
-	state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
-	state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
-	state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
-	state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
-	state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
-	state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
-	state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
-	state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
-
-	//M[rowOut][col] = M[rowOut][col] XOR rand
-	ptrWordOut[0] ^= state[0];
-	ptrWordOut[1] ^= state[1];
-	ptrWordOut[2] ^= state[2];
-	ptrWordOut[3] ^= state[3];
-	ptrWordOut[4] ^= state[4];
-	ptrWordOut[5] ^= state[5];
-	ptrWordOut[6] ^= state[6];
-	ptrWordOut[7] ^= state[7];
-	ptrWordOut[8] ^= state[8];
-	ptrWordOut[9] ^= state[9];
-	ptrWordOut[10] ^= state[10];
-	ptrWordOut[11] ^= state[11];
-
-	//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
-	ptrWordInOut[0] ^= state[11];
-	ptrWordInOut[1] ^= state[0];
-	ptrWordInOut[2] ^= state[1];
-	ptrWordInOut[3] ^= state[2];
-	ptrWordInOut[4] ^= state[3];
-	ptrWordInOut[5] ^= state[4];
-	ptrWordInOut[6] ^= state[5];
-	ptrWordInOut[7] ^= state[6];
-	ptrWordInOut[8] ^= state[7];
-	ptrWordInOut[9] ^= state[8];
-	ptrWordInOut[10] ^= state[9];
-	ptrWordInOut[11] ^= state[10];
-
-	//Goes to next block
-	ptrWordOut += BLOCK_LEN_INT64;
-	ptrWordInOut += BLOCK_LEN_INT64;
-	ptrWordIn += BLOCK_LEN_INT64;
-    }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/**
- * Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand"
- * on M[rowOut] and making "M[rowInOut] =  M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
- * rotation to the left.
- *
- * @param state          The current state of the sponge
- * @param rowIn          Row used only as input
- * @param rowInOut       Row used as input and to receive output after rotation
- * @param rowOut         Row receiving the output
- *
- */
-/*
-inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
-    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
-    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
-    int i;
-    for (i = 0; i < N_COLS; i++) {
-
-	//Absorbing "M[rowInOut] XOR M[rowIn]"
-	state[0] ^= ptrWordInOut[0] ^ ptrWordIn[0];
-	state[1] ^= ptrWordInOut[1] ^ ptrWordIn[1];
-	state[2] ^= ptrWordInOut[2] ^ ptrWordIn[2];
-	state[3] ^= ptrWordInOut[3] ^ ptrWordIn[3];
-	state[4] ^= ptrWordInOut[4] ^ ptrWordIn[4];
-	state[5] ^= ptrWordInOut[5] ^ ptrWordIn[5];
-	state[6] ^= ptrWordInOut[6] ^ ptrWordIn[6];
-	state[7] ^= ptrWordInOut[7] ^ ptrWordIn[7];
-	state[8] ^= ptrWordInOut[8] ^ ptrWordIn[8];
-	state[9] ^= ptrWordInOut[9] ^ ptrWordIn[9];
-	state[10] ^= ptrWordInOut[10] ^ ptrWordIn[10];
-	state[11] ^= ptrWordInOut[11] ^ ptrWordIn[11];
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
-
-	//M[row][col] = rand
-	ptrWordOut[0] = state[0];
-	ptrWordOut[1] = state[1];
-	ptrWordOut[2] = state[2];
-	ptrWordOut[3] = state[3];
-	ptrWordOut[4] = state[4];
-	ptrWordOut[5] = state[5];
-	ptrWordOut[6] = state[6];
-	ptrWordOut[7] = state[7];
-	ptrWordOut[8] = state[8];
-	ptrWordOut[9] = state[9];
-	ptrWordOut[10] = state[10];
-	ptrWordOut[11] = state[11];
-
-
-	//M[row*][col] = M[row*][col] XOR rotW(rand)
-	ptrWordInOut[0] ^= state[10];
-	ptrWordInOut[1] ^= state[11];
-	ptrWordInOut[2] ^= state[0];
-	ptrWordInOut[3] ^= state[1];
-	ptrWordInOut[4] ^= state[2];
-	ptrWordInOut[5] ^= state[3];
-	ptrWordInOut[6] ^= state[4];
-	ptrWordInOut[7] ^= state[5];
-	ptrWordInOut[8] ^= state[6];
-	ptrWordInOut[9] ^= state[7];
-	ptrWordInOut[10] ^= state[8];
-	ptrWordInOut[11] ^= state[9];
-
-	//Goes to next column (i.e., next block in sequence)
-	ptrWordInOut += BLOCK_LEN_INT64;
-	ptrWordIn += BLOCK_LEN_INT64;
-	ptrWordOut += BLOCK_LEN_INT64;
-    }
-}
-*/
-
-/**
- * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand"
- * on M[rowOut] and making "M[rowInOut] =  M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
- * rotation to the left.
- *
- * @param state          The current state of the sponge
- * @param rowIn          Row used only as input
- * @param rowInOut       Row used as input and to receive output after rotation
- * @param rowOut         Row receiving the output
- *
- */
-/*
-inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
-    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
-    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
-    int i;
-    for (i = 0; i < N_COLS; i++) {
-
-	//Absorbing "M[rowInOut] XOR M[rowIn]"
-	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
-	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
-	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
-	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
-	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
-	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
-	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
-	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
-	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
-	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
-	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
-	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
-
-
-	//M[row*][col] = M[row*][col] XOR rotW(rand)
-	ptrWordInOut[0] ^= state[10];
-	ptrWordInOut[1] ^= state[11];
-	ptrWordInOut[2] ^= state[0];
-	ptrWordInOut[3] ^= state[1];
-	ptrWordInOut[4] ^= state[2];
-	ptrWordInOut[5] ^= state[3];
-	ptrWordInOut[6] ^= state[4];
-	ptrWordInOut[7] ^= state[5];
-	ptrWordInOut[8] ^= state[6];
-	ptrWordInOut[9] ^= state[7];
-	ptrWordInOut[10] ^= state[8];
-	ptrWordInOut[11] ^= state[9];
-
-
-	//M[row][col] = rand
-	ptrWordOut[0] = state[0] ^ ptrWordIn[0];
-	ptrWordOut[1] = state[1] ^ ptrWordIn[1];
-	ptrWordOut[2] = state[2] ^ ptrWordIn[2];
-	ptrWordOut[3] = state[3] ^ ptrWordIn[3];
-	ptrWordOut[4] = state[4] ^ ptrWordIn[4];
-	ptrWordOut[5] = state[5] ^ ptrWordIn[5];
-	ptrWordOut[6] = state[6] ^ ptrWordIn[6];
-	ptrWordOut[7] = state[7] ^ ptrWordIn[7];
-	ptrWordOut[8] = state[8] ^ ptrWordIn[8];
-	ptrWordOut[9] = state[9] ^ ptrWordIn[9];
-	ptrWordOut[10] = state[10] ^ ptrWordIn[10];
-	ptrWordOut[11] = state[11] ^ ptrWordIn[11];
-
-	//Goes to next column (i.e., next block in sequence)
-	ptrWordInOut += BLOCK_LEN_INT64;
-	ptrWordIn += BLOCK_LEN_INT64;
-	ptrWordOut += BLOCK_LEN_INT64;
-    }
-}
-*/
-
-/**
- * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand"
- * on M[rowOut] and making "M[rowInOut] =  M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
- * rotation to the left.
- *
- * @param state          The current state of the sponge
- * @param rowIn          Row used only as input
- * @param rowInOut       Row used as input and to receive output after rotation
- * @param rowOut         Row receiving the output
- *
- */
-/*
-inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
-    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
-    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut;
-    int i;
-
-    for (i = 0; i < N_COLS / 2; i++) {
-	//Absorbing "M[rowInOut] XOR M[rowIn]"
-	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
-	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
-	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
-	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
-	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
-	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
-	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
-	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
-	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
-	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
-	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
-	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
-
-
-	//M[row*][col] = M[row*][col] XOR rotW(rand)
-	ptrWordInOut[0] ^= state[10];
-	ptrWordInOut[1] ^= state[11];
-	ptrWordInOut[2] ^= state[0];
-	ptrWordInOut[3] ^= state[1];
-	ptrWordInOut[4] ^= state[2];
-	ptrWordInOut[5] ^= state[3];
-	ptrWordInOut[6] ^= state[4];
-	ptrWordInOut[7] ^= state[5];
-	ptrWordInOut[8] ^= state[6];
-	ptrWordInOut[9] ^= state[7];
-	ptrWordInOut[10] ^= state[8];
-	ptrWordInOut[11] ^= state[9];
-
-
-	//M[row][col] = rand
-	ptrWordOut[0] = state[0] ^ ptrWordIn[0];
-	ptrWordOut[1] = state[1] ^ ptrWordIn[1];
-	ptrWordOut[2] = state[2] ^ ptrWordIn[2];
-	ptrWordOut[3] = state[3] ^ ptrWordIn[3];
-	ptrWordOut[4] = state[4] ^ ptrWordIn[4];
-	ptrWordOut[5] = state[5] ^ ptrWordIn[5];
-	ptrWordOut[6] = state[6] ^ ptrWordIn[6];
-	ptrWordOut[7] = state[7] ^ ptrWordIn[7];
-	ptrWordOut[8] = state[8] ^ ptrWordIn[8];
-	ptrWordOut[9] = state[9] ^ ptrWordIn[9];
-	ptrWordOut[10] = state[10] ^ ptrWordIn[10];
-	ptrWordOut[11] = state[11] ^ ptrWordIn[11];
-
-	//Goes to next column (i.e., next block in sequence)
-	ptrWordInOut += BLOCK_LEN_INT64;
-	ptrWordIn += BLOCK_LEN_INT64;
-	ptrWordOut += 2 * BLOCK_LEN_INT64;
-    }
-
-    ptrWordOut =  rowOut + BLOCK_LEN_INT64;
-    for (i = 0; i < N_COLS / 2; i++) {
-	//Absorbing "M[rowInOut] XOR M[rowIn]"
-	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
-	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
-	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
-	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
-	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
-	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
-	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
-	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
-	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
-	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
-	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
-	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
-
-
-	//M[row*][col] = M[row*][col] XOR rotW(rand)
-	ptrWordInOut[0] ^= state[10];
-	ptrWordInOut[1] ^= state[11];
-	ptrWordInOut[2] ^= state[0];
-	ptrWordInOut[3] ^= state[1];
-	ptrWordInOut[4] ^= state[2];
-	ptrWordInOut[5] ^= state[3];
-	ptrWordInOut[6] ^= state[4];
-	ptrWordInOut[7] ^= state[5];
-	ptrWordInOut[8] ^= state[6];
-	ptrWordInOut[9] ^= state[7];
-	ptrWordInOut[10] ^= state[8];
-	ptrWordInOut[11] ^= state[9];
-
-
-	//M[row][col] = rand
-	ptrWordOut[0] = state[0] ^ ptrWordIn[0];
-	ptrWordOut[1] = state[1] ^ ptrWordIn[1];
-	ptrWordOut[2] = state[2] ^ ptrWordIn[2];
-	ptrWordOut[3] = state[3] ^ ptrWordIn[3];
-	ptrWordOut[4] = state[4] ^ ptrWordIn[4];
-	ptrWordOut[5] = state[5] ^ ptrWordIn[5];
-	ptrWordOut[6] = state[6] ^ ptrWordIn[6];
-	ptrWordOut[7] = state[7] ^ ptrWordIn[7];
-	ptrWordOut[8] = state[8] ^ ptrWordIn[8];
-	ptrWordOut[9] = state[9] ^ ptrWordIn[9];
-	ptrWordOut[10] = state[10] ^ ptrWordIn[10];
-	ptrWordOut[11] = state[11] ^ ptrWordIn[11];
-
-	//Goes to next column (i.e., next block in sequence)
-	ptrWordInOut += BLOCK_LEN_INT64;
-	ptrWordIn += BLOCK_LEN_INT64;
-	ptrWordOut += 2 * BLOCK_LEN_INT64;
-    }
-}
-*/
-
-/**
- * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand"
- * to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)",
- * where rotW is a 64-bit rotation to the left.
- *
- * @param state          The current state of the sponge
- * @param rowIn          Row used only as input
- * @param rowInOut       Row used as input and to receive output after rotation
- * @param rowOut         Row receiving the output
- *
- */
-/*
-inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
-    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
-    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
-    int i;
-    for (i = 0; i < N_COLS; i++) {
-
-	//Absorbing "M[rowInOut] XOR M[rowIn]"
-	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
-	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
-	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
-	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
-	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
-	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
-	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
-	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
-	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
-	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
-	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
-	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
-
-	//M[rowOut][col] = M[rowOut][col] XOR rand
-	ptrWordOut[0] ^= state[0];
-	ptrWordOut[1] ^= state[1];
-	ptrWordOut[2] ^= state[2];
-	ptrWordOut[3] ^= state[3];
-	ptrWordOut[4] ^= state[4];
-	ptrWordOut[5] ^= state[5];
-	ptrWordOut[6] ^= state[6];
-	ptrWordOut[7] ^= state[7];
-	ptrWordOut[8] ^= state[8];
-	ptrWordOut[9] ^= state[9];
-	ptrWordOut[10] ^= state[10];
-	ptrWordOut[11] ^= state[11];
-
-	//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
-
-
-	//Goes to next block
-	ptrWordOut += BLOCK_LEN_INT64;
-	ptrWordInOut += BLOCK_LEN_INT64;
-	ptrWordIn += BLOCK_LEN_INT64;
-    }
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols)
+{
+   uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+   uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+   uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+   unsigned int i;
+
+   for (i = 0; i < nCols; i++)
+   {
+
+   //Absorbing "M[prev] [+] M[row*]"
+   #if defined __AVX2__
+
+       __m256i state_v[3], in_v[3], inout_v[3];
+       #define out_v in_v    // reuse register in next code block
+
+       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+       in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
+       inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
+       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+       in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
+       inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) );
+       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+       in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
+       inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );
+
+       _mm256_store_si256( (__m256i*)(&state[0]),
+                             _mm256_xor_si256( state_v[0], 
+                                               _mm256_add_epi64( in_v[0],
+                                                               inout_v[0] ) ) );
+       _mm256_store_si256( (__m256i*)(&state[4]), 
+                            _mm256_xor_si256( state_v[1],
+                                               _mm256_add_epi64( in_v[1],
+                                                               inout_v[1] ) ) );
+       _mm256_store_si256( (__m256i*)(&state[8]), 
+                            _mm256_xor_si256( state_v[2], 
+                                              _mm256_add_epi64( in_v[2],
+                                                               inout_v[2] ) ) );
+   #elif defined __AVX__
+
+       __m128i state_v[6], in_v[6], inout_v[6];
+       #define out_v in_v    // reuse register in next code block
+
+       state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
+       state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
+       state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
+       state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
+       state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
+       state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );
+
+       inout_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[0]) );
+       inout_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[2]) );
+       inout_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[4]) );
+       inout_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[6]) );
+       inout_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[8]) );
+       inout_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[10]) );
+
+       in_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) );
+       in_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) );
+       in_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) );
+       in_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) );
+       in_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) );
+       in_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) );
+
+       _mm_store_si128( (__m128i*)(&state[0]),
+                         _mm_xor_si128( state_v[0],
+                                        _mm_add_epi64( in_v[0],
+                                                       inout_v[0] ) ) );
+       _mm_store_si128( (__m128i*)(&state[2]),
+                         _mm_xor_si128( state_v[1],
+                                        _mm_add_epi64( in_v[1],
+                                                       inout_v[1] ) ) );
+       _mm_store_si128( (__m128i*)(&state[4]),
+                         _mm_xor_si128( state_v[2],
+                                        _mm_add_epi64( in_v[2],
+                                                       inout_v[2] ) ) );
+       _mm_store_si128( (__m128i*)(&state[6]),
+                         _mm_xor_si128( state_v[3],
+                                        _mm_add_epi64( in_v[3],
+                                                       inout_v[3] ) ) );
+       _mm_store_si128( (__m128i*)(&state[8]),
+                         _mm_xor_si128( state_v[4],
+                                        _mm_add_epi64( in_v[4],
+                                                       inout_v[4] ) ) );
+       _mm_store_si128( (__m128i*)(&state[10]),
+                         _mm_xor_si128( state_v[5],
+                                        _mm_add_epi64( in_v[5],
+                                                       inout_v[5] ) ) );
+
+   #else
+
+       state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
+       state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
+       state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
+       state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
+       state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
+       state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
+       state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
+       state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
+       state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
+       state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
+       state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+       state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+    #endif
+
+    //Applies the reduced-round transformation f to the sponge's state
+    reducedBlake2bLyra(state);
+
+    //M[rowOut][col] = M[rowOut][col] XOR rand
+    #if defined __AVX2__
+
+       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+       out_v  [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
+       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+       out_v  [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) );
+       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+       out_v  [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) );
+
+       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
+                            _mm256_xor_si256( state_v[0], out_v[0] ) );
+       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
+                            _mm256_xor_si256( state_v[1], out_v[1] ) );
+       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
+                            _mm256_xor_si256( state_v[2], out_v[2] ) );
+
+    #elif defined __AVX__
+
+       state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
+       state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
+       state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
+       state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
+       state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
+       state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );
+
+       out_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[0]) );
+       out_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[2]) );
+       out_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[4]) );
+       out_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[6]) );
+       out_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[8]) );
+       out_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[10]) );
+
+       _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]),
+                         _mm_xor_si128( state_v[0], out_v[0] ) );
+       _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]),
+                         _mm_xor_si128( state_v[1], out_v[1] ) );
+       _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]),
+                         _mm_xor_si128( state_v[2], out_v[2] ) );
+       _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]),
+                         _mm_xor_si128( state_v[3], out_v[3] ) );
+       _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]),
+                         _mm_xor_si128( state_v[4], out_v[4] ) );
+       _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]),
+                         _mm_xor_si128( state_v[5], out_v[5] ) );
+
+    #else
+
+       ptrWordOut[0] ^= state[0];
+       ptrWordOut[1] ^= state[1];
+       ptrWordOut[2] ^= state[2];
+       ptrWordOut[3] ^= state[3];
+       ptrWordOut[4] ^= state[4];
+       ptrWordOut[5] ^= state[5];
+       ptrWordOut[6] ^= state[6];
+       ptrWordOut[7] ^= state[7];
+       ptrWordOut[8] ^= state[8];
+       ptrWordOut[9] ^= state[9];
+       ptrWordOut[10] ^= state[10];
+       ptrWordOut[11] ^= state[11];
+
+    #endif
+
+    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+    ptrWordInOut[0] ^= state[11];
+    ptrWordInOut[1] ^= state[0];
+    ptrWordInOut[2] ^= state[1];
+    ptrWordInOut[3] ^= state[2];
+    ptrWordInOut[4] ^= state[3];
+    ptrWordInOut[5] ^= state[4];
+    ptrWordInOut[6] ^= state[5];
+    ptrWordInOut[7] ^= state[6];
+    ptrWordInOut[8] ^= state[7];
+    ptrWordInOut[9] ^= state[8];
+    ptrWordInOut[10] ^= state[9];
+    ptrWordInOut[11] ^= state[10];
+
+    //Goes to next block
+    ptrWordOut += BLOCK_LEN_INT64;
+    ptrWordInOut += BLOCK_LEN_INT64;
+    ptrWordIn += BLOCK_LEN_INT64;
+  }
 }
-*/
 
 /**
- Prints an array of unsigned chars
+ * Prints an array of unsigned chars
  */
-void printArray(unsigned char *array, unsigned int size, char *name) {
+void printArray(unsigned char *array, unsigned int size, char *name)
+{
 	unsigned int i;
 	printf("%s: ", name);
 	for (i = 0; i < size; i++) {
diff --git a/lyra2/Sponge.h b/lyra2/Sponge.h
index 9bd8ed664..d151ef837 100644
--- a/lyra2/Sponge.h
+++ b/lyra2/Sponge.h
@@ -24,53 +24,129 @@
 
 #include <stdint.h>
 
-#if defined(__GNUC__)
-#define ALIGN __attribute__ ((aligned(32)))
-#elif defined(_MSC_VER)
-#define ALIGN __declspec(align(32))
-#else
-#define ALIGN
-#endif
-
-
-/*Blake2b IV Array*/
+/* Blake2b IV Array */
 static const uint64_t blake2b_IV[8] =
 {
-  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
-  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
-  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
-  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };
 
-/*Blake2b's rotation*/
-static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){
-    return ( w >> c ) | ( w << ( 64 - c ) );
+/* Blake2b's rotation */
+static 
+#ifdef _MSC_VER
+__forceinline
+#else
+__inline
+#endif
+uint64_t rotr64(const uint64_t w, const unsigned c) {
+	return ( w >> c ) | ( w << ( 64 - c ) );
 }
 
-/*Blake2b's G function*/
-#define G(r,i,a,b,c,d) \
-  do { \
-    a = a + b; \
-    d = rotr64(d ^ a, 32); \
-    c = c + d; \
-    b = rotr64(b ^ c, 24); \
-    a = a + b; \
-    d = rotr64(d ^ a, 16); \
-    c = c + d; \
-    b = rotr64(b ^ c, 63); \
+#if defined __AVX2__
+
+// _m256i
+#define  mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \
+                                            _mm256_slli_epi64(w, 64 - c))
+
+// Rotate uint64 by one uint64
+// __m256i
+#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 )
+#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 )
+
+// swap hi and lo 128 bits in 256 bit vector
+// _m256i
+#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 )
+
+// void
+#define G_4X64(a,b,c,d) \
+   a = _mm256_add_epi64( a, b ); \
+   d = mm256_rotr_64( _mm256_xor_si256( d, a), 32 ); \
+   c = _mm256_add_epi64( c, d ); \
+   b = mm256_rotr_64( _mm256_xor_si256( b, c ), 24 ); \
+   a = _mm256_add_epi64( a, b ); \
+   d = mm256_rotr_64( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi64( c, d ); \
+   b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 );
+
+#elif defined __AVX__
+
+// _m128i
+#define  mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
+                                      _mm_slli_epi64(w, 64 - c))
+
+// swap 128 bit source vectors
+// void
+#define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
+                              s1 = _mm_xor_si128(s0, s1); \
+                              s0 = _mm_xor_si128(s0, s1);
+ 
+// swap uint64 in source vector
+// __m128i
+#define mm128_swap64(s) _mm_or_si128( _mm_slli_si128( s, 8 ), \
+                                      _mm_srli_si128( s, 8 ) )
+
+// rotate 2 128 bit vectors as one 256 vector by 1 uint64
+//void
+#define mm128_rotl256_1x64(s0, s1) do { \
+   __m128i t; \
+   s0 = mm128_swap64( s0); \
+   s1 = mm128_swap64( s1); \
+   t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
+                     _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
+   s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
+                      _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
+   s0 = t; \
+} while(0)
+
+#define mm128_rotr256_1x64(s0, s1) do { \
+   __m128i t; \
+   s0 = mm128_swap64( s0); \
+   s1 = mm128_swap64( s1); \
+   t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
+                        _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
+   s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
+                      _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
+   s0 = t; \
+} while(0)
+
+
+#define G_2X64(a,b,c,d) \
+   a = _mm_add_epi64( a, b ); \
+   d = mm_rotr_64( _mm_xor_si128( d, a), 32 ); \
+   c = _mm_add_epi64( c, d ); \
+   b = mm_rotr_64( _mm_xor_si128( b, c ), 24 ); \
+   a = _mm_add_epi64( a, b ); \
+   d = mm_rotr_64( _mm_xor_si128( d, a ), 16 ); \
+   c = _mm_add_epi64( c, d ); \
+   b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
+
+#endif   // AVX2
+
+/* Blake2b's G function */
+#define G(r,i,a,b,c,d) do { \
+	a = a + b; \
+	d = rotr64(d ^ a, 32); \
+	c = c + d; \
+	b = rotr64(b ^ c, 24); \
+	a = a + b; \
+	d = rotr64(d ^ a, 16); \
+	c = c + d; \
+	b = rotr64(b ^ c, 63); \
   } while(0)
 
 
 /*One Round of the Blake2b's compression function*/
-#define ROUND_LYRA(r)  \
-    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+#define ROUND_LYRA(r) \
+	G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+	G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+	G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+	G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+	G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+	G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+	G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+	G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
 
 
 //---- Housekeeping
@@ -78,31 +154,18 @@ void initState(uint64_t state[/*16*/]);
 
 //---- Squeezes
 void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
-void reducedSqueezeRow0(uint64_t* state, uint64_t* row);
+void reducedSqueezeRow0(uint64_t* state, uint64_t* row, const uint32_t nCols);
 
 //---- Absorbs
 void absorbBlock(uint64_t *state, const uint64_t *in);
 void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
 
 //---- Duplexes
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols);
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
 
 //---- Misc
 void printArray(unsigned char *array, unsigned int size, char *name);
 
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-////TESTS////
-//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
-//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-/////////////
-
-
 #endif /* SPONGE_H_ */
diff --git a/mingw64avx.sh b/mingw64avx.sh
index eddd80ae2..8955ce6c5 100644
--- a/mingw64avx.sh
+++ b/mingw64avx.sh
@@ -21,7 +21,7 @@ windres res/icon.rc icon.o
 ./configure --build=x86_64-w64-mingw32 --with-crypto=$SSL_PREFIX --with-curl=$CURL_PREFIX \
 	CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" LDFLAGS="icon.o"
 
-make
+make -j8
 
 strip -p --strip-debug --strip-unneeded cpuminer.exe
 
diff --git a/mingw64avx2.sh b/mingw64avx2.sh
index eda8a2651..8b73506ad 100644
--- a/mingw64avx2.sh
+++ b/mingw64avx2.sh
@@ -21,7 +21,7 @@ windres res/icon.rc icon.o
 ./configure --build=x86_64-w64-mingw32 --with-crypto=$SSL_PREFIX --with-curl=$CURL_PREFIX \
 	CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" LDFLAGS="icon.o"
 
-make
+make -j8
 
 strip -p --strip-debug --strip-unneeded cpuminer.exe
 
diff --git a/mingw64sse2.sh b/mingw64sse2.sh
index c49270b24..eaa19214e 100644
--- a/mingw64sse2.sh
+++ b/mingw64sse2.sh
@@ -21,7 +21,7 @@ windres res/icon.rc icon.o
 ./configure --build=x86_64-w64-mingw32 --with-crypto=$SSL_PREFIX --with-curl=$CURL_PREFIX \
 	CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" LDFLAGS="icon.o"
 
-make
+make -j8
 
 strip -p --strip-debug --strip-unneeded cpuminer.exe