/* * * Copyright (c) 2010-2011 * ntldr PGP key ID - 0x1B6A24550F33E44A * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License version 3 as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #if defined(USE_AVX) && !defined(__INTEL_COMPILER) #error Please use Intel C++ Compoler #endif #include #include "serpent.h" #include "xts_fast.h" #ifdef USE_AVX #include #include "xts_serpent_avx.h" #else #include #include "xts_serpent_sse2.h" #endif #define transpose(_B0, _B1, _B2, _B3) { \ __m128i _T0 = _mm_unpacklo_epi32(_B0, _B1); \ __m128i _T1 = _mm_unpacklo_epi32(_B2, _B3); \ __m128i _T2 = _mm_unpackhi_epi32(_B0, _B1); \ __m128i _T3 = _mm_unpackhi_epi32(_B2, _B3); \ _B0 = _mm_unpacklo_epi64(_T0, _T1); \ _B1 = _mm_unpackhi_epi64(_T0, _T1); \ _B2 = _mm_unpacklo_epi64(_T2, _T3); \ _B3 = _mm_unpackhi_epi64(_T2, _T3); \ } #define KXf(_B0, _B1, _B2, _B3, _ctx, round) \ _B0 = _mm_xor_si128(_B0, _mm_set1_epi32((_ctx)->expkey[4*round ])); \ _B1 = _mm_xor_si128(_B1, _mm_set1_epi32((_ctx)->expkey[4*round+1])); \ _B2 = _mm_xor_si128(_B2, _mm_set1_epi32((_ctx)->expkey[4*round+2])); \ _B3 = _mm_xor_si128(_B3, _mm_set1_epi32((_ctx)->expkey[4*round+3])); #define NOT_SI128(_X) ( \ _mm_xor_si128(_X, _mm_set1_epi32(0xFFFFFFFF)) ) #define ROL_SI128(_X, _rot) ( \ _mm_or_si128(_mm_slli_epi32(_X, _rot), _mm_srli_epi32(_X, 32-_rot)) ) #define ROR_SI128(_X, _rot) ( ROL_SI128(_X, (32-_rot)) ) #define LTf(_B0, _B1, _B2, _B3) \ _B0 = ROL_SI128(_B0, 13); \ _B2 = ROL_SI128(_B2, 3); \ _B1 = _mm_xor_si128(_B1, _B0); \ _B1 = _mm_xor_si128(_B1, _B2); \ _B3 = _mm_xor_si128(_B3, _B2); \ _B3 = _mm_xor_si128(_B3, _mm_slli_epi32(_B0, 3)); \ _B1 = ROL_SI128(_B1, 1); \ _B3 = ROL_SI128(_B3, 7); \ _B0 = _mm_xor_si128(_B0, _B1); \ _B0 = _mm_xor_si128(_B0, _B3); \ _B2 = _mm_xor_si128(_B2, _B3); \ _B2 = _mm_xor_si128(_B2, _mm_slli_epi32(_B1, 7)); \ _B0 = ROL_SI128(_B0, 5); \ _B2 = ROL_SI128(_B2, 22); #define ITf(_B0, _B1, _B2, _B3) \ _B2 = ROR_SI128(_B2, 22); \ _B0 = ROR_SI128(_B0, 5); \ _B2 = _mm_xor_si128(_B2, _B3); \ _B2 = _mm_xor_si128(_B2, _mm_slli_epi32(_B1, 7)); \ _B0 = _mm_xor_si128(_B0, _B1); \ _B0 = _mm_xor_si128(_B0, _B3); \ _B3 = ROR_SI128(_B3, 7); \ _B1 = ROR_SI128(_B1, 1); \ _B3 = _mm_xor_si128(_B3, _B2); \ _B3 = _mm_xor_si128(_B3, _mm_slli_epi32(_B0, 3)); \ _B1 = _mm_xor_si128(_B1, _B0); \ _B1 = _mm_xor_si128(_B1, _B2); \ _B2 = ROR_SI128(_B2, 3); \ _B0 = ROR_SI128(_B0, 13); #define sE1(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B1; \ _B3 = _mm_xor_si128(_B3, _B0); \ _B1 = _mm_and_si128(_B1, _B3); \ _tt = _mm_xor_si128(_tt, _B2); \ _B1 = _mm_xor_si128(_B1, _B0); \ _B0 = _mm_or_si128(_B0, _B3); \ _B0 = _mm_xor_si128(_B0, _tt); \ _tt = _mm_xor_si128(_tt, _B3); \ _B3 = _mm_xor_si128(_B3, _B2); \ _B2 = _mm_or_si128(_B2, _B1); \ _B2 = _mm_xor_si128(_B2, _tt); \ _tt = NOT_SI128(_tt); \ _tt = _mm_or_si128(_tt, _B1); \ _B1 = _mm_xor_si128(_B1, _B3); \ _B1 = _mm_xor_si128(_B1, _tt); \ _B3 = _mm_or_si128(_B3, _B0); \ _B1 = _mm_xor_si128(_B1, _B3); \ _tt = _mm_xor_si128(_tt, _B3); \ _B3 = _B0; \ _B0 = _B1; \ _B1 = _tt; \ } #define sE2(_B0, _B1, _B2, _B3) { \ __m128i _tt; \ _B0 = NOT_SI128(_B0); \ _B2 = NOT_SI128(_B2); \ _tt = _B0; \ _B0 = _mm_and_si128(_B0, _B1); \ _B2 = _mm_xor_si128(_B2, _B0); \ _B0 = _mm_or_si128(_B0, _B3); \ _B3 = _mm_xor_si128(_B3, _B2); \ _B1 = _mm_xor_si128(_B1, _B0); \ _B0 = _mm_xor_si128(_B0, _tt); \ _tt = _mm_or_si128(_tt, _B1); \ _B1 = _mm_xor_si128(_B1, _B3); \ _B2 = _mm_or_si128(_B2, _B0); \ _B2 = _mm_and_si128(_B2, _tt); \ _B0 = _mm_xor_si128(_B0, _B1); \ _B1 = _mm_and_si128(_B1, _B2); \ _B1 = _mm_xor_si128(_B1, _B0); \ _B0 = _mm_and_si128(_B0, _B2); \ _tt = _mm_xor_si128(_tt, _B0); \ _B0 = _B2; \ _B2 = _B3; \ _B3 = _B1; \ _B1 = _tt; \ } #define sE3(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B0; \ _B0 = _mm_and_si128(_B0, _B2); \ _B0 = _mm_xor_si128(_B0, _B3); \ _B2 = _mm_xor_si128(_B2, _B1); \ _B2 = _mm_xor_si128(_B2, _B0); \ _B3 = _mm_or_si128(_B3, _tt); \ _B3 = _mm_xor_si128(_B3, _B1); \ _tt = _mm_xor_si128(_tt, _B2); \ _B1 = _B3; \ _B3 = _mm_or_si128(_B3, _tt); \ _B3 = _mm_xor_si128(_B3, _B0); \ _B0 = _mm_and_si128(_B0, _B1); \ _tt = _mm_xor_si128(_tt, _B0); \ _B1 = _mm_xor_si128(_B1, _B3); \ _B1 = _mm_xor_si128(_B1, _tt); \ _B0 = _B2; \ _B2 = _B1; \ _B1 = _B3; \ _B3 = NOT_SI128(_tt); \ } #define sE4(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B0; \ _B0 = _mm_or_si128(_B0, _B3); \ _B3 = _mm_xor_si128(_B3, _B1); \ _B1 = _mm_and_si128(_B1, _tt); \ _tt = _mm_xor_si128(_tt, _B2); \ _B2 = _mm_xor_si128(_B2, _B3); \ _B3 = _mm_and_si128(_B3, _B0); \ _tt = _mm_or_si128(_tt, _B1); \ _B3 = _mm_xor_si128(_B3, _tt); \ _B0 = _mm_xor_si128(_B0, _B1); \ _tt = _mm_and_si128(_tt, _B0); \ _B1 = _mm_xor_si128(_B1, _B3); \ _tt = _mm_xor_si128(_tt, _B2); \ _B1 = _mm_or_si128(_B1, _B0); \ _B1 = _mm_xor_si128(_B1, _B2); \ _B0 = _mm_xor_si128(_B0, _B3); \ _B2 = _B1; \ _B1 = _mm_or_si128(_B1, _B3); \ _B0 = _mm_xor_si128(_B0, _B1); \ _B1 = _B2; \ _B2 = _B3; \ _B3 = _tt; \ } #define sE5(_B0, _B1, _B2, _B3) { \ __m128i _tt; \ _B1 = _mm_xor_si128(_B1, _B3); \ _B3 = NOT_SI128(_B3); \ _B2 = _mm_xor_si128(_B2, _B3); \ _B3 = _mm_xor_si128(_B3, _B0); \ _tt = _B1; \ _B1 = _mm_and_si128(_B1, _B3); \ _B1 = _mm_xor_si128(_B1, _B2); \ _tt = _mm_xor_si128(_tt, _B3); \ _B0 = _mm_xor_si128(_B0, _tt); \ _B2 = _mm_and_si128(_B2, _tt); \ _B2 = _mm_xor_si128(_B2, _B0); \ _B0 = _mm_and_si128(_B0, _B1); \ _B3 = _mm_xor_si128(_B3, _B0); \ _tt = _mm_or_si128(_tt, _B1); \ _tt = _mm_xor_si128(_tt, _B0); \ _B0 = _mm_or_si128(_B0, _B3); \ _B0 = _mm_xor_si128(_B0, _B2); \ _B2 = _mm_and_si128(_B2, _B3); \ _B0 = NOT_SI128(_B0); \ _tt = _mm_xor_si128(_tt, _B2); \ _B2 = _B0; \ _B0 = _B1; \ _B1 = _tt; \ } #define sE6(_B0, _B1, _B2, _B3) { \ __m128i _tt; \ _B0 = _mm_xor_si128(_B0, _B1); \ _B1 = _mm_xor_si128(_B1, _B3); \ _B3 = NOT_SI128(_B3); \ _tt = _B1; \ _B1 = _mm_and_si128(_B1, _B0); \ _B2 = _mm_xor_si128(_B2, _B3); \ _B1 = _mm_xor_si128(_B1, _B2); \ _B2 = _mm_or_si128(_B2, _tt); \ _tt = _mm_xor_si128(_tt, _B3); \ _B3 = _mm_and_si128(_B3, _B1); \ _B3 = _mm_xor_si128(_B3, _B0); \ _tt = _mm_xor_si128(_tt, _B1); \ _tt = _mm_xor_si128(_tt, _B2); \ _B2 = _mm_xor_si128(_B2, _B0); \ _B0 = _mm_and_si128(_B0, _B3); \ _B2 = NOT_SI128(_B2); \ _B0 = _mm_xor_si128(_B0, _tt); \ _tt = _mm_or_si128(_tt, _B3); \ _tt = _mm_xor_si128(_tt, _B2); \ _B2 = _B0; \ _B0 = _B1; \ _B1 = _B3; \ _B3 = _tt; \ } #define sE7(_B0, _B1, _B2, _B3) { \ __m128i _tt; \ _B2 = NOT_SI128(_B2); \ _tt = _B3; \ _B3 = _mm_and_si128(_B3, _B0); \ _B0 = _mm_xor_si128(_B0, _tt); \ _B3 = _mm_xor_si128(_B3, _B2); \ _B2 = _mm_or_si128(_B2, _tt); \ _B1 = _mm_xor_si128(_B1, _B3); \ _B2 = _mm_xor_si128(_B2, _B0); \ _B0 = _mm_or_si128(_B0, _B1); \ _B2 = _mm_xor_si128(_B2, _B1); \ _tt = _mm_xor_si128(_tt, _B0); \ _B0 = _mm_or_si128(_B0, _B3); \ _B0 = _mm_xor_si128(_B0, _B2); \ _tt = _mm_xor_si128(_tt, _B3); \ _tt = _mm_xor_si128(_tt, _B0); \ _B3 = NOT_SI128(_B3); \ _B2 = _mm_and_si128(_B2, _tt); \ _B3 = _mm_xor_si128(_B3, _B2); \ _B2 = _tt; \ } #define sE8(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B1; \ _B1 = _mm_or_si128(_B1, _B2); \ _B1 = _mm_xor_si128(_B1, _B3); \ _tt = _mm_xor_si128(_tt, _B2); \ _B2 = _mm_xor_si128(_B2, _B1); \ _B3 = _mm_or_si128(_B3, _tt); \ _B3 = _mm_and_si128(_B3, _B0); \ _tt = _mm_xor_si128(_tt, _B2); \ _B3 = _mm_xor_si128(_B3, _B1); \ _B1 = _mm_or_si128(_B1, _tt); \ _B1 = _mm_xor_si128(_B1, _B0); \ _B0 = _mm_or_si128(_B0, _tt); \ _B0 = _mm_xor_si128(_B0, _B2); \ _B1 = _mm_xor_si128(_B1, _tt); \ _B2 = _mm_xor_si128(_B2, _B1); \ _B1 = _mm_and_si128(_B1, _B0); \ _B1 = _mm_xor_si128(_B1, _tt); \ _B2 = NOT_SI128(_B2); \ _B2 = _mm_or_si128(_B2, _B0); \ _tt = _mm_xor_si128(_tt, _B2); \ _B2 = _B1; \ _B1 = _B3; \ _B3 = _B0; \ _B0 = _tt; \ } #define sD1(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B1; \ _B2 = NOT_SI128(_B2); \ _B1 = _mm_or_si128(_B1, _B0); \ _tt = NOT_SI128(_tt); \ _B1 = _mm_xor_si128(_B1, _B2); \ _B2 = _mm_or_si128(_B2, _tt); \ _B1 = _mm_xor_si128(_B1, _B3); \ _B0 = _mm_xor_si128(_B0, _tt); \ _B2 = _mm_xor_si128(_B2, _B0); \ _B0 = _mm_and_si128(_B0, _B3); \ _tt = _mm_xor_si128(_tt, _B0); \ _B0 = _mm_or_si128(_B0, _B1); \ _B0 = _mm_xor_si128(_B0, _B2); \ _B3 = _mm_xor_si128(_B3, _tt); \ _B2 = _mm_xor_si128(_B2, _B1); \ _B3 = _mm_xor_si128(_B3, _B0); \ _B3 = _mm_xor_si128(_B3, _B1); \ _B2 = _mm_and_si128(_B2, _B3); \ _tt = _mm_xor_si128(_tt, _B2); \ _B2 = _B1; \ _B1 = _tt; \ } #define sD2(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B1; \ _B1 = _mm_xor_si128(_B1, _B3); \ _B3 = _mm_and_si128(_B3, _B1); \ _tt = _mm_xor_si128(_tt, _B2); \ _B3 = _mm_xor_si128(_B3, _B0); \ _B0 = _mm_or_si128(_B0, _B1); \ _B2 = _mm_xor_si128(_B2, _B3); \ _B0 = _mm_xor_si128(_B0, _tt); \ _B0 = _mm_or_si128(_B0, _B2); \ _B1 = _mm_xor_si128(_B1, _B3); \ _B0 = _mm_xor_si128(_B0, _B1); \ _B1 = _mm_or_si128(_B1, _B3); \ _B1 = _mm_xor_si128(_B1, _B0); \ _tt = NOT_SI128(_tt); \ _tt = _mm_xor_si128(_tt, _B1); \ _B1 = _mm_or_si128(_B1, _B0); \ _B1 = _mm_xor_si128(_B1, _B0); \ _B1 = _mm_or_si128(_B1, _tt); \ _B3 = _mm_xor_si128(_B3, _B1); \ _B1 = _B0; \ _B0 = _tt; \ _tt = _B2; \ _B2 = _B3; \ _B3 = _tt; \ } #define sD3(_B0, _B1, _B2, _B3) { \ __m128i _tt; \ _B2 = _mm_xor_si128(_B2, _B3); \ _B3 = _mm_xor_si128(_B3, _B0); \ _tt = _B3; \ _B3 = _mm_and_si128(_B3, _B2); \ _B3 = _mm_xor_si128(_B3, _B1); \ _B1 = _mm_or_si128(_B1, _B2); \ _B1 = _mm_xor_si128(_B1, _tt); \ _tt = _mm_and_si128(_tt, _B3); \ _B2 = _mm_xor_si128(_B2, _B3); \ _tt = _mm_and_si128(_tt, _B0); \ _tt = _mm_xor_si128(_tt, _B2); \ _B2 = _mm_and_si128(_B2, _B1); \ _B2 = _mm_or_si128(_B2, _B0); \ _B3 = NOT_SI128(_B3); \ _B2 = _mm_xor_si128(_B2, _B3); \ _B0 = _mm_xor_si128(_B0, _B3); \ _B0 = _mm_and_si128(_B0, _B1); \ _B3 = _mm_xor_si128(_B3, _tt); \ _B3 = _mm_xor_si128(_B3, _B0); \ _B0 = _B1; \ _B1 = _tt; \ } #define sD4(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B2; \ _B2 = _mm_xor_si128(_B2, _B1); \ _B0 = _mm_xor_si128(_B0, _B2); \ _tt = _mm_and_si128(_tt, _B2); \ _tt = _mm_xor_si128(_tt, _B0); \ _B0 = _mm_and_si128(_B0, _B1); \ _B1 = _mm_xor_si128(_B1, _B3); \ _B3 = _mm_or_si128(_B3, _tt); \ _B2 = _mm_xor_si128(_B2, _B3); \ _B0 = _mm_xor_si128(_B0, _B3); \ _B1 = _mm_xor_si128(_B1, _tt); \ _B3 = _mm_and_si128(_B3, _B2); \ _B3 = _mm_xor_si128(_B3, _B1); \ _B1 = _mm_xor_si128(_B1, _B0); \ _B1 = _mm_or_si128(_B1, _B2); \ _B0 = _mm_xor_si128(_B0, _B3); \ _B1 = _mm_xor_si128(_B1, _tt); \ _B0 = _mm_xor_si128(_B0, _B1); \ _tt = _B0; \ _B0 = _B2; \ _B2 = _B3; \ _B3 = _tt; \ } #define sD5(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B2; \ _B2 = _mm_and_si128(_B2, _B3); \ _B2 = _mm_xor_si128(_B2, _B1); \ _B1 = _mm_or_si128(_B1, _B3); \ _B1 = _mm_and_si128(_B1, _B0); \ _tt = _mm_xor_si128(_tt, _B2); \ _tt = _mm_xor_si128(_tt, _B1); \ _B1 = _mm_and_si128(_B1, _B2); \ _B0 = NOT_SI128(_B0); \ _B3 = _mm_xor_si128(_B3, _tt); \ _B1 = _mm_xor_si128(_B1, _B3); \ _B3 = _mm_and_si128(_B3, _B0); \ _B3 = _mm_xor_si128(_B3, _B2); \ _B0 = _mm_xor_si128(_B0, _B1); \ _B2 = _mm_and_si128(_B2, _B0); \ _B3 = _mm_xor_si128(_B3, _B0); \ _B2 = _mm_xor_si128(_B2, _tt); \ _B2 = _mm_or_si128(_B2, _B3); \ _B3 = _mm_xor_si128(_B3, _B0); \ _B2 = _mm_xor_si128(_B2, _B1); \ _B1 = _B3; \ _B3 = _tt; \ } #define sD6(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B3; \ _B1 = NOT_SI128(_B1); \ _B2 = _mm_xor_si128(_B2, _B1); \ _B3 = _mm_or_si128(_B3, _B0); \ _B3 = _mm_xor_si128(_B3, _B2); \ _B2 = _mm_or_si128(_B2, _B1); \ _B2 = _mm_and_si128(_B2, _B0); \ _tt = _mm_xor_si128(_tt, _B3); \ _B2 = _mm_xor_si128(_B2, _tt); \ _tt = _mm_or_si128(_tt, _B0); \ _tt = _mm_xor_si128(_tt, _B1); \ _B1 = _mm_and_si128(_B1, _B2); \ _B1 = _mm_xor_si128(_B1, _B3); \ _tt = _mm_xor_si128(_tt, _B2); \ _B3 = _mm_and_si128(_B3, _tt); \ _tt = _mm_xor_si128(_tt, _B1); \ _B3 = _mm_xor_si128(_B3, _tt); \ _tt = NOT_SI128(_tt); \ _B3 = _mm_xor_si128(_B3, _B0); \ _B0 = _B1; \ _B1 = _tt; \ _tt = _B3; \ _B3 = _B2; \ _B2 = _tt; \ } #define sD7(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B2; \ _B0 = _mm_xor_si128(_B0, _B2); \ _B2 = _mm_and_si128(_B2, _B0); \ _tt = _mm_xor_si128(_tt, _B3); \ _B2 = NOT_SI128(_B2); \ _B3 = _mm_xor_si128(_B3, _B1); \ _B2 = _mm_xor_si128(_B2, _B3); \ _tt = _mm_or_si128(_tt, _B0); \ _B0 = _mm_xor_si128(_B0, _B2); \ _B3 = _mm_xor_si128(_B3, _tt); \ _tt = _mm_xor_si128(_tt, _B1); \ _B1 = _mm_and_si128(_B1, _B3); \ _B1 = _mm_xor_si128(_B1, _B0); \ _B0 = _mm_xor_si128(_B0, _B3); \ _B0 = _mm_or_si128(_B0, _B2); \ _B3 = _mm_xor_si128(_B3, _B1); \ _tt = _mm_xor_si128(_tt, _B0); \ _B0 = _B1; \ _B1 = _B2; \ _B2 = _tt; \ } #define sD8(_B0, _B1, _B2, _B3) { \ __m128i _tt = _B2; \ _B2 = _mm_xor_si128(_B2, _B0); \ _B0 = _mm_and_si128(_B0, _B3); \ _tt = _mm_or_si128(_tt, _B3); \ _B2 = NOT_SI128(_B2); \ _B3 = _mm_xor_si128(_B3, _B1); \ _B1 = _mm_or_si128(_B1, _B0); \ _B0 = _mm_xor_si128(_B0, _B2); \ _B2 = _mm_and_si128(_B2, _tt); \ _B3 = _mm_and_si128(_B3, _tt); \ _B1 = _mm_xor_si128(_B1, _B2); \ _B2 = _mm_xor_si128(_B2, _B0); \ _B0 = _mm_or_si128(_B0, _B2); \ _tt = _mm_xor_si128(_tt, _B1); \ _B0 = _mm_xor_si128(_B0, _B3); \ _B3 = _mm_xor_si128(_B3, _tt); \ _tt = _mm_or_si128(_tt, _B0); \ _B3 = _mm_xor_si128(_B3, _B2); \ _tt = _mm_xor_si128(_tt, _B2); \ _B2 = _B1; \ _B1 = _B0; \ _B0 = _B3; \ _B3 = _tt; \ } #define serpent256_sse2_encrypt(_B0, _B1, _B2, _B3, _ctx) \ transpose(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 0); sE1(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 1); sE2(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 2); sE3(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 3); sE4(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 4); sE5(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 5); sE6(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 6); sE7(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 7); sE8(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 8); sE1(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 9); sE2(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 10); sE3(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 11); sE4(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 12); sE5(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 13); sE6(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 14); sE7(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 15); sE8(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 16); sE1(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 17); sE2(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 18); sE3(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 19); sE4(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 20); sE5(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 21); sE6(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 22); sE7(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 23); sE8(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 24); sE1(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 25); sE2(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 26); sE3(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 27); sE4(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 28); sE5(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 29); sE6(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 30); sE7(_B0,_B1,_B2,_B3); LTf(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 31); sE8(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 32); \ transpose(_B0,_B1,_B2,_B3); #define serpent256_sse2_decrypt(_B0, _B1, _B2, _B3, _ctx) \ transpose(_B0,_B1,_B2,_B3); \ KXf(_B0,_B1,_B2,_B3,_ctx, 32); sD8(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 31); \ ITf(_B0,_B1,_B2,_B3); sD7(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 30); \ ITf(_B0,_B1,_B2,_B3); sD6(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 29); \ ITf(_B0,_B1,_B2,_B3); sD5(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 28); \ ITf(_B0,_B1,_B2,_B3); sD4(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 27); \ ITf(_B0,_B1,_B2,_B3); sD3(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 26); \ ITf(_B0,_B1,_B2,_B3); sD2(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 25); \ ITf(_B0,_B1,_B2,_B3); sD1(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 24); \ ITf(_B0,_B1,_B2,_B3); sD8(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 23); \ ITf(_B0,_B1,_B2,_B3); sD7(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 22); \ ITf(_B0,_B1,_B2,_B3); sD6(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 21); \ ITf(_B0,_B1,_B2,_B3); sD5(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 20); \ ITf(_B0,_B1,_B2,_B3); sD4(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 19); \ ITf(_B0,_B1,_B2,_B3); sD3(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 18); \ ITf(_B0,_B1,_B2,_B3); sD2(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 17); \ ITf(_B0,_B1,_B2,_B3); sD1(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 16); \ ITf(_B0,_B1,_B2,_B3); sD8(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 15); \ ITf(_B0,_B1,_B2,_B3); sD7(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 14); \ ITf(_B0,_B1,_B2,_B3); sD6(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 13); \ ITf(_B0,_B1,_B2,_B3); sD5(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 12); \ ITf(_B0,_B1,_B2,_B3); sD4(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 11); \ ITf(_B0,_B1,_B2,_B3); sD3(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 10); \ ITf(_B0,_B1,_B2,_B3); sD2(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 9); \ ITf(_B0,_B1,_B2,_B3); sD1(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 8); \ ITf(_B0,_B1,_B2,_B3); sD8(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 7); \ ITf(_B0,_B1,_B2,_B3); sD7(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 6); \ ITf(_B0,_B1,_B2,_B3); sD6(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 5); \ ITf(_B0,_B1,_B2,_B3); sD5(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 4); \ ITf(_B0,_B1,_B2,_B3); sD4(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 3); \ ITf(_B0,_B1,_B2,_B3); sD3(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 2); \ ITf(_B0,_B1,_B2,_B3); sD2(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 1); \ ITf(_B0,_B1,_B2,_B3); sD1(_B0,_B1,_B2,_B3); KXf(_B0,_B1,_B2,_B3,_ctx, 0); \ transpose(_B0,_B1,_B2,_B3); #define sse2_next_tweak(_N, _O) { \ __m128i _tt = _O; \ __m128i _t2; \ _tt = _mm_srai_epi16(_tt, 8); \ _tt = _mm_srli_si128(_tt, 15); \ _tt = _mm_and_si128(_tt, _mm_setr_epi32(135,0,0,0)); \ _t2 = _O; \ _t2 = _mm_slli_si128(_t2, 8); \ _t2 = _mm_srli_si128(_t2, 7); \ _t2 = _mm_srli_epi64(_t2, 7); \ _N = _O; \ _N = _mm_slli_epi64(_N, 1); \ _N = _mm_or_si128(_N, _t2); \ _N = _mm_xor_si128(_N, _tt); \ } #ifdef USE_AVX void _stdcall xts_serpent_avx_encrypt(const unsigned char *in, unsigned char *out, size_t len, unsigned __int64 offset, xts_key *key) #else void _stdcall xts_serpent_sse2_encrypt(const unsigned char *in, unsigned char *out, size_t len, unsigned __int64 offset, xts_key *key) #endif { __m128i t0, t1, t2, t3; __m128i b0, b1, b2, b3; __m128i idx; int i; ((unsigned __int64*)&idx)[0] = offset / XTS_SECTOR_SIZE; ((unsigned __int64*)&idx)[1] = 0; do { // update tweak unit index ((unsigned __int64*)&idx)[0]++; // derive first tweak value serpent256_encrypt((unsigned char*)&idx, (unsigned char*)&t0, &key->tweak_k.serpent); for (i = 0; i < XTS_BLOCKS_IN_SECTOR / 4; i++) { // derive t1-t3 sse2_next_tweak(t1, t0); sse2_next_tweak(t2, t1); sse2_next_tweak(t3, t2); // load and tweak 4 blocks b0 = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(in + 0 )), t0); b1 = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(in + 16)), t1); b2 = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(in + 32)), t2); b3 = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(in + 48)), t3); // encrypt / decrypt serpent256_sse2_encrypt(b0, b1, b2, b3, &key->crypt_k.serpent); // tweak and store 4 blocks _mm_storeu_si128((__m128i*)(out + 0 ), _mm_xor_si128(b0, t0)); _mm_storeu_si128((__m128i*)(out + 16), _mm_xor_si128(b1, t1)); _mm_storeu_si128((__m128i*)(out + 32), _mm_xor_si128(b2, t2)); _mm_storeu_si128((__m128i*)(out + 48), _mm_xor_si128(b3, t3)); // derive next t0 sse2_next_tweak(t0, t3); // update pointers in += XTS_BLOCK_SIZE*4; out += XTS_BLOCK_SIZE*4; } } while (len -= XTS_SECTOR_SIZE); } #ifdef USE_AVX void _stdcall xts_serpent_avx_decrypt(const unsigned char *in, unsigned char *out, size_t len, unsigned __int64 offset, xts_key *key) #else void _stdcall xts_serpent_sse2_decrypt(const unsigned char *in, unsigned char *out, size_t len, unsigned __int64 offset, xts_key *key) #endif { __m128i t0, t1, t2, t3; __m128i b0, b1, b2, b3; __m128i idx; int i; ((unsigned __int64*)&idx)[0] = offset / XTS_SECTOR_SIZE; ((unsigned __int64*)&idx)[1] = 0; do { // update tweak unit index ((unsigned __int64*)&idx)[0]++; // derive first tweak value serpent256_encrypt((unsigned char*)&idx, (unsigned char*)&t0, &key->tweak_k.serpent); for (i = 0; i < XTS_BLOCKS_IN_SECTOR / 4; i++) { // derive t1-t3 sse2_next_tweak(t1, t0); sse2_next_tweak(t2, t1); sse2_next_tweak(t3, t2); // load and tweak 4 blocks b0 = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(in + 0 )), t0); b1 = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(in + 16)), t1); b2 = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(in + 32)), t2); b3 = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(in + 48)), t3); // encrypt / decrypt serpent256_sse2_decrypt(b0, b1, b2, b3, &key->crypt_k.serpent); // tweak and store 4 blocks _mm_storeu_si128((__m128i*)(out + 0 ), _mm_xor_si128(b0, t0)); _mm_storeu_si128((__m128i*)(out + 16), _mm_xor_si128(b1, t1)); _mm_storeu_si128((__m128i*)(out + 32), _mm_xor_si128(b2, t2)); _mm_storeu_si128((__m128i*)(out + 48), _mm_xor_si128(b3, t3)); // derive next t0 sse2_next_tweak(t0, t3); // update pointers in += XTS_BLOCK_SIZE*4; out += XTS_BLOCK_SIZE*4; } } while (len -= XTS_SECTOR_SIZE); } #ifdef USE_AVX int _stdcall xts_serpent_avx_available() { int succs = 0; __asm { mov eax, 1 cpuid and ecx, 0x18000000 // check 27 bit (OS uses XSAVE/XRSTOR) cmp ecx, 0x18000000 // and 28 (AVX supported by CPU) jne not_supported xor ecx, ecx // XFEATURE_ENABLED_MASK/XCR0 register number = 0 xgetbv // XFEATURE_ENABLED_MASK register is in edx:eax and eax, 6 cmp eax, 6 // check the AVX registers restore at context switch jne not_supported mov [succs], 1 not_supported: } return succs; } #else int _stdcall xts_serpent_sse2_available() { int info[4]; __cpuid(info, 1); return (info[3] & (1 << 26)) != 0; } #endif