Crypto++
|
00001 // salsa.cpp - written and placed in the public domain by Wei Dai 00002 00003 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code 00004 00005 #include "pch.h" 00006 00007 #ifndef CRYPTOPP_GENERATE_X64_MASM 00008 00009 #include "salsa.h" 00010 #include "misc.h" 00011 #include "argnames.h" 00012 #include "cpu.h" 00013 00014 NAMESPACE_BEGIN(CryptoPP) 00015 00016 void Salsa20_TestInstantiations() 00017 { 00018 Salsa20::Encryption x; 00019 } 00020 00021 void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) 00022 { 00023 m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20); 00024 00025 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20)) 00026 throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds); 00027 00028 // m_state is reordered for SSE2 00029 GetBlock<word32, LittleEndian> get1(key); 00030 get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]); 00031 GetBlock<word32, LittleEndian> get2(key + length - 16); 00032 get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]); 00033 00034 // "expand 16-byte k" or "expand 32-byte k" 00035 m_state[0] = 0x61707865; 00036 m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e; 00037 m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32; 00038 m_state[3] = 0x6b206574; 00039 } 00040 00041 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length) 00042 { 00043 assert(length==8); 00044 GetBlock<word32, LittleEndian> get(IV); 00045 get(m_state[14])(m_state[11]); 00046 m_state[8] = m_state[5] = 0; 00047 } 00048 00049 void Salsa20_Policy::SeekToIteration(lword iterationCount) 00050 { 00051 m_state[8] = (word32)iterationCount; 00052 m_state[5] = (word32)SafeRightShift<32>(iterationCount); 00053 } 00054 00055 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 00056 unsigned int Salsa20_Policy::GetAlignment() const 00057 { 00058 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00059 if (HasSSE2()) 00060 return 16; 00061 else 00062 #endif 00063 return GetAlignmentOf<word32>(); 00064 } 00065 00066 unsigned int Salsa20_Policy::GetOptimalBlockSize() const 00067 { 00068 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00069 if (HasSSE2()) 00070 return 4*BYTES_PER_ITERATION; 00071 else 00072 #endif 00073 return BYTES_PER_ITERATION; 00074 } 00075 #endif 00076 00077 #ifdef CRYPTOPP_X64_MASM_AVAILABLE 00078 extern "C" { 00079 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state); 00080 } 00081 #endif 00082 00083 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code 00084 00085 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) 00086 { 00087 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM 00088 00089 #ifdef CRYPTOPP_X64_MASM_AVAILABLE 00090 Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data()); 00091 return; 00092 #endif 00093 00094 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00095 #ifdef CRYPTOPP_GENERATE_X64_MASM 00096 ALIGN 8 00097 Salsa20_OperateKeystream PROC FRAME 00098 mov r10, [rsp + 5*8] ; state 00099 alloc_stack(10*16 + 32*16 + 8) 00100 save_xmm128 xmm6, 0200h 00101 save_xmm128 xmm7, 0210h 00102 save_xmm128 xmm8, 0220h 00103 save_xmm128 xmm9, 0230h 00104 save_xmm128 xmm10, 0240h 00105 save_xmm128 xmm11, 0250h 00106 save_xmm128 xmm12, 0260h 00107 save_xmm128 xmm13, 0270h 00108 save_xmm128 xmm14, 0280h 00109 save_xmm128 xmm15, 0290h 00110 .endprolog 00111 00112 #define REG_output rcx 00113 #define REG_input rdx 00114 #define REG_iterationCount r8 00115 #define REG_state r10 00116 #define REG_rounds e9d 00117 #define REG_roundsLeft eax 00118 #define REG_temp32 r11d 00119 #define REG_temp r11 00120 #define SSE2_WORKSPACE rsp 00121 #else 00122 if (HasSSE2()) 00123 { 00124 #if CRYPTOPP_BOOL_X64 00125 #define REG_output %4 00126 #define REG_input %1 00127 #define REG_iterationCount %2 00128 #define REG_state %3 00129 #define REG_rounds %0 00130 #define REG_roundsLeft eax 00131 #define REG_temp32 edx 00132 #define REG_temp rdx 00133 #define SSE2_WORKSPACE %5 00134 00135 FixedSizeAlignedSecBlock<byte, 32*16> workspace; 00136 #else 00137 #define REG_output edi 00138 #define REG_input eax 00139 #define REG_iterationCount ecx 00140 #define REG_state esi 00141 #define REG_rounds edx 00142 #define REG_roundsLeft ebx 00143 #define REG_temp32 ebp 00144 #define REG_temp ebp 00145 #define SSE2_WORKSPACE esp + WORD_SZ 00146 #endif 00147 00148 #ifdef __GNUC__ 00149 __asm__ __volatile__ 00150 ( 00151 ".intel_syntax noprefix;" 00152 AS_PUSH_IF86( bx) 00153 #else 00154 void *s = m_state.data(); 00155 word32 r = m_rounds; 00156 00157 AS2( mov REG_iterationCount, iterationCount) 00158 AS2( mov REG_input, input) 00159 AS2( mov REG_output, output) 00160 AS2( mov REG_state, s) 00161 AS2( mov REG_rounds, r) 00162 #endif 00163 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM 00164 00165 AS_PUSH_IF86( bp) 00166 AS2( cmp REG_iterationCount, 4) 00167 ASJ( jl, 5, f) 00168 00169 #if CRYPTOPP_BOOL_X86 00170 AS2( mov ebx, esp) 00171 AS2( and esp, -16) 00172 AS2( sub esp, 32*16) 00173 AS1( push ebx) 00174 #endif 00175 00176 #define SSE2_EXPAND_S(i, j) \ 00177 ASS( pshufd xmm4, xmm##i, j, j, j, j) \ 00178 AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4) 00179 00180 AS2( movdqa xmm0, [REG_state + 0*16]) 00181 AS2( movdqa xmm1, [REG_state + 1*16]) 00182 AS2( movdqa xmm2, [REG_state + 2*16]) 00183 AS2( movdqa xmm3, [REG_state + 3*16]) 00184 SSE2_EXPAND_S(0, 0) 00185 SSE2_EXPAND_S(0, 1) 00186 SSE2_EXPAND_S(0, 2) 00187 SSE2_EXPAND_S(0, 3) 00188 SSE2_EXPAND_S(1, 0) 00189 SSE2_EXPAND_S(1, 2) 00190 SSE2_EXPAND_S(1, 3) 00191 SSE2_EXPAND_S(2, 1) 00192 SSE2_EXPAND_S(2, 2) 00193 SSE2_EXPAND_S(2, 3) 00194 SSE2_EXPAND_S(3, 0) 00195 SSE2_EXPAND_S(3, 1) 00196 SSE2_EXPAND_S(3, 2) 00197 SSE2_EXPAND_S(3, 3) 00198 00199 #define SSE2_EXPAND_S85(i) \ 00200 AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \ 00201 AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \ 00202 AS2( add REG_roundsLeft, 1) \ 00203 AS2( adc REG_temp32, 0) 00204 00205 ASL(1) 00206 AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4]) 00207 AS2( mov REG_temp32, dword ptr [REG_state + 5*4]) 00208 SSE2_EXPAND_S85(0) 00209 SSE2_EXPAND_S85(1) 00210 SSE2_EXPAND_S85(2) 00211 SSE2_EXPAND_S85(3) 00212 AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft) 00213 AS2( mov dword ptr [REG_state + 5*4], REG_temp32) 00214 00215 #define SSE2_QUARTER_ROUND(a, b, d, i) \ 00216 AS2( movdqa xmm4, xmm##d) \ 00217 AS2( paddd xmm4, xmm##a) \ 00218 AS2( movdqa xmm5, xmm4) \ 00219 AS2( pslld xmm4, i) \ 00220 AS2( psrld xmm5, 32-i) \ 00221 AS2( pxor xmm##b, xmm4) \ 00222 AS2( pxor xmm##b, xmm5) 00223 00224 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */ 00225 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */ 00226 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */ 00227 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) 00228 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7) 00229 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7) 00230 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256]) 00231 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */ 00232 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A) 00233 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) 00234 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */ 00235 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) 00236 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9) 00237 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9) 00238 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256]) 00239 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */ 00240 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A) 00241 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) 00242 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */ 00243 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) 00244 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13) 00245 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13) 00246 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) 00247 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */ 00248 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A) 00249 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */ 00250 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) 00251 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18) 00252 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18) 00253 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */ 00254 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */ 00255 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A) 00256 00257 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \ 00258 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \ 00259 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \ 00260 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \ 00261 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \ 00262 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \ 00263 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \ 00264 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \ 00265 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \ 00266 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \ 00267 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \ 00268 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \ 00269 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \ 00270 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \ 00271 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \ 00272 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \ 00273 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \ 00274 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \ 00275 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \ 00276 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \ 00277 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \ 00278 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \ 00279 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \ 00280 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \ 00281 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \ 00282 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \ 00283 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \ 00284 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \ 00285 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \ 00286 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \ 00287 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \ 00288 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \ 00289 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) 00290 00291 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \ 00292 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \ 00293 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \ 00294 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \ 00295 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \ 00296 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \ 00297 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \ 00298 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \ 00299 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \ 00300 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \ 00301 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \ 00302 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \ 00303 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \ 00304 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \ 00305 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \ 00306 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \ 00307 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \ 00308 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \ 00309 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \ 00310 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \ 00311 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \ 00312 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \ 00313 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \ 00314 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \ 00315 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \ 00316 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \ 00317 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \ 00318 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \ 00319 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \ 00320 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \ 00321 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \ 00322 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \ 00323 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i) 00324 00325 #if CRYPTOPP_BOOL_X64 00326 SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) 00327 #else 00328 SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15) 00329 SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13) 00330 #endif 00331 AS2( mov REG_roundsLeft, REG_rounds) 00332 ASJ( jmp, 2, f) 00333 00334 ASL(SSE2_Salsa_Output) 00335 AS2( movdqa xmm0, xmm4) 00336 AS2( punpckldq xmm4, xmm5) 00337 AS2( movdqa xmm1, xmm6) 00338 AS2( punpckldq xmm6, xmm7) 00339 AS2( movdqa xmm2, xmm4) 00340 AS2( punpcklqdq xmm4, xmm6) // e 00341 AS2( punpckhqdq xmm2, xmm6) // f 00342 AS2( punpckhdq xmm0, xmm5) 00343 AS2( punpckhdq xmm1, xmm7) 00344 AS2( movdqa xmm6, xmm0) 00345 AS2( punpcklqdq xmm0, xmm1) // g 00346 AS2( punpckhqdq xmm6, xmm1) // h 00347 AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1) 00348 AS1( ret) 00349 00350 ASL(6) 00351 #if CRYPTOPP_BOOL_X64 00352 SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) 00353 ASL(2) 00354 SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6) 00355 #else 00356 SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15) 00357 SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13) 00358 ASL(2) 00359 SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6) 00360 SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4) 00361 #endif 00362 AS2( sub REG_roundsLeft, 2) 00363 ASJ( jnz, 6, b) 00364 00365 #define SSE2_OUTPUT_4(a, b, c, d) \ 00366 AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\ 00367 AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\ 00368 AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\ 00369 AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\ 00370 AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\ 00371 AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\ 00372 AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\ 00373 AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\ 00374 ASC( call, SSE2_Salsa_Output) 00375 00376 SSE2_OUTPUT_4(0, 13, 10, 7) 00377 SSE2_OUTPUT_4(4, 1, 14, 11) 00378 SSE2_OUTPUT_4(8, 5, 2, 15) 00379 SSE2_OUTPUT_4(12, 9, 6, 3) 00380 AS2( test REG_input, REG_input) 00381 ASJ( jz, 9, f) 00382 AS2( add REG_input, 12*16) 00383 ASL(9) 00384 AS2( add REG_output, 12*16) 00385 AS2( sub REG_iterationCount, 4) 00386 AS2( cmp REG_iterationCount, 4) 00387 ASJ( jge, 1, b) 00388 AS_POP_IF86( sp) 00389 00390 ASL(5) 00391 AS2( sub REG_iterationCount, 1) 00392 ASJ( jl, 4, f) 00393 AS2( movdqa xmm0, [REG_state + 0*16]) 00394 AS2( movdqa xmm1, [REG_state + 1*16]) 00395 AS2( movdqa xmm2, [REG_state + 2*16]) 00396 AS2( movdqa xmm3, [REG_state + 3*16]) 00397 AS2( mov REG_roundsLeft, REG_rounds) 00398 00399 ASL(0) 00400 SSE2_QUARTER_ROUND(0, 1, 3, 7) 00401 SSE2_QUARTER_ROUND(1, 2, 0, 9) 00402 SSE2_QUARTER_ROUND(2, 3, 1, 13) 00403 SSE2_QUARTER_ROUND(3, 0, 2, 18) 00404 ASS( pshufd xmm1, xmm1, 2, 1, 0, 3) 00405 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2) 00406 ASS( pshufd xmm3, xmm3, 0, 3, 2, 1) 00407 SSE2_QUARTER_ROUND(0, 3, 1, 7) 00408 SSE2_QUARTER_ROUND(3, 2, 0, 9) 00409 SSE2_QUARTER_ROUND(2, 1, 3, 13) 00410 SSE2_QUARTER_ROUND(1, 0, 2, 18) 00411 ASS( pshufd xmm1, xmm1, 0, 3, 2, 1) 00412 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2) 00413 ASS( pshufd xmm3, xmm3, 2, 1, 0, 3) 00414 AS2( sub REG_roundsLeft, 2) 00415 ASJ( jnz, 0, b) 00416 00417 AS2( paddd xmm0, [REG_state + 0*16]) 00418 AS2( paddd xmm1, [REG_state + 1*16]) 00419 AS2( paddd xmm2, [REG_state + 2*16]) 00420 AS2( paddd xmm3, [REG_state + 3*16]) 00421 00422 AS2( add dword ptr [REG_state + 8*4], 1) 00423 AS2( adc dword ptr [REG_state + 5*4], 0) 00424 00425 AS2( pcmpeqb xmm6, xmm6) // all ones 00426 AS2( psrlq xmm6, 32) // lo32 mask 00427 ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask 00428 AS2( movdqa xmm4, xmm0) 00429 AS2( movdqa xmm5, xmm3) 00430 AS2( pand xmm0, xmm7) 00431 AS2( pand xmm4, xmm6) 00432 AS2( pand xmm3, xmm6) 00433 AS2( pand xmm5, xmm7) 00434 AS2( por xmm4, xmm5) // 0,13,2,15 00435 AS2( movdqa xmm5, xmm1) 00436 AS2( pand xmm1, xmm7) 00437 AS2( pand xmm5, xmm6) 00438 AS2( por xmm0, xmm5) // 4,1,6,3 00439 AS2( pand xmm6, xmm2) 00440 AS2( pand xmm2, xmm7) 00441 AS2( por xmm1, xmm6) // 8,5,10,7 00442 AS2( por xmm2, xmm3) // 12,9,14,11 00443 00444 AS2( movdqa xmm5, xmm4) 00445 AS2( movdqa xmm6, xmm0) 00446 AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7 00447 AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11 00448 AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15 00449 AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3 00450 00451 // output keystream 00452 AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4) 00453 ASJ( jmp, 5, b) 00454 ASL(4) 00455 00456 AS_POP_IF86( bp) 00457 #ifdef __GNUC__ 00458 AS_POP_IF86( bx) 00459 ".att_syntax prefix;" 00460 : 00461 #if CRYPTOPP_BOOL_X64 00462 : "r" (m_rounds), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace.m_ptr) 00463 : "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" 00464 #else 00465 : "d" (m_rounds), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output) 00466 : "memory", "cc" 00467 #endif 00468 ); 00469 #endif 00470 #ifdef CRYPTOPP_GENERATE_X64_MASM 00471 movdqa xmm6, [rsp + 0200h] 00472 movdqa xmm7, [rsp + 0210h] 00473 movdqa xmm8, [rsp + 0220h] 00474 movdqa xmm9, [rsp + 0230h] 00475 movdqa xmm10, [rsp + 0240h] 00476 movdqa xmm11, [rsp + 0250h] 00477 movdqa xmm12, [rsp + 0260h] 00478 movdqa xmm13, [rsp + 0270h] 00479 movdqa xmm14, [rsp + 0280h] 00480 movdqa xmm15, [rsp + 0290h] 00481 add rsp, 10*16 + 32*16 + 8 00482 ret 00483 Salsa20_OperateKeystream ENDP 00484 #else 00485 } 00486 else 00487 #endif 00488 #endif 00489 #ifndef CRYPTOPP_GENERATE_X64_MASM 00490 { 00491 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; 00492 00493 while (iterationCount--) 00494 { 00495 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; 00496 x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7]; 00497 x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11]; 00498 x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15]; 00499 00500 for (int i=m_rounds; i>0; i-=2) 00501 { 00502 #define QUARTER_ROUND(a, b, c, d) \ 00503 b = b ^ rotlFixed(a + d, 7); \ 00504 c = c ^ rotlFixed(b + a, 9); \ 00505 d = d ^ rotlFixed(c + b, 13); \ 00506 a = a ^ rotlFixed(d + c, 18); 00507 00508 QUARTER_ROUND(x0, x4, x8, x12) 00509 QUARTER_ROUND(x1, x5, x9, x13) 00510 QUARTER_ROUND(x2, x6, x10, x14) 00511 QUARTER_ROUND(x3, x7, x11, x15) 00512 00513 QUARTER_ROUND(x0, x13, x10, x7) 00514 QUARTER_ROUND(x1, x14, x11, x4) 00515 QUARTER_ROUND(x2, x15, x8, x5) 00516 QUARTER_ROUND(x3, x12, x9, x6) 00517 } 00518 00519 #define SALSA_OUTPUT(x) {\ 00520 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ 00521 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\ 00522 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\ 00523 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\ 00524 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ 00525 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\ 00526 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\ 00527 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\ 00528 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ 00529 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\ 00530 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\ 00531 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\ 00532 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ 00533 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\ 00534 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\ 00535 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);} 00536 00537 #ifndef CRYPTOPP_DOXYGEN_PROCESSING 00538 CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION); 00539 #endif 00540 00541 if (++m_state[8] == 0) 00542 ++m_state[5]; 00543 } 00544 } 00545 } // see comment above if an internal compiler error occurs here 00546 00547 void XSalsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) 00548 { 00549 m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20); 00550 00551 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20)) 00552 throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds); 00553 00554 GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length); 00555 if (length == 16) 00556 memcpy(m_key.begin()+4, m_key.begin(), 16); 00557 00558 // "expand 32-byte k" 00559 m_state[0] = 0x61707865; 00560 m_state[1] = 0x3320646e; 00561 m_state[2] = 0x79622d32; 00562 m_state[3] = 0x6b206574; 00563 } 00564 00565 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length) 00566 { 00567 assert(length==24); 00568 00569 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; 00570 00571 GetBlock<word32, LittleEndian> get(IV); 00572 get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]); 00573 00574 x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3]; 00575 x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7]; 00576 x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; 00577 00578 for (int i=m_rounds; i>0; i-=2) 00579 { 00580 QUARTER_ROUND(x0, x4, x8, x12) 00581 QUARTER_ROUND(x1, x5, x9, x13) 00582 QUARTER_ROUND(x2, x6, x10, x14) 00583 QUARTER_ROUND(x3, x7, x11, x15) 00584 00585 QUARTER_ROUND(x0, x13, x10, x7) 00586 QUARTER_ROUND(x1, x14, x11, x4) 00587 QUARTER_ROUND(x2, x15, x8, x5) 00588 QUARTER_ROUND(x3, x12, x9, x6) 00589 } 00590 00591 m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3; 00592 m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5; 00593 m_state[8] = m_state[5] = 0; 00594 } 00595 00596 NAMESPACE_END 00597 00598 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM