/root/src/xen/xen/crypto/vmac.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* -------------------------------------------------------------------------- |
2 | | * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai. |
3 | | * This implementation is herby placed in the public domain. |
4 | | * The authors offers no warranty. Use at your own risk. |
5 | | * Please send bug reports to the authors. |
6 | | * Last modified: 17 APR 08, 1700 PDT |
7 | | * ----------------------------------------------------------------------- */ |
8 | | |
9 | | /* start for Xen */ |
10 | | #include <xen/init.h> |
11 | | #include <xen/types.h> |
12 | | #include <xen/lib.h> |
13 | | #include <crypto/vmac.h> |
14 | | #define UINT64_C(x) x##ULL |
15 | | /* end for Xen */ |
16 | | |
17 | | /* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */ |
18 | | #ifndef VMAC_ARCH_64 |
19 | | #define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64) |
20 | | #endif |
21 | | |
22 | | /* Enable code tuned for Intel SSE2 instruction set */ |
23 | | #if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64)) |
24 | | #define VMAC_USE_SSE2 1 |
25 | | #include <emmintrin.h> |
26 | | #endif |
27 | | |
28 | | /* Native word reads. Update (or define via compiler) if incorrect */ |
29 | | #ifndef VMAC_ARCH_BIG_ENDIAN /* Assume big-endian unless on the list */ |
30 | | #define VMAC_ARCH_BIG_ENDIAN \ |
31 | | (!(__x86_64__ || __i386__ || _M_IX86 || \ |
32 | | _M_X64 || __ARMEL__ || __MIPSEL__)) |
33 | | #endif |
34 | | |
35 | | /* ----------------------------------------------------------------------- */ |
36 | | /* Constants and masks */ |
37 | | |
38 | | const uint64_t p64 = UINT64_C(0xfffffffffffffeff); /* 2^64 - 257 prime */ |
39 | | const uint64_t m62 = UINT64_C(0x3fffffffffffffff); /* 62-bit mask */ |
40 | | const uint64_t m63 = UINT64_C(0x7fffffffffffffff); /* 63-bit mask */ |
41 | | const uint64_t m64 = UINT64_C(0xffffffffffffffff); /* 64-bit mask */ |
42 | | const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff); /* Poly key mask */ |
43 | | |
44 | | /* ----------------------------------------------------------------------- * |
45 | | * The following routines are used in this implementation. They are |
46 | | * written via macros to simulate zero-overhead call-by-reference. |
47 | | * All have default implemantations for when they are not defined in an |
48 | | * architecture-specific manner. |
49 | | * |
50 | | * MUL64: 64x64->128-bit multiplication |
51 | | * PMUL64: assumes top bits cleared on inputs |
52 | | * ADD128: 128x128->128-bit addition |
53 | | * GET_REVERSED_64: load and byte-reverse 64-bit word |
54 | | * ----------------------------------------------------------------------- */ |
55 | | |
56 | | /* ----------------------------------------------------------------------- */ |
57 | | #if (__GNUC__ && (__x86_64__ || __amd64__)) |
58 | | /* ----------------------------------------------------------------------- */ |
59 | | |
60 | | #define ADD128(rh,rl,ih,il) \ |
61 | 0 | asm ("addq %3, %1 \n\t" \ |
62 | 0 | "adcq %2, %0" \ |
63 | 0 | : "+r"(rh),"+r"(rl) \ |
64 | 0 | : "r"(ih),"r"(il) : "cc"); |
65 | | |
66 | | #define MUL64(rh,rl,i1,i2) \ |
67 | 0 | asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc") |
68 | | |
69 | 0 | #define PMUL64 MUL64 |
70 | | |
71 | | #define GET_REVERSED_64(p) \ |
72 | 0 | ({uint64_t x; \ |
73 | 0 | asm ("bswapq %0" : "=r" (x) : "0"(*(uint64_t *)(p))); x;}) |
74 | | |
75 | | /* ----------------------------------------------------------------------- */ |
76 | | #elif (__GNUC__ && __i386__) |
77 | | /* ----------------------------------------------------------------------- */ |
78 | | |
79 | | #define GET_REVERSED_64(p) \ |
80 | | ({ uint64_t x; \ |
81 | | uint32_t *tp = (uint32_t *)(p); \ |
82 | | asm ("bswap %%edx\n\t" \ |
83 | | "bswap %%eax" \ |
84 | | : "=A"(x) \ |
85 | | : "a"(tp[1]), "d"(tp[0])); \ |
86 | | x; }) |
87 | | |
88 | | /* ----------------------------------------------------------------------- */ |
89 | | #elif (__GNUC__ && __ppc64__) |
90 | | /* ----------------------------------------------------------------------- */ |
91 | | |
92 | | #define ADD128(rh,rl,ih,il) \ |
93 | | asm volatile ( "addc %1, %1, %3 \n\t" \ |
94 | | "adde %0, %0, %2" \ |
95 | | : "+r"(rh),"+r"(rl) \ |
96 | | : "r"(ih),"r"(il)); |
97 | | |
98 | | #define MUL64(rh,rl,i1,i2) \ |
99 | | { uint64_t _i1 = (i1), _i2 = (i2); \ |
100 | | rl = _i1 * _i2; \ |
101 | | asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\ |
102 | | } |
103 | | |
104 | | #define PMUL64 MUL64 |
105 | | |
106 | | #define GET_REVERSED_64(p) \ |
107 | | ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \ |
108 | | asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \ |
109 | | asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \ |
110 | | ((uint64_t)hi << 32) | (uint64_t)lo; } ) |
111 | | |
112 | | /* ----------------------------------------------------------------------- */ |
113 | | #elif (__GNUC__ && (__ppc__ || __PPC__)) |
114 | | /* ----------------------------------------------------------------------- */ |
115 | | |
116 | | #define GET_REVERSED_64(p) \ |
117 | | ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \ |
118 | | asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \ |
119 | | asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \ |
120 | | ((uint64_t)hi << 32) | (uint64_t)lo; } ) |
121 | | |
122 | | /* ----------------------------------------------------------------------- */ |
123 | | #elif (__GNUC__ && (__ARMEL__ || __ARM__)) |
124 | | /* ----------------------------------------------------------------------- */ |
125 | | |
126 | | #define bswap32(v) \ |
127 | | ({ uint32_t tmp,out; \ |
128 | | asm volatile( \ |
129 | | "eor %1, %2, %2, ror #16\n" \ |
130 | | "bic %1, %1, #0x00ff0000\n" \ |
131 | | "mov %0, %2, ror #8\n" \ |
132 | | "eor %0, %0, %1, lsr #8" \ |
133 | | : "=r" (out), "=&r" (tmp) \ |
134 | | : "r" (v)); \ |
135 | | out;}) |
136 | | |
137 | | /* ----------------------------------------------------------------------- */ |
138 | | #elif _MSC_VER |
139 | | /* ----------------------------------------------------------------------- */ |
140 | | |
141 | | #include <intrin.h> |
142 | | |
143 | | #if (_M_IA64 || _M_X64) && \ |
144 | | (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000) |
145 | | #define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh)); |
146 | | #pragma intrinsic(_umul128) |
147 | | #define PMUL64 MUL64 |
148 | | #endif |
149 | | |
150 | | /* MSVC uses add, adc in this version */ |
151 | | #define ADD128(rh,rl,ih,il) \ |
152 | | { uint64_t _il = (il); \ |
153 | | (rl) += (_il); \ |
154 | | (rh) += (ih) + ((rl) < (_il)); \ |
155 | | } |
156 | | |
157 | | #if _MSC_VER >= 1300 |
158 | | #define GET_REVERSED_64(p) _byteswap_uint64(*(uint64_t *)(p)) |
159 | | #pragma intrinsic(_byteswap_uint64) |
160 | | #endif |
161 | | |
162 | | #if _MSC_VER >= 1400 && \ |
163 | | (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000) |
164 | | #define MUL32(i1,i2) (__emulu((uint32_t)(i1),(uint32_t)(i2))) |
165 | | #pragma intrinsic(__emulu) |
166 | | #endif |
167 | | |
168 | | /* ----------------------------------------------------------------------- */ |
169 | | #endif |
170 | | /* ----------------------------------------------------------------------- */ |
171 | | |
172 | | #if __GNUC__ |
173 | | #define ALIGN(n) __attribute__ ((aligned(n))) |
174 | | #define NOINLINE __attribute__ ((noinline)) |
175 | | #elif _MSC_VER |
176 | | #define ALIGN(n) __declspec(align(n)) |
177 | | #define NOINLINE __declspec(noinline) |
178 | | #else |
179 | | #define ALIGN(n) |
180 | | #define NOINLINE |
181 | | #endif |
182 | | |
183 | | /* ----------------------------------------------------------------------- */ |
184 | | /* Default implementations, if not defined above */ |
185 | | /* ----------------------------------------------------------------------- */ |
186 | | |
187 | | #ifndef ADD128 |
188 | | #define ADD128(rh,rl,ih,il) \ |
189 | | { uint64_t _il = (il); \ |
190 | | (rl) += (_il); \ |
191 | | if ((rl) < (_il)) (rh)++; \ |
192 | | (rh) += (ih); \ |
193 | | } |
194 | | #endif |
195 | | |
196 | | #ifndef MUL32 |
197 | | #define MUL32(i1,i2) ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2)) |
198 | | #endif |
199 | | |
200 | | #ifndef PMUL64 /* rh may not be same as i1 or i2 */ |
201 | | #define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow */ \ |
202 | | { uint64_t _i1 = (i1), _i2 = (i2); \ |
203 | | uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2); \ |
204 | | rh = MUL32(_i1>>32,_i2>>32); \ |
205 | | rl = MUL32(_i1,_i2); \ |
206 | | ADD128(rh,rl,(m >> 32),(m << 32)); \ |
207 | | } |
208 | | #endif |
209 | | |
210 | | #ifndef MUL64 |
211 | | #define MUL64(rh,rl,i1,i2) \ |
212 | | { uint64_t _i1 = (i1), _i2 = (i2); \ |
213 | | uint64_t m1= MUL32(_i1,_i2>>32); \ |
214 | | uint64_t m2= MUL32(_i1>>32,_i2); \ |
215 | | rh = MUL32(_i1>>32,_i2>>32); \ |
216 | | rl = MUL32(_i1,_i2); \ |
217 | | ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \ |
218 | | ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \ |
219 | | } |
220 | | #endif |
221 | | |
222 | | #ifndef GET_REVERSED_64 |
223 | | #ifndef bswap64 |
224 | | #ifndef bswap32 |
225 | | #define bswap32(x) \ |
226 | | ({ uint32_t bsx = (x); \ |
227 | | ((((bsx) & 0xff000000u) >> 24) | (((bsx) & 0x00ff0000u) >> 8) | \ |
228 | | (((bsx) & 0x0000ff00u) << 8) | (((bsx) & 0x000000ffu) << 24)); }) |
229 | | #endif |
230 | | #define bswap64(x) \ |
231 | | ({ union { uint64_t ll; uint32_t l[2]; } w, r; \ |
232 | | w.ll = (x); \ |
233 | | r.l[0] = bswap32 (w.l[1]); \ |
234 | | r.l[1] = bswap32 (w.l[0]); \ |
235 | | r.ll; }) |
236 | | #endif |
237 | | #define GET_REVERSED_64(p) bswap64(*(uint64_t *)(p)) |
238 | | #endif |
239 | | |
240 | | /* ----------------------------------------------------------------------- */ |
241 | | |
242 | | #if (VMAC_PREFER_BIG_ENDIAN) |
243 | | # define get64PE get64BE |
244 | | #else |
245 | | # define get64PE get64LE |
246 | | #endif |
247 | | |
248 | | #if (VMAC_ARCH_BIG_ENDIAN) |
249 | | # define get64BE(ptr) (*(uint64_t *)(ptr)) |
250 | | # define get64LE(ptr) GET_REVERSED_64(ptr) |
251 | | #else /* assume little-endian */ |
252 | 0 | # define get64BE(ptr) GET_REVERSED_64(ptr) |
253 | | # define get64LE(ptr) (*(uint64_t *)(ptr)) |
254 | | #endif |
255 | | |
256 | | |
257 | | /* --------------------------------------------------------------------- * |
258 | | * For highest performance the L1 NH and L2 polynomial hashes should be |
259 | | * carefully implemented to take advantage of one's target architechture. |
260 | | * Here these two hash functions are defined multiple time; once for |
261 | | * 64-bit architectures, once for 32-bit SSE2 architectures, and once |
262 | | * for the rest (32-bit) architectures. |
263 | | * For each, nh_16 *must* be defined (works on multiples of 16 bytes). |
264 | | * Optionally, nh_vmac_nhbytes can be defined (for multiples of |
265 | | * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two |
266 | | * NH computations at once). |
267 | | * --------------------------------------------------------------------- */ |
268 | | |
269 | | /* ----------------------------------------------------------------------- */ |
270 | | #if VMAC_ARCH_64 |
271 | | /* ----------------------------------------------------------------------- */ |
272 | | |
273 | 0 | #define nh_16(mp, kp, nw, rh, rl) \ |
274 | 0 | { int i; uint64_t th, tl; \ |
275 | 0 | rh = rl = 0; \ |
276 | 0 | for (i = 0; i < nw; i+= 2) { \ |
277 | 0 | MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\ |
278 | 0 | ADD128(rh,rl,th,tl); \ |
279 | 0 | } \ |
280 | 0 | } |
281 | | #define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1) \ |
282 | | { int i; uint64_t th, tl; \ |
283 | | rh1 = rl1 = rh = rl = 0; \ |
284 | | for (i = 0; i < nw; i+= 2) { \ |
285 | | MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\ |
286 | | ADD128(rh,rl,th,tl); \ |
287 | | MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\ |
288 | | ADD128(rh1,rl1,th,tl); \ |
289 | | } \ |
290 | | } |
291 | | |
292 | | #if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */ |
293 | 0 | #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \ |
294 | 0 | { int i; uint64_t th, tl; \ |
295 | 0 | rh = rl = 0; \ |
296 | 0 | for (i = 0; i < nw; i+= 8) { \ |
297 | 0 | MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\ |
298 | 0 | ADD128(rh,rl,th,tl); \ |
299 | 0 | MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\ |
300 | 0 | ADD128(rh,rl,th,tl); \ |
301 | 0 | MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\ |
302 | 0 | ADD128(rh,rl,th,tl); \ |
303 | 0 | MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\ |
304 | 0 | ADD128(rh,rl,th,tl); \ |
305 | 0 | } \ |
306 | 0 | } |
307 | | #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1) \ |
308 | | { int i; uint64_t th, tl; \ |
309 | | rh1 = rl1 = rh = rl = 0; \ |
310 | | for (i = 0; i < nw; i+= 8) { \ |
311 | | MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\ |
312 | | ADD128(rh,rl,th,tl); \ |
313 | | MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\ |
314 | | ADD128(rh1,rl1,th,tl); \ |
315 | | MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\ |
316 | | ADD128(rh,rl,th,tl); \ |
317 | | MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\ |
318 | | ADD128(rh1,rl1,th,tl); \ |
319 | | MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\ |
320 | | ADD128(rh,rl,th,tl); \ |
321 | | MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\ |
322 | | ADD128(rh1,rl1,th,tl); \ |
323 | | MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\ |
324 | | ADD128(rh,rl,th,tl); \ |
325 | | MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\ |
326 | | ADD128(rh1,rl1,th,tl); \ |
327 | | } \ |
328 | | } |
329 | | #endif |
330 | | |
331 | 0 | #define poly_step(ah, al, kh, kl, mh, ml) \ |
332 | 0 | { uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0; \ |
333 | 0 | /* compute ab*cd, put bd into result registers */ \ |
334 | 0 | PMUL64(t3h,t3l,al,kh); \ |
335 | 0 | PMUL64(t2h,t2l,ah,kl); \ |
336 | 0 | PMUL64(t1h,t1l,ah,2*kh); \ |
337 | 0 | PMUL64(ah,al,al,kl); \ |
338 | 0 | /* add 2 * ac to result */ \ |
339 | 0 | ADD128(ah,al,t1h,t1l); \ |
340 | 0 | /* add together ad + bc */ \ |
341 | 0 | ADD128(t2h,t2l,t3h,t3l); \ |
342 | 0 | /* now (ah,al), (t2l,2*t2h) need summing */ \ |
343 | 0 | /* first add the high registers, carrying into t2h */ \ |
344 | 0 | ADD128(t2h,ah,z,t2l); \ |
345 | 0 | /* double t2h and add top bit of ah */ \ |
346 | 0 | t2h = 2 * t2h + (ah >> 63); \ |
347 | 0 | ah &= m63; \ |
348 | 0 | /* now add the low registers */ \ |
349 | 0 | ADD128(ah,al,mh,ml); \ |
350 | 0 | ADD128(ah,al,z,t2h); \ |
351 | 0 | } |
352 | | |
353 | | /* ----------------------------------------------------------------------- */ |
354 | | #elif VMAC_USE_SSE2 |
355 | | /* ----------------------------------------------------------------------- */ |
356 | | |
357 | | // macros from Crypto++ for sharing inline assembly code between MSVC and GNU C |
358 | | #if defined(__GNUC__) |
359 | | // define these in two steps to allow arguments to be expanded |
360 | | #define GNU_AS2(x, y) #x ", " #y ";" |
361 | | #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";" |
362 | | #define GNU_ASL(x) "\n" #x ":" |
363 | | #define GNU_ASJ(x, y, z) #x " " #y #z ";" |
364 | | #define AS2(x, y) GNU_AS2(x, y) |
365 | | #define AS3(x, y, z) GNU_AS3(x, y, z) |
366 | | #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";" |
367 | | #define ASL(x) GNU_ASL(x) |
368 | | #define ASJ(x, y, z) GNU_ASJ(x, y, z) |
369 | | #else |
370 | | #define AS2(x, y) __asm {x, y} |
371 | | #define AS3(x, y, z) __asm {x, y, z} |
372 | | #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)} |
373 | | #define ASL(x) __asm {label##x:} |
374 | | #define ASJ(x, y, z) __asm {x label##y} |
375 | | #endif |
376 | | |
377 | | static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl) |
378 | | { |
379 | | // This assembly version, using MMX registers, is just as fast as the |
380 | | // intrinsics version (which uses XMM registers) on the Intel Core 2, |
381 | | // but is much faster on the Pentium 4. In order to schedule multiplies |
382 | | // as early as possible, the loop interleaves operations for the current |
383 | | // block and the next block. To mask out high 32-bits, we use "movd" |
384 | | // to move the lower 32-bits to the stack and then back. Surprisingly, |
385 | | // this is faster than any other method. |
386 | | #ifdef __GNUC__ |
387 | | __asm__ __volatile__ |
388 | | ( |
389 | | ".intel_syntax noprefix;" |
390 | | #else |
391 | | AS2( mov esi, mp) |
392 | | AS2( mov edi, kp) |
393 | | AS2( mov ecx, nw) |
394 | | AS2( mov eax, rl) |
395 | | AS2( mov edx, rh) |
396 | | #endif |
397 | | AS2( sub esp, 12) |
398 | | AS2( movq mm6, [esi]) |
399 | | AS2( paddq mm6, [edi]) |
400 | | AS2( movq mm5, [esi+8]) |
401 | | AS2( paddq mm5, [edi+8]) |
402 | | AS2( add esi, 16) |
403 | | AS2( add edi, 16) |
404 | | AS2( movq mm4, mm6) |
405 | | ASS( pshufw mm2, mm6, 1, 0, 3, 2) |
406 | | AS2( pmuludq mm6, mm5) |
407 | | ASS( pshufw mm3, mm5, 1, 0, 3, 2) |
408 | | AS2( pmuludq mm5, mm2) |
409 | | AS2( pmuludq mm2, mm3) |
410 | | AS2( pmuludq mm3, mm4) |
411 | | AS2( pxor mm7, mm7) |
412 | | AS2( movd [esp], mm6) |
413 | | AS2( psrlq mm6, 32) |
414 | | AS2( movd [esp+4], mm5) |
415 | | AS2( psrlq mm5, 32) |
416 | | AS2( sub ecx, 2) |
417 | | ASJ( jz, 1, f) |
418 | | ASL(0) |
419 | | AS2( movq mm0, [esi]) |
420 | | AS2( paddq mm0, [edi]) |
421 | | AS2( movq mm1, [esi+8]) |
422 | | AS2( paddq mm1, [edi+8]) |
423 | | AS2( add esi, 16) |
424 | | AS2( add edi, 16) |
425 | | AS2( movq mm4, mm0) |
426 | | AS2( paddq mm5, mm2) |
427 | | ASS( pshufw mm2, mm0, 1, 0, 3, 2) |
428 | | AS2( pmuludq mm0, mm1) |
429 | | AS2( movd [esp+8], mm3) |
430 | | AS2( psrlq mm3, 32) |
431 | | AS2( paddq mm5, mm3) |
432 | | ASS( pshufw mm3, mm1, 1, 0, 3, 2) |
433 | | AS2( pmuludq mm1, mm2) |
434 | | AS2( pmuludq mm2, mm3) |
435 | | AS2( pmuludq mm3, mm4) |
436 | | AS2( movd mm4, [esp]) |
437 | | AS2( paddq mm7, mm4) |
438 | | AS2( movd mm4, [esp+4]) |
439 | | AS2( paddq mm6, mm4) |
440 | | AS2( movd mm4, [esp+8]) |
441 | | AS2( paddq mm6, mm4) |
442 | | AS2( movd [esp], mm0) |
443 | | AS2( psrlq mm0, 32) |
444 | | AS2( paddq mm6, mm0) |
445 | | AS2( movd [esp+4], mm1) |
446 | | AS2( psrlq mm1, 32) |
447 | | AS2( paddq mm5, mm1) |
448 | | AS2( sub ecx, 2) |
449 | | ASJ( jnz, 0, b) |
450 | | ASL(1) |
451 | | AS2( paddq mm5, mm2) |
452 | | AS2( movd [esp+8], mm3) |
453 | | AS2( psrlq mm3, 32) |
454 | | AS2( paddq mm5, mm3) |
455 | | AS2( movd mm4, [esp]) |
456 | | AS2( paddq mm7, mm4) |
457 | | AS2( movd mm4, [esp+4]) |
458 | | AS2( paddq mm6, mm4) |
459 | | AS2( movd mm4, [esp+8]) |
460 | | AS2( paddq mm6, mm4) |
461 | | |
462 | | ASS( pshufw mm0, mm7, 3, 2, 1, 0) |
463 | | AS2( psrlq mm7, 32) |
464 | | AS2( paddq mm6, mm7) |
465 | | AS2( punpckldq mm0, mm6) |
466 | | AS2( psrlq mm6, 32) |
467 | | AS2( paddq mm5, mm6) |
468 | | AS2( movq [eax], mm0) |
469 | | AS2( movq [edx], mm5) |
470 | | AS2( add esp, 12) |
471 | | #ifdef __GNUC__ |
472 | | ".att_syntax prefix;" |
473 | | : |
474 | | : "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh) |
475 | | : "memory", "cc" |
476 | | ); |
477 | | #endif |
478 | | } |
479 | | #define nh_16(mp, kp, nw, rh, rl) nh_16_func(mp, kp, nw, &(rh), &(rl)); |
480 | | |
481 | | static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh, |
482 | | const uint64_t *kl, const uint64_t *mh, const uint64_t *ml) |
483 | | { |
484 | | // This code tries to schedule the multiplies as early as possible to overcome |
485 | | // the long latencies on the Pentium 4. It also minimizes "movq" instructions |
486 | | // which are very expensive on the P4. |
487 | | |
488 | | #define a0 [eax+0] |
489 | | #define a1 [eax+4] |
490 | | #define a2 [ebx+0] |
491 | | #define a3 [ebx+4] |
492 | | #define k0 [ecx+0] |
493 | | #define k1 [ecx+4] |
494 | | #define k2 [edx+0] |
495 | | #define k3 [edx+4] |
496 | | |
497 | | #ifdef __GNUC__ |
498 | | uint32_t temp; |
499 | | __asm__ __volatile__ |
500 | | ( |
501 | | "mov %%ebx, %0;" |
502 | | "mov %1, %%ebx;" |
503 | | ".intel_syntax noprefix;" |
504 | | #else |
505 | | AS2( mov ebx, ahi) |
506 | | AS2( mov edx, kh) |
507 | | AS2( mov eax, alo) |
508 | | AS2( mov ecx, kl) |
509 | | AS2( mov esi, mh) |
510 | | AS2( mov edi, ml) |
511 | | #endif |
512 | | |
513 | | AS2( movd mm0, a3) |
514 | | AS2( movq mm4, mm0) |
515 | | AS2( pmuludq mm0, k3) // a3*k3 |
516 | | AS2( movd mm1, a0) |
517 | | AS2( pmuludq mm1, k2) // a0*k2 |
518 | | AS2( movd mm2, a1) |
519 | | AS2( movd mm6, k1) |
520 | | AS2( pmuludq mm2, mm6) // a1*k1 |
521 | | AS2( movd mm3, a2) |
522 | | AS2( movq mm5, mm3) |
523 | | AS2( movd mm7, k0) |
524 | | AS2( pmuludq mm3, mm7) // a2*k0 |
525 | | AS2( pmuludq mm4, mm7) // a3*k0 |
526 | | AS2( pmuludq mm5, mm6) // a2*k1 |
527 | | AS2( psllq mm0, 1) |
528 | | AS2( paddq mm0, [esi]) |
529 | | AS2( paddq mm0, mm1) |
530 | | AS2( movd mm1, a1) |
531 | | AS2( paddq mm4, mm5) |
532 | | AS2( movq mm5, mm1) |
533 | | AS2( pmuludq mm1, k2) // a1*k2 |
534 | | AS2( paddq mm0, mm2) |
535 | | AS2( movd mm2, a0) |
536 | | AS2( paddq mm0, mm3) |
537 | | AS2( movq mm3, mm2) |
538 | | AS2( pmuludq mm2, k3) // a0*k3 |
539 | | AS2( pmuludq mm3, mm7) // a0*k0 |
540 | | AS2( movd esi, mm0) |
541 | | AS2( psrlq mm0, 32) |
542 | | AS2( pmuludq mm7, mm5) // a1*k0 |
543 | | AS2( pmuludq mm5, k3) // a1*k3 |
544 | | AS2( paddq mm0, mm1) |
545 | | AS2( movd mm1, a2) |
546 | | AS2( pmuludq mm1, k2) // a2*k2 |
547 | | AS2( paddq mm0, mm2) |
548 | | AS2( paddq mm0, mm4) |
549 | | AS2( movq mm4, mm0) |
550 | | AS2( movd mm2, a3) |
551 | | AS2( pmuludq mm2, mm6) // a3*k1 |
552 | | AS2( pmuludq mm6, a0) // a0*k1 |
553 | | AS2( psrlq mm0, 31) |
554 | | AS2( paddq mm0, mm3) |
555 | | AS2( movd mm3, [edi]) |
556 | | AS2( paddq mm0, mm3) |
557 | | AS2( movd mm3, a2) |
558 | | AS2( pmuludq mm3, k3) // a2*k3 |
559 | | AS2( paddq mm5, mm1) |
560 | | AS2( movd mm1, a3) |
561 | | AS2( pmuludq mm1, k2) // a3*k2 |
562 | | AS2( paddq mm5, mm2) |
563 | | AS2( movd mm2, [edi+4]) |
564 | | AS2( psllq mm5, 1) |
565 | | AS2( paddq mm0, mm5) |
566 | | AS2( movq mm5, mm0) |
567 | | AS2( psllq mm4, 33) |
568 | | AS2( psrlq mm0, 32) |
569 | | AS2( paddq mm6, mm7) |
570 | | AS2( movd mm7, esi) |
571 | | AS2( paddq mm0, mm6) |
572 | | AS2( paddq mm0, mm2) |
573 | | AS2( paddq mm3, mm1) |
574 | | AS2( psllq mm3, 1) |
575 | | AS2( paddq mm0, mm3) |
576 | | AS2( psrlq mm4, 1) |
577 | | AS2( punpckldq mm5, mm0) |
578 | | AS2( psrlq mm0, 32) |
579 | | AS2( por mm4, mm7) |
580 | | AS2( paddq mm0, mm4) |
581 | | AS2( movq a0, mm5) |
582 | | AS2( movq a2, mm0) |
583 | | #ifdef __GNUC__ |
584 | | ".att_syntax prefix;" |
585 | | "mov %0, %%ebx;" |
586 | | : "=m" (temp) |
587 | | : "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl) |
588 | | : "memory", "cc" |
589 | | ); |
590 | | #endif |
591 | | |
592 | | |
593 | | #undef a0 |
594 | | #undef a1 |
595 | | #undef a2 |
596 | | #undef a3 |
597 | | #undef k0 |
598 | | #undef k1 |
599 | | #undef k2 |
600 | | #undef k3 |
601 | | } |
602 | | |
603 | | #define poly_step(ah, al, kh, kl, mh, ml) \ |
604 | | poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml)) |
605 | | |
606 | | /* ----------------------------------------------------------------------- */ |
607 | | #else /* not VMAC_ARCH_64 and not SSE2 */ |
608 | | /* ----------------------------------------------------------------------- */ |
609 | | |
610 | | #ifndef nh_16 |
611 | | #define nh_16(mp, kp, nw, rh, rl) \ |
612 | | { uint64_t t1,t2,m1,m2,t; \ |
613 | | int i; \ |
614 | | rh = rl = t = 0; \ |
615 | | for (i = 0; i < nw; i+=2) { \ |
616 | | t1 = get64PE(mp+i) + kp[i]; \ |
617 | | t2 = get64PE(mp+i+1) + kp[i+1]; \ |
618 | | m2 = MUL32(t1 >> 32, t2); \ |
619 | | m1 = MUL32(t1, t2 >> 32); \ |
620 | | ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2)); \ |
621 | | rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32); \ |
622 | | t += (uint64_t)(uint32_t)m1 + (uint32_t)m2; \ |
623 | | } \ |
624 | | ADD128(rh,rl,(t >> 32),(t << 32)); \ |
625 | | } |
626 | | #endif |
627 | | |
628 | | static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh, |
629 | | const uint64_t *kl, const uint64_t *mh, const uint64_t *ml) |
630 | | { |
631 | | |
632 | | #if VMAC_ARCH_BIG_ENDIAN |
633 | | #define INDEX_HIGH 0 |
634 | | #define INDEX_LOW 1 |
635 | | #else |
636 | | #define INDEX_HIGH 1 |
637 | | #define INDEX_LOW 0 |
638 | | #endif |
639 | | |
640 | | #define a0 *(((uint32_t*)alo)+INDEX_LOW) |
641 | | #define a1 *(((uint32_t*)alo)+INDEX_HIGH) |
642 | | #define a2 *(((uint32_t*)ahi)+INDEX_LOW) |
643 | | #define a3 *(((uint32_t*)ahi)+INDEX_HIGH) |
644 | | #define k0 *(((uint32_t*)kl)+INDEX_LOW) |
645 | | #define k1 *(((uint32_t*)kl)+INDEX_HIGH) |
646 | | #define k2 *(((uint32_t*)kh)+INDEX_LOW) |
647 | | #define k3 *(((uint32_t*)kh)+INDEX_HIGH) |
648 | | |
649 | | uint64_t p, q, t; |
650 | | uint32_t t2; |
651 | | |
652 | | p = MUL32(a3, k3); |
653 | | p += p; |
654 | | p += *(uint64_t *)mh; |
655 | | p += MUL32(a0, k2); |
656 | | p += MUL32(a1, k1); |
657 | | p += MUL32(a2, k0); |
658 | | t = (uint32_t)(p); |
659 | | p >>= 32; |
660 | | p += MUL32(a0, k3); |
661 | | p += MUL32(a1, k2); |
662 | | p += MUL32(a2, k1); |
663 | | p += MUL32(a3, k0); |
664 | | t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32; |
665 | | p >>= 31; |
666 | | p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]); |
667 | | p += MUL32(a0, k0); |
668 | | q = MUL32(a1, k3); |
669 | | q += MUL32(a2, k2); |
670 | | q += MUL32(a3, k1); |
671 | | q += q; |
672 | | p += q; |
673 | | t2 = (uint32_t)(p); |
674 | | p >>= 32; |
675 | | p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]); |
676 | | p += MUL32(a0, k1); |
677 | | p += MUL32(a1, k0); |
678 | | q = MUL32(a2, k3); |
679 | | q += MUL32(a3, k2); |
680 | | q += q; |
681 | | p += q; |
682 | | *(uint64_t *)(alo) = (p << 32) | t2; |
683 | | p >>= 32; |
684 | | *(uint64_t *)(ahi) = p + t; |
685 | | |
686 | | #undef a0 |
687 | | #undef a1 |
688 | | #undef a2 |
689 | | #undef a3 |
690 | | #undef k0 |
691 | | #undef k1 |
692 | | #undef k2 |
693 | | #undef k3 |
694 | | } |
695 | | |
696 | | #define poly_step(ah, al, kh, kl, mh, ml) \ |
697 | | poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml)) |
698 | | |
699 | | /* ----------------------------------------------------------------------- */ |
700 | | #endif /* end of specialized NH and poly definitions */ |
701 | | /* ----------------------------------------------------------------------- */ |
702 | | |
703 | | /* At least nh_16 is defined. Defined others as needed here */ |
704 | | #ifndef nh_16_2 |
705 | | #define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2) \ |
706 | | nh_16(mp, kp, nw, rh, rl); \ |
707 | | nh_16(mp, ((kp)+2), nw, rh2, rl2); |
708 | | #endif |
709 | | #ifndef nh_vmac_nhbytes |
710 | | #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \ |
711 | | nh_16(mp, kp, nw, rh, rl) |
712 | | #endif |
713 | | #ifndef nh_vmac_nhbytes_2 |
714 | | #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2) \ |
715 | | nh_vmac_nhbytes(mp, kp, nw, rh, rl); \ |
716 | | nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2); |
717 | | #endif |
718 | | |
719 | | /* ----------------------------------------------------------------------- */ |
720 | | |
721 | | static void vhash_abort(vmac_ctx_t *ctx) |
722 | 0 | { |
723 | 0 | ctx->polytmp[0] = ctx->polykey[0] ; |
724 | 0 | ctx->polytmp[1] = ctx->polykey[1] ; |
725 | 0 | #if (VMAC_TAG_LEN == 128) |
726 | | ctx->polytmp[2] = ctx->polykey[2] ; |
727 | | ctx->polytmp[3] = ctx->polykey[3] ; |
728 | | #endif |
729 | 0 | ctx->first_block_processed = 0; |
730 | 0 | } |
731 | | |
732 | | /* ----------------------------------------------------------------------- */ |
733 | | static uint64_t l3hash(uint64_t p1, uint64_t p2, |
734 | | uint64_t k1, uint64_t k2, uint64_t len) |
735 | 0 | { |
736 | 0 | uint64_t rh, rl, t, z=0; |
737 | 0 |
|
738 | 0 | /* fully reduce (p1,p2)+(len,0) mod p127 */ |
739 | 0 | t = p1 >> 63; |
740 | 0 | p1 &= m63; |
741 | 0 | ADD128(p1, p2, len, t); |
742 | 0 | /* At this point, (p1,p2) is at most 2^127+(len<<64) */ |
743 | 0 | t = (p1 > m63) + ((p1 == m63) && (p2 == m64)); |
744 | 0 | ADD128(p1, p2, z, t); |
745 | 0 | p1 &= m63; |
746 | 0 |
|
747 | 0 | /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */ |
748 | 0 | t = p1 + (p2 >> 32); |
749 | 0 | t += (t >> 32); |
750 | 0 | t += (uint32_t)t > 0xfffffffeu; |
751 | 0 | p1 += (t >> 32); |
752 | 0 | p2 += (p1 << 32); |
753 | 0 |
|
754 | 0 | /* compute (p1+k1)%p64 and (p2+k2)%p64 */ |
755 | 0 | p1 += k1; |
756 | 0 | p1 += (0 - (p1 < k1)) & 257; |
757 | 0 | p2 += k2; |
758 | 0 | p2 += (0 - (p2 < k2)) & 257; |
759 | 0 |
|
760 | 0 | /* compute (p1+k1)*(p2+k2)%p64 */ |
761 | 0 | MUL64(rh, rl, p1, p2); |
762 | 0 | t = rh >> 56; |
763 | 0 | ADD128(t, rl, z, rh); |
764 | 0 | rh <<= 8; |
765 | 0 | ADD128(t, rl, z, rh); |
766 | 0 | t += t << 8; |
767 | 0 | rl += t; |
768 | 0 | rl += (0 - (rl < t)) & 257; |
769 | 0 | rl += (0 - (rl > p64-1)) & 257; |
770 | 0 | return rl; |
771 | 0 | } |
772 | | |
773 | | /* ----------------------------------------------------------------------- */ |
774 | | |
775 | | void vhash_update(unsigned char *m, |
776 | | unsigned int mbytes, /* Pos multiple of VMAC_NHBYTES */ |
777 | | vmac_ctx_t *ctx) |
778 | 0 | { |
779 | 0 | uint64_t rh, rl, *mptr; |
780 | 0 | const uint64_t *kptr = (uint64_t *)ctx->nhkey; |
781 | 0 | int i; |
782 | 0 | uint64_t ch, cl; |
783 | 0 | uint64_t pkh = ctx->polykey[0]; |
784 | 0 | uint64_t pkl = ctx->polykey[1]; |
785 | 0 | #if (VMAC_TAG_LEN == 128) |
786 | | uint64_t ch2, cl2, rh2, rl2; |
787 | | uint64_t pkh2 = ctx->polykey[2]; |
788 | | uint64_t pkl2 = ctx->polykey[3]; |
789 | | #endif |
790 | 0 |
|
791 | 0 | mptr = (uint64_t *)m; |
792 | 0 | i = mbytes / VMAC_NHBYTES; /* Must be non-zero */ |
793 | 0 |
|
794 | 0 | ch = ctx->polytmp[0]; |
795 | 0 | cl = ctx->polytmp[1]; |
796 | 0 | #if (VMAC_TAG_LEN == 128) |
797 | | ch2 = ctx->polytmp[2]; |
798 | | cl2 = ctx->polytmp[3]; |
799 | | #endif |
800 | 0 | |
801 | 0 | if ( ! ctx->first_block_processed) { |
802 | 0 | ctx->first_block_processed = 1; |
803 | 0 | #if (VMAC_TAG_LEN == 64) |
804 | 0 | nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl); |
805 | 0 | #else |
806 | | nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2); |
807 | | rh2 &= m62; |
808 | | ADD128(ch2,cl2,rh2,rl2); |
809 | | #endif |
810 | 0 | rh &= m62; |
811 | 0 | ADD128(ch,cl,rh,rl); |
812 | 0 | mptr += (VMAC_NHBYTES/sizeof(uint64_t)); |
813 | 0 | i--; |
814 | 0 | } |
815 | 0 |
|
816 | 0 | while (i--) { |
817 | 0 | #if (VMAC_TAG_LEN == 64) |
818 | 0 | nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl); |
819 | 0 | #else |
820 | | nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2); |
821 | | rh2 &= m62; |
822 | | poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2); |
823 | | #endif |
824 | 0 | rh &= m62; |
825 | 0 | poly_step(ch,cl,pkh,pkl,rh,rl); |
826 | 0 | mptr += (VMAC_NHBYTES/sizeof(uint64_t)); |
827 | 0 | } |
828 | 0 |
|
829 | 0 | ctx->polytmp[0] = ch; |
830 | 0 | ctx->polytmp[1] = cl; |
831 | 0 | #if (VMAC_TAG_LEN == 128) |
832 | | ctx->polytmp[2] = ch2; |
833 | | ctx->polytmp[3] = cl2; |
834 | | #endif |
835 | 0 | #if VMAC_USE_SSE2 |
836 | | _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */ |
837 | | #endif |
838 | 0 | } |
839 | | |
840 | | /* ----------------------------------------------------------------------- */ |
841 | | |
842 | | uint64_t vhash(unsigned char m[], |
843 | | unsigned int mbytes, |
844 | | uint64_t *tagl, |
845 | | vmac_ctx_t *ctx) |
846 | 0 | { |
847 | 0 | uint64_t rh, rl, *mptr; |
848 | 0 | const uint64_t *kptr = (uint64_t *)ctx->nhkey; |
849 | 0 | int i, remaining; |
850 | 0 | uint64_t ch, cl; |
851 | 0 | uint64_t pkh = ctx->polykey[0]; |
852 | 0 | uint64_t pkl = ctx->polykey[1]; |
853 | 0 | #if (VMAC_TAG_LEN == 128) |
854 | | uint64_t ch2, cl2, rh2, rl2; |
855 | | uint64_t pkh2 = ctx->polykey[2]; |
856 | | uint64_t pkl2 = ctx->polykey[3]; |
857 | | #endif |
858 | 0 |
|
859 | 0 | mptr = (uint64_t *)m; |
860 | 0 | i = mbytes / VMAC_NHBYTES; |
861 | 0 | remaining = mbytes % VMAC_NHBYTES; |
862 | 0 |
|
863 | 0 | if (ctx->first_block_processed) |
864 | 0 | { |
865 | 0 | ch = ctx->polytmp[0]; |
866 | 0 | cl = ctx->polytmp[1]; |
867 | 0 | #if (VMAC_TAG_LEN == 128) |
868 | | ch2 = ctx->polytmp[2]; |
869 | | cl2 = ctx->polytmp[3]; |
870 | | #endif |
871 | 0 | } |
872 | 0 | else if (i) |
873 | 0 | { |
874 | 0 | #if (VMAC_TAG_LEN == 64) |
875 | 0 | nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl); |
876 | 0 | #else |
877 | | nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2); |
878 | | ch2 &= m62; |
879 | | ADD128(ch2,cl2,pkh2,pkl2); |
880 | | #endif |
881 | 0 | ch &= m62; |
882 | 0 | ADD128(ch,cl,pkh,pkl); |
883 | 0 | mptr += (VMAC_NHBYTES/sizeof(uint64_t)); |
884 | 0 | i--; |
885 | 0 | } |
886 | 0 | else if (remaining) |
887 | 0 | { |
888 | 0 | #if (VMAC_TAG_LEN == 64) |
889 | 0 | nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl); |
890 | 0 | #else |
891 | | nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2); |
892 | | ch2 &= m62; |
893 | | ADD128(ch2,cl2,pkh2,pkl2); |
894 | | #endif |
895 | 0 | ch &= m62; |
896 | 0 | ADD128(ch,cl,pkh,pkl); |
897 | 0 | mptr += (VMAC_NHBYTES/sizeof(uint64_t)); |
898 | 0 | goto do_l3; |
899 | 0 | } |
900 | 0 | else /* Empty String */ |
901 | 0 | { |
902 | 0 | ch = pkh; cl = pkl; |
903 | 0 | #if (VMAC_TAG_LEN == 128) |
904 | | ch2 = pkh2; cl2 = pkl2; |
905 | | #endif |
906 | 0 | goto do_l3; |
907 | 0 | } |
908 | 0 |
|
909 | 0 | while (i--) { |
910 | 0 | #if (VMAC_TAG_LEN == 64) |
911 | 0 | nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl); |
912 | 0 | #else |
913 | | nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2); |
914 | | rh2 &= m62; |
915 | | poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2); |
916 | | #endif |
917 | 0 | rh &= m62; |
918 | 0 | poly_step(ch,cl,pkh,pkl,rh,rl); |
919 | 0 | mptr += (VMAC_NHBYTES/sizeof(uint64_t)); |
920 | 0 | } |
921 | 0 | if (remaining) { |
922 | 0 | #if (VMAC_TAG_LEN == 64) |
923 | 0 | nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl); |
924 | 0 | #else |
925 | | nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2); |
926 | | rh2 &= m62; |
927 | | poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2); |
928 | | #endif |
929 | 0 | rh &= m62; |
930 | 0 | poly_step(ch,cl,pkh,pkl,rh,rl); |
931 | 0 | } |
932 | 0 |
|
933 | 0 | do_l3: |
934 | 0 | #if VMAC_USE_SSE2 |
935 | | _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */ |
936 | | #endif |
937 | 0 | vhash_abort(ctx); |
938 | 0 | remaining *= 8; |
939 | 0 | #if (VMAC_TAG_LEN == 128) |
940 | | *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining); |
941 | | #endif |
942 | 0 | return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining); |
943 | 0 | } |
944 | | |
945 | | /* ----------------------------------------------------------------------- */ |
946 | | |
947 | | uint64_t vmac(unsigned char m[], |
948 | | unsigned int mbytes, |
949 | | unsigned char n[16], |
950 | | uint64_t *tagl, |
951 | | vmac_ctx_t *ctx) |
952 | 0 | { |
953 | 0 | #if (VMAC_TAG_LEN == 64) |
954 | 0 | uint64_t *in_n, *out_p; |
955 | 0 | uint64_t p, h; |
956 | 0 | int i; |
957 | 0 | |
958 | 0 | #if VMAC_CACHE_NONCES |
959 | 0 | in_n = ctx->cached_nonce; |
960 | 0 | out_p = ctx->cached_aes; |
961 | 0 | #else |
962 | | uint64_t tmp[2]; |
963 | | in_n = out_p = tmp; |
964 | | #endif |
965 | 0 |
|
966 | 0 | i = n[15] & 1; |
967 | 0 | #if VMAC_CACHE_NONCES |
968 | 0 | if ((*(uint64_t *)(n+8) != in_n[1]) || |
969 | 0 | (*(uint64_t *)(n ) != in_n[0])) { |
970 | 0 | #endif |
971 | 0 | |
972 | 0 | in_n[0] = *(uint64_t *)(n ); |
973 | 0 | in_n[1] = *(uint64_t *)(n+8); |
974 | 0 | ((unsigned char *)in_n)[15] &= 0xFE; |
975 | 0 | aes_encryption(in_n, out_p, &ctx->cipher_key); |
976 | 0 |
|
977 | 0 | #if VMAC_CACHE_NONCES |
978 | 0 | ((unsigned char *)in_n)[15] |= (unsigned char)(1-i); |
979 | 0 | } |
980 | 0 | #endif |
981 | 0 | p = get64BE(out_p + i); |
982 | 0 | h = vhash(m, mbytes, (uint64_t *)0, ctx); |
983 | 0 | return p + h; |
984 | 0 | #else |
985 | | uint64_t tmp[2]; |
986 | | uint64_t th,tl; |
987 | | aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key); |
988 | | th = vhash(m, mbytes, &tl, ctx); |
989 | | th += get64BE(tmp); |
990 | | *tagl = tl + get64BE(tmp+1); |
991 | | return th; |
992 | | #endif |
993 | 0 | } |
994 | | |
995 | | /* ----------------------------------------------------------------------- */ |
996 | | |
997 | | void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx) |
998 | 0 | { |
999 | 0 | uint64_t in[2] = {0}, out[2]; |
1000 | 0 | unsigned i; |
1001 | 0 | aes_key_setup(user_key, &ctx->cipher_key); |
1002 | 0 | |
1003 | 0 | /* Fill nh key */ |
1004 | 0 | ((unsigned char *)in)[0] = 0x80; |
1005 | 0 | for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) { |
1006 | 0 | aes_encryption((unsigned char *)in, (unsigned char *)out, |
1007 | 0 | &ctx->cipher_key); |
1008 | 0 | ctx->nhkey[i ] = get64BE(out); |
1009 | 0 | ctx->nhkey[i+1] = get64BE(out+1); |
1010 | 0 | ((unsigned char *)in)[15] += 1; |
1011 | 0 | } |
1012 | 0 |
|
1013 | 0 | /* Fill poly key */ |
1014 | 0 | ((unsigned char *)in)[0] = 0xC0; |
1015 | 0 | in[1] = 0; |
1016 | 0 | for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) { |
1017 | 0 | aes_encryption((unsigned char *)in, (unsigned char *)out, |
1018 | 0 | &ctx->cipher_key); |
1019 | 0 | ctx->polytmp[i ] = ctx->polykey[i ] = get64BE(out) & mpoly; |
1020 | 0 | ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly; |
1021 | 0 | ((unsigned char *)in)[15] += 1; |
1022 | 0 | } |
1023 | 0 |
|
1024 | 0 | /* Fill ip key */ |
1025 | 0 | ((unsigned char *)in)[0] = 0xE0; |
1026 | 0 | in[1] = 0; |
1027 | 0 | for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) { |
1028 | 0 | do { |
1029 | 0 | aes_encryption((unsigned char *)in, (unsigned char *)out, |
1030 | 0 | &ctx->cipher_key); |
1031 | 0 | ctx->l3key[i ] = get64BE(out); |
1032 | 0 | ctx->l3key[i+1] = get64BE(out+1); |
1033 | 0 | ((unsigned char *)in)[15] += 1; |
1034 | 0 | } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64); |
1035 | 0 | } |
1036 | 0 | |
1037 | 0 | /* Invalidate nonce/aes cache and reset other elements */ |
1038 | 0 | #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES) |
1039 | 0 | ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */ |
1040 | 0 | ctx->cached_nonce[1] = (uint64_t)0; /* Ensure illegal nonce */ |
1041 | 0 | #endif |
1042 | 0 | ctx->first_block_processed = 0; |
1043 | 0 | } |
1044 | | |
1045 | | /* ----------------------------------------------------------------------- */ |
1046 | | |
1047 | | |
1048 | | #if VMAC_RUN_TESTS |
1049 | | |
1050 | | #include <stdlib.h> |
1051 | | #include <stdio.h> |
1052 | | #include <time.h> |
1053 | | #include <string.h> |
1054 | | |
1055 | | unsigned prime(void) /* Wake variable speed cpu, get rough speed estimate */ |
1056 | | { |
1057 | | volatile uint64_t i; |
1058 | | volatile uint64_t j=1; |
1059 | | unsigned cnt=0; |
1060 | | volatile clock_t ticks = clock(); |
1061 | | do { |
1062 | | for (i = 0; i < 500000; i++) { |
1063 | | uint64_t x = get64PE(&j); |
1064 | | j = x * x + (uint64_t)ticks; |
1065 | | } |
1066 | | cnt++; |
1067 | | } while (clock() - ticks < (CLOCKS_PER_SEC/2)); |
1068 | | return cnt; /* cnt is millions of iterations per second */ |
1069 | | } |
1070 | | |
1071 | | int main(void) |
1072 | | { |
1073 | | ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2; |
1074 | | uint64_t res, tagl; |
1075 | | void *p; |
1076 | | unsigned char *m; |
1077 | | ALIGN(4) unsigned char key[] = "abcdefghijklmnop"; |
1078 | | ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi"; |
1079 | | unsigned int vector_lengths[] = {0,3,48,300,3000000}; |
1080 | | #if (VMAC_TAG_LEN == 64) |
1081 | | ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5", |
1082 | | "E8421F61D573D298","4492DF6C5CAC1BBE", |
1083 | | "09BA597DD7601113"}; |
1084 | | #else |
1085 | | ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC", |
1086 | | "4EE815A06A1D71EDD36FC75D51188A42", |
1087 | | "09F2C80C8E1007A0C12FAE19FE4504AE", |
1088 | | "66438817154850C61D8A412164803BCB", |
1089 | | "2B6B02288FFC461B75485DE893C629DC"}; |
1090 | | #endif |
1091 | | unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; |
1092 | | unsigned i, j, *speed_iters; |
1093 | | clock_t ticks; |
1094 | | double cpb; |
1095 | | const unsigned int buf_len = 3 * (1 << 20); |
1096 | | |
1097 | | j = prime(); |
1098 | | i = sizeof(speed_lengths)/sizeof(speed_lengths[0]); |
1099 | | speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0])); |
1100 | | speed_iters[i-1] = j * (1 << 12); |
1101 | | while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]); |
1102 | | |
1103 | | /* Initialize context and message buffer, all 16-byte aligned */ |
1104 | | p = malloc(buf_len + 32); |
1105 | | m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15)); |
1106 | | memset(m, 0, buf_len + 16); |
1107 | | vmac_set_key(key, &ctx); |
1108 | | |
1109 | | /* Test incremental and all-in-one interfaces for correctness */ |
1110 | | vmac_set_key(key, &ctx_aio); |
1111 | | vmac_set_key(key, &ctx_inc1); |
1112 | | vmac_set_key(key, &ctx_inc2); |
1113 | | |
1114 | | |
1115 | | /* |
1116 | | for (i = 0; i <= 512; i++) { |
1117 | | vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1); |
1118 | | tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES, |
1119 | | nonce, &tagl, &ctx); |
1120 | | vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1); |
1121 | | for (j = 0; j < vector_lengths[i]; j++) |
1122 | | m[j] = (unsigned char)('a'+j%3); |
1123 | | |
1124 | | } |
1125 | | */ |
1126 | | |
1127 | | /* Generate vectors */ |
1128 | | for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) { |
1129 | | for (j = 0; j < vector_lengths[i]; j++) |
1130 | | m[j] = (unsigned char)('a'+j%3); |
1131 | | res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx); |
1132 | | #if (VMAC_TAG_LEN == 64) |
1133 | | printf("\'abc\' * %7u: %016llX Should be: %s\n", |
1134 | | vector_lengths[i]/3,res,should_be[i]); |
1135 | | #else |
1136 | | printf("\'abc\' * %7u: %016llX%016llX\nShould be : %s\n", |
1137 | | vector_lengths[i]/3,res,tagl,should_be[i]); |
1138 | | #endif |
1139 | | } |
1140 | | |
1141 | | /* Speed test */ |
1142 | | for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) { |
1143 | | ticks = clock(); |
1144 | | for (j = 0; j < speed_iters[i]; j++) { |
1145 | | #if HASH_ONLY |
1146 | | res = vhash(m, speed_lengths[i], &tagl, &ctx); |
1147 | | #else |
1148 | | res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx); |
1149 | | nonce[7]++; |
1150 | | #endif |
1151 | | } |
1152 | | ticks = clock() - ticks; |
1153 | | cpb = ((ticks*VMAC_HZ)/ |
1154 | | ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i])); |
1155 | | printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb); |
1156 | | } |
1157 | | return 1; |
1158 | | } |
1159 | | |
1160 | | #endif |