Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/crypto/vmac.c
Line
Count
Source (jump to first uncovered line)
1
/* --------------------------------------------------------------------------
2
 * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
3
 * This implementation is herby placed in the public domain.
4
 * The authors offers no warranty. Use at your own risk.
5
 * Please send bug reports to the authors.
6
 * Last modified: 17 APR 08, 1700 PDT
7
 * ----------------------------------------------------------------------- */
8
9
/* start for Xen */
10
#include <xen/init.h>
11
#include <xen/types.h>
12
#include <xen/lib.h>
13
#include <crypto/vmac.h>
14
#define UINT64_C(x)  x##ULL
15
/* end for Xen */
16
17
/* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */
18
#ifndef VMAC_ARCH_64
19
#define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64)
20
#endif
21
22
/* Enable code tuned for Intel SSE2 instruction set                   */
23
#if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64))
24
#define VMAC_USE_SSE2    1
25
#include <emmintrin.h>
26
#endif
27
28
/* Native word reads. Update (or define via compiler) if incorrect */
29
#ifndef VMAC_ARCH_BIG_ENDIAN       /* Assume big-endian unless on the list */
30
#define VMAC_ARCH_BIG_ENDIAN \
31
    (!(__x86_64__ || __i386__ || _M_IX86 || \
32
       _M_X64 || __ARMEL__ || __MIPSEL__))
33
#endif
34
35
/* ----------------------------------------------------------------------- */
36
/* Constants and masks                                                     */
37
38
const uint64_t p64   = UINT64_C(0xfffffffffffffeff);  /* 2^64 - 257 prime  */
39
const uint64_t m62   = UINT64_C(0x3fffffffffffffff);  /* 62-bit mask       */
40
const uint64_t m63   = UINT64_C(0x7fffffffffffffff);  /* 63-bit mask       */
41
const uint64_t m64   = UINT64_C(0xffffffffffffffff);  /* 64-bit mask       */
42
const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff);  /* Poly key mask     */
43
44
/* ----------------------------------------------------------------------- *
45
 * The following routines are used in this implementation. They are
46
 * written via macros to simulate zero-overhead call-by-reference.
47
 * All have default implemantations for when they are not defined in an
48
 * architecture-specific manner.
49
 *
50
 * MUL64: 64x64->128-bit multiplication
51
 * PMUL64: assumes top bits cleared on inputs
52
 * ADD128: 128x128->128-bit addition
53
 * GET_REVERSED_64: load and byte-reverse 64-bit word  
54
 * ----------------------------------------------------------------------- */
55
56
/* ----------------------------------------------------------------------- */
57
#if (__GNUC__ && (__x86_64__ || __amd64__))
58
/* ----------------------------------------------------------------------- */
59
60
#define ADD128(rh,rl,ih,il)                                               \
61
0
    asm ("addq %3, %1 \n\t"                                               \
62
0
         "adcq %2, %0"                                                    \
63
0
    : "+r"(rh),"+r"(rl)                                                   \
64
0
    : "r"(ih),"r"(il) : "cc");
65
66
#define MUL64(rh,rl,i1,i2)                                                \
67
0
    asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc")
68
69
0
#define PMUL64 MUL64
70
71
#define GET_REVERSED_64(p)                                                \
72
0
    ({uint64_t x;                                                         \
73
0
     asm ("bswapq %0" : "=r" (x) : "0"(*(uint64_t *)(p))); x;})
74
75
/* ----------------------------------------------------------------------- */
76
#elif (__GNUC__ && __i386__)
77
/* ----------------------------------------------------------------------- */
78
79
#define GET_REVERSED_64(p)                                                \
80
    ({ uint64_t x;                                                        \
81
    uint32_t *tp = (uint32_t *)(p);                                       \
82
    asm  ("bswap %%edx\n\t"                                               \
83
          "bswap %%eax"                                                   \
84
    : "=A"(x)                                                             \
85
    : "a"(tp[1]), "d"(tp[0]));                                            \
86
    x; })
87
88
/* ----------------------------------------------------------------------- */
89
#elif (__GNUC__ && __ppc64__)
90
/* ----------------------------------------------------------------------- */
91
92
#define ADD128(rh,rl,ih,il)                                               \
93
    asm volatile (  "addc %1, %1, %3 \n\t"                                \
94
                    "adde %0, %0, %2"                                     \
95
    : "+r"(rh),"+r"(rl)                                                   \
96
    : "r"(ih),"r"(il));
97
98
#define MUL64(rh,rl,i1,i2)                                                \
99
{ uint64_t _i1 = (i1), _i2 = (i2);                                        \
100
    rl = _i1 * _i2;                                                       \
101
    asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\
102
}
103
104
#define PMUL64 MUL64
105
106
#define GET_REVERSED_64(p)                                                \
107
    ({ uint32_t hi, lo, *_p = (uint32_t *)(p);                            \
108
       asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) );  \
109
       asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) );  \
110
       ((uint64_t)hi << 32) | (uint64_t)lo; } )
111
112
/* ----------------------------------------------------------------------- */
113
#elif (__GNUC__ && (__ppc__ || __PPC__))
114
/* ----------------------------------------------------------------------- */
115
116
#define GET_REVERSED_64(p)                                                \
117
    ({ uint32_t hi, lo, *_p = (uint32_t *)(p);                            \
118
       asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) );  \
119
       asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) );  \
120
       ((uint64_t)hi << 32) | (uint64_t)lo; } )
121
122
/* ----------------------------------------------------------------------- */
123
#elif (__GNUC__ && (__ARMEL__ || __ARM__))
124
/* ----------------------------------------------------------------------- */
125
126
#define bswap32(v)                                                        \
127
({ uint32_t tmp,out;                                                      \
128
    asm volatile(                                                         \
129
        "eor    %1, %2, %2, ror #16\n"                                    \
130
        "bic    %1, %1, #0x00ff0000\n"                                    \
131
        "mov    %0, %2, ror #8\n"                                         \
132
        "eor    %0, %0, %1, lsr #8"                                       \
133
    : "=r" (out), "=&r" (tmp)                                             \
134
    : "r" (v));                                                           \
135
    out;})
136
137
/* ----------------------------------------------------------------------- */
138
#elif _MSC_VER
139
/* ----------------------------------------------------------------------- */
140
141
#include <intrin.h>
142
143
#if (_M_IA64 || _M_X64) && \
144
    (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
145
#define MUL64(rh,rl,i1,i2)   (rl) = _umul128(i1,i2,&(rh));
146
#pragma intrinsic(_umul128)
147
#define PMUL64 MUL64
148
#endif
149
150
/* MSVC uses add, adc in this version */
151
#define ADD128(rh,rl,ih,il)                                          \
152
    {   uint64_t _il = (il);                                         \
153
        (rl) += (_il);                                               \
154
        (rh) += (ih) + ((rl) < (_il));                               \
155
    }
156
157
#if _MSC_VER >= 1300
158
#define GET_REVERSED_64(p) _byteswap_uint64(*(uint64_t *)(p))
159
#pragma intrinsic(_byteswap_uint64)
160
#endif
161
162
#if _MSC_VER >= 1400 && \
163
    (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
164
#define MUL32(i1,i2)    (__emulu((uint32_t)(i1),(uint32_t)(i2)))
165
#pragma intrinsic(__emulu)
166
#endif
167
168
/* ----------------------------------------------------------------------- */
169
#endif
170
/* ----------------------------------------------------------------------- */
171
172
#if __GNUC__
173
#define ALIGN(n)      __attribute__ ((aligned(n))) 
174
#define NOINLINE      __attribute__ ((noinline))
175
#elif _MSC_VER
176
#define ALIGN(n)      __declspec(align(n))
177
#define NOINLINE      __declspec(noinline)
178
#else
179
#define ALIGN(n)
180
#define NOINLINE
181
#endif
182
183
/* ----------------------------------------------------------------------- */
184
/* Default implementations, if not defined above                           */
185
/* ----------------------------------------------------------------------- */
186
187
#ifndef ADD128
188
#define ADD128(rh,rl,ih,il)                                              \
189
    {   uint64_t _il = (il);                                             \
190
        (rl) += (_il);                                                   \
191
        if ((rl) < (_il)) (rh)++;                                        \
192
        (rh) += (ih);                                                    \
193
    }
194
#endif
195
196
#ifndef MUL32
197
#define MUL32(i1,i2)    ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2))
198
#endif
199
200
#ifndef PMUL64              /* rh may not be same as i1 or i2 */
201
#define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow     */         \
202
    {   uint64_t _i1 = (i1), _i2 = (i2);                                 \
203
        uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2);            \
204
        rh         = MUL32(_i1>>32,_i2>>32);                             \
205
        rl         = MUL32(_i1,_i2);                                     \
206
        ADD128(rh,rl,(m >> 32),(m << 32));                               \
207
    }
208
#endif
209
210
#ifndef MUL64
211
#define MUL64(rh,rl,i1,i2)                                               \
212
    {   uint64_t _i1 = (i1), _i2 = (i2);                                 \
213
        uint64_t m1= MUL32(_i1,_i2>>32);                                 \
214
        uint64_t m2= MUL32(_i1>>32,_i2);                                 \
215
        rh         = MUL32(_i1>>32,_i2>>32);                             \
216
        rl         = MUL32(_i1,_i2);                                     \
217
        ADD128(rh,rl,(m1 >> 32),(m1 << 32));                             \
218
        ADD128(rh,rl,(m2 >> 32),(m2 << 32));                             \
219
    }
220
#endif
221
222
#ifndef GET_REVERSED_64
223
#ifndef bswap64
224
#ifndef bswap32
225
#define bswap32(x)                                                        \
226
  ({ uint32_t bsx = (x);                                                  \
227
      ((((bsx) & 0xff000000u) >> 24) | (((bsx) & 0x00ff0000u) >>  8) |    \
228
       (((bsx) & 0x0000ff00u) <<  8) | (((bsx) & 0x000000ffu) << 24)); })
229
#endif
230
#define bswap64(x)                                                        \
231
     ({ union { uint64_t ll; uint32_t l[2]; } w, r;                       \
232
         w.ll = (x);                                                      \
233
         r.l[0] = bswap32 (w.l[1]);                                       \
234
         r.l[1] = bswap32 (w.l[0]);                                       \
235
         r.ll; })
236
#endif
237
#define GET_REVERSED_64(p) bswap64(*(uint64_t *)(p)) 
238
#endif
239
240
/* ----------------------------------------------------------------------- */
241
242
#if (VMAC_PREFER_BIG_ENDIAN)
243
#  define get64PE get64BE
244
#else
245
#  define get64PE get64LE
246
#endif
247
248
#if (VMAC_ARCH_BIG_ENDIAN)
249
#  define get64BE(ptr) (*(uint64_t *)(ptr))
250
#  define get64LE(ptr) GET_REVERSED_64(ptr)
251
#else /* assume little-endian */
252
0
#  define get64BE(ptr) GET_REVERSED_64(ptr)
253
#  define get64LE(ptr) (*(uint64_t *)(ptr))
254
#endif
255
256
257
/* --------------------------------------------------------------------- *
258
 * For highest performance the L1 NH and L2 polynomial hashes should be
259
 * carefully implemented to take advantage of one's target architechture.
260
 * Here these two hash functions are defined multiple time; once for
261
 * 64-bit architectures, once for 32-bit SSE2 architectures, and once
262
 * for the rest (32-bit) architectures.
263
 * For each, nh_16 *must* be defined (works on multiples of 16 bytes).
264
 * Optionally, nh_vmac_nhbytes can be defined (for multiples of
265
 * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two
266
 * NH computations at once).
267
 * --------------------------------------------------------------------- */
268
269
/* ----------------------------------------------------------------------- */
270
#if VMAC_ARCH_64
271
/* ----------------------------------------------------------------------- */
272
273
0
#define nh_16(mp, kp, nw, rh, rl)                                            \
274
0
{   int i; uint64_t th, tl;                                                  \
275
0
    rh = rl = 0;                                                             \
276
0
    for (i = 0; i < nw; i+= 2) {                                             \
277
0
        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
278
0
        ADD128(rh,rl,th,tl);                                                 \
279
0
    }                                                                        \
280
0
}
281
#define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1)                                \
282
{   int i; uint64_t th, tl;                                                  \
283
    rh1 = rl1 = rh = rl = 0;                                                 \
284
    for (i = 0; i < nw; i+= 2) {                                             \
285
        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
286
        ADD128(rh,rl,th,tl);                                                 \
287
        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
288
        ADD128(rh1,rl1,th,tl);                                               \
289
    }                                                                        \
290
}
291
292
#if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */
293
0
#define nh_vmac_nhbytes(mp, kp, nw, rh, rl)                                  \
294
0
{   int i; uint64_t th, tl;                                                  \
295
0
    rh = rl = 0;                                                             \
296
0
    for (i = 0; i < nw; i+= 8) {                                             \
297
0
        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
298
0
        ADD128(rh,rl,th,tl);                                                 \
299
0
        MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
300
0
        ADD128(rh,rl,th,tl);                                                 \
301
0
        MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
302
0
        ADD128(rh,rl,th,tl);                                                 \
303
0
        MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
304
0
        ADD128(rh,rl,th,tl);                                                 \
305
0
    }                                                                        \
306
0
}
307
#define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1)                      \
308
{   int i; uint64_t th, tl;                                                  \
309
    rh1 = rl1 = rh = rl = 0;                                                 \
310
    for (i = 0; i < nw; i+= 8) {                                             \
311
        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
312
        ADD128(rh,rl,th,tl);                                                 \
313
        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
314
        ADD128(rh1,rl1,th,tl);                                               \
315
        MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
316
        ADD128(rh,rl,th,tl);                                                 \
317
        MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\
318
        ADD128(rh1,rl1,th,tl);                                               \
319
        MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
320
        ADD128(rh,rl,th,tl);                                                 \
321
        MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\
322
        ADD128(rh1,rl1,th,tl);                                               \
323
        MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
324
        ADD128(rh,rl,th,tl);                                                 \
325
        MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\
326
        ADD128(rh1,rl1,th,tl);                                               \
327
    }                                                                        \
328
}
329
#endif
330
331
0
#define poly_step(ah, al, kh, kl, mh, ml)                   \
332
0
{   uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0;             \
333
0
    /* compute ab*cd, put bd into result registers */       \
334
0
    PMUL64(t3h,t3l,al,kh);                                  \
335
0
    PMUL64(t2h,t2l,ah,kl);                                  \
336
0
    PMUL64(t1h,t1l,ah,2*kh);                                \
337
0
    PMUL64(ah,al,al,kl);                                    \
338
0
    /* add 2 * ac to result */                              \
339
0
    ADD128(ah,al,t1h,t1l);                                  \
340
0
    /* add together ad + bc */                              \
341
0
    ADD128(t2h,t2l,t3h,t3l);                                \
342
0
    /* now (ah,al), (t2l,2*t2h) need summing */             \
343
0
    /* first add the high registers, carrying into t2h */   \
344
0
    ADD128(t2h,ah,z,t2l);                                   \
345
0
    /* double t2h and add top bit of ah */                  \
346
0
    t2h = 2 * t2h + (ah >> 63);                             \
347
0
    ah &= m63;                                              \
348
0
    /* now add the low registers */                         \
349
0
    ADD128(ah,al,mh,ml);                                    \
350
0
    ADD128(ah,al,z,t2h);                                    \
351
0
}
352
353
/* ----------------------------------------------------------------------- */
354
#elif VMAC_USE_SSE2
355
/* ----------------------------------------------------------------------- */
356
357
// macros from Crypto++ for sharing inline assembly code between MSVC and GNU C
358
#if defined(__GNUC__)
359
  // define these in two steps to allow arguments to be expanded
360
  #define GNU_AS2(x, y) #x ", " #y ";"
361
  #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";"
362
  #define GNU_ASL(x) "\n" #x ":"
363
  #define GNU_ASJ(x, y, z) #x " " #y #z ";"
364
  #define AS2(x, y) GNU_AS2(x, y)
365
  #define AS3(x, y, z) GNU_AS3(x, y, z)
366
  #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
367
  #define ASL(x) GNU_ASL(x)
368
  #define ASJ(x, y, z) GNU_ASJ(x, y, z)
369
#else
370
  #define AS2(x, y) __asm {x, y}
371
  #define AS3(x, y, z) __asm {x, y, z}
372
  #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
373
  #define ASL(x) __asm {label##x:}
374
  #define ASJ(x, y, z) __asm {x label##y}
375
#endif
376
377
static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl)
378
{
379
  // This assembly version, using MMX registers, is just as fast as the
380
  // intrinsics version (which uses XMM registers) on the Intel Core 2,
381
  // but is much faster on the Pentium 4. In order to schedule multiplies
382
  // as early as possible, the loop interleaves operations for the current
383
  // block and the next block. To mask out high 32-bits, we use "movd"
384
  // to move the lower 32-bits to the stack and then back. Surprisingly,
385
  // this is faster than any other method.
386
#ifdef __GNUC__
387
  __asm__ __volatile__
388
  (
389
    ".intel_syntax noprefix;"
390
#else
391
    AS2(  mov   esi, mp)
392
    AS2(  mov   edi, kp)
393
    AS2(  mov   ecx, nw)
394
    AS2(  mov   eax, rl)
395
    AS2(  mov   edx, rh)
396
#endif
397
    AS2(  sub   esp, 12)
398
    AS2(  movq  mm6, [esi])
399
    AS2(  paddq mm6, [edi])
400
    AS2(  movq  mm5, [esi+8])
401
    AS2(  paddq mm5, [edi+8])
402
    AS2(  add   esi, 16)
403
    AS2(  add   edi, 16)
404
    AS2(  movq  mm4, mm6)
405
    ASS(  pshufw  mm2, mm6, 1, 0, 3, 2)
406
    AS2(  pmuludq mm6, mm5)
407
    ASS(  pshufw  mm3, mm5, 1, 0, 3, 2)
408
    AS2(  pmuludq mm5, mm2)
409
    AS2(  pmuludq mm2, mm3)
410
    AS2(  pmuludq mm3, mm4)
411
    AS2(  pxor  mm7, mm7)
412
    AS2(  movd  [esp], mm6)
413
    AS2(  psrlq mm6, 32)
414
    AS2(  movd  [esp+4], mm5)
415
    AS2(  psrlq mm5, 32)
416
    AS2(  sub   ecx, 2)
417
    ASJ(  jz,   1, f)
418
    ASL(0)
419
    AS2(  movq  mm0, [esi])
420
    AS2(  paddq mm0, [edi])
421
    AS2(  movq  mm1, [esi+8])
422
    AS2(  paddq mm1, [edi+8])
423
    AS2(  add   esi, 16)
424
    AS2(  add   edi, 16)
425
    AS2(  movq  mm4, mm0)
426
    AS2(  paddq mm5, mm2)
427
    ASS(  pshufw  mm2, mm0, 1, 0, 3, 2)
428
    AS2(  pmuludq mm0, mm1)
429
    AS2(  movd  [esp+8], mm3)
430
    AS2(  psrlq mm3, 32)
431
    AS2(  paddq mm5, mm3)
432
    ASS(  pshufw  mm3, mm1, 1, 0, 3, 2)
433
    AS2(  pmuludq mm1, mm2)
434
    AS2(  pmuludq mm2, mm3)
435
    AS2(  pmuludq mm3, mm4)
436
    AS2(  movd  mm4, [esp])
437
    AS2(  paddq mm7, mm4)
438
    AS2(  movd  mm4, [esp+4])
439
    AS2(  paddq mm6, mm4)
440
    AS2(  movd  mm4, [esp+8])
441
    AS2(  paddq mm6, mm4)
442
    AS2(  movd  [esp], mm0)
443
    AS2(  psrlq mm0, 32)
444
    AS2(  paddq mm6, mm0)
445
    AS2(  movd  [esp+4], mm1)
446
    AS2(  psrlq mm1, 32)
447
    AS2(  paddq mm5, mm1)
448
    AS2(  sub   ecx, 2)
449
    ASJ(  jnz,  0, b)
450
    ASL(1)
451
    AS2(  paddq mm5, mm2)
452
    AS2(  movd  [esp+8], mm3)
453
    AS2(  psrlq mm3, 32)
454
    AS2(  paddq mm5, mm3)
455
    AS2(  movd  mm4, [esp])
456
    AS2(  paddq mm7, mm4)
457
    AS2(  movd  mm4, [esp+4])
458
    AS2(  paddq mm6, mm4)
459
    AS2(  movd  mm4, [esp+8])
460
    AS2(  paddq mm6, mm4)
461
462
    ASS(  pshufw  mm0, mm7, 3, 2, 1, 0)
463
    AS2(  psrlq mm7, 32)
464
    AS2(  paddq mm6, mm7)
465
    AS2(  punpckldq mm0, mm6)
466
    AS2(  psrlq mm6, 32)
467
    AS2(  paddq mm5, mm6)
468
    AS2(  movq  [eax], mm0)
469
    AS2(  movq  [edx], mm5)
470
    AS2(  add   esp, 12)
471
#ifdef __GNUC__
472
    ".att_syntax prefix;"
473
    :
474
    : "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh)
475
    : "memory", "cc"
476
  );
477
#endif
478
}
479
#define nh_16(mp, kp, nw, rh, rl)   nh_16_func(mp, kp, nw, &(rh), &(rl));
480
481
static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
482
               const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)                  
483
{
484
  // This code tries to schedule the multiplies as early as possible to overcome
485
  // the long latencies on the Pentium 4. It also minimizes "movq" instructions
486
  // which are very expensive on the P4.
487
488
#define a0 [eax+0]
489
#define a1 [eax+4]
490
#define a2 [ebx+0]
491
#define a3 [ebx+4]
492
#define k0 [ecx+0]
493
#define k1 [ecx+4]
494
#define k2 [edx+0]
495
#define k3 [edx+4]
496
497
#ifdef __GNUC__
498
  uint32_t temp;
499
  __asm__ __volatile__
500
  (
501
    "mov %%ebx, %0;"
502
    "mov %1, %%ebx;"
503
    ".intel_syntax noprefix;"
504
#else
505
    AS2(  mov   ebx, ahi)
506
    AS2(  mov   edx, kh)
507
    AS2(  mov   eax, alo)
508
    AS2(  mov   ecx, kl)
509
    AS2(  mov   esi, mh)
510
    AS2(  mov   edi, ml)
511
#endif
512
513
    AS2(  movd  mm0, a3)
514
    AS2(  movq  mm4, mm0)
515
    AS2(  pmuludq mm0, k3)    // a3*k3
516
    AS2(  movd  mm1, a0)
517
    AS2(  pmuludq mm1, k2)    // a0*k2
518
    AS2(  movd  mm2, a1)
519
    AS2(  movd  mm6, k1)
520
    AS2(  pmuludq mm2, mm6)   // a1*k1
521
    AS2(  movd  mm3, a2)
522
    AS2(  movq  mm5, mm3)
523
    AS2(  movd  mm7, k0)
524
    AS2(  pmuludq mm3, mm7)   // a2*k0
525
    AS2(  pmuludq mm4, mm7)   // a3*k0
526
    AS2(  pmuludq mm5, mm6)   // a2*k1
527
    AS2(  psllq mm0, 1)
528
    AS2(  paddq mm0, [esi])
529
    AS2(  paddq mm0, mm1)
530
    AS2(  movd  mm1, a1)
531
    AS2(  paddq mm4, mm5)
532
    AS2(  movq  mm5, mm1)
533
    AS2(  pmuludq mm1, k2)    // a1*k2
534
    AS2(  paddq mm0, mm2)
535
    AS2(  movd  mm2, a0)
536
    AS2(  paddq mm0, mm3)
537
    AS2(  movq  mm3, mm2)
538
    AS2(  pmuludq mm2, k3)    // a0*k3
539
    AS2(  pmuludq mm3, mm7)   // a0*k0
540
    AS2(  movd  esi, mm0)
541
    AS2(  psrlq mm0, 32)
542
    AS2(  pmuludq mm7, mm5)   // a1*k0
543
    AS2(  pmuludq mm5, k3)    // a1*k3
544
    AS2(  paddq mm0, mm1)
545
    AS2(  movd  mm1, a2)
546
    AS2(  pmuludq mm1, k2)    // a2*k2
547
    AS2(  paddq mm0, mm2)
548
    AS2(  paddq mm0, mm4)
549
    AS2(  movq  mm4, mm0)
550
    AS2(  movd  mm2, a3)
551
    AS2(  pmuludq mm2, mm6)   // a3*k1
552
    AS2(  pmuludq mm6, a0)    // a0*k1
553
    AS2(  psrlq mm0, 31)
554
    AS2(  paddq mm0, mm3)
555
    AS2(  movd  mm3, [edi])
556
    AS2(  paddq mm0, mm3)
557
    AS2(  movd  mm3, a2)
558
    AS2(  pmuludq mm3, k3)    // a2*k3
559
    AS2(  paddq mm5, mm1)
560
    AS2(  movd  mm1, a3)
561
    AS2(  pmuludq mm1, k2)    // a3*k2
562
    AS2(  paddq mm5, mm2)
563
    AS2(  movd  mm2, [edi+4])
564
    AS2(  psllq mm5, 1)
565
    AS2(  paddq mm0, mm5)
566
    AS2(  movq  mm5, mm0)
567
    AS2(  psllq mm4, 33)
568
    AS2(  psrlq mm0, 32)
569
    AS2(  paddq mm6, mm7)
570
    AS2(  movd  mm7, esi)
571
    AS2(  paddq mm0, mm6)
572
    AS2(  paddq mm0, mm2)
573
    AS2(  paddq mm3, mm1)
574
    AS2(  psllq mm3, 1)
575
    AS2(  paddq mm0, mm3)
576
    AS2(  psrlq mm4, 1)
577
    AS2(  punpckldq mm5, mm0)
578
    AS2(  psrlq mm0, 32)
579
    AS2(  por   mm4, mm7)
580
    AS2(  paddq mm0, mm4)
581
    AS2(  movq  a0, mm5)
582
    AS2(  movq  a2, mm0)
583
#ifdef __GNUC__
584
    ".att_syntax prefix;"
585
    "mov %0, %%ebx;"
586
    : "=m" (temp)
587
    : "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl)
588
    : "memory", "cc"
589
  );
590
#endif
591
592
593
#undef a0
594
#undef a1
595
#undef a2
596
#undef a3
597
#undef k0
598
#undef k1
599
#undef k2
600
#undef k3
601
}
602
603
#define poly_step(ah, al, kh, kl, mh, ml)   \
604
        poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
605
606
/* ----------------------------------------------------------------------- */
607
#else /* not VMAC_ARCH_64 and not SSE2 */
608
/* ----------------------------------------------------------------------- */
609
610
#ifndef nh_16
611
#define nh_16(mp, kp, nw, rh, rl)                                       \
612
{   uint64_t t1,t2,m1,m2,t;                                             \
613
    int i;                                                              \
614
    rh = rl = t = 0;                                                    \
615
    for (i = 0; i < nw; i+=2)  {                                        \
616
        t1  = get64PE(mp+i) + kp[i];                                    \
617
        t2  = get64PE(mp+i+1) + kp[i+1];                                \
618
        m2  = MUL32(t1 >> 32, t2);                                      \
619
        m1  = MUL32(t1, t2 >> 32);                                      \
620
        ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2));            \
621
        rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32);    \
622
        t  += (uint64_t)(uint32_t)m1 + (uint32_t)m2;                    \
623
    }                                                                   \
624
    ADD128(rh,rl,(t >> 32),(t << 32));                                  \
625
}
626
#endif
627
628
static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
629
               const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)                  
630
{
631
632
#if VMAC_ARCH_BIG_ENDIAN
633
#define INDEX_HIGH 0
634
#define INDEX_LOW 1
635
#else
636
#define INDEX_HIGH 1
637
#define INDEX_LOW 0
638
#endif
639
640
#define a0 *(((uint32_t*)alo)+INDEX_LOW)
641
#define a1 *(((uint32_t*)alo)+INDEX_HIGH)
642
#define a2 *(((uint32_t*)ahi)+INDEX_LOW)
643
#define a3 *(((uint32_t*)ahi)+INDEX_HIGH)
644
#define k0 *(((uint32_t*)kl)+INDEX_LOW)
645
#define k1 *(((uint32_t*)kl)+INDEX_HIGH)
646
#define k2 *(((uint32_t*)kh)+INDEX_LOW)
647
#define k3 *(((uint32_t*)kh)+INDEX_HIGH)
648
649
    uint64_t p, q, t;
650
    uint32_t t2;
651
652
    p = MUL32(a3, k3);
653
    p += p;
654
  p += *(uint64_t *)mh;
655
    p += MUL32(a0, k2);
656
    p += MUL32(a1, k1);
657
    p += MUL32(a2, k0);
658
    t = (uint32_t)(p);
659
    p >>= 32;
660
    p += MUL32(a0, k3);
661
    p += MUL32(a1, k2);
662
    p += MUL32(a2, k1);
663
    p += MUL32(a3, k0);
664
    t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32;
665
    p >>= 31;
666
    p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]);
667
    p += MUL32(a0, k0);
668
    q =  MUL32(a1, k3);
669
    q += MUL32(a2, k2);
670
    q += MUL32(a3, k1);
671
    q += q;
672
    p += q;
673
    t2 = (uint32_t)(p);
674
    p >>= 32;
675
    p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]);
676
    p += MUL32(a0, k1);
677
    p += MUL32(a1, k0);
678
    q =  MUL32(a2, k3);
679
    q += MUL32(a3, k2);
680
    q += q;
681
    p += q;
682
    *(uint64_t *)(alo) = (p << 32) | t2;
683
    p >>= 32;
684
    *(uint64_t *)(ahi) = p + t;
685
686
#undef a0
687
#undef a1
688
#undef a2
689
#undef a3
690
#undef k0
691
#undef k1
692
#undef k2
693
#undef k3
694
}
695
696
#define poly_step(ah, al, kh, kl, mh, ml)   \
697
        poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
698
699
/* ----------------------------------------------------------------------- */
700
#endif  /* end of specialized NH and poly definitions */
701
/* ----------------------------------------------------------------------- */
702
703
/* At least nh_16 is defined. Defined others as needed  here               */
704
#ifndef nh_16_2
705
#define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2)                           \
706
    nh_16(mp, kp, nw, rh, rl);                                          \
707
    nh_16(mp, ((kp)+2), nw, rh2, rl2);
708
#endif
709
#ifndef nh_vmac_nhbytes
710
#define nh_vmac_nhbytes(mp, kp, nw, rh, rl)                             \
711
    nh_16(mp, kp, nw, rh, rl)
712
#endif
713
#ifndef nh_vmac_nhbytes_2
714
#define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2)                 \
715
    nh_vmac_nhbytes(mp, kp, nw, rh, rl);                                \
716
    nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2);
717
#endif
718
719
/* ----------------------------------------------------------------------- */
720
721
static void vhash_abort(vmac_ctx_t *ctx)
722
0
{
723
0
    ctx->polytmp[0] = ctx->polykey[0] ;
724
0
    ctx->polytmp[1] = ctx->polykey[1] ;
725
0
    #if (VMAC_TAG_LEN == 128)
726
    ctx->polytmp[2] = ctx->polykey[2] ;
727
    ctx->polytmp[3] = ctx->polykey[3] ;
728
    #endif
729
0
    ctx->first_block_processed = 0;
730
0
}
731
732
/* ----------------------------------------------------------------------- */
733
static uint64_t l3hash(uint64_t p1, uint64_t p2,
734
                       uint64_t k1, uint64_t k2, uint64_t len)
735
0
{
736
0
    uint64_t rh, rl, t, z=0;
737
0
738
0
    /* fully reduce (p1,p2)+(len,0) mod p127 */
739
0
    t = p1 >> 63;
740
0
    p1 &= m63;
741
0
    ADD128(p1, p2, len, t);
742
0
    /* At this point, (p1,p2) is at most 2^127+(len<<64) */
743
0
    t = (p1 > m63) + ((p1 == m63) && (p2 == m64));
744
0
    ADD128(p1, p2, z, t);
745
0
    p1 &= m63;
746
0
747
0
    /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
748
0
    t = p1 + (p2 >> 32);
749
0
    t += (t >> 32);
750
0
    t += (uint32_t)t > 0xfffffffeu;
751
0
    p1 += (t >> 32);
752
0
    p2 += (p1 << 32);
753
0
754
0
    /* compute (p1+k1)%p64 and (p2+k2)%p64 */
755
0
    p1 += k1;
756
0
    p1 += (0 - (p1 < k1)) & 257;
757
0
    p2 += k2;
758
0
    p2 += (0 - (p2 < k2)) & 257;
759
0
760
0
    /* compute (p1+k1)*(p2+k2)%p64 */
761
0
    MUL64(rh, rl, p1, p2);
762
0
    t = rh >> 56;
763
0
    ADD128(t, rl, z, rh);
764
0
    rh <<= 8;
765
0
    ADD128(t, rl, z, rh);
766
0
    t += t << 8;
767
0
    rl += t;
768
0
    rl += (0 - (rl < t)) & 257;
769
0
    rl += (0 - (rl > p64-1)) & 257;
770
0
    return rl;
771
0
}
772
773
/* ----------------------------------------------------------------------- */
774
775
void vhash_update(unsigned char *m,
776
                  unsigned int   mbytes, /* Pos multiple of VMAC_NHBYTES */
777
                  vmac_ctx_t    *ctx)
778
0
{
779
0
    uint64_t rh, rl, *mptr;
780
0
    const uint64_t *kptr = (uint64_t *)ctx->nhkey;
781
0
    int i;
782
0
    uint64_t ch, cl;
783
0
    uint64_t pkh = ctx->polykey[0];
784
0
    uint64_t pkl = ctx->polykey[1];
785
0
    #if (VMAC_TAG_LEN == 128)
786
    uint64_t ch2, cl2, rh2, rl2;
787
    uint64_t pkh2 = ctx->polykey[2];
788
    uint64_t pkl2 = ctx->polykey[3];
789
    #endif
790
0
791
0
    mptr = (uint64_t *)m;
792
0
    i = mbytes / VMAC_NHBYTES;  /* Must be non-zero */
793
0
794
0
    ch = ctx->polytmp[0];
795
0
    cl = ctx->polytmp[1];
796
0
    #if (VMAC_TAG_LEN == 128)
797
    ch2 = ctx->polytmp[2];
798
    cl2 = ctx->polytmp[3];
799
    #endif
800
0
    
801
0
    if ( ! ctx->first_block_processed) {
802
0
        ctx->first_block_processed = 1;
803
0
        #if (VMAC_TAG_LEN == 64)
804
0
        nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
805
0
        #else
806
        nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
807
        rh2 &= m62;
808
        ADD128(ch2,cl2,rh2,rl2);
809
        #endif
810
0
        rh &= m62;
811
0
        ADD128(ch,cl,rh,rl);
812
0
        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
813
0
        i--;
814
0
    }
815
0
816
0
    while (i--) {
817
0
        #if (VMAC_TAG_LEN == 64)
818
0
        nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
819
0
        #else
820
        nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
821
        rh2 &= m62;
822
        poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
823
        #endif
824
0
        rh &= m62;
825
0
        poly_step(ch,cl,pkh,pkl,rh,rl);
826
0
        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
827
0
    }
828
0
829
0
    ctx->polytmp[0] = ch;
830
0
    ctx->polytmp[1] = cl;
831
0
    #if (VMAC_TAG_LEN == 128)
832
    ctx->polytmp[2] = ch2;
833
    ctx->polytmp[3] = cl2;
834
    #endif
835
0
    #if VMAC_USE_SSE2
836
    _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
837
    #endif
838
0
}
839
840
/* ----------------------------------------------------------------------- */
841
842
uint64_t vhash(unsigned char m[],
843
          unsigned int mbytes,
844
          uint64_t *tagl,
845
          vmac_ctx_t *ctx)
846
0
{
847
0
    uint64_t rh, rl, *mptr;
848
0
    const uint64_t *kptr = (uint64_t *)ctx->nhkey;
849
0
    int i, remaining;
850
0
    uint64_t ch, cl;
851
0
    uint64_t pkh = ctx->polykey[0];
852
0
    uint64_t pkl = ctx->polykey[1];
853
0
    #if (VMAC_TAG_LEN == 128)
854
        uint64_t ch2, cl2, rh2, rl2;
855
        uint64_t pkh2 = ctx->polykey[2];
856
        uint64_t pkl2 = ctx->polykey[3];
857
    #endif
858
0
859
0
    mptr = (uint64_t *)m;
860
0
    i = mbytes / VMAC_NHBYTES;
861
0
    remaining = mbytes % VMAC_NHBYTES;
862
0
863
0
    if (ctx->first_block_processed)
864
0
    {
865
0
        ch = ctx->polytmp[0];
866
0
        cl = ctx->polytmp[1];
867
0
        #if (VMAC_TAG_LEN == 128)
868
        ch2 = ctx->polytmp[2];
869
        cl2 = ctx->polytmp[3];
870
        #endif
871
0
    }
872
0
    else if (i)
873
0
    {
874
0
        #if (VMAC_TAG_LEN == 64)
875
0
        nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl);
876
0
        #else
877
        nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2);
878
        ch2 &= m62;
879
        ADD128(ch2,cl2,pkh2,pkl2);
880
        #endif
881
0
        ch &= m62;
882
0
        ADD128(ch,cl,pkh,pkl);
883
0
        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
884
0
        i--;
885
0
    }
886
0
    else if (remaining)
887
0
    {
888
0
        #if (VMAC_TAG_LEN == 64)
889
0
        nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl);
890
0
        #else
891
        nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2);
892
        ch2 &= m62;
893
        ADD128(ch2,cl2,pkh2,pkl2);
894
        #endif
895
0
        ch &= m62;
896
0
        ADD128(ch,cl,pkh,pkl);
897
0
        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
898
0
        goto do_l3;
899
0
    }
900
0
    else /* Empty String */
901
0
    {
902
0
        ch = pkh; cl = pkl;
903
0
        #if (VMAC_TAG_LEN == 128)
904
        ch2 = pkh2; cl2 = pkl2;
905
        #endif
906
0
        goto do_l3;
907
0
    }
908
0
909
0
    while (i--) {
910
0
        #if (VMAC_TAG_LEN == 64)
911
0
        nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
912
0
        #else
913
        nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
914
        rh2 &= m62;
915
        poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
916
        #endif
917
0
        rh &= m62;
918
0
        poly_step(ch,cl,pkh,pkl,rh,rl);
919
0
        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
920
0
    }
921
0
    if (remaining) {
922
0
        #if (VMAC_TAG_LEN == 64)
923
0
        nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
924
0
        #else
925
        nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
926
        rh2 &= m62;
927
        poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
928
        #endif
929
0
        rh &= m62;
930
0
        poly_step(ch,cl,pkh,pkl,rh,rl);
931
0
    }
932
0
933
0
do_l3:
934
0
    #if VMAC_USE_SSE2
935
    _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
936
    #endif
937
0
    vhash_abort(ctx);
938
0
    remaining *= 8;
939
0
#if (VMAC_TAG_LEN == 128)
940
    *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
941
#endif
942
0
    return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
943
0
}
944
945
/* ----------------------------------------------------------------------- */
946
947
uint64_t vmac(unsigned char m[],
948
         unsigned int mbytes,
949
         unsigned char n[16],
950
         uint64_t *tagl,
951
         vmac_ctx_t *ctx)
952
0
{
953
0
#if (VMAC_TAG_LEN == 64)
954
0
    uint64_t *in_n, *out_p;
955
0
    uint64_t p, h;
956
0
    int i;
957
0
    
958
0
    #if VMAC_CACHE_NONCES
959
0
    in_n = ctx->cached_nonce;
960
0
    out_p = ctx->cached_aes;
961
0
    #else
962
    uint64_t tmp[2];
963
    in_n = out_p = tmp;
964
    #endif
965
0
966
0
    i = n[15] & 1;
967
0
    #if VMAC_CACHE_NONCES
968
0
    if ((*(uint64_t *)(n+8) != in_n[1]) ||
969
0
        (*(uint64_t *)(n  ) != in_n[0])) {
970
0
    #endif
971
0
    
972
0
        in_n[0] = *(uint64_t *)(n  );
973
0
        in_n[1] = *(uint64_t *)(n+8);
974
0
        ((unsigned char *)in_n)[15] &= 0xFE;
975
0
        aes_encryption(in_n, out_p, &ctx->cipher_key);
976
0
977
0
    #if VMAC_CACHE_NONCES
978
0
        ((unsigned char *)in_n)[15] |= (unsigned char)(1-i);
979
0
    }
980
0
    #endif
981
0
    p = get64BE(out_p + i);
982
0
    h = vhash(m, mbytes, (uint64_t *)0, ctx);
983
0
    return p + h;
984
0
#else
985
    uint64_t tmp[2];
986
    uint64_t th,tl;
987
    aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key);
988
    th = vhash(m, mbytes, &tl, ctx);
989
    th += get64BE(tmp);
990
    *tagl = tl + get64BE(tmp+1);
991
    return th;
992
#endif
993
0
}
994
995
/* ----------------------------------------------------------------------- */
996
997
void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx)
998
0
{
999
0
    uint64_t in[2] = {0}, out[2];
1000
0
    unsigned i;
1001
0
    aes_key_setup(user_key, &ctx->cipher_key);
1002
0
    
1003
0
    /* Fill nh key */
1004
0
    ((unsigned char *)in)[0] = 0x80; 
1005
0
    for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) {
1006
0
        aes_encryption((unsigned char *)in, (unsigned char *)out,
1007
0
                                                         &ctx->cipher_key);
1008
0
        ctx->nhkey[i  ] = get64BE(out);
1009
0
        ctx->nhkey[i+1] = get64BE(out+1);
1010
0
        ((unsigned char *)in)[15] += 1;
1011
0
    }
1012
0
1013
0
    /* Fill poly key */
1014
0
    ((unsigned char *)in)[0] = 0xC0; 
1015
0
    in[1] = 0;
1016
0
    for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) {
1017
0
        aes_encryption((unsigned char *)in, (unsigned char *)out,
1018
0
                                                         &ctx->cipher_key);
1019
0
        ctx->polytmp[i  ] = ctx->polykey[i  ] = get64BE(out) & mpoly;
1020
0
        ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly;
1021
0
        ((unsigned char *)in)[15] += 1;
1022
0
    }
1023
0
1024
0
    /* Fill ip key */
1025
0
    ((unsigned char *)in)[0] = 0xE0;
1026
0
    in[1] = 0;
1027
0
    for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) {
1028
0
        do {
1029
0
            aes_encryption((unsigned char *)in, (unsigned char *)out,
1030
0
                                                         &ctx->cipher_key);
1031
0
            ctx->l3key[i  ] = get64BE(out);
1032
0
            ctx->l3key[i+1] = get64BE(out+1);
1033
0
            ((unsigned char *)in)[15] += 1;
1034
0
        } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64);
1035
0
    }
1036
0
    
1037
0
    /* Invalidate nonce/aes cache and reset other elements */
1038
0
    #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES)
1039
0
    ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */
1040
0
    ctx->cached_nonce[1] = (uint64_t)0;  /* Ensure illegal nonce */
1041
0
    #endif
1042
0
    ctx->first_block_processed = 0;
1043
0
}
1044
1045
/* ----------------------------------------------------------------------- */
1046
1047
1048
#if VMAC_RUN_TESTS
1049
1050
#include <stdlib.h>
1051
#include <stdio.h>
1052
#include <time.h>
1053
#include <string.h>
1054
1055
unsigned prime(void)  /* Wake variable speed cpu, get rough speed estimate */
1056
{
1057
    volatile uint64_t i;
1058
    volatile uint64_t j=1;
1059
    unsigned cnt=0;
1060
    volatile clock_t ticks = clock();
1061
    do {
1062
        for (i = 0; i < 500000; i++) {
1063
            uint64_t x = get64PE(&j);
1064
            j = x * x + (uint64_t)ticks;
1065
        }
1066
        cnt++;
1067
    } while (clock() - ticks < (CLOCKS_PER_SEC/2));
1068
    return cnt;  /* cnt is millions of iterations per second */
1069
}
1070
1071
int main(void)
1072
{
1073
    ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2;
1074
    uint64_t res, tagl;
1075
    void *p;
1076
    unsigned char *m;
1077
    ALIGN(4) unsigned char key[] = "abcdefghijklmnop";
1078
    ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi";
1079
    unsigned int  vector_lengths[] = {0,3,48,300,3000000};
1080
    #if (VMAC_TAG_LEN == 64)
1081
    ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5",
1082
                        "E8421F61D573D298","4492DF6C5CAC1BBE",
1083
                        "09BA597DD7601113"};
1084
    #else
1085
    ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC",
1086
                         "4EE815A06A1D71EDD36FC75D51188A42",
1087
                         "09F2C80C8E1007A0C12FAE19FE4504AE",
1088
                         "66438817154850C61D8A412164803BCB",
1089
                         "2B6B02288FFC461B75485DE893C629DC"};
1090
    #endif
1091
    unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
1092
    unsigned i, j, *speed_iters;
1093
    clock_t ticks;
1094
    double cpb;
1095
    const unsigned int buf_len = 3 * (1 << 20);
1096
    
1097
    j = prime();
1098
    i = sizeof(speed_lengths)/sizeof(speed_lengths[0]);
1099
    speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0]));
1100
    speed_iters[i-1] = j * (1 << 12);
1101
    while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]);
1102
    
1103
    /* Initialize context and message buffer, all 16-byte aligned */
1104
    p = malloc(buf_len + 32);
1105
    m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15));
1106
    memset(m, 0, buf_len + 16);
1107
    vmac_set_key(key, &ctx);
1108
    
1109
    /* Test incremental and all-in-one interfaces for correctness */
1110
    vmac_set_key(key, &ctx_aio);
1111
    vmac_set_key(key, &ctx_inc1);
1112
    vmac_set_key(key, &ctx_inc2);
1113
    
1114
    
1115
    /*
1116
    for (i = 0; i <= 512; i++) {
1117
        vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1118
        tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES,
1119
                                                      nonce, &tagl, &ctx);
1120
        vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1121
        for (j = 0; j < vector_lengths[i]; j++)
1122
            m[j] = (unsigned char)('a'+j%3);
1123
        
1124
    }
1125
    */
1126
    
1127
    /* Generate vectors */
1128
    for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) {
1129
        for (j = 0; j < vector_lengths[i]; j++)
1130
            m[j] = (unsigned char)('a'+j%3);
1131
        res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx);
1132
        #if (VMAC_TAG_LEN == 64)
1133
        printf("\'abc\' * %7u: %016llX Should be: %s\n",
1134
              vector_lengths[i]/3,res,should_be[i]);
1135
        #else
1136
        printf("\'abc\' * %7u: %016llX%016llX\nShould be      : %s\n",
1137
              vector_lengths[i]/3,res,tagl,should_be[i]);
1138
        #endif
1139
    }
1140
1141
    /* Speed test */
1142
    for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) {
1143
        ticks = clock();
1144
        for (j = 0; j < speed_iters[i]; j++) {
1145
            #if HASH_ONLY
1146
            res = vhash(m, speed_lengths[i], &tagl, &ctx);
1147
            #else
1148
            res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx);
1149
            nonce[7]++;
1150
            #endif
1151
        }
1152
        ticks = clock() - ticks;
1153
        cpb = ((ticks*VMAC_HZ)/
1154
              ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i]));
1155
        printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb);
1156
    }
1157
    return 1;
1158
}
1159
1160
#endif