debuggers.hg

annotate xen/common/bunzip2.c @ 22855:1d1eec7e1fb4

xl: Perform minimal validation of virtual disk file while parsing config file

This patch performs some very basic validation on the virtual disk
file passed through the config file. This validation ensures that we
don't go too far with the initialization like spawn qemu and more
while there could be some potentially fundamental issues.

[ Patch fixed up to work with PHYSTYPE_EMPTY 22808:6ec61438713a -iwj ]

Signed-off-by: Kamala Narasimhan <kamala.narasimhan@citrix.com>
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
Committed-by: Ian Jackson <ian.jackson@eu.citrix.com>
author Kamala Narasimhan <kamala.narasimhan@gmail.com>
date Tue Jan 25 18:09:49 2011 +0000 (2011-01-25)
parents c4630f8f69cc
children
rev   line source
keir@20447 1 /* vi: set sw = 4 ts = 4: */
keir@20447 2 /* Small bzip2 deflate implementation, by Rob Landley (rob@landley.net).
keir@20447 3
keir@20447 4 Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
keir@20447 5 which also acknowledges contributions by Mike Burrows, David Wheeler,
keir@20447 6 Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten,
keir@20447 7 Robert Sedgewick, and Jon L. Bentley.
keir@20447 8
keir@20447 9 This code is licensed under the LGPLv2:
keir@20447 10 LGPL (http://www.gnu.org/copyleft/lgpl.html
keir@20447 11 */
keir@20447 12
keir@20447 13 /*
keir@20447 14 Size and speed optimizations by Manuel Novoa III (mjn3@codepoet.org).
keir@20447 15
keir@20447 16 More efficient reading of Huffman codes, a streamlined read_bunzip()
keir@20447 17 function, and various other tweaks. In (limited) tests, approximately
keir@20447 18 20% faster than bzcat on x86 and about 10% faster on arm.
keir@20447 19
keir@20447 20 Note that about 2/3 of the time is spent in read_unzip() reversing
keir@20447 21 the Burrows-Wheeler transformation. Much of that time is delay
keir@20447 22 resulting from cache misses.
keir@20447 23
keir@20447 24 I would ask that anyone benefiting from this work, especially those
keir@20447 25 using it in commercial products, consider making a donation to my local
keir@20447 26 non-profit hospice organization in the name of the woman I loved, who
keir@20447 27 passed away Feb. 12, 2003.
keir@20447 28
keir@20447 29 In memory of Toni W. Hagan
keir@20447 30
keir@20447 31 Hospice of Acadiana, Inc.
keir@20447 32 2600 Johnston St., Suite 200
keir@20447 33 Lafayette, LA 70503-3240
keir@20447 34
keir@20447 35 Phone (337) 232-1234 or 1-800-738-2226
keir@20447 36 Fax (337) 232-1297
keir@20447 37
keir@20447 38 http://www.hospiceacadiana.com/
keir@20447 39
keir@20447 40 Manuel
keir@20447 41 */
keir@20447 42
keir@20447 43 /*
keir@20447 44 Made it fit for running in Linux Kernel by Alain Knaff (alain@knaff.lu)
keir@20447 45 */
keir@20447 46
keir@20447 47 #include "decompress.h"
keir@20447 48
keir@20447 49 #ifndef INT_MAX
keir@20447 50 #define INT_MAX 0x7fffffff
keir@20447 51 #endif
keir@20447 52
keir@20447 53 /* Constants for Huffman coding */
keir@20447 54 #define MAX_GROUPS 6
keir@20447 55 #define GROUP_SIZE 50 /* 64 would have been more efficient */
keir@20447 56 #define MAX_HUFCODE_BITS 20 /* Longest Huffman code allowed */
keir@20447 57 #define MAX_SYMBOLS 258 /* 256 literals + RUNA + RUNB */
keir@20447 58 #define SYMBOL_RUNA 0
keir@20447 59 #define SYMBOL_RUNB 1
keir@20447 60
keir@20447 61 /* Status return values */
keir@20447 62 #define RETVAL_OK 0
keir@20447 63 #define RETVAL_LAST_BLOCK (-1)
keir@20447 64 #define RETVAL_NOT_BZIP_DATA (-2)
keir@20447 65 #define RETVAL_UNEXPECTED_INPUT_EOF (-3)
keir@20447 66 #define RETVAL_UNEXPECTED_OUTPUT_EOF (-4)
keir@20447 67 #define RETVAL_DATA_ERROR (-5)
keir@20447 68 #define RETVAL_OUT_OF_MEMORY (-6)
keir@20447 69 #define RETVAL_OBSOLETE_INPUT (-7)
keir@20447 70
keir@20447 71 /* Other housekeeping constants */
keir@20447 72 #define BZIP2_IOBUF_SIZE 4096
keir@20447 73
keir@20447 74 /* This is what we know about each Huffman coding group */
keir@20447 75 struct group_data {
keir@20447 76 /* We have an extra slot at the end of limit[] for a sentinal value. */
keir@20447 77 int limit[MAX_HUFCODE_BITS+1];
keir@20447 78 int base[MAX_HUFCODE_BITS];
keir@20447 79 int permute[MAX_SYMBOLS];
keir@20447 80 int minLen, maxLen;
keir@20447 81 };
keir@20447 82
keir@20447 83 /* Structure holding all the housekeeping data, including IO buffers and
keir@20447 84 memory that persists between calls to bunzip */
keir@20447 85 struct bunzip_data {
keir@20447 86 /* State for interrupting output loop */
keir@20447 87 int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent;
keir@20447 88 /* I/O tracking data (file handles, buffers, positions, etc.) */
keir@20447 89 int (*fill)(void*, unsigned int);
keir@20447 90 int inbufCount, inbufPos /*, outbufPos*/;
keir@20447 91 unsigned char *inbuf /*,*outbuf*/;
keir@20447 92 unsigned int inbufBitCount, inbufBits;
keir@20447 93 /* The CRC values stored in the block header and calculated from the
keir@20447 94 data */
keir@20447 95 unsigned int crc32Table[256], headerCRC, totalCRC, writeCRC;
keir@20447 96 /* Intermediate buffer and its size (in bytes) */
keir@20447 97 unsigned int *dbuf, dbufSize;
keir@20447 98 /* These things are a bit too big to go on the stack */
keir@20447 99 unsigned char selectors[32768]; /* nSelectors = 15 bits */
keir@20447 100 struct group_data groups[MAX_GROUPS]; /* Huffman coding tables */
keir@20447 101 int io_error; /* non-zero if we have IO error */
keir@20447 102 };
keir@20447 103
keir@20447 104
keir@20447 105 /* Return the next nnn bits of input. All reads from the compressed input
keir@20447 106 are done through this function. All reads are big endian */
keir@20447 107 static unsigned int INIT get_bits(struct bunzip_data *bd, char bits_wanted)
keir@20447 108 {
keir@20447 109 unsigned int bits = 0;
keir@20447 110
keir@20447 111 /* If we need to get more data from the byte buffer, do so.
keir@20447 112 (Loop getting one byte at a time to enforce endianness and avoid
keir@20447 113 unaligned access.) */
keir@20447 114 while (bd->inbufBitCount < bits_wanted) {
keir@20447 115 /* If we need to read more data from file into byte buffer, do
keir@20447 116 so */
keir@20447 117 if (bd->inbufPos == bd->inbufCount) {
keir@20447 118 if (bd->io_error)
keir@20447 119 return 0;
keir@20447 120 bd->inbufCount = bd->fill(bd->inbuf, BZIP2_IOBUF_SIZE);
keir@20447 121 if (bd->inbufCount <= 0) {
keir@20447 122 bd->io_error = RETVAL_UNEXPECTED_INPUT_EOF;
keir@20447 123 return 0;
keir@20447 124 }
keir@20447 125 bd->inbufPos = 0;
keir@20447 126 }
keir@20447 127 /* Avoid 32-bit overflow (dump bit buffer to top of output) */
keir@20447 128 if (bd->inbufBitCount >= 24) {
keir@20447 129 bits = bd->inbufBits&((1 << bd->inbufBitCount)-1);
keir@20447 130 bits_wanted -= bd->inbufBitCount;
keir@20447 131 bits <<= bits_wanted;
keir@20447 132 bd->inbufBitCount = 0;
keir@20447 133 }
keir@20447 134 /* Grab next 8 bits of input from buffer. */
keir@20447 135 bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
keir@20447 136 bd->inbufBitCount += 8;
keir@20447 137 }
keir@20447 138 /* Calculate result */
keir@20447 139 bd->inbufBitCount -= bits_wanted;
keir@20447 140 bits |= (bd->inbufBits >> bd->inbufBitCount)&((1 << bits_wanted)-1);
keir@20447 141
keir@20447 142 return bits;
keir@20447 143 }
keir@20447 144
keir@20447 145 /* Unpacks the next block and sets up for the inverse burrows-wheeler step. */
keir@20447 146
keir@20447 147 static int INIT get_next_block(struct bunzip_data *bd)
keir@20447 148 {
keir@20447 149 struct group_data *hufGroup = NULL;
keir@20447 150 int *base = NULL;
keir@20447 151 int *limit = NULL;
keir@20447 152 int dbufCount, nextSym, dbufSize, groupCount, selector,
keir@20447 153 i, j, k, t, runPos, symCount, symTotal, nSelectors,
keir@20447 154 byteCount[256];
keir@20447 155 unsigned char uc, symToByte[256], mtfSymbol[256], *selectors;
keir@20447 156 unsigned int *dbuf, origPtr;
keir@20447 157
keir@20447 158 dbuf = bd->dbuf;
keir@20447 159 dbufSize = bd->dbufSize;
keir@20447 160 selectors = bd->selectors;
keir@20447 161
keir@20447 162 /* Read in header signature and CRC, then validate signature.
keir@20447 163 (last block signature means CRC is for whole file, return now) */
keir@20447 164 i = get_bits(bd, 24);
keir@20447 165 j = get_bits(bd, 24);
keir@20447 166 bd->headerCRC = get_bits(bd, 32);
keir@20447 167 if ((i == 0x177245) && (j == 0x385090))
keir@20447 168 return RETVAL_LAST_BLOCK;
keir@20447 169 if ((i != 0x314159) || (j != 0x265359))
keir@20447 170 return RETVAL_NOT_BZIP_DATA;
keir@20447 171 /* We can add support for blockRandomised if anybody complains.
keir@20447 172 There was some code for this in busybox 1.0.0-pre3, but nobody ever
keir@20447 173 noticed that it didn't actually work. */
keir@20447 174 if (get_bits(bd, 1))
keir@20447 175 return RETVAL_OBSOLETE_INPUT;
keir@20447 176 origPtr = get_bits(bd, 24);
keir@20447 177 if (origPtr > dbufSize)
keir@20447 178 return RETVAL_DATA_ERROR;
keir@20447 179 /* mapping table: if some byte values are never used (encoding things
keir@20447 180 like ascii text), the compression code removes the gaps to have fewer
keir@20447 181 symbols to deal with, and writes a sparse bitfield indicating which
keir@20447 182 values were present. We make a translation table to convert the
keir@20447 183 symbols back to the corresponding bytes. */
keir@20447 184 t = get_bits(bd, 16);
keir@20447 185 symTotal = 0;
keir@20447 186 for (i = 0; i < 16; i++) {
keir@20447 187 if (t&(1 << (15-i))) {
keir@20447 188 k = get_bits(bd, 16);
keir@20447 189 for (j = 0; j < 16; j++)
keir@20447 190 if (k&(1 << (15-j)))
keir@20447 191 symToByte[symTotal++] = (16*i)+j;
keir@20447 192 }
keir@20447 193 }
keir@20447 194 /* How many different Huffman coding groups does this block use? */
keir@20447 195 groupCount = get_bits(bd, 3);
keir@20447 196 if (groupCount < 2 || groupCount > MAX_GROUPS)
keir@20447 197 return RETVAL_DATA_ERROR;
keir@20447 198 /* nSelectors: Every GROUP_SIZE many symbols we select a new
keir@20447 199 Huffman coding group. Read in the group selector list,
keir@20447 200 which is stored as MTF encoded bit runs. (MTF = Move To
keir@20447 201 Front, as each value is used it's moved to the start of the
keir@20447 202 list.) */
keir@20447 203 nSelectors = get_bits(bd, 15);
keir@20447 204 if (!nSelectors)
keir@20447 205 return RETVAL_DATA_ERROR;
keir@20447 206 for (i = 0; i < groupCount; i++)
keir@20447 207 mtfSymbol[i] = i;
keir@20447 208 for (i = 0; i < nSelectors; i++) {
keir@20447 209 /* Get next value */
keir@20447 210 for (j = 0; get_bits(bd, 1); j++)
keir@20447 211 if (j >= groupCount)
keir@20447 212 return RETVAL_DATA_ERROR;
keir@20447 213 /* Decode MTF to get the next selector */
keir@20447 214 uc = mtfSymbol[j];
keir@20447 215 for (; j; j--)
keir@20447 216 mtfSymbol[j] = mtfSymbol[j-1];
keir@20447 217 mtfSymbol[0] = selectors[i] = uc;
keir@20447 218 }
keir@20447 219 /* Read the Huffman coding tables for each group, which code
keir@20447 220 for symTotal literal symbols, plus two run symbols (RUNA,
keir@20447 221 RUNB) */
keir@20447 222 symCount = symTotal+2;
keir@20447 223 for (j = 0; j < groupCount; j++) {
keir@20447 224 unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1];
keir@20447 225 int minLen, maxLen, pp;
keir@20447 226 /* Read Huffman code lengths for each symbol. They're
keir@20447 227 stored in a way similar to mtf; record a starting
keir@20447 228 value for the first symbol, and an offset from the
keir@20447 229 previous value for everys symbol after that.
keir@20447 230 (Subtracting 1 before the loop and then adding it
keir@20447 231 back at the end is an optimization that makes the
keir@20447 232 test inside the loop simpler: symbol length 0
keir@20447 233 becomes negative, so an unsigned inequality catches
keir@20447 234 it.) */
keir@20447 235 t = get_bits(bd, 5)-1;
keir@20447 236 for (i = 0; i < symCount; i++) {
keir@20447 237 for (;;) {
keir@20447 238 if (((unsigned)t) > (MAX_HUFCODE_BITS-1))
keir@20447 239 return RETVAL_DATA_ERROR;
keir@20447 240
keir@20447 241 /* If first bit is 0, stop. Else
keir@20447 242 second bit indicates whether to
keir@20447 243 increment or decrement the value.
keir@20447 244 Optimization: grab 2 bits and unget
keir@20447 245 the second if the first was 0. */
keir@20447 246
keir@20447 247 k = get_bits(bd, 2);
keir@20447 248 if (k < 2) {
keir@20447 249 bd->inbufBitCount++;
keir@20447 250 break;
keir@20447 251 }
keir@20447 252 /* Add one if second bit 1, else
keir@20447 253 * subtract 1. Avoids if/else */
keir@20447 254 t += (((k+1)&2)-1);
keir@20447 255 }
keir@20447 256 /* Correct for the initial -1, to get the
keir@20447 257 * final symbol length */
keir@20447 258 length[i] = t+1;
keir@20447 259 }
keir@20447 260 /* Find largest and smallest lengths in this group */
keir@20447 261 minLen = maxLen = length[0];
keir@20447 262
keir@20447 263 for (i = 1; i < symCount; i++) {
keir@20447 264 if (length[i] > maxLen)
keir@20447 265 maxLen = length[i];
keir@20447 266 else if (length[i] < minLen)
keir@20447 267 minLen = length[i];
keir@20447 268 }
keir@20447 269
keir@20447 270 /* Calculate permute[], base[], and limit[] tables from
keir@20447 271 * length[].
keir@20447 272 *
keir@20447 273 * permute[] is the lookup table for converting
keir@20447 274 * Huffman coded symbols into decoded symbols. base[]
keir@20447 275 * is the amount to subtract from the value of a
keir@20447 276 * Huffman symbol of a given length when using
keir@20447 277 * permute[].
keir@20447 278 *
keir@20447 279 * limit[] indicates the largest numerical value a
keir@20447 280 * symbol with a given number of bits can have. This
keir@20447 281 * is how the Huffman codes can vary in length: each
keir@20447 282 * code with a value > limit[length] needs another
keir@20447 283 * bit.
keir@20447 284 */
keir@20447 285 hufGroup = bd->groups+j;
keir@20447 286 hufGroup->minLen = minLen;
keir@20447 287 hufGroup->maxLen = maxLen;
keir@20447 288 /* Note that minLen can't be smaller than 1, so we
keir@20447 289 adjust the base and limit array pointers so we're
keir@20447 290 not always wasting the first entry. We do this
keir@20447 291 again when using them (during symbol decoding).*/
keir@20447 292 base = hufGroup->base-1;
keir@20447 293 limit = hufGroup->limit-1;
keir@20447 294 /* Calculate permute[]. Concurently, initialize
keir@20447 295 * temp[] and limit[]. */
keir@20447 296 pp = 0;
keir@20447 297 for (i = minLen; i <= maxLen; i++) {
keir@20447 298 temp[i] = limit[i] = 0;
keir@20447 299 for (t = 0; t < symCount; t++)
keir@20447 300 if (length[t] == i)
keir@20447 301 hufGroup->permute[pp++] = t;
keir@20447 302 }
keir@20447 303 /* Count symbols coded for at each bit length */
keir@20447 304 for (i = 0; i < symCount; i++)
keir@20447 305 temp[length[i]]++;
keir@20447 306 /* Calculate limit[] (the largest symbol-coding value
keir@20447 307 *at each bit length, which is (previous limit <<
keir@20447 308 *1)+symbols at this level), and base[] (number of
keir@20447 309 *symbols to ignore at each bit length, which is limit
keir@20447 310 *minus the cumulative count of symbols coded for
keir@20447 311 *already). */
keir@20447 312 pp = t = 0;
keir@20447 313 for (i = minLen; i < maxLen; i++) {
keir@20447 314 pp += temp[i];
keir@20447 315 /* We read the largest possible symbol size
keir@20447 316 and then unget bits after determining how
keir@20447 317 many we need, and those extra bits could be
keir@20447 318 set to anything. (They're noise from
keir@20447 319 future symbols.) At each level we're
keir@20447 320 really only interested in the first few
keir@20447 321 bits, so here we set all the trailing
keir@20447 322 to-be-ignored bits to 1 so they don't
keir@20447 323 affect the value > limit[length]
keir@20447 324 comparison. */
keir@20447 325 limit[i] = (pp << (maxLen - i)) - 1;
keir@20447 326 pp <<= 1;
keir@20447 327 base[i+1] = pp-(t += temp[i]);
keir@20447 328 }
keir@20447 329 limit[maxLen+1] = INT_MAX; /* Sentinal value for
keir@20447 330 * reading next sym. */
keir@20447 331 limit[maxLen] = pp+temp[maxLen]-1;
keir@20447 332 base[minLen] = 0;
keir@20447 333 }
keir@20447 334 /* We've finished reading and digesting the block header. Now
keir@20447 335 read this block's Huffman coded symbols from the file and
keir@20447 336 undo the Huffman coding and run length encoding, saving the
keir@20447 337 result into dbuf[dbufCount++] = uc */
keir@20447 338
keir@20447 339 /* Initialize symbol occurrence counters and symbol Move To
keir@20447 340 * Front table */
keir@20447 341 for (i = 0; i < 256; i++) {
keir@20447 342 byteCount[i] = 0;
keir@20447 343 mtfSymbol[i] = (unsigned char)i;
keir@20447 344 }
keir@20447 345 /* Loop through compressed symbols. */
keir@20447 346 runPos = dbufCount = symCount = selector = 0;
keir@20447 347 for (;;) {
keir@20447 348 /* Determine which Huffman coding group to use. */
keir@20447 349 if (!(symCount--)) {
keir@20447 350 symCount = GROUP_SIZE-1;
keir@20447 351 if (selector >= nSelectors)
keir@20447 352 return RETVAL_DATA_ERROR;
keir@20447 353 hufGroup = bd->groups+selectors[selector++];
keir@20447 354 base = hufGroup->base-1;
keir@20447 355 limit = hufGroup->limit-1;
keir@20447 356 }
keir@20447 357 /* Read next Huffman-coded symbol. */
keir@20447 358 /* Note: It is far cheaper to read maxLen bits and
keir@20447 359 back up than it is to read minLen bits and then an
keir@20447 360 additional bit at a time, testing as we go.
keir@20447 361 Because there is a trailing last block (with file
keir@20447 362 CRC), there is no danger of the overread causing an
keir@20447 363 unexpected EOF for a valid compressed file. As a
keir@20447 364 further optimization, we do the read inline
keir@20447 365 (falling back to a call to get_bits if the buffer
keir@20447 366 runs dry). The following (up to got_huff_bits:) is
keir@20447 367 equivalent to j = get_bits(bd, hufGroup->maxLen);
keir@20447 368 */
keir@20447 369 while (bd->inbufBitCount < hufGroup->maxLen) {
keir@20447 370 if (bd->inbufPos == bd->inbufCount) {
keir@20447 371 j = get_bits(bd, hufGroup->maxLen);
keir@20447 372 goto got_huff_bits;
keir@20447 373 }
keir@20447 374 bd->inbufBits =
keir@20447 375 (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
keir@20447 376 bd->inbufBitCount += 8;
keir@20447 377 };
keir@20447 378 bd->inbufBitCount -= hufGroup->maxLen;
keir@20447 379 j = (bd->inbufBits >> bd->inbufBitCount)&
keir@20447 380 ((1 << hufGroup->maxLen)-1);
keir@20447 381 got_huff_bits:
keir@20447 382 /* Figure how how many bits are in next symbol and
keir@20447 383 * unget extras */
keir@20447 384 i = hufGroup->minLen;
keir@20447 385 while (j > limit[i])
keir@20447 386 ++i;
keir@20447 387 bd->inbufBitCount += (hufGroup->maxLen - i);
keir@20447 388 /* Huffman decode value to get nextSym (with bounds checking) */
keir@20447 389 if ((i > hufGroup->maxLen)
keir@20447 390 || (((unsigned)(j = (j>>(hufGroup->maxLen-i))-base[i]))
keir@20447 391 >= MAX_SYMBOLS))
keir@20447 392 return RETVAL_DATA_ERROR;
keir@20447 393 nextSym = hufGroup->permute[j];
keir@20447 394 /* We have now decoded the symbol, which indicates
keir@20447 395 either a new literal byte, or a repeated run of the
keir@20447 396 most recent literal byte. First, check if nextSym
keir@20447 397 indicates a repeated run, and if so loop collecting
keir@20447 398 how many times to repeat the last literal. */
keir@20447 399 if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */
keir@20447 400 /* If this is the start of a new run, zero out
keir@20447 401 * counter */
keir@20447 402 if (!runPos) {
keir@20447 403 runPos = 1;
keir@20447 404 t = 0;
keir@20447 405 }
keir@20447 406 /* Neat trick that saves 1 symbol: instead of
keir@20447 407 or-ing 0 or 1 at each bit position, add 1
keir@20447 408 or 2 instead. For example, 1011 is 1 << 0
keir@20447 409 + 1 << 1 + 2 << 2. 1010 is 2 << 0 + 2 << 1
keir@20447 410 + 1 << 2. You can make any bit pattern
keir@20447 411 that way using 1 less symbol than the basic
keir@20447 412 or 0/1 method (except all bits 0, which
keir@20447 413 would use no symbols, but a run of length 0
keir@20447 414 doesn't mean anything in this context).
keir@20447 415 Thus space is saved. */
keir@20447 416 t += (runPos << nextSym);
keir@20447 417 /* +runPos if RUNA; +2*runPos if RUNB */
keir@20447 418
keir@20447 419 runPos <<= 1;
keir@20447 420 continue;
keir@20447 421 }
keir@20447 422 /* When we hit the first non-run symbol after a run,
keir@20447 423 we now know how many times to repeat the last
keir@20447 424 literal, so append that many copies to our buffer
keir@20447 425 of decoded symbols (dbuf) now. (The last literal
keir@20447 426 used is the one at the head of the mtfSymbol
keir@20447 427 array.) */
keir@20447 428 if (runPos) {
keir@20447 429 runPos = 0;
keir@20447 430 if (dbufCount+t >= dbufSize)
keir@20447 431 return RETVAL_DATA_ERROR;
keir@20447 432
keir@20447 433 uc = symToByte[mtfSymbol[0]];
keir@20447 434 byteCount[uc] += t;
keir@20447 435 while (t--)
keir@20447 436 dbuf[dbufCount++] = uc;
keir@20447 437 }
keir@20447 438 /* Is this the terminating symbol? */
keir@20447 439 if (nextSym > symTotal)
keir@20447 440 break;
keir@20447 441 /* At this point, nextSym indicates a new literal
keir@20447 442 character. Subtract one to get the position in the
keir@20447 443 MTF array at which this literal is currently to be
keir@20447 444 found. (Note that the result can't be -1 or 0,
keir@20447 445 because 0 and 1 are RUNA and RUNB. But another
keir@20447 446 instance of the first symbol in the mtf array,
keir@20447 447 position 0, would have been handled as part of a
keir@20447 448 run above. Therefore 1 unused mtf position minus 2
keir@20447 449 non-literal nextSym values equals -1.) */
keir@20447 450 if (dbufCount >= dbufSize)
keir@20447 451 return RETVAL_DATA_ERROR;
keir@20447 452 i = nextSym - 1;
keir@20447 453 uc = mtfSymbol[i];
keir@20447 454 /* Adjust the MTF array. Since we typically expect to
keir@20447 455 *move only a small number of symbols, and are bound
keir@20447 456 *by 256 in any case, using memmove here would
keir@20447 457 *typically be bigger and slower due to function call
keir@20447 458 *overhead and other assorted setup costs. */
keir@20447 459 do {
keir@20447 460 mtfSymbol[i] = mtfSymbol[i-1];
keir@20447 461 } while (--i);
keir@20447 462 mtfSymbol[0] = uc;
keir@20447 463 uc = symToByte[uc];
keir@20447 464 /* We have our literal byte. Save it into dbuf. */
keir@20447 465 byteCount[uc]++;
keir@20447 466 dbuf[dbufCount++] = (unsigned int)uc;
keir@20447 467 }
keir@20447 468 /* At this point, we've read all the Huffman-coded symbols
keir@20447 469 (and repeated runs) for this block from the input stream,
keir@20447 470 and decoded them into the intermediate buffer. There are
keir@20447 471 dbufCount many decoded bytes in dbuf[]. Now undo the
keir@20447 472 Burrows-Wheeler transform on dbuf. See
keir@20447 473 http://dogma.net/markn/articles/bwt/bwt.htm
keir@20447 474 */
keir@20447 475 /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
keir@20447 476 j = 0;
keir@20447 477 for (i = 0; i < 256; i++) {
keir@20447 478 k = j+byteCount[i];
keir@20447 479 byteCount[i] = j;
keir@20447 480 j = k;
keir@20447 481 }
keir@20447 482 /* Figure out what order dbuf would be in if we sorted it. */
keir@20447 483 for (i = 0; i < dbufCount; i++) {
keir@20447 484 uc = (unsigned char)(dbuf[i] & 0xff);
keir@20447 485 dbuf[byteCount[uc]] |= (i << 8);
keir@20447 486 byteCount[uc]++;
keir@20447 487 }
keir@20447 488 /* Decode first byte by hand to initialize "previous" byte.
keir@20447 489 Note that it doesn't get output, and if the first three
keir@20447 490 characters are identical it doesn't qualify as a run (hence
keir@20447 491 writeRunCountdown = 5). */
keir@20447 492 if (dbufCount) {
keir@20447 493 if (origPtr >= dbufCount)
keir@20447 494 return RETVAL_DATA_ERROR;
keir@20447 495 bd->writePos = dbuf[origPtr];
keir@20447 496 bd->writeCurrent = (unsigned char)(bd->writePos&0xff);
keir@20447 497 bd->writePos >>= 8;
keir@20447 498 bd->writeRunCountdown = 5;
keir@20447 499 }
keir@20447 500 bd->writeCount = dbufCount;
keir@20447 501
keir@20447 502 return RETVAL_OK;
keir@20447 503 }
keir@20447 504
keir@20447 505 /* Undo burrows-wheeler transform on intermediate buffer to produce output.
keir@20447 506 If start_bunzip was initialized with out_fd =-1, then up to len bytes of
keir@20447 507 data are written to outbuf. Return value is number of bytes written or
keir@20447 508 error (all errors are negative numbers). If out_fd!=-1, outbuf and len
keir@20447 509 are ignored, data is written to out_fd and return is RETVAL_OK or error.
keir@20447 510 */
keir@20447 511
keir@20447 512 static int INIT read_bunzip(struct bunzip_data *bd, unsigned char *outbuf, int len)
keir@20447 513 {
keir@20447 514 const unsigned int *dbuf;
keir@20447 515 int pos, xcurrent, previous, gotcount;
keir@20447 516
keir@20447 517 /* If last read was short due to end of file, return last block now */
keir@20447 518 if (bd->writeCount < 0)
keir@20447 519 return bd->writeCount;
keir@20447 520
keir@20447 521 gotcount = 0;
keir@20447 522 dbuf = bd->dbuf;
keir@20447 523 pos = bd->writePos;
keir@20447 524 xcurrent = bd->writeCurrent;
keir@20447 525
keir@20447 526 /* We will always have pending decoded data to write into the output
keir@20447 527 buffer unless this is the very first call (in which case we haven't
keir@20447 528 Huffman-decoded a block into the intermediate buffer yet). */
keir@20447 529
keir@20447 530 if (bd->writeCopies) {
keir@20447 531 /* Inside the loop, writeCopies means extra copies (beyond 1) */
keir@20447 532 --bd->writeCopies;
keir@20447 533 /* Loop outputting bytes */
keir@20447 534 for (;;) {
keir@20447 535 /* If the output buffer is full, snapshot
keir@20447 536 * state and return */
keir@20447 537 if (gotcount >= len) {
keir@20447 538 bd->writePos = pos;
keir@20447 539 bd->writeCurrent = xcurrent;
keir@20447 540 bd->writeCopies++;
keir@20447 541 return len;
keir@20447 542 }
keir@20447 543 /* Write next byte into output buffer, updating CRC */
keir@20447 544 outbuf[gotcount++] = xcurrent;
keir@20447 545 bd->writeCRC = (((bd->writeCRC) << 8)
keir@20447 546 ^bd->crc32Table[((bd->writeCRC) >> 24)
keir@20447 547 ^xcurrent]);
keir@20447 548 /* Loop now if we're outputting multiple
keir@20447 549 * copies of this byte */
keir@20447 550 if (bd->writeCopies) {
keir@20447 551 --bd->writeCopies;
keir@20447 552 continue;
keir@20447 553 }
keir@20447 554 decode_next_byte:
keir@20447 555 if (!bd->writeCount--)
keir@20447 556 break;
keir@20447 557 /* Follow sequence vector to undo
keir@20447 558 * Burrows-Wheeler transform */
keir@20447 559 previous = xcurrent;
keir@20447 560 pos = dbuf[pos];
keir@20447 561 xcurrent = pos&0xff;
keir@20447 562 pos >>= 8;
keir@20447 563 /* After 3 consecutive copies of the same
keir@20447 564 byte, the 4th is a repeat count. We count
keir@20447 565 down from 4 instead *of counting up because
keir@20447 566 testing for non-zero is faster */
keir@20447 567 if (--bd->writeRunCountdown) {
keir@20447 568 if (xcurrent != previous)
keir@20447 569 bd->writeRunCountdown = 4;
keir@20447 570 } else {
keir@20447 571 /* We have a repeated run, this byte
keir@20447 572 * indicates the count */
keir@20447 573 bd->writeCopies = xcurrent;
keir@20447 574 xcurrent = previous;
keir@20447 575 bd->writeRunCountdown = 5;
keir@20447 576 /* Sometimes there are just 3 bytes
keir@20447 577 * (run length 0) */
keir@20447 578 if (!bd->writeCopies)
keir@20447 579 goto decode_next_byte;
keir@20447 580 /* Subtract the 1 copy we'd output
keir@20447 581 * anyway to get extras */
keir@20447 582 --bd->writeCopies;
keir@20447 583 }
keir@20447 584 }
keir@20447 585 /* Decompression of this block completed successfully */
keir@20447 586 bd->writeCRC = ~bd->writeCRC;
keir@20447 587 bd->totalCRC = ((bd->totalCRC << 1) |
keir@20447 588 (bd->totalCRC >> 31)) ^ bd->writeCRC;
keir@20447 589 /* If this block had a CRC error, force file level CRC error. */
keir@20447 590 if (bd->writeCRC != bd->headerCRC) {
keir@20447 591 bd->totalCRC = bd->headerCRC+1;
keir@20447 592 return RETVAL_LAST_BLOCK;
keir@20447 593 }
keir@20447 594 }
keir@20447 595
keir@20447 596 /* Refill the intermediate buffer by Huffman-decoding next
keir@20447 597 * block of input */
keir@20447 598 /* (previous is just a convenient unused temp variable here) */
keir@20447 599 previous = get_next_block(bd);
keir@20447 600 if (previous) {
keir@20447 601 bd->writeCount = previous;
keir@20447 602 return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount;
keir@20447 603 }
keir@20447 604 bd->writeCRC = 0xffffffffUL;
keir@20447 605 pos = bd->writePos;
keir@20447 606 xcurrent = bd->writeCurrent;
keir@20447 607 goto decode_next_byte;
keir@20447 608 }
keir@20447 609
keir@20447 610 static int INIT nofill(void *buf, unsigned int len)
keir@20447 611 {
keir@20447 612 return -1;
keir@20447 613 }
keir@20447 614
keir@20447 615 /* Allocate the structure, read file header. If in_fd ==-1, inbuf must contain
keir@20447 616 a complete bunzip file (len bytes long). If in_fd!=-1, inbuf and len are
keir@20447 617 ignored, and data is read from file handle into temporary buffer. */
keir@20447 618 static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len,
keir@20447 619 int (*fill)(void*, unsigned int))
keir@20447 620 {
keir@20447 621 struct bunzip_data *bd;
keir@20447 622 unsigned int i, j, c;
keir@20447 623 const unsigned int BZh0 =
keir@20447 624 (((unsigned int)'B') << 24)+(((unsigned int)'Z') << 16)
keir@20447 625 +(((unsigned int)'h') << 8)+(unsigned int)'0';
keir@20447 626
keir@20447 627 /* Figure out how much data to allocate */
keir@20447 628 i = sizeof(struct bunzip_data);
keir@20447 629
keir@20447 630 /* Allocate bunzip_data. Most fields initialize to zero. */
keir@20447 631 bd = *bdp = malloc(i);
keir@20447 632 memset(bd, 0, sizeof(struct bunzip_data));
keir@20447 633 /* Setup input buffer */
keir@20447 634 bd->inbuf = inbuf;
keir@20447 635 bd->inbufCount = len;
keir@20447 636 if (fill != NULL)
keir@20447 637 bd->fill = fill;
keir@20447 638 else
keir@20447 639 bd->fill = nofill;
keir@20447 640
keir@20447 641 /* Init the CRC32 table (big endian) */
keir@20447 642 for (i = 0; i < 256; i++) {
keir@20447 643 c = i << 24;
keir@20447 644 for (j = 8; j; j--)
keir@20447 645 c = c&0x80000000 ? (c << 1)^0x04c11db7 : (c << 1);
keir@20447 646 bd->crc32Table[i] = c;
keir@20447 647 }
keir@20447 648
keir@20447 649 /* Ensure that file starts with "BZh['1'-'9']." */
keir@20447 650 i = get_bits(bd, 32);
keir@20447 651 if (((unsigned int)(i-BZh0-1)) >= 9)
keir@20447 652 return RETVAL_NOT_BZIP_DATA;
keir@20447 653
keir@20447 654 /* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of
keir@20447 655 uncompressed data. Allocate intermediate buffer for block. */
keir@20447 656 bd->dbufSize = 100000*(i-BZh0);
keir@20447 657
keir@20447 658 bd->dbuf = large_malloc(bd->dbufSize * sizeof(int));
keir@20447 659 return RETVAL_OK;
keir@20447 660 }
keir@20447 661
keir@20447 662 /* Example usage: decompress src_fd to dst_fd. (Stops at end of bzip2 data,
keir@20447 663 not end of file.) */
keir@20447 664 STATIC int INIT bunzip2(unsigned char *buf, unsigned int len,
keir@20447 665 int(*fill)(void*, unsigned int),
keir@20447 666 int(*flush)(void*, unsigned int),
keir@20447 667 unsigned char *outbuf,
keir@20447 668 unsigned int *pos,
keir@20447 669 void(*error_fn)(const char *x))
keir@20447 670 {
keir@20447 671 struct bunzip_data *bd;
keir@20447 672 int i = -1;
keir@20447 673 unsigned char *inbuf;
keir@20447 674
keir@20447 675 set_error_fn(error_fn);
keir@20447 676 if (flush)
keir@20447 677 outbuf = malloc(BZIP2_IOBUF_SIZE);
keir@20447 678
keir@20447 679 if (!outbuf) {
keir@20447 680 error("Could not allocate output bufer");
keir@20447 681 return -1;
keir@20447 682 }
keir@20447 683 if (buf)
keir@20447 684 inbuf = buf;
keir@20447 685 else
keir@20447 686 inbuf = malloc(BZIP2_IOBUF_SIZE);
keir@20447 687 if (!inbuf) {
keir@20447 688 error("Could not allocate input bufer");
keir@20447 689 goto exit_0;
keir@20447 690 }
keir@20447 691 i = start_bunzip(&bd, inbuf, len, fill);
keir@20447 692 if (!i) {
keir@20447 693 for (;;) {
keir@20447 694 i = read_bunzip(bd, outbuf, BZIP2_IOBUF_SIZE);
keir@20447 695 if (i <= 0)
keir@20447 696 break;
keir@20447 697 if (!flush)
keir@20447 698 outbuf += i;
keir@20447 699 else
keir@20447 700 if (i != flush(outbuf, i)) {
keir@20447 701 i = RETVAL_UNEXPECTED_OUTPUT_EOF;
keir@20447 702 break;
keir@20447 703 }
keir@20447 704 }
keir@20447 705 }
keir@20447 706 /* Check CRC and release memory */
keir@20447 707 if (i == RETVAL_LAST_BLOCK) {
keir@20447 708 if (bd->headerCRC != bd->totalCRC)
keir@20447 709 error("Data integrity error when decompressing.");
keir@20447 710 else
keir@20447 711 i = RETVAL_OK;
keir@20447 712 } else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) {
keir@20447 713 error("Compressed file ends unexpectedly");
keir@20447 714 }
keir@20447 715 if (bd->dbuf)
keir@20447 716 large_free(bd->dbuf);
keir@20447 717 if (pos)
keir@20447 718 *pos = bd->inbufPos;
keir@20447 719 free(bd);
keir@20447 720 if (!buf)
keir@20447 721 free(inbuf);
keir@20447 722 exit_0:
keir@20447 723 if (flush)
keir@20447 724 free(outbuf);
keir@20447 725 return i;
keir@20447 726 }