LZIP/smallz4cat.c

   1 // //////////////////////////////////////////////////////////
   2 // smallz4cat.c
   3 // Copyright (c) 2016-2019 Stephan Brumme. All rights reserved.
   4 // see https://create.stephan-brumme.com/smallz4/
   5 //
   6 // "MIT License":
   7 // Permission is hereby granted, free of charge, to any person obtaining a copy
   8 // of this software and associated documentation files (the "Software"),
   9 // to deal in the Software without restriction, including without limitation
  10 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11 // and/or sell copies of the Software, and to permit persons to whom the Software
  12 // is furnished to do so, subject to the following conditions:
  13 //
  14 // The above copyright notice and this permission notice shall be included
  15 // in all copies or substantial portions of the Software.
  16 //
  17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  18 // INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  19 // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  20 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  22 // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23
  24 // This program is a shorter, more readable, albeit slower re-implementation of lz4cat ( https://github.com/Cyan4973/xxHash )
  25
  26 // compile: gcc smallz4cat.c -O3 -o smallz4cat -Wall -pedantic -std=c99 -s
  27 // The static 8k binary was compiled using Clang and dietlibc (see https://www.fefe.de/dietlibc/ )
  28
  29 // Limitations:
  30 // - skippable frames and legacy frames are not implemented (and most likely never will)
  31 // - checksums are not verified (see https://create.stephan-brumme.com/xxhash/ for a simple implementation)
  32
  33 // Replace getByteFromIn() and sendToOut() by your own code if you need in-memory LZ4 decompression.
  34 // Corrupted data causes a call to unlz4error().
  35
  36 // suppress warnings when compiled by Visual C++
  37 #define _CRT_SECURE_NO_WARNINGS
  38
  39 #include <stdio.h>  // stdin/stdout/stderr, fopen, ...
  40 #include <stdlib.h> // exit()
  41 #include <string.h> // memcpy
  42
  43 #ifndef FALSE
  44 #define FALSE 0
  45 #define TRUE  1
  46 #endif
  47
  48 /// error handler
  49 static void unlz4error(const char* msg)
  50 {
  51   // smaller static binary than fprintf(stderr, "ERROR: %s\n", msg);
  52   fputs("ERROR: ", stderr);
  53   fputs(msg,       stderr);
  54   fputc('\n',      stderr);
  55   exit(1);
  56 }
  57
  58
  59 // ==================== I/O INTERFACE ====================
  60
  61
  62 // read one byte from input, see getByteFromIn()  for a basic implementation
  63 typedef unsigned char (*GET_BYTE)  (void* userPtr);
  64 // write several bytes,      see sendBytesToOut() for a basic implementation
  65 typedef void          (*SEND_BYTES)(const unsigned char*, unsigned int, void* userPtr);
  66
  67 struct UserPtr
  68 {
  69   // file handles
  70   FILE* in;
  71   FILE* out;
  72   // modify input buffer size as you like ... for most use cases, bigger buffer aren't faster anymore - and even reducing to 1 byte works !
  73 #define READ_BUFFER_SIZE 4*1024
  74   unsigned char readBuffer[READ_BUFFER_SIZE];
  75   unsigned int  pos;
  76   unsigned int  available;
  77 };
  78
  79 /// read a single byte (with simple buffering)
  80 static unsigned char getByteFromIn(void* userPtr) // parameter "userPtr" not needed
  81 {
  82   /// cast user-specific data
  83   struct UserPtr* user = (struct UserPtr*)userPtr;
  84
  85   // refill buffer
  86   if (user->pos == user->available)
  87   {
  88     user->pos = 0;
  89     user->available = fread(user->readBuffer, 1, READ_BUFFER_SIZE, user->in);
  90     if (user->available == 0)
  91       unlz4error("out of data");
  92   }
  93
  94   // return a byte
  95   return user->readBuffer[user->pos++];
  96 }
  97
  98 /// write a block of bytes
  99 static void sendBytesToOut(const unsigned char* data, unsigned int numBytes, void* userPtr)
 100 {
 101   /// cast user-specific data
 102   struct UserPtr* user = (struct UserPtr*)userPtr;
 103   if (data != NULL && numBytes > 0)
 104     fwrite(data, 1, numBytes, user->out);
 105 }
 106
 107
 108 // ==================== LZ4 DECOMPRESSOR ====================
 109
 110
 111 /// decompress everything in input stream (accessed via getByte) and write to output stream (via sendBytes)
 112 void unlz4_userPtr(GET_BYTE getByte, SEND_BYTES sendBytes, const char* dictionary, void* userPtr)
 113 {
 114   // signature
 115   unsigned char signature1 = getByte(userPtr);
 116   unsigned char signature2 = getByte(userPtr);
 117   unsigned char signature3 = getByte(userPtr);
 118   unsigned char signature4 = getByte(userPtr);
 119   unsigned int  signature  = (signature4 << 24) | (signature3 << 16) | (signature2 << 8) | signature1;
 120   unsigned char isModern   = (signature == 0x184D2204);
 121   unsigned char isLegacy   = (signature == 0x184C2102);
 122   if (!isModern && !isLegacy)
 123     unlz4error("invalid signature");
 124
 125   unsigned char hasBlockChecksum   = FALSE;
 126   unsigned char hasContentSize     = FALSE;
 127   unsigned char hasContentChecksum = FALSE;
 128   unsigned char hasDictionaryID    = FALSE;
 129   if (isModern)
 130   {
 131     // flags
 132     unsigned char flags = getByte(userPtr);
 133     hasBlockChecksum   = flags & 16;
 134     hasContentSize     = flags &  8;
 135     hasContentChecksum = flags &  4;
 136     hasDictionaryID    = flags &  1;
 137
 138     // only version 1 file format
 139     unsigned char version = flags >> 6;
 140     if (version != 1)
 141       unlz4error("only LZ4 file format version 1 supported");
 142
 143     // ignore blocksize
 144     char numIgnore = 1;
 145
 146     // ignore, skip 8 bytes
 147     if (hasContentSize)
 148       numIgnore += 8;
 149     // ignore, skip 4 bytes
 150     if (hasDictionaryID)
 151       numIgnore += 4;
 152
 153     // ignore header checksum (xxhash32 of everything up this point & 0xFF)
 154     numIgnore++;
 155
 156     // skip all those ignored bytes
 157     while (numIgnore--)
 158       getByte(userPtr);
 159   }
 160
 161   // don't lower this value, backreferences can be 64kb far away
 162 #define HISTORY_SIZE 64*1024
 163   // contains the latest decoded data
 164   unsigned char history[HISTORY_SIZE];
 165   // next free position in history[]
 166   unsigned int  pos = 0;
 167
 168   // dictionary compression is a recently introduced feature, just move its contents to the buffer
 169   if (dictionary != NULL)
 170   {
 171     // open dictionary
 172     FILE* dict = fopen(dictionary, "rb");
 173     if (!dict)
 174       unlz4error("cannot open dictionary");
 175
 176     // get dictionary's filesize
 177     fseek(dict, 0, SEEK_END);
 178     long dictSize = ftell(dict);
 179     // only the last 64k are relevant
 180     long relevant = dictSize < 65536 ? 0 : dictSize - 65536;
 181     fseek(dict, relevant, SEEK_SET);
 182     if (dictSize > 65536)
 183       dictSize = 65536;
 184     // read it and store it at the end of the buffer
 185     fread(history + HISTORY_SIZE - dictSize, 1, dictSize, dict);
 186     fclose(dict);
 187   }
 188
 189   // parse all blocks until blockSize == 0
 190   while (1)
 191   {
 192     // block size
 193     unsigned int blockSize = getByte(userPtr);
 194     blockSize |= (unsigned int)getByte(userPtr) <<  8;
 195     blockSize |= (unsigned int)getByte(userPtr) << 16;
 196     blockSize |= (unsigned int)getByte(userPtr) << 24;
 197
 198     // highest bit set ?
 199     unsigned char isCompressed = isLegacy || (blockSize & 0x80000000) == 0;
 200     if (isModern)
 201       blockSize &= 0x7FFFFFFF;
 202
 203     // stop after last block
 204     if (blockSize == 0)
 205       break;
 206
 207     if (isCompressed)
 208     {
 209       // decompress block
 210       unsigned int blockOffset = 0;
 211       unsigned int numWritten  = 0;
 212       while (blockOffset < blockSize)
 213       {
 214         // get a token
 215         unsigned char token = getByte(userPtr);
 216         blockOffset++;
 217
 218         // determine number of literals
 219         unsigned int numLiterals = token >> 4;
 220         if (numLiterals == 15)
 221         {
 222           // number of literals length encoded in more than 1 byte
 223           unsigned char current;
 224           do
 225           {
 226             current = getByte(userPtr);
 227             numLiterals += current;
 228             blockOffset++;
 229           } while (current == 255);
 230         }
 231
 232         blockOffset += numLiterals;
 233
 234         // copy all those literals
 235         if (pos + numLiterals < HISTORY_SIZE)
 236         {
 237           // fast loop
 238           while (numLiterals-- > 0)
 239             history[pos++] = getByte(userPtr);
 240         }
 241         else
 242         {
 243           // slow loop
 244           while (numLiterals-- > 0)
 245           {
 246             history[pos++] = getByte(userPtr);
 247
 248             // flush output buffer
 249             if (pos == HISTORY_SIZE)
 250             {
 251               sendBytes(history, HISTORY_SIZE, userPtr);
 252               numWritten += HISTORY_SIZE;
 253               pos = 0;
 254             }
 255           }
 256         }
 257
 258         // last token has only literals
 259         if (blockOffset == blockSize)
 260           break;
 261
 262         // match distance is encoded in two bytes (little endian)
 263         unsigned int delta = getByte(userPtr);
 264         delta |= (unsigned int)getByte(userPtr) << 8;
 265         // zero isn't allowed
 266         if (delta == 0)
 267           unlz4error("invalid offset");
 268         blockOffset += 2;
 269
 270         // match length (always >= 4, therefore length is stored minus 4)
 271         unsigned int matchLength = 4 + (token & 0x0F);
 272         if (matchLength == 4 + 0x0F)
 273         {
 274           unsigned char current;
 275           do // match length encoded in more than 1 byte
 276           {
 277             current = getByte(userPtr);
 278             matchLength += current;
 279             blockOffset++;
 280           } while (current == 255);
 281         }
 282
 283         // copy match
 284         unsigned int referencePos = (pos >= delta) ? (pos - delta) : (HISTORY_SIZE + pos - delta);
 285         // start and end within the current 64k block ?
 286         if (pos + matchLength < HISTORY_SIZE && referencePos + matchLength < HISTORY_SIZE)
 287         {
 288           // read/write continuous block (no wrap-around at the end of history[])
 289           // fast copy
 290           if (pos >= referencePos + matchLength || referencePos >= pos + matchLength)
 291           {
 292             // non-overlapping
 293             memcpy(history + pos, history + referencePos, matchLength);
 294             pos += matchLength;
 295           }
 296           else
 297           {
 298             // overlapping, slower byte-wise copy
 299             while (matchLength-- > 0)
 300               history[pos++] = history[referencePos++];
 301           }
 302         }
 303         else
 304         {
 305           // either read or write wraps around at the end of history[]
 306           while (matchLength-- > 0)
 307           {
 308             // copy single byte
 309             history[pos++] = history[referencePos++];
 310
 311             // cannot write anymore ? => wrap around
 312             if (pos == HISTORY_SIZE)
 313             {
 314               // flush output buffer
 315               sendBytes(history, HISTORY_SIZE, userPtr);
 316               numWritten += HISTORY_SIZE;
 317               pos = 0;
 318             }
 319             // wrap-around of read location
 320             referencePos %= HISTORY_SIZE;
 321           }
 322         }
 323       }
 324
 325       // all legacy blocks must be completely filled - except for the last one
 326       if (isLegacy && numWritten + pos < 8*1024*1024)
 327         break;
 328     }
 329     else
 330     {
 331       // copy uncompressed data and add to history, too (if next block is compressed and some matches refer to this block)
 332       while (blockSize-- > 0)
 333       {
 334         // copy a byte ...
 335         history[pos++] = getByte(userPtr);
 336         // ... until buffer is full => send to output
 337         if (pos == HISTORY_SIZE)
 338         {
 339           sendBytes(history, HISTORY_SIZE, userPtr);
 340           pos = 0;
 341         }
 342       }
 343     }
 344
 345     if (hasBlockChecksum)
 346     {
 347       // ignore checksum, skip 4 bytes
 348       getByte(userPtr); getByte(userPtr); getByte(userPtr); getByte(userPtr);
 349     }
 350   }
 351
 352   if (hasContentChecksum)
 353   {
 354     // ignore checksum, skip 4 bytes
 355     getByte(userPtr); getByte(userPtr); getByte(userPtr); getByte(userPtr);
 356   }
 357
 358   // flush output buffer
 359   sendBytes(history, pos, userPtr);
 360 }
 361
 362 /// old interface where getByte and sendBytes use global file handles
 363 void unlz4(GET_BYTE getByte, SEND_BYTES sendBytes, const char* dictionary)
 364 {
 365   unlz4_userPtr(getByte, sendBytes, dictionary, NULL);
 366 }
 367
 368
 369 // ==================== COMMAND-LINE HANDLING ====================
 370
 371
 372 /// parse command-line
 373 int main(int argc, const char* argv[])
 374 {
 375   // default input/output streams
 376   struct UserPtr user =
 377   {
 378     .in        = stdin,
 379     .out       = stdout,
 380     .pos       = 0, // initial input buffer is empty
 381     .available = 0
 382   };
 383
 384   const char* dictionary = NULL;
 385
 386   // first command-line parameter is our input filename / but ignore "-" which stands for STDIN
 387   int parameter;
 388   for (parameter = 1; parameter < argc; parameter++)
 389   {
 390     const char* current = argv[parameter];
 391     // dictionary
 392     if (current[0] == '-' && current[1] == 'D')
 393     {
 394       if (parameter + 1 >= argc)
 395         unlz4error("no dictionary filename found");
 396       dictionary = argv[++parameter];
 397       continue;
 398     }
 399
 400     // filename
 401     // read from STDIN, default behavior
 402     if (current[0] != '-' && current[1] != '\0')
 403     {
 404       // already have a filename - at most one filename is allowed (except for dictionary) ?
 405       if (user.in != stdin)
 406         unlz4error("can only decompress one file at a time");
 407       // get handle
 408       user.in = fopen(argv[1], "rb");
 409       if (!user.in)
 410         unlz4error("file not found");
 411     }
 412   }
 413
 414   // and go !
 415   unlz4_userPtr(getByteFromIn, sendBytesToOut, dictionary, &user);
 416   return 0;
 417 }