Implement payload compression

2021-05-12 16:47:06 -06:00 · 2021-05-12 16:47:06 -06:00 · 918b00ce07
commit 918b00ce07
parent a7712b173c
20 changed files with 2305 additions and 28 deletions
--- a/tools/lz/lz.c
+++ b/tools/lz/lz.c
@ -0,0 +1,546 @@
+//
+// Name:        lz.c
+// Author:      Marcus Geelnard
+// Description: LZ77 coder/decoder implementation.
+// Reentrant:   Yes
+// $ATH_LICENSE_NULL$
+//
+// The LZ77 compression scheme is a substitutional compression scheme
+// proposed by Abraham Lempel and Jakob Ziv in 1977. It is very simple in
+// its design, and uses no fancy bit level compression.
+//
+// This is my first attempt at an implementation of a LZ77 code/decoder.
+//
+// The principle of the LZ77 compression algorithm is to store repeated
+// occurrences of strings as references to previous occurrences of the same
+// string. The point is that the reference consumes less space than the
+// string itself, provided that the string is long enough (in this
+// implementation, the string has to be at least 4 bytes long, since the
+// minimum coded reference is 3 bytes long). Also note that the term
+// "string" refers to any kind of byte sequence (it does not have to be
+// an ASCII string, for instance).
+//
+// The coder uses a brute force approach to finding string matches in the
+// history buffer (or "sliding window", if you wish), which is very, very
+// slow. I recon the complexity is somewhere between O(n^2) and O(n^3),
+// depending on the input data.
+//
+// There is also a faster implementation that uses a large working buffer
+// in which a "jump table" is stored, which is used to quickly find
+// possible string matches (see the source code for LZ_CompressFast() for
+// more information). The faster method is an order of magnitude faster,
+// but still quite slow compared to other compression methods.
+//
+// The upside is that decompression is very fast, and the compression ratio
+// is often very good.
+//
+// The reference to a string is coded as a (length,offset) pair, where the
+// length indicates the length of the string, and the offset gives the
+// offset from the current data position. To distinguish between string
+// references and literal strings (uncompressed bytes), a string reference
+// is preceded by a marker byte, which is chosen as the least common byte
+// symbol in the input data stream (this marker byte is stored in the
+// output stream as the first byte).
+//
+// Occurrences of the marker byte in the stream are encoded as the marker
+// byte followed by a zero byte, which means that occurrences of the marker
+// byte have to be coded with two bytes.
+//
+// The lengths and offsets are coded in a variable length fashion, allowing
+// values of any magnitude (up to 4294967295 in this implementation).
+//
+// With this compression scheme, the worst case compression result is
+// (257/256)*insize + 1.
+//
+//------------------------------------------------------------------------
+// Copyright (c) 2003-2006 Marcus Geelnard
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgment in the product documentation would
+//    be appreciated but is not required.
+//
+// 2. Altered source versions must be plainly marked as such, and must not
+//    be misrepresented as being the original software.
+//
+// 3. This notice may not be removed or altered from any source
+//    distribution.
+//
+// Marcus Geelnard
+// marcus.geelnard at home.se
+//
+
+//
+// This file has been altered from the original version.
+//
+
+/*************************************************************************
+* Constants used for LZ77 coding
+*************************************************************************/
+
+/* Maximum offset (can be any size < 2^31). Lower values give faster
+   compression, while higher values gives better compression. The default
+   value of 100000 is quite high. Experiment to see what works best for
+   you. */
+#define LZ_MAX_OFFSET 100000
+
+
+
+/*************************************************************************
+*                           INTERNAL FUNCTIONS                           *
+*************************************************************************/
+
+
+/*************************************************************************
+* _LZ_StringCompare() - Return maximum length string match.
+*************************************************************************/
+
+static unsigned int _LZ_StringCompare( unsigned char * str1,
+  unsigned char * str2, unsigned int minlen, unsigned int maxlen )
+{
+    unsigned int len;
+
+    for( len = minlen; (len < maxlen) && (str1[len] == str2[len]); ++ len );
+
+    return len;
+}
+
+
+/*************************************************************************
+* _LZ_WriteVarSize() - Write unsigned integer with variable number of
+* bytes depending on value.
+*************************************************************************/
+
+static int _LZ_WriteVarSize( unsigned int x, unsigned char * buf )
+{
+    unsigned int y;
+    int num_bytes, i, b;
+
+    /* Determine number of bytes needed to store the number x */
+    y = x >> 3;
+    for( num_bytes = 5; num_bytes >= 2; -- num_bytes )
+    {
+        if( y & 0xfe000000 ) break;
+        y <<= 7;
+    }
+
+    /* Write all bytes, seven bits in each, with 8:th bit set for all */
+    /* but the last byte. */
+    for( i = num_bytes-1; i >= 0; -- i )
+    {
+        b = (x >> (i*7)) & 0x0000007f;
+        if( i > 0 )
+        {
+            b |= 0x00000080;
+        }
+        *buf ++ = (unsigned char) b;
+    }
+
+    /* Return number of bytes written */
+    return num_bytes;
+}
+
+
+/*************************************************************************
+* _LZ_ReadVarSize() - Read unsigned integer with variable number of
+* bytes depending on value.
+*************************************************************************/
+
+static int _LZ_ReadVarSize( unsigned int * x, unsigned char * buf )
+{
+    unsigned int y, b, num_bytes;
+
+    /* Read complete value (stop when byte contains zero in 8:th bit) */
+    y = 0;
+    num_bytes = 0;
+    do
+    {
+        b = (unsigned int) (*buf ++);
+        y = (y << 7) | (b & 0x0000007f);
+        ++ num_bytes;
+    }
+    while( b & 0x00000080 );
+
+    /* Store value in x */
+    *x = y;
+
+    /* Return number of bytes read */
+    return num_bytes;
+}
+
+
+
+/*************************************************************************
+*                            PUBLIC FUNCTIONS                            *
+*************************************************************************/
+
+
+/*************************************************************************
+* LZ_Compress() - Compress a block of data using an LZ77 coder.
+*  in     - Input (uncompressed) buffer.
+*  out    - Output (compressed) buffer. This buffer must be 0.4% larger
+*           than the input buffer, plus one byte.
+*  insize - Number of input bytes.
+* The function returns the size of the compressed data.
+*************************************************************************/
+
+int LZ_Compress( unsigned char *in, unsigned char *out,
+    unsigned int insize )
+{
+    unsigned char marker, symbol;
+    unsigned int  inpos, outpos, bytesleft, i;
+    unsigned int  maxoffset, offset, bestoffset;
+    unsigned int  maxlength, length, bestlength;
+    unsigned int  histogram[ 256 ];
+    unsigned char *ptr1, *ptr2;
+
+    /* Do we have anything to compress? */
+    if( insize < 1 )
+    {
+        return 0;
+    }
+
+    /* Create histogram */
+    for( i = 0; i < 256; ++ i )
+    {
+        histogram[ i ] = 0;
+    }
+    for( i = 0; i < insize; ++ i )
+    {
+        ++ histogram[ in[ i ] ];
+    }
+
+    /* Find the least common byte, and use it as the marker symbol */
+    marker = 0;
+    for( i = 1; i < 256; ++ i )
+    {
+        if( histogram[ i ] < histogram[ marker ] )
+        {
+            marker = i;
+        }
+    }
+
+    /* Remember the marker symbol for the decoder */
+    out[ 0 ] = marker;
+
+    /* Start of compression */
+    inpos = 0;
+    outpos = 1;
+
+    /* Main compression loop */
+    bytesleft = insize;
+    do
+    {
+        /* Determine most distant position */
+        if( inpos > LZ_MAX_OFFSET ) maxoffset = LZ_MAX_OFFSET;
+        else                        maxoffset = inpos;
+
+        /* Get pointer to current position */
+        ptr1 = &in[ inpos ];
+
+        /* Search history window for maximum length string match */
+        bestlength = 3;
+        bestoffset = 0;
+        for( offset = 3; offset <= maxoffset; ++ offset )
+        {
+            /* Get pointer to candidate string */
+            ptr2 = &ptr1[ -(int)offset ];
+
+            /* Quickly determine if this is a candidate (for speed) */
+            if( (ptr1[ 0 ] == ptr2[ 0 ]) &&
+                (ptr1[ bestlength ] == ptr2[ bestlength ]) )
+            {
+                /* Determine maximum length for this offset */
+                maxlength = (bytesleft < offset ? bytesleft : offset);
+
+                /* Count maximum length match at this offset */
+                length = _LZ_StringCompare( ptr1, ptr2, 0, maxlength );
+
+                /* Better match than any previous match? */
+                if( length > bestlength )
+                {
+                    bestlength = length;
+                    bestoffset = offset;
+                }
+            }
+        }
+
+        /* Was there a good enough match? */
+        if( (bestlength >= 8) ||
+            ((bestlength == 4) && (bestoffset <= 0x0000007f)) ||
+            ((bestlength == 5) && (bestoffset <= 0x00003fff)) ||
+            ((bestlength == 6) && (bestoffset <= 0x001fffff)) ||
+            ((bestlength == 7) && (bestoffset <= 0x0fffffff)) )
+        {
+            out[ outpos ++ ] = (unsigned char) marker;
+            outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );
+            outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );
+            inpos += bestlength;
+            bytesleft -= bestlength;
+        }
+        else
+        {
+            /* Output single byte (or two bytes if marker byte) */
+            symbol = in[ inpos ++ ];
+            out[ outpos ++ ] = symbol;
+            if( symbol == marker )
+            {
+                out[ outpos ++ ] = 0;
+            }
+            -- bytesleft;
+        }
+    }
+    while( bytesleft > 3 );
+
+    /* Dump remaining bytes, if any */
+    while( inpos < insize )
+    {
+        if( in[ inpos ] == marker )
+        {
+            out[ outpos ++ ] = marker;
+            out[ outpos ++ ] = 0;
+        }
+        else
+        {
+            out[ outpos ++ ] = in[ inpos ];
+        }
+        ++ inpos;
+    }
+
+    return outpos;
+}
+
+
+/*************************************************************************
+* LZ_CompressFast() - Compress a block of data using an LZ77 coder.
+*  in     - Input (uncompressed) buffer.
+*  out    - Output (compressed) buffer. This buffer must be 0.4% larger
+*           than the input buffer, plus one byte.
+*  insize - Number of input bytes.
+*  work   - Pointer to a temporary buffer (internal working buffer), which
+*           must be able to hold (insize+65536) unsigned integers.
+* The function returns the size of the compressed data.
+*************************************************************************/
+
+int LZ_CompressFast( unsigned char *in, unsigned char *out,
+    unsigned int insize, unsigned int *work )
+{
+    unsigned char marker, symbol;
+    unsigned int  inpos, outpos, bytesleft, i, index, symbols;
+    unsigned int  offset, bestoffset;
+    unsigned int  maxlength, length, bestlength;
+    unsigned int  histogram[ 256 ], *lastindex, *jumptable;
+    unsigned char *ptr1, *ptr2;
+
+    /* Do we have anything to compress? */
+    if( insize < 1 )
+    {
+        return 0;
+    }
+
+    /* Assign arrays to the working area */
+    lastindex = work;
+    jumptable = &work[ 65536 ];
+
+    /* Build a "jump table". Here is how the jump table works:
+       jumptable[i] points to the nearest previous occurrence of the same
+       symbol pair as in[i]:in[i+1], so in[i] == in[jumptable[i]] and
+       in[i+1] == in[jumptable[i]+1], and so on... Following the jump table
+       gives a dramatic boost for the string search'n'match loop compared
+       to doing a brute force search. The jump table is built in O(n) time,
+       so it is a cheap operation in terms of time, but it is expensice in
+       terms of memory consumption. */
+    for( i = 0; i < 65536; ++ i )
+    {
+        lastindex[ i ] = 0xffffffff;
+    }
+    for( i = 0; i < insize-1; ++ i )
+    {
+        symbols = (((unsigned int)in[i]) << 8) | ((unsigned int)in[i+1]);
+        index = lastindex[ symbols ];
+        lastindex[ symbols ] = i;
+        jumptable[ i ] = index;
+    }
+    jumptable[ insize-1 ] = 0xffffffff;
+
+    /* Create histogram */
+    for( i = 0; i < 256; ++ i )
+    {
+        histogram[ i ] = 0;
+    }
+    for( i = 0; i < insize; ++ i )
+    {
+        ++ histogram[ in[ i ] ];
+    }
+
+    /* Find the least common byte, and use it as the marker symbol */
+    marker = 0;
+    for( i = 1; i < 256; ++ i )
+    {
+        if( histogram[ i ] < histogram[ marker ] )
+        {
+            marker = i;
+        }
+    }
+
+    /* Remember the marker symbol for the decoder */
+    out[ 0 ] = marker;
+
+    /* Start of compression */
+    inpos = 0;
+    outpos = 1;
+
+    /* Main compression loop */
+    bytesleft = insize;
+    do
+    {
+        /* Get pointer to current position */
+        ptr1 = &in[ inpos ];
+
+        /* Search history window for maximum length string match */
+        bestlength = 3;
+        bestoffset = 0;
+        index = jumptable[ inpos ];
+        while( (index != 0xffffffff) && ((inpos - index) < LZ_MAX_OFFSET) )
+        {
+            /* Get pointer to candidate string */
+            ptr2 = &in[ index ];
+
+            /* Quickly determine if this is a candidate (for speed) */
+            if( ptr2[ bestlength ] == ptr1[ bestlength ] )
+            {
+                /* Determine maximum length for this offset */
+                offset = inpos - index;
+                maxlength = (bytesleft < offset ? bytesleft : offset);
+
+                /* Count maximum length match at this offset */
+                length = _LZ_StringCompare( ptr1, ptr2, 2, maxlength );
+
+                /* Better match than any previous match? */
+                if( length > bestlength )
+                {
+                    bestlength = length;
+                    bestoffset = offset;
+                }
+            }
+
+            /* Get next possible index from jump table */
+            index = jumptable[ index ];
+        }
+
+        /* Was there a good enough match? */
+        if( (bestlength >= 8) ||
+            ((bestlength == 4) && (bestoffset <= 0x0000007f)) ||
+            ((bestlength == 5) && (bestoffset <= 0x00003fff)) ||
+            ((bestlength == 6) && (bestoffset <= 0x001fffff)) ||
+            ((bestlength == 7) && (bestoffset <= 0x0fffffff)) )
+        {
+            out[ outpos ++ ] = (unsigned char) marker;
+            outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );
+            outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );
+            inpos += bestlength;
+            bytesleft -= bestlength;
+        }
+        else
+        {
+            /* Output single byte (or two bytes if marker byte) */
+            symbol = in[ inpos ++ ];
+            out[ outpos ++ ] = symbol;
+            if( symbol == marker )
+            {
+                out[ outpos ++ ] = 0;
+            }
+            -- bytesleft;
+        }
+    }
+    while( bytesleft > 3 );
+
+    /* Dump remaining bytes, if any */
+    while( inpos < insize )
+    {
+        if( in[ inpos ] == marker )
+        {
+            out[ outpos ++ ] = marker;
+            out[ outpos ++ ] = 0;
+        }
+        else
+        {
+            out[ outpos ++ ] = in[ inpos ];
+        }
+        ++ inpos;
+    }
+
+    return outpos;
+}
+
+
+/*************************************************************************
+* LZ_Uncompress() - Uncompress a block of data using an LZ77 decoder.
+*  in      - Input (compressed) buffer.
+*  out     - Output (uncompressed) buffer. This buffer must be large
+*            enough to hold the uncompressed data.
+*  insize  - Number of input bytes.
+*************************************************************************/
+
+int LZ_Uncompress( unsigned char *in, unsigned char *out,
+    unsigned int insize )
+{
+    unsigned char marker, symbol;
+    unsigned int  i, inpos, outpos, length, offset;
+
+    /* Do we have anything to uncompress? */
+    if( insize < 1 )
+    {
+        return 0;
+    }
+
+    /* Get marker symbol from input stream */
+    marker = in[ 0 ];
+    inpos = 1;
+
+    /* Main decompression loop */
+    outpos = 0;
+    do
+    {
+        symbol = in[ inpos ++ ];
+        if( symbol == marker )
+        {
+            /* We had a marker byte */
+            if( in[ inpos ] == 0 )
+            {
+                /* It was a single occurrence of the marker byte */
+                out[ outpos ++ ] = marker;
+                ++ inpos;
+            }
+            else
+            {
+                /* Extract true length and offset */
+                inpos += _LZ_ReadVarSize( &length, &in[ inpos ] );
+                inpos += _LZ_ReadVarSize( &offset, &in[ inpos ] );
+
+                /* Copy corresponding data from history window */
+                for( i = 0; i < length; ++ i )
+                {
+                    out[ outpos ] = out[ outpos - offset ];
+                    ++ outpos;
+                }
+            }
+        }
+        else
+        {
+            /* No marker, plain copy */
+            out[ outpos ++ ] = symbol;
+        }
+    }
+    while( inpos < insize );
+
+    return outpos;
+}