Digest diff compression

From: Henrik Nordstrom <hno@dont-contact.us>
Date: Sun, 05 Sep 1999 22:55:52 +0200

Hi fellow Squid hackers.

I have now completed and verified the prototype for my digest diff
compression.

Attached is a "simulator" which implements both compression and
decompression, and verifies that the decompressed output equals the
input. Program output compatible with the NLANR cd_diff.c program as
required by the task description. The program expects MIME input as
before. Remove the call to skipHttpHeader if you have data files without
a MIME header.

This code is mostly a prof of concept and not at all suitable for
inclusion in Squid. The code is about as poorly designed as the previous
one, very inefficient with both CPU and memory, and has a lot of extra
validations to verify that the operation produces correct output. The
good news is that the algorithms behind it is very easy to divide up in
small timeframes, and there is at least two different possible basic
approaches with different CPU/memory priority.

The encoding consists of three "independent" steps:

1. Calculate bit distances between changed bits, encoded as 16 bit
integers
2. Huffman encoding of these 16 bit integers.
3. A compact way to represent the histogram used to build the huffman
code.

If memory is a premium then steps 1 & 2 can be combined in two passes,
pass 1 to build the huffman code, pass 2 to produce the huffman output.

This version produces slightly better results than the previously
distributed simulator due to the more efficient coding of the histogram
information.

-------------------------------------------
%bits | My | NLANR | % |
changed | program | cd_diff | improvement |
--------+---------+---------+-------------|
 0.216% | 2.32% | 3.46% | 33.1% |
 0.330% | 3.28% | 5.22% | 37.2% |
 0.440% | 4.16% | 6.94% | 40.1% |
 0.553% | 5.04% | 8.67% | 41.8% |
 1.006% | 8.19% | 15.53% | 47.2% |
 2.020% | 14.34% | 30.10% | 52.3% |
 5.020% | 28.91% | 67.45% | 57.1% |
10.024% | 47.34% |114.11% | 58.5% |
20.950% | 74.63% |169.48% | 55.9% |
28.670% | 87.04% |186.52% | 53.3% |
-------------------------------------------

%bits changed is difference between two digests. Only first ones is
relevant for Squid, the higher values are included to show what happens
at extremes.

Detailed description of each step:

1. Bit distances encoded as 16 bit integers

The first step is to calculate the bit distance between each changed bit
and encode this with a simple encoding resulting in 16 bit integer
outputs. Each integer carries 15 bits of information and the 16'th bit
is a roll-over bit to allow encoding of arbitrary distances (0-32767 is
one word, 32768-1073741823 two words, ...). For the final implementation
it needs to be decided on word ordering (most significant word first or
least significant word first). Byte ordering dependencies within
individual words is removed by the huffman encoding if the output is
huffman encoded.

2. Huffman encoding of the 16 bit integers

Not much to say. Plain huffman encoding of 16 bit integers.

3. A compact way to represent the count histogram used to build the
huffman code.

This is perhaps the most complex part of the format, and utilizes some
"known facts" on the frequency distribution. It consists of counts for
each bit distance, and encoded as array of {bit distance size offset,
count offset} with a adaptive bit length encoding. The weights used in
the adaptive bit encoding is not fully optimized and it may be possible
to squeeze out a few bytes more here (perhaps in the range of 100bytes
more). The current selection is more a result of trial and error than
actual research.

I will work on documenting this in more detail.

For a implementation in Squid it would help if someone located (or
implemented) an efficient block based huffman encoder/decoder for 16 bit
integers with a known frequency table.

I am afraid I won't have much time to spend on this the next few weeks.
Good news it that I am now hired to do some Squid related work.

/Henrik

    [ Part 2: "Attached Text" ]

#include <fcntl.h>
#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <unistd.h>
#include <assert.h>
#include <stdlib.h>
#include <sys/times.h>

typedef struct {
    int size;
    unsigned char *data;
} DataBuffer;

#define MIN(a,b) (a<b?a:b)
#define MAX(a,b) (a>b?a:b)
#define PERCENT(n,total) (((n)*100+50)/(total))
#define DB_BTEST(b,buf) (((buf).data[(b)/8]&(1<<((b)&7)))!=0)
#define DB_BSET(b,buf) ( (buf).data[(b)/8] |= (1<<((b)&7)))
#define DB_BXOR(b,buf) ( (buf).data[(b)/8] ^= (1<<((b)&7)))

/* Data handling function */
DataBuffer readFile(int fd)
{
    DataBuffer buf;
    int startpos = lseek(fd, 0, SEEK_CUR);
    int endpos = lseek(fd, 0, SEEK_END);
    int size = endpos - startpos;

    buf.size = size;
    buf.data = malloc(size);
    assert(buf.data);

    lseek(fd, startpos, SEEK_SET);
    if (read(fd, buf.data, size) != size) {
        perror("Unexpected read size");
        exit(1);
    }
    return buf;
}

int bufferEqual(DataBuffer *buf1, DataBuffer *buf2)
{
    if (buf1->size != buf2->size)
        return 0;
    return memcmp(buf1->data, buf2->data, buf1->size)==0;
}

/* Simple encoding of bit distances as 16 bit values. Each word
 * carries 15 significant bits of data, and the 16'th (most
 * significant) bit signals that additional word(s) follow for values
 * larger than 15 bits.
 * The data is stored with the least significant word first, with
 * byte order within each word most significant first.
 *
 * Some examples:
 * 0x1234:
 * 0x12, 0x34
 * 0x12345678:
 * 0xD6, 0x78, 0x24, 0x68
 *
 */
DataBuffer bitOffset15Encode(DataBuffer *buf1, DataBuffer *buf2)
{
    int i, j, len=0;
    unsigned char byte;
    DataBuffer out;
    int outp = 0;
    assert(buf1->size == buf2->size);
    out.size = buf1->size;
    out.data = calloc(1, out.size);
    assert(out.data);
#define OUTBYTE(b) {\
                    if (out.size <= outp) { \
                    out.size *= 2; \
                    out.data = realloc(out.data, out.size); \
                    assert(out.data); \
                } \
                out.data[outp++] = (b); \
            }
#define OUTWORD(w) {\
                    OUTBYTE(((w)>>8)&0xFF); \
                    OUTBYTE((w)&0xff); \
            }
    /* Simple encoding of bif offsets with 15 bits of significance per network
     * word. The 16th bit signals that more bits follows in the next word.
     */
    for(i=0; i< buf1->size; i++) {
        byte = buf1->data[i] ^ buf2->data[i];
        for (j=0 ; j<8; j++) {
            int bit = byte & ( 1<<j );
            if (bit) {
                do {
                    int o = len & 0x7FFF;
                    if (len > 0x7FFF)
                        o |= 0x8000;
                    OUTWORD(o);
                    len = len >> 15;
                } while (len > 0);
            } else {
                len++;
            }
        }
    }
    out.size = outp;
    out.data = realloc(out.data, out.size);
    assert(out.data);
    return out;
}

DataBuffer bitOffset15Decode(DataBuffer *orig, DataBuffer *diff)
{
    DataBuffer out;
    int i;
    int bitoffset = 0;

    /* Set up the output initially as a copy of the "old" input */
    out.size = orig->size;
    out.data = malloc(out.size);
    assert(out.data);
    memcpy(out.data, orig->data, out.size);

    /* Loop throught the diff */
    for (i=0 ; i<diff->size ; i+=2) {
        int nbits = 0;
        int morebits;
        int value = 0;
        do {
            value |= (((diff->data[i]<<8)+diff->data[i+1])&0x7fff) << nbits;
            nbits += 15;
            morebits = diff->data[i]&0x80;
        } while(morebits);
        bitoffset += value;
        assert(bitoffset < out.size*8);
        DB_BXOR(bitoffset, out);
        bitoffset++;
    }
    return out;
}

/*
 * Huffman encoding related stuff
 */

/* A simple support function for maintaining a pointer indirect heap */
void pqdownheapR(int *a, int *p, int N, int k)
{
    int j, v = p[k];
    while (k <= N/2) {
        j = k+k;
        if (j<N && a[p[j]] > a[p[j+1]]) j++;
        if (a[v] <= a[p[j]]) break;
        p[k] = p[j];
        k = j;
    }
    p[k] = v;
}

typedef struct {
    /* Code representation */
    unsigned long *bits;
    int *len;
    /* Count histogram */
    int *count;
    /* Binary tree */
    int *dad;
    int (*child)[2];
    /* Attribute cache */
    int treeSize;
} HuffmanCode;

/* Build the huffman code of a 16 bit data input */

HuffmanCode huffmanCode(int *count)
{
    static int heap[65536];
    int N=0;
    static long int code[65536];
    static int len[65536];
    static int dad[65536*2];
    static int child[65536][2];
    int i,k;
    HuffmanCode result;

    memset(dad,0,sizeof(dad));

    /* Build the initial index heap */
    for(i=0;i<65536;i++) if(count[i]) heap[++N]=i;
    for(i=N;i>0;i--) pqdownheapR(count,heap,N,i);

    /* Build a binary tree */
    while (N > 1) {
        int t = heap[1];
        heap[1] = heap[N--];
        pqdownheapR(count,heap,N,1);
        count[65536+N] = count[heap[1]]+count[t];
        dad[t] = 65536+N;
        dad[heap[1]] = -65536-N;
        child[N][0]=t;
        child[N][1]=heap[1];
        heap[1] = 65536+N;
        pqdownheapR(count,heap,N,1);
    }
    dad[65536+N]=0;

    /* Make huffman code from the heap based binary tree */
    for (k = 0; k<65536; k++) {
        if (count[k]) {
            int i = 0, t = dad[k], x=0;
            while (t) {
                x <<= 1;
                if (t<0) { x |= 1; t = -t; }
                t = dad[t]; i++;
            }
            code[k]=x; len[k]=i;
        } else {
            code[k] = len[k] = 0;
        }
    }

    result.bits = code;
    result.len = len;
    result.count = count;
    result.dad = dad;
    result.child = child;
    result.treeSize = -1;

    return result;
}

HuffmanCode huffmanCode16(DataBuffer *in)
{
    static int count[65536*2];
    int i;
    memset(count,0,sizeof(count));
    /* Build a frequency histogram of the input data */
    for(i=0; i<in->size; i+=2) {
        int value = (in->data[i]<<8)+in->data[i+1];
        count[value] += 1;
    }

    return huffmanCode(count);
}

int significantbits(long v)
{
    int n=0;
    while (v!=0) {
        n++;
        v=v>>1;
    };
    return n;
}

int significantbitssigned(long v)
{
    int lastv = 0;
    int n=0;
    if (v!=0) {
        do {
            n++;
            lastv = v;
            v=v>>1;
        } while ( v != lastv );
    }
    return n;
}

int bitencodesize(unsigned long int v,int startn)
{
    int size=0;
    int bitsize = significantbits(v);
    int n = startn;
    do {
        size += n+1;
        bitsize -= n;
        n = 2;
    } while(bitsize > 0);
    return size;
}

int bitencodesizesigned(unsigned long int v,int startn)
{
    int size=0;
    int bitsize = significantbitssigned(v);
    int n = startn;
    do {
        size += n+1;
        bitsize -= n;
        n = 2;
    } while(bitsize > 0);
    return size;
}

int huffmanTreeSize(HuffmanCode *huff)
{
    int i;
    int size = 0;

    int lasti = 0, lastcount = 0;
    int idiff, countdiff;
    int lastisize = 3, lastcountsize=9;
    int conti = 0;

    if (huff->treeSize != -1)
        return huff->treeSize;

    for (conti = 0 ; conti<65536 && huff->count[conti]; conti++);
    lasti = conti;

    size += 16; /* conti */
    size += 16; /* N codes */
    for(i=0;i<65536;i++) {
        if(huff->count[i]>0) {
            if (i >= conti) {
                idiff = i - lasti - 1;
#if 0
fprintf(stderr,"tree %d i=%d id=%d s=%d n=%d\n", size, i, idiff, significantbits(idiff), lastisize);
#endif
                size += bitencodesize(idiff, lastisize);
                lastisize = significantbits(idiff);
            }
            countdiff = lastcount - huff->count[i];
#if 0
fprintf(stderr,"tree %d i=%d cd=%d s=%d n=%d\n", size, i, countdiff, significantbitssigned(countdiff), lastcountsize);
#endif
            size += bitencodesizesigned(countdiff, lastcountsize);
            lastcount = huff->count[i];
            lastcountsize = significantbitssigned(countdiff);
            lasti = i;
        }
    }
    huff->treeSize = size;
    return size;
}

int huffmanSize(HuffmanCode *huff)
{
    int i;
    int size=0;
    for(i=0; i<65536; i++) {
        if (huff->count[i])
            size += huff->count[i]*huff->len[i];
    }
    size += huffmanTreeSize(huff);
    size = (size + 7) / 8;
    return size;
}

DataBuffer huffmanEncode16(DataBuffer *buf)
{
    int i;
    DataBuffer out;
    int bitpos = 0;
    int conti;
    int N;
    int lasti;
    int lastisize = 3, lastcountsize=9;
    int lastcount = 0;
    HuffmanCode huff = huffmanCode16(buf);
    out.size = huffmanSize(&huff);
    out.data = malloc(out.size);
    assert(out.data);
#define OUTBIT(value) { \
            assert(bitpos < out.size*8); \
        if ( value ) { \
            DB_BSET(bitpos, out); \
        } \
        bitpos++; \
    }
#define OUTBITS(bits, len) { \
        int i; \
        for(i=0; i<len; i++) { \
            int ob_value = bits & (1<<i); \
            assert(bitpos < out.size*8); \
            if ( ob_value ) { \
                DB_BSET(bitpos, out); \
            } \
            bitpos++; \
        } \
    }

    /* Output a compact represenation of the data histogram
     * used to build the huffman code
     */
    for (conti = 0 ; conti<65536 && huff.count[conti]; conti++);
    lasti = conti;
    for (N=i=0; i<65536; i++)
        if (huff.count[i]) {
#if 0
fprintf(stderr,"ci %d(%d) = %d\n", N, i, huff.count[i]);
#endif
            N++;
        }
    OUTBITS(conti, 16);
    OUTBITS(N, 16);
    for(i=0;i<65536;i++) {
        if(huff.count[i]>0) {
            if (i >= conti) {
                int v = i - lasti - 1;
                int n = lastisize;
                int bitsize = significantbits(v);
                lastisize = bitsize;
#if 0
fprintf(stderr,"out %d i=%d id=%d s=%d n=%d\n", bitpos, i, v, bitsize, n);
#endif
                do {
                    OUTBITS(v, n);
                    bitsize -= n;
                    v >>= n;
                    n = 2;
                    OUTBIT(bitsize > 0);
                } while(bitsize > 0);
            }
            {
                int v = lastcount - huff.count[i];
                int n = lastcountsize;
                int bitsize = significantbitssigned(v);
                lastcountsize = bitsize;
                lastcount = huff.count[i];
#if 0
fprintf(stderr,"out %d i=%d cd=%d s=%d n=%d\n", bitpos, i, v, bitsize, n);
#endif
                do {
                    OUTBITS(v, n);
                    bitsize -= n;
                    v >>= n;
                    n = 2;
                    OUTBIT(bitsize>0);
                } while(bitsize > 0);
            }
            lasti = i;
        }
    }
    assert(bitpos == huffmanTreeSize(&huff));
    /* Output the huffman encoded data */
    for(i=0; i<buf->size; i+=2) {
        int value = (buf->data[i]<<8)+buf->data[i+1];
        OUTBITS(huff.bits[value], huff.len[value]);
    }
    assert( (bitpos + 7) / 8 == huffmanSize(&huff) );
    return out;
}

long db_getbits(DataBuffer *buf, int pos, int len)
{
    int ret = 0;
    int i;
    for (i=0; i<len; i++, pos++) {
        ret |= DB_BTEST(pos, *buf)<<i;
    }
    return ret;
}

DataBuffer huffmanDecode16(DataBuffer *in)
{
    DataBuffer out;
    int outp;
    int lastisize = 3, lastcountsize=9;
    int conti, N;
    int bitpos = 0, pos=0;
    int i;
    int cnt = 0;
    static int count[65536*2];
    HuffmanCode huff;
    memset(count, 0, sizeof(count));
#define GETBITS(len) ( bitpos+=len, db_getbits(in, bitpos-len, len) )
#define GETBIT() ( bitpos+=1, DB_BTEST(bitpos-1, *in))

    conti = GETBITS(16);
    N = GETBITS(16);
    for (i=-1, pos=0; pos<N; pos++) {
        if (i >= conti - 1) {
            int n = lastisize;
            unsigned int v = 0;
            int nbits = 0;
            do {
                v |= GETBITS(n) << nbits;
                nbits += n;
                n = 2;
            } while(GETBIT());
            assert (v<65536);
            i += v;
            lastisize = significantbits(v);
        }
        i += 1;
        {
            int n = lastcountsize;
            int nbits = 0;
            int v = 0;
            do {
                v |= GETBITS(n) << nbits;
                nbits += n;
                n = 2;
            } while(GETBIT());
            if (nbits > 0 && v & (1<<(nbits-1)))
                v |= -1 << nbits;
            cnt -= v;
            lastcountsize = significantbitssigned(v);
        }
        assert(i<65536 && i>=0);
        count[i]=cnt;
#if 0
fprintf(stderr,"cd %d(%d) = %d\n", pos, i, cnt);
#endif
    }
    /* Find out the expected size of huffman decoded input */
    for(i=0, N=0; i<65536; i++)
        N += count[i];
    /* Dumb huffman decoder */
    huff = huffmanCode(count);
    outp = 0;
    i = 0;
    out.size = N*2;
    out.data = malloc(out.size);
    assert(out.data);
    pos = 65537;
    for(;outp<out.size;bitpos++) {
        assert(bitpos < in->size*8);
        if (DB_BTEST(bitpos, *in))
            pos=huff.child[pos-65536][1];
        else
            pos=huff.child[pos-65536][0];
        if (pos < 65536) {
            OUTWORD(pos);
            pos = 65537;
        }
    }
    assert(((bitpos+7)/8)==in->size);
    return out;
}

char *bits(unsigned long value, int len)
{
    static char res[256];
    int i,j;

    for(i=0,j=0;i<len;i++)
        res[j++]=(value&(1<<i))?'1':'0';
    res[j]='\0';
    return res;
}

void dump_huffman(HuffmanCode huffman)
{
    int i;
    for(i=0;i<65536;i++)
        if (huffman.len[i])
            fprintf(stderr,"%2x: %4d %d %s\n", i, huffman.count[i], huffman.len[i], bits(huffman.bits[i],huffman.len[i]));
}

void dump_huffman_stats(HuffmanCode huffman)
{
    int i,j;
    int codes[64],count[64];
    memset(codes,0,sizeof codes);
    memset(count,0,sizeof count);
    for(i=0;i<65536;i++) {
        j = huffman.len[i];
        codes[j]++;
        count[j]+=huffman.count[i];
    }
    for(i=0;i<64;i++) {
        if (count[i] > 0)
            fprintf(stderr,"%2d: %3d %d\n", i, codes[i], count[i]);
    }
}

void diff(int fd1, int fd2)
{
    DataBuffer buf1, buf2, bitencoded;
    DataBuffer encoded, decoded, result;

    buf1 = readFile(fd1);
    buf2 = readFile(fd2);
    assert(buf1.size == buf2.size);

    bitencoded = bitOffset15Encode(&buf1, &buf2);
    encoded = huffmanEncode16(&bitencoded);
    decoded = huffmanDecode16(&encoded);
    assert(bufferEqual(&bitencoded, &decoded));
    result = bitOffset15Decode(&buf1, &decoded);
    assert(bufferEqual(&buf2, &result));
    printf("sizes: old: %8d, new: %8d, diff: %6d (%6.2f%% : x%6.2f)\n",
            buf1.size, buf2.size, encoded.size, 100.*encoded.size/buf1.size,
            buf1.size/(double)encoded.size);
}

static int Open(const char *fname)
{
        int fd = open(fname, O_RDONLY);
        if (fd < 0) {
                perror(fname);
                exit(errno);
        }
        return fd;
}

void skipHttpHeader(int fd)
{
    char c, n;
    n = 0;
    while(n < 2 && read(fd, &c, 1) > 0) {
        switch(c) {
        case '\r':
            break;
        case '\n':
            n += 1;
            break;
        default:
            n = 0;
        }
    }
}

#if TEST_HUFFMAN
int main(int argc, char *argv[]) {
    int count[65536];
    unsigned char *encoded;
    unsigned short *input;
    unsigned short *output;
    int i;
    int insize = strlen(argv[1]), outsize;
    HuffmanCode huffman;

    memset(count,0,sizeof(count));
    for(i=0; i<strlen(argv[1]); i++)
        count[(unsigned char)argv[1][i]]++;

    huffman = huffmanCode(count);
    dump_huffman(huffman);
    dump_huffman_stats(huffman);
    for(outsize=i=0; i<insize; i++)
        outsize += huffman.len[(unsigned char)argv[1][i]];
    output = malloc((outsize+7)/8);
    assert(output);
    printf("Input %d bits, output %d bits\n", insize*8, outsize);
    return 0;
}
    
#else
int main(int argc, char *argv[]) {
        int fd1, fd2;
        if (argc != 3) {
                fprintf(stderr, "usage: %s <digest_old> <digest_new>\n", argv[0]);
                return -1;
        }

        fd1 = Open(argv[1]);
        skipHttpHeader(fd1);
        fd2 = Open(argv[2]);
        skipHttpHeader(fd2);

        diff(fd1, fd2);

        close(fd1);
        close(fd2);

        return 0;
}
#endif
Received on Tue Jul 29 2003 - 13:16:00 MDT

This archive was generated by hypermail pre-2.1.9 : Tue Dec 09 2003 - 16:12:17 MST