micropython/lib/uzlib/lz77.c

/*
 * Simple LZ77 streaming compressor.
 *
 * The scheme implemented here doesn't use a hash table and instead does a brute
 * force search in the history for a previous string.  It is relatively slow
 * (but still O(N)) but gives good compression and minimal memory usage.  For a
 * small history window (eg 256 bytes) it's not too slow and compresses well.
 *
 * MIT license; Copyright (c) 2021 Damien P. George
 */

#include "uzlib.h"

#include "defl_static.c"

#define MATCH_LEN_MIN (3)
#define MATCH_LEN_MAX (258)

// hist should be a preallocated buffer of hist_max size bytes.
// hist_max should be greater than 0 a power of 2 (ie 1, 2, 4, 8, ...).
// It's possible to pass in hist=NULL, and then the history window will be taken from the
// src passed in to uzlib_lz77_compress (this is useful when not doing streaming compression).
void uzlib_lz77_init(uzlib_lz77_state_t *state, uint8_t *hist, size_t hist_max) {
    memset(state, 0, sizeof(uzlib_lz77_state_t));
    state->hist_buf = hist;
    state->hist_max = hist_max;
    state->hist_start = 0;
    state->hist_len = 0;
}

// Search back in the history for the maximum match of the given src data,
// with support for searching beyond the end of the history and into the src buffer
// (effectively the history and src buffer are concatenated).
static size_t uzlib_lz77_search_max_match(uzlib_lz77_state_t *state, const uint8_t *src, size_t len, size_t *longest_offset) {
    size_t longest_len = 0;
    for (size_t hist_search = 0; hist_search < state->hist_len; ++hist_search) {
        // Search for a match.
        size_t match_len;
        for (match_len = 0; match_len <= MATCH_LEN_MAX && match_len < len; ++match_len) {
            uint8_t hist;
            if (hist_search + match_len < state->hist_len) {
                hist = state->hist_buf[(state->hist_start + hist_search + match_len) & (state->hist_max - 1)];
            } else {
                hist = src[hist_search + match_len - state->hist_len];
            }
            if (src[match_len] != hist) {
                break;
            }
        }

        // Take this match if its length is at least the minimum, and larger than previous matches.
        // If the length is the same as the previous longest then take this match as well, because
        // this match will be closer (more recent in the history) and take less bits to encode.
        if (match_len >= MATCH_LEN_MIN && match_len >= longest_len) {
            longest_len = match_len;
            *longest_offset = state->hist_len - hist_search;
        }
    }

    return longest_len;
}

// Compress the given chunk of data.
void uzlib_lz77_compress(uzlib_lz77_state_t *state, const uint8_t *src, unsigned len) {
    const uint8_t *top = src + len;
    while (src < top) {
        // Look for a match in the history window.
        size_t match_offset = 0;
        size_t match_len = uzlib_lz77_search_max_match(state, src, top - src, &match_offset);

        // Encode the literal byte or the match.
        if (match_len == 0) {
            uzlib_literal(state, *src);
            match_len = 1;
        } else {
            uzlib_match(state, match_offset, match_len);
        }

        // Push the bytes into the history buffer.
        size_t mask = state->hist_max - 1;
        while (match_len--) {
            uint8_t b = *src++;
            state->hist_buf[(state->hist_start + state->hist_len) & mask] = b;
            if (state->hist_len == state->hist_max) {
                state->hist_start = (state->hist_start + 1) & mask;
            } else {
                ++state->hist_len;
            }
        }
    }
}