micropython/lib/uzlib/lz77.c

92 wiersze
3.5 KiB
C

/*
* Simple LZ77 streaming compressor.
*
* The scheme implemented here doesn't use a hash table and instead does a brute
* force search in the history for a previous string. It is relatively slow
* (but still O(N)) but gives good compression and minimal memory usage. For a
* small history window (eg 256 bytes) it's not too slow and compresses well.
*
* MIT license; Copyright (c) 2021 Damien P. George
*/
#include "uzlib.h"
#include "defl_static.c"
#define MATCH_LEN_MIN (3)
#define MATCH_LEN_MAX (258)
// hist should be a preallocated buffer of hist_max size bytes.
// hist_max should be greater than 0 a power of 2 (ie 1, 2, 4, 8, ...).
// It's possible to pass in hist=NULL, and then the history window will be taken from the
// src passed in to uzlib_lz77_compress (this is useful when not doing streaming compression).
void uzlib_lz77_init(uzlib_lz77_state_t *state, uint8_t *hist, size_t hist_max) {
memset(state, 0, sizeof(uzlib_lz77_state_t));
state->hist_buf = hist;
state->hist_max = hist_max;
state->hist_start = 0;
state->hist_len = 0;
}
// Search back in the history for the maximum match of the given src data,
// with support for searching beyond the end of the history and into the src buffer
// (effectively the history and src buffer are concatenated).
static size_t uzlib_lz77_search_max_match(uzlib_lz77_state_t *state, const uint8_t *src, size_t len, size_t *longest_offset) {
size_t longest_len = 0;
for (size_t hist_search = 0; hist_search < state->hist_len; ++hist_search) {
// Search for a match.
size_t match_len;
for (match_len = 0; match_len <= MATCH_LEN_MAX && match_len < len; ++match_len) {
uint8_t hist;
if (hist_search + match_len < state->hist_len) {
hist = state->hist_buf[(state->hist_start + hist_search + match_len) & (state->hist_max - 1)];
} else {
hist = src[hist_search + match_len - state->hist_len];
}
if (src[match_len] != hist) {
break;
}
}
// Take this match if its length is at least the minimum, and larger than previous matches.
// If the length is the same as the previous longest then take this match as well, because
// this match will be closer (more recent in the history) and take less bits to encode.
if (match_len >= MATCH_LEN_MIN && match_len >= longest_len) {
longest_len = match_len;
*longest_offset = state->hist_len - hist_search;
}
}
return longest_len;
}
// Compress the given chunk of data.
void uzlib_lz77_compress(uzlib_lz77_state_t *state, const uint8_t *src, unsigned len) {
const uint8_t *top = src + len;
while (src < top) {
// Look for a match in the history window.
size_t match_offset = 0;
size_t match_len = uzlib_lz77_search_max_match(state, src, top - src, &match_offset);
// Encode the literal byte or the match.
if (match_len == 0) {
uzlib_literal(state, *src);
match_len = 1;
} else {
uzlib_match(state, match_offset, match_len);
}
// Push the bytes into the history buffer.
size_t mask = state->hist_max - 1;
while (match_len--) {
uint8_t b = *src++;
state->hist_buf[(state->hist_start + state->hist_len) & mask] = b;
if (state->hist_len == state->hist_max) {
state->hist_start = (state->hist_start + 1) & mask;
} else {
++state->hist_len;
}
}
}
}