From e9a110bb1ef2a0133bc686b7c38ace21c8973bc9 Mon Sep 17 00:00:00 2001 From: NinjaCheetah <58050615+NinjaCheetah@users.noreply.github.com> Date: Thu, 23 Jan 2025 21:27:15 -0500 Subject: [PATCH] LZ77 compression is now fully functional! (But still very slow) --- src/libWiiPy/archive/lz77.py | 51 ++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/src/libWiiPy/archive/lz77.py b/src/libWiiPy/archive/lz77.py index 70161e8..0e168b3 100644 --- a/src/libWiiPy/archive/lz77.py +++ b/src/libWiiPy/archive/lz77.py @@ -7,10 +7,10 @@ import io from dataclasses import dataclass as _dataclass -LZ_MIN_DISTANCE = 0x01 # Minimum distance for each reference. -LZ_MAX_DISTANCE = 0x1000 # Maximum distance for each reference. -LZ_MIN_LENGTH = 0x03 # Minimum length for each reference. -LZ_MAX_LENGTH = 0x12 # Maximum length for each reference. +_LZ_MIN_DISTANCE = 0x01 # Minimum distance for each reference. +_LZ_MAX_DISTANCE = 0x1000 # Maximum distance for each reference. +_LZ_MIN_LENGTH = 0x03 # Minimum length for each reference. +_LZ_MAX_LENGTH = 0x12 # Maximum length for each reference. @_dataclass @@ -23,10 +23,8 @@ class _LZNode: def _compress_compare_bytes(byte1: bytes, offset1: int, byte2: bytes, offset2: int, abs_len_max: int) -> int: # Compare bytes up to the maximum length we can match. num_matched = 0 - mem1 = memoryview(byte1) - mem2 = memoryview(byte2) while num_matched < abs_len_max: - if mem1[offset1 + num_matched] != mem2[offset2 + num_matched]: + if byte1[offset1 + num_matched] != byte2[offset2 + num_matched]: break num_matched += 1 return num_matched @@ -34,18 +32,18 @@ def _compress_compare_bytes(byte1: bytes, offset1: int, byte2: bytes, offset2: i def _compress_search_matches(buffer: bytes, pos: int) -> (int, int): bytes_left = len(buffer) - pos - global LZ_MAX_DISTANCE, LZ_MAX_LENGTH, LZ_MIN_DISTANCE + global _LZ_MAX_DISTANCE, _LZ_MAX_LENGTH, _LZ_MIN_DISTANCE # Default to only looking back 4096 bytes, unless we've moved fewer than 4096 bytes, in which case we should # only look as far back as we've gone. - max_dist = min(LZ_MAX_DISTANCE, pos) + max_dist = min(_LZ_MAX_DISTANCE, pos) # Default to only matching up to 18 bytes, unless fewer than 18 bytes remain, in which case we can only match # up to that many bytes. - max_len = min(LZ_MAX_LENGTH, bytes_left) + max_len = min(_LZ_MAX_LENGTH, bytes_left) # Log the longest match we found and its offset. biggest_match = 0 biggest_match_pos = 0 # Search for matches. - for i in range(LZ_MIN_DISTANCE, max_dist + 1): + for i in range(_LZ_MIN_DISTANCE, max_dist + 1): num_matched = _compress_compare_bytes(buffer, pos - i, buffer, pos, max_len) if num_matched > biggest_match: biggest_match = num_matched @@ -56,11 +54,11 @@ def _compress_search_matches(buffer: bytes, pos: int) -> (int, int): def _compress_node_is_ref(node: _LZNode) -> bool: - return node.len >= LZ_MIN_LENGTH + return node.len >= _LZ_MIN_LENGTH def _compress_get_node_cost(length: int) -> int: - if length >= LZ_MAX_LENGTH: + if length >= _LZ_MIN_LENGTH: num_bytes = 2 else: num_bytes = 1 @@ -81,31 +79,33 @@ def compress_lz77(data: bytes) -> bytes: nodes = [_LZNode() for _ in range(len(data))] # Iterate over the uncompressed data, starting from the end. pos = len(data) - global LZ_MAX_LENGTH, LZ_MIN_LENGTH, LZ_MIN_DISTANCE + global _LZ_MAX_LENGTH, _LZ_MIN_LENGTH, _LZ_MIN_DISTANCE + iters = 0 while pos: + iters += 1 pos -= 1 node = nodes[pos] # Limit the maximum search length when we're near the end of the file. - max_search_len = LZ_MAX_LENGTH + max_search_len = _LZ_MAX_LENGTH if max_search_len > (len(data) - pos): max_search_len = len(data) - pos - if max_search_len < LZ_MIN_DISTANCE: + if max_search_len < _LZ_MIN_DISTANCE: max_search_len = 1 # Initialize as 1 for each, since that's all we could use if we weren't compressing. length, dist = 1, 1 - if max_search_len >= LZ_MIN_LENGTH: + if max_search_len >= _LZ_MIN_LENGTH: length, dist = _compress_search_matches(data, pos) # Treat as direct bytes if it's too short to copy. - if length == 0 or length < LZ_MIN_LENGTH: + if length == 0 or length < _LZ_MIN_LENGTH: length = 1 # If the node goes to the end of the file, the weight is the cost of the node. - if pos + length == len(data): + if (pos + length) == len(data): node.len = length node.dist = dist node.weight = _compress_get_node_cost(length) # Otherwise, search for possible matches and determine the one with the best cost. else: - weight_best = 0xFFFFFFFF # This was originally UINT_MAX, but that isn't a thing here. + weight_best = 0xFFFFFFFF # This was originally UINT_MAX, but that isn't a thing here so 32-bit it is! len_best = 1 while length: weight_next = nodes[pos + length].weight @@ -114,13 +114,11 @@ def compress_lz77(data: bytes) -> bytes: len_best = length weight_best = weight length -= 1 - if length != 0 and length < LZ_MIN_LENGTH: + if length != 0 and length < _LZ_MIN_LENGTH: length = 1 node.len = len_best node.dist = dist node.weight = weight_best - # Maximum size of the compressed file. - max_compressed_size = int(4 + len(data) + (len(data) + 7) / 8) # Write the header data. with io.BytesIO() as buffer: # Write the header data. @@ -131,6 +129,7 @@ def compress_lz77(data: bytes) -> bytes: while src_pos < len(data): head = 0 head_pos = buffer.tell() + buffer.write(b'\x00') # Reserve a byte for the chunk head. i = 0 while i < 8 and src_pos < len(data): @@ -139,14 +138,14 @@ def compress_lz77(data: bytes) -> bytes: dist = current_node.dist # This is a reference node. if _compress_node_is_ref(current_node): - encoded = ((dist - LZ_MIN_DISTANCE) | ((length - LZ_MAX_LENGTH) << 12)) & 0xFFFF # A uint16_t. - buffer.write(((encoded >> 8) & 0xFF).to_bytes(1)) - buffer.write(((encoded >> 0) & 0xFF).to_bytes(1)) + encoded = (((length - _LZ_MIN_LENGTH) & 0xF) << 12) | ((dist - _LZ_MIN_DISTANCE) & 0xFFF) + buffer.write(encoded.to_bytes(2)) head = (head | (1 << (7 - i))) & 0xFF # This is a direct copy node. else: buffer.write(data[src_pos:src_pos + 1]) src_pos += length + i += 1 pos = buffer.tell() buffer.seek(head_pos)