From 42fd523843e22e93607a7d6a300dc78d5b42d036 Mon Sep 17 00:00:00 2001 From: NinjaCheetah <58050615+NinjaCheetah@users.noreply.github.com> Date: Fri, 4 Apr 2025 13:33:33 -0400 Subject: [PATCH] Added LZ77 compression, added corresponding CLI command --- src/archive/lz77.rs | 151 ++++++++++++++++++++++++++++++++- src/bin/rustii/archive/lz77.rs | 16 +++- 2 files changed, 163 insertions(+), 4 deletions(-) diff --git a/src/archive/lz77.rs b/src/archive/lz77.rs index 88fb8e0..5acd727 100644 --- a/src/archive/lz77.rs +++ b/src/archive/lz77.rs @@ -3,8 +3,9 @@ // // Implements the compression and decompression routines used for the Wii's LZ77 compression scheme. -use std::io::{Cursor, Read, Seek, SeekFrom}; -use byteorder::{BigEndian, LittleEndian, ReadBytesExt}; +use std::cmp::min; +use std::io::{Cursor, Read, Write, Seek, SeekFrom}; +use byteorder::{BigEndian, LittleEndian, ReadBytesExt, WriteBytesExt}; use thiserror::Error; #[derive(Debug, Error)] @@ -15,6 +16,152 @@ pub enum LZ77Error { IO(#[from] std::io::Error), } +const LZ_MIN_DISTANCE: usize = 0x01; // Minimum distance for each reference. +const LZ_MAX_DISTANCE: usize = 0x1000; // Maximum distance for each reference. +const LZ_MIN_LENGTH: usize = 0x03; // Minimum length for each reference. +const LZ_MAX_LENGTH: usize = 0x12; // Maximum length for each reference. + +#[derive(Clone)] +struct LZNode { + dist: usize, + len: usize, + weight: usize, +} + +fn compress_compare_bytes(buf: &[u8], offset1: usize, offset2: usize, abs_len_max: usize) -> usize { + // Compare bytes up to the maximum length we can match. Start by comparing the first 3 bytes, + // since that's the minimum match length and this allows for a more optimized early exit. + let mut num_matched: usize = 0; + while num_matched < abs_len_max { + if buf[offset1 + num_matched] != buf[offset2 + num_matched] { + break + } + num_matched += 1 + } + num_matched +} + +fn compress_search_matches(buf: &[u8], pos: usize) -> (usize, usize) { + let bytes_left = buf.len() - pos; + // Default to only looking back 4096 bytes, unless we've moved fewer than 4096 bytes, in which + // case we should only look as far back as we've gone. + let max_dist = min(LZ_MAX_DISTANCE, pos); + // Default to only matching up to 18 bytes, unless fewer than 18 bytes remain, in which case + // we can only match up to that many bytes. + let max_len = min(LZ_MAX_LENGTH, bytes_left); + // Log the longest match we found and its offset. + let (mut biggest_match, mut biggest_match_pos) = (0, 0); + // Search for matches. + for i in LZ_MIN_DISTANCE..(max_dist + 1) { + let num_matched = compress_compare_bytes(buf, pos - i, pos, max_len); + if num_matched > biggest_match { + biggest_match = num_matched; + biggest_match_pos = i; + if biggest_match == max_len { + break; + } + } + } + (biggest_match, biggest_match_pos) +} + +fn compress_node_is_ref(node: LZNode) -> bool { + node.len >= LZ_MIN_LENGTH +} + +fn compress_get_node_cost(length: usize) -> usize { + let num_bytes = if length >= LZ_MIN_LENGTH { + 2 + } else { + 1 + }; + 1 + (num_bytes * 8) +} + +/// Compresses data using the Wii's LZ77 compression algorithm and returns the compressed result. +pub fn compress_lz77(data: &[u8]) -> Result, LZ77Error> { + // Optimized compressor based around a node graph that finds optimal string matches. + let mut nodes = vec![LZNode { dist: 0, len: 0, weight: 0 }; data.len()]; + // Iterate over the uncompressed data, starting from the end. + let mut pos = data.len(); + while pos > 0 { + pos -= 1; + // Limit the maximum search length when we're near the end of the file. + let mut max_search_len = min(LZ_MAX_LENGTH, data.len() - pos); + if max_search_len < LZ_MIN_DISTANCE { + max_search_len = 1; + } + // Initialize as 1 for each, since that's all we could use if we weren't compressing. + let (mut length, mut dist) = (1, 1); + if max_search_len >= LZ_MIN_LENGTH { + (length, dist) = compress_search_matches(data, pos); + } + // Treat as direct bytes if it's too short to copy. + if length == 0 || length < LZ_MIN_LENGTH { + length = 1; + } + // If the node goes to the end of the file, the weight is the cost of the node. + if (pos + length) == data.len() { + nodes[pos].len = length; + nodes[pos].dist = dist; + nodes[pos].weight = compress_get_node_cost(length); + } + // Otherwise, search for possible matches and determine the one with the best cost. + else { + let mut weight_best = u32::MAX as usize; + let mut len_best = 1; + while length > 0 { + let weight_next = nodes[pos + length].weight; + let weight = compress_get_node_cost(length) + weight_next; + if weight < weight_best { + len_best = length; + weight_best = weight; + } + length -= 1; + if length != 0 && length < LZ_MIN_LENGTH { + length = 1; + } + } + nodes[pos].len = len_best; + nodes[pos].dist = dist; + nodes[pos].weight = weight_best; + } + } + // Write out compressed data now that we've done our calculations. + let mut buf = Cursor::new(Vec::new()); + buf.write_all(b"LZ77\x10")?; + buf.write_u24::(data.len() as u32)?; + let mut src_pos = 0; + while src_pos < data.len() { + let mut flag = 0; + let flag_pos = buf.position(); + buf.write_u8(b'\x00')?; // Reserve a byte for the chunk head. + let mut i = 0; + while i < 8 && src_pos < data.len() { + let current_node = nodes[src_pos].clone(); + let length = current_node.len; + let dist = current_node.dist; + // This is a reference node. + if compress_node_is_ref(current_node) { + let encoded = ((((length - LZ_MIN_LENGTH) & 0xF) << 12) | ((dist - LZ_MIN_DISTANCE) & 0xFFF)) as u16; + buf.write_u16::(encoded)?; + flag |= 1 << (7 - i); + } + // This is a direct copy node. + else { + buf.write_all(&data[src_pos..src_pos + 1])?; + } + src_pos += length; + i += 1 + } + pos = buf.position() as usize; + buf.seek(SeekFrom::Start(flag_pos))?; + buf.write_u8(flag)?; + buf.seek(SeekFrom::Start(pos as u64))?; + } + Ok(buf.into_inner()) +} + /// Decompresses LZ77-compressed data and returns the decompressed result. pub fn decompress_lz77(data: &[u8]) -> Result, LZ77Error> { let mut buf = Cursor::new(data); diff --git a/src/bin/rustii/archive/lz77.rs b/src/bin/rustii/archive/lz77.rs index 796dd5a..02536d7 100644 --- a/src/bin/rustii/archive/lz77.rs +++ b/src/bin/rustii/archive/lz77.rs @@ -30,8 +30,20 @@ pub enum Commands { } } -pub fn compress_lz77(_input: &str, _output: &Option) -> Result<()> { - bail!("compression is not yet implemented"); +pub fn compress_lz77(input: &str, output: &Option) -> Result<()> { + let in_path = Path::new(input); + if !in_path.exists() { + bail!("Input file \"{}\" could not be found.", in_path.display()); + } + let decompressed = fs::read(in_path)?; + let compressed = lz77::compress_lz77(&decompressed).with_context(|| "An unknown error occurred while compressing the data.")?; + let out_path = if output.is_some() { + PathBuf::from(output.clone().unwrap()) + } else { + PathBuf::from(in_path).with_extension("lz77") + }; + fs::write(out_path, compressed)?; + Ok(()) } pub fn decompress_lz77(input: &str, output: &Option) -> Result<()> {