From 152a13fbe25b2e43d34b722a55fca6911e745480 Mon Sep 17 00:00:00 2001 From: NinjaCheetah <58050615+NinjaCheetah@users.noreply.github.com> Date: Wed, 7 Aug 2024 22:46:28 -0400 Subject: [PATCH] Rewrote most of U8 module, now extracts and packs all archives as expected --- src/libWiiPy/archive/u8.py | 162 ++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 92 deletions(-) diff --git a/src/libWiiPy/archive/u8.py b/src/libWiiPy/archive/u8.py index 8b8541e..c178593 100644 --- a/src/libWiiPy/archive/u8.py +++ b/src/libWiiPy/archive/u8.py @@ -8,7 +8,7 @@ import os import pathlib from dataclasses import dataclass as _dataclass from typing import List -from ..shared import _align_value +from ..shared import _align_value, _pad_bytes @_dataclass @@ -47,7 +47,10 @@ class U8Archive: self.u8_node_list: List[_U8Node] = [] # All the nodes in the header of a U8 file. self.file_name_list: List[str] = [] self.file_data_list: List[bytes] = [] - self.u8_file_structure = dict + self.root_node_offset: int = 0 + self.header_size: int = 0 + self.data_offset: int = 0 + self.root_node: _U8Node = _U8Node(0, 0, 0, 0) def load(self, u8_data: bytes) -> None: """ @@ -64,26 +67,25 @@ class U8Archive: self.u8_magic = u8_data.read(4) if self.u8_magic != b'\x55\xAA\x38\x2D': raise TypeError("This is not a valid U8 archive!") - # The following code is all skipped because these values really don't matter for extraction. They honestly - # really only matter to my code when they get calculated and used for packing. - # Offset of the root node, which will always be 0x20. - # root_node_offset = int(binascii.hexlify(u8_data.read(4)), 16) + self.root_node_offset = int.from_bytes(u8_data.read(4)) # The size of the U8 header. - # header_size = int(binascii.hexlify(u8_data.read(4)), 16) + self.header_size = int.from_bytes(u8_data.read(4)) # The offset of the data, which is root_node_offset + header_size, aligned to 0x10. - # data_offset = int(binascii.hexlify(u8_data.read(4)), 16) - - # Seek ahead to the size defined in the root node, because it's the total number of nodes in the file. The - # rest of the data in the root node (not that it really matters) will get read when we read the whole list. - u8_data.seek(u8_data.tell() + 36) + self.data_offset = int.from_bytes(u8_data.read(4)) + # Seek past 16 bytes of padding, then load the root node. + u8_data.seek(u8_data.tell() + 16) + root_node_type = int.from_bytes(u8_data.read(1)) + root_node_name_offset = int.from_bytes(u8_data.read(3)) + root_node_data_offset = int.from_bytes(u8_data.read(4)) root_node_size = int.from_bytes(u8_data.read(4)) + self.root_node = _U8Node(root_node_type, root_node_name_offset, root_node_data_offset, root_node_size) # Seek back before the root node so that it gets read with all the rest. u8_data.seek(u8_data.tell() - 12) # Iterate over the number of nodes that the root node lists. for node in range(root_node_size): - node_type = int.from_bytes(u8_data.read(2)) - node_name_offset = int.from_bytes(u8_data.read(2)) + node_type = int.from_bytes(u8_data.read(1)) + node_name_offset = int.from_bytes(u8_data.read(3)) node_data_offset = int.from_bytes(u8_data.read(4)) node_size = int.from_bytes(u8_data.read(4)) self.u8_node_list.append(_U8Node(node_type, node_name_offset, node_data_offset, node_size)) @@ -120,15 +122,19 @@ class U8Archive: # Add the number of bytes used for each file/folder name in the string table. for file_name in self.file_name_list: header_size += len(file_name) + 1 - # The initial data offset is equal to the file header (32 bytes) + node data aligned to 16 bytes. - data_offset = _align_value(header_size + 32, 16) + # The initial data offset is equal to the file header (32 bytes) + node data aligned to 64 bytes. + data_offset = _align_value(header_size + 32, 64) # Adjust all nodes to place file data in the same order as the nodes. Why isn't it already like this? current_data_offset = data_offset + current_name_offset = 0 for node in range(len(self.u8_node_list)): if self.u8_node_list[node].type == 0: - self.u8_node_list[node].data_offset = current_data_offset - current_data_offset += self.u8_node_list[node].size - # Begin joining all the U8 archive data into one variable. + self.u8_node_list[node].data_offset = _align_value(current_data_offset, 32) + current_data_offset += _align_value(self.u8_node_list[node].size, 32) + # Calculate the name offsets, including the extra 1 for the NULL byte at the end of each name. + self.u8_node_list[node].name_offset = current_name_offset + current_name_offset += len(self.file_name_list[node]) + 1 + # Begin joining all the U8 archive data into bytes. u8_data = b'' # Magic number. u8_data += b'\x55\xAA\x38\x2D' @@ -142,19 +148,18 @@ class U8Archive: u8_data += (b'\x00' * 16) # Iterate over all the U8 nodes and dump them. for node in self.u8_node_list: - u8_data += int.to_bytes(node.type, 2) - u8_data += int.to_bytes(node.name_offset, 2) + u8_data += int.to_bytes(node.type, 1) + u8_data += int.to_bytes(node.name_offset, 3) u8_data += int.to_bytes(node.data_offset, 4) u8_data += int.to_bytes(node.size, 4) # Iterate over all file names and dump them. All file names are suffixed by a \x00 byte. for file_name in self.file_name_list: u8_data += str.encode(file_name) + b'\x00' # Apply the extra padding we calculated earlier by padding to where the data offset begins. - while len(u8_data) < data_offset: - u8_data += b'\x00' + u8_data = _pad_bytes(u8_data, 64) # Iterate all file data and dump it. for file in self.file_data_list: - u8_data += file + u8_data += _pad_bytes(file, 32) # Return the U8 archive. return u8_data @@ -185,69 +190,58 @@ def extract_u8(u8_data, output_folder) -> None: u8_archive.load(u8_data) # This variable stores the path of the directory we're currently processing. current_dir = output_folder - # This variable stores the final nodes for every directory we've entered, and is used to handle the recursion of - # those directories to ensure that everything gets where it belongs. - directory_recursion = [0] - # Iterate over every node and extract the files and folders. + # This variable stores the order of directory nodes leading to the current working directory, to make sure that + # things get where they belong. + parent_dirs = [0] for node in range(len(u8_archive.u8_node_list)): - # Code for a directory node. Second check just ensures we ignore the root node. - if u8_archive.u8_node_list[node].type == 256 and u8_archive.u8_node_list[node].name_offset != 0: - # The size value for a directory node is the position of the last node in this directory, with the root node - # counting as node 1. - # If the current node is below the end of the current directory, create this directory inside the previous - # current directory and make the current. - if node + 1 < directory_recursion[-1]: + # Code for a directory node (excluding the root node since that already exists). + if u8_archive.u8_node_list[node].type == 1 and u8_archive.u8_node_list[node].name_offset != 0: + if u8_archive.u8_node_list[node].data_offset == parent_dirs[-1]: current_dir = current_dir.joinpath(u8_archive.file_name_list[node]) - os.mkdir(current_dir) - # If the current node is beyond the end of the current directory, we've followed that path all the way down, - # so reset back to the root directory and put our new directory there. - elif node + 1 > directory_recursion[-1]: - current_dir = output_folder.joinpath(u8_archive.file_name_list[node]) - os.mkdir(current_dir) - # This check is here just in case a directory ever ends with an empty directory and not a file. - elif node + 1 == directory_recursion[-1]: - current_dir = current_dir.parent - directory_recursion.pop() - # If the last node for the directory we just processed is new (which is always should be), add it to the - # recursion array. - if u8_archive.u8_node_list[node].size not in directory_recursion: - directory_recursion.append(u8_archive.u8_node_list[node].size) + current_dir.mkdir(exist_ok=True) + parent_dirs.append(node) + else: + # Go up until we're back at the correct level. + while u8_archive.u8_node_list[node].data_offset != parent_dirs[-1]: + parent_dirs.pop() + parent_dirs.append(node) + current_dir = output_folder + # Rebuild current working directory, and make sure all directories in the path exist. + for directory in parent_dirs: + current_dir = current_dir.joinpath(u8_archive.file_name_list[directory]) + current_dir.mkdir(exist_ok=True) # Code for a file node. elif u8_archive.u8_node_list[node].type == 0: - # Write out the file to the current directory. - output_file = open(current_dir.joinpath(u8_archive.file_name_list[node]), "wb") - output_file.write(u8_archive.file_data_list[node]) - output_file.close() - # If this file is the final node for the current directory, pop() the recursion array and set the current - # directory to the parent of the previous current. - if node + 1 in directory_recursion: - current_dir = current_dir.parent - directory_recursion.pop() - # Code for a totally unrecognized node type, which should not happen. - elif u8_archive.u8_node_list[node].type != 0 and u8_archive.u8_node_list[node].type != 256: - raise ValueError("A node with an invalid type (" + str(u8_archive.u8_node_list[node].type) + ") was" - "found!") + open(current_dir.joinpath(u8_archive.file_name_list[node]), "wb").write(u8_archive.file_data_list[node]) + # Handle an invalid node type. + elif u8_archive.u8_node_list[node].type != 0 and u8_archive.u8_node_list[node].type != 1: + raise ValueError("A node with an invalid type (" + str(u8_archive.u8_node_list[node].type) + ") was found!") -def _pack_u8_dir(u8_archive: U8Archive, current_path, node_count, name_offset): +def _pack_u8_dir(u8_archive: U8Archive, current_path, node_count, parent_node): # First, get the list of everything in current path. root_list = os.listdir(current_path) file_list = [] dir_list = [] # Create separate lists of the files and directories in the current directory so that we can handle the files first. + # noinspection PyTypeChecker + root_list.sort(key=str.lower) for path in root_list: if os.path.isfile(current_path.joinpath(path)): file_list.append(path) elif os.path.isdir(current_path.joinpath(path)): dir_list.append(path) + # noinspection PyTypeChecker + file_list.sort(key=str.lower) + # noinspection PyTypeChecker + dir_list.sort(key=str.lower) # For files, read their data into the file data list, add their name into the file name list, then calculate the - # offset for their file name and create a new U8Node() for them. + # offset for their file name and create a new U8Node() for them. -1 values are temporary and are set during dumping. for file in file_list: node_count += 1 u8_archive.file_name_list.append(file) u8_archive.file_data_list.append(open(current_path.joinpath(file), "rb").read()) - u8_archive.u8_node_list.append(_U8Node(0, name_offset, 0, len(u8_archive.file_data_list[-1]))) - name_offset = name_offset + len(file) + 1 # Add 1 to accommodate the null byte at the end of the name. + u8_archive.u8_node_list.append(_U8Node(0, -1, -1, len(u8_archive.file_data_list[-1]))) # For directories, add their name to the file name list, add empty data to the file data list (since they obviously # wouldn't have any), find the total number of files and directories inside the directory to calculate the final # node included in it, then recursively call this function again on that directory to process it. @@ -256,12 +250,11 @@ def _pack_u8_dir(u8_archive: U8Archive, current_path, node_count, name_offset): u8_archive.file_name_list.append(directory) u8_archive.file_data_list.append(b'') max_node = node_count + sum(1 for _ in current_path.joinpath(directory).rglob('*')) - u8_archive.u8_node_list.append(_U8Node(256, name_offset, 0, max_node)) - name_offset = name_offset + len(directory) + 1 # Add 1 to accommodate the null byte at the end of the name. - u8_archive, node_count, name_offset = _pack_u8_dir(u8_archive, current_path.joinpath(directory), node_count, - name_offset) + u8_archive.u8_node_list.append(_U8Node(1, -1, parent_node, max_node)) + u8_archive, node_count = _pack_u8_dir(u8_archive, current_path.joinpath(directory), node_count, + u8_archive.u8_node_list.index(u8_archive.u8_node_list[-1])) # Return the U8Archive object, the current node we're on, and the current name offset. - return u8_archive, node_count, name_offset + return u8_archive, node_count def pack_u8(input_path) -> bytes: @@ -279,34 +272,19 @@ def pack_u8(input_path) -> bytes: The data for the packed U8 archive. """ input_path = pathlib.Path(input_path) - if os.path.isdir(input_path): + if input_path.is_dir(): # Append empty entries at the start for the root node, and then create the root U8Node() object, using rglob() # to read the total count of files and directories that will be packed so that we can add the total node count. u8_archive = U8Archive() u8_archive.file_name_list.append("") u8_archive.file_data_list.append(b'') - u8_archive.u8_node_list.append(_U8Node(256, 0, 0, sum(1 for _ in input_path.rglob('*')) + 1)) + u8_archive.u8_node_list.append(_U8Node(1, 0, 0, sum(1 for _ in input_path.rglob('*')) + 1)) # Call the private function _pack_u8_dir() on the root note, which will recursively call itself to pack every # subdirectory and file. Discard node_count and name_offset since we don't care about them here, as they're # really only necessary for the directory recursion. - u8_archive, _, _ = _pack_u8_dir(u8_archive, input_path, node_count=1, name_offset=1) + u8_archive, _ = _pack_u8_dir(u8_archive, input_path, node_count=1, parent_node=0) return u8_archive.dump() - elif os.path.isfile(input_path): - # Simple code to handle if a single file is provided as input. Not really sure *why* you'd do this, since the - # whole point of a U8 archive is to stitch files together, but it's here nonetheless. - with open(input_path, "rb") as f: - u8_archive = U8Archive() - file_name = input_path.name - file_data = f.read() - # Append blank file name for the root node. - u8_archive.file_name_list.append("") - u8_archive.file_name_list.append(file_name) - # Append blank data for the root node. - u8_archive.file_data_list.append(b'') - u8_archive.file_data_list.append(file_data) - # Append generic U8Node for the root, followed by the actual file's node. - u8_archive.u8_node_list.append(_U8Node(256, 0, 0, 2)) - u8_archive.u8_node_list.append(_U8Node(0, 1, 0, len(file_data))) - return u8_archive.dump() + elif input_path.is_file(): + raise ValueError("This does not appear to be a directory.") else: - raise FileNotFoundError("Input file/directory: \"" + str(input_path) + "\" does not exist!") + raise FileNotFoundError("Input directory: \"" + str(input_path) + "\" does not exist!")