From 4b27d582aa450c02ad5ada4445bfb54a5b4a03ae Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Mon, 8 Jul 2024 16:13:03 -0400 Subject: [PATCH 01/22] Add package_partitions script and copy into container --- Dockerfile | 4 ++ src/package_partitions.sh | 83 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100755 src/package_partitions.sh diff --git a/Dockerfile b/Dockerfile index b850e1e..413792f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,6 +47,7 @@ RUN apt-get update && \ unar \ unrar-free \ unzip \ + uuid-runtime \ xz-utils \ zlib1g-dev \ zstd @@ -106,6 +107,9 @@ RUN --mount=type=ssh git clone git@github.com:rehosting/fakeroot.git /fakeroot & #RUN pip install git+https://github.com/qkaiser/arpy.git RUN curl "https://raw.githubusercontent.com/qkaiser/arpy/23faf88a88488c41fc4348ea2b70996803f84f40/arpy.py" -o /usr/local/lib/python3.10/dist-packages/arpy.py +# Unblob package used for creating .tar.gz archives of partitions +COPY src/package_partitions.sh /usr/local/bin/package_partitions.sh + # Copy wrapper script into container so we can copy out - note we don't put it on guest path COPY ./fw2tar /usr/local/src/fw2tar_wrapper # And add install helpers which generate shell commands to install it on host diff --git a/src/package_partitions.sh b/src/package_partitions.sh new file mode 100755 index 0000000..8344988 --- /dev/null +++ b/src/package_partitions.sh @@ -0,0 +1,83 @@ +#!/bin/bash +set -eu + +# USAGE: ./unblob_package.sh firmware output_root + +#if [ "$#" -ne 2 ]; then +# echo "Usage: $0 [firmware] [output_dir]" +# exit 1 +#fi + + +# First we run unblob with a temporary directory to extract the files +# Then we package potential rootfs files into tar.gz archives + +firmware="$1" +output_dir="$2" +mkdir -p "$output_dir" +chmod 777 "$output_dir" + +# Stage 1: Run unblob on the provided firmware +extract_dir=$(mktemp -d) +log_scratch=$(mktemp) # Can't write unblob.log into / +unblob -k "$firmware" -e "$extract_dir" --log "${log_scratch}" +rm ${log_scratch} + +# Clean unblob output - delete unblob artifacts and debian packages + +# Delete all .uncompressed, .unknown, *.padding, and carved.elf files +find "$extract_dir" -type f \( -name '*.uncompressed' -o -name '*.unknown' -o -name '*.padding' -o -name 'carved.elf' \) -delete + +# Find and delete debian packages - look for files named `debian_binary` that are in a directory named *_extract +# Also search for 'control' files that have a 'Package:' line and delete their parent directory +find "$extract_dir" -type f -name 'debian-binary' | while read -r debian_binary; do + if [[ "$debian_binary" == *_extract/debian-binary ]]; then + rm -rf "$(dirname "$debian_binary")" + fi +done +find "$extract_dir" -type f -name 'control' | while read -r control_file; do + if grep -q "^Package:" "$control_file"; then + rm -rf "$(dirname "$control_file")" + fi +done + +# Archive all potential rootfs directories into a temporary directory +temp_dir=$(mktemp -d) +find $extract_dir -type d \( -name "*_carve" -o -name "*_extract" \) | while read -r dir; do + # Create a name for the archive + temp_archive="$temp_dir/$(uuidgen).tar.gz" + + # Create the archive, excluding subdirectories ending with '_carve' '_extract' or '_uncompressed' + # Also filter out ###-####.[ext] files, which are almost always unblob artifacts (e.g., 0-100.lzma) + tar -czf "$temp_archive" \ + --exclude='*_carve' --exclude='*_extract' --exclude '*.uncompressed' \ + --exclude='[0-9]*-[0-9]*.*' \ + -C "$dir" . + + # If the generated archive has a size of 0, delete it. Need to run tar to get list of files + if [ $(tar -tf "$temp_archive" | wc -l) -lt 2 ]; then + rm "$temp_archive" + continue + fi +done + +# Sort the archives by size and place them in the output directory, ordered by size +while read -r size file; do + # Get the number of files in the archive + nfiles=$(tar -tf "$file" | wc -l) + + # Generate a hash of the file contents + file_hash=$(tar -xOf "$file" | sha256sum | cut -c1-8) + + # Create the new filename using file count and hash + new_filename="${file_hash}.tar.gz" + + # Move and rename the file + mv "$file" "$output_dir/$new_filename" + + echo "Packaged $new_filename (size: $size, nfiles: $nfiles)" +done < <(find "$temp_dir" -type f -name "*.tar.gz" -print0 | xargs -0 du -s | sort -rn) + +# Don't create root-owned files that end users can't delete in mapped directories +chmod 777 "${output_dir}/"* +rm -rf "$temp_dir" \ No newline at end of file From 95327267e9a4529b7ee63bb2b561f9eaf0683b87 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Mon, 8 Jul 2024 16:14:20 -0400 Subject: [PATCH 02/22] Update fw2tar wrapper to call package_partitions --- fw2tar | 59 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/fw2tar b/fw2tar index 43d5093..72888f7 100755 --- a/fw2tar +++ b/fw2tar @@ -28,7 +28,7 @@ fw2tar_run() { while [[ $# -gt 0 ]]; do case "$1" in --wrapper-help) - echo "Usage: fw2tar [WRAPPER FLAGS] [FLAGS] FIRMWARE_FILE" + echo "Usage: fw2tar [WRAPPER FLAGS] [FLAGS] FIRMWARE_FILE OUTPUT_DIR" echo "Wrapper script for running FW2TAR in a Docker container" echo "" echo "Wrapper-specific flags may be passed in *before* the fw2tar flags and args. If a value is required, it must be specified immediately after the flag with a space." @@ -122,34 +122,32 @@ fw2tar_run() { fi fi - # Check if last argument is a file, if so we'll need to map it - if [[ ${#cmd[@]} -gt 0 ]]; then - if [[ -f "${cmd[-1]}" ]]; then - local arg="${cmd[-1]}" - local abspath=$(realpath "$arg") - local host_path=$(dirname "$abspath") - local guest_path="/host_$(basename "$host_path")" - maps+=("$host_path:$guest_path") - cmd[-1]="/host_$(basename "$host_path")/$(basename "$arg")" - fi + # Second to last argument should be the firmware file. This file should already exist + if [[ -f "${cmd[-2]}" ]]; then + local arg="${cmd[-2]}" + local abspath=$(realpath "$arg") + local host_path=$(dirname "$abspath") + local guest_path="/host_$(basename "$host_path")" + maps+=("$host_path:$guest_path") + #cmd[-2]="/host_$(basename "$host_path")/$(basename "$arg")" + firmware_file="/host_$(basename "$host_path")/$(basename "$arg")" + else + echo "Fatal error: Firmware file not found: ${cmd[-2]}" fi - # Check for "--output" flag, create and map the directory as necessary - for ((i=0; i<${#cmd[@]}; i++)); do - if [[ "${cmd[$i]}" == "--output" && $((i+1)) -lt ${#cmd[@]} ]]; then - output_dir="${cmd[$i+1]}" - if [[ ! -d "$output_dir" ]]; then - mkdir -p "$output_dir" - fi - - # Add mapping for the output directory - local abspath=$(realpath "$output_dir") - local guest_path="/host_$(basename "$abspath")" - maps+=("$abspath:$guest_path") - cmd[$i+1]="/host_$(basename "$abspath")/$(basename "$output_dir")" + # Final argument should be the output directory + output_dir="${cmd[-1]}" + if [[ ! -d "$output_dir" ]]; then + mkdir -p "$output_dir" + fi - fi - done + local arg="${cmd[-1]}" + local abspath=$(realpath "$arg") + local host_path=$(dirname "$abspath") + local guest_path="/host_$(basename "$host_path")" + maps+=("$host_path:$guest_path") + #cmd[-1]="/host_$(basename "$host_path")/$(basename "$arg")" + output_dir="/host_$(basename "$host_path")/$(basename "$arg")" # Check for "--scratch-dir" flag, create and map the directory as necessary for ((i=0; i<${#cmd[@]}; i++)); do @@ -191,12 +189,15 @@ fw2tar_run() { docker_cmd+=("$image") - docker_cmd+=("fakeroot_fw2tar") - docker_cmd+=("${cmd[@]}") + #docker_cmd+=("fakeroot_fw2tar") + docker_cmd+=("fakeroot" "/usr/local/bin/package_partitions.sh") + docker_cmd+=("${firmware_file}") + docker_cmd+=("${output_dir}") if $verbose; then echo "${BOLD}Fw2tar command:${RESET}" - echo " fw2tar ${cmd[@]}" + #echo " fw2tar ${cmd[@]}" + echo " fakeroot /usr/local/bin/unblob_package.sh ${cmd[@]}" echo echo "${BOLD}Complete docker commands:${RESET}" From cf1952dce17616b7f38b78a5a3de96fd679b66c0 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Wed, 26 Jun 2024 17:27:13 -0400 Subject: [PATCH 03/22] Standalone unification logic Cherry-picked from https://github.com/rehosting/fw2tar/commit/1ffc07cf03a188ba6cf35fdc67c1e3c13b1d50e9 --- Dockerfile | 5 +- unifyroot/README.md | 40 +++ unifyroot/setup.py | 34 ++ unifyroot/unifyroot/__init__.py | 5 + unifyroot/unifyroot/cli.py | 31 ++ unifyroot/unifyroot/common.py | 206 +++++++++++ unifyroot/unifyroot/filesystemunifier.py | 427 +++++++++++++++++++++++ 7 files changed, 747 insertions(+), 1 deletion(-) create mode 100644 unifyroot/README.md create mode 100644 unifyroot/setup.py create mode 100644 unifyroot/unifyroot/__init__.py create mode 100644 unifyroot/unifyroot/cli.py create mode 100644 unifyroot/unifyroot/common.py create mode 100644 unifyroot/unifyroot/filesystemunifier.py diff --git a/Dockerfile b/Dockerfile index 413792f..15c5099 100644 --- a/Dockerfile +++ b/Dockerfile @@ -120,4 +120,7 @@ RUN echo '[ ! -z "$TERM" ] && [ -z "$NOBANNER" ] && /usr/local/bin/banner.sh' >> # fw2tar here is a simple shell wrapper to call fakeroot fw2tar.py COPY src/fw2tar src/fakeroot_fw2tar /usr/local/bin/ -CMD ["/usr/local/bin/banner.sh"] \ No newline at end of file +COPY unifyroot /tmp/unifyroot +RUN python3 -m pip install /tmp/unifyroot + +CMD ["/usr/local/bin/banner.sh"] diff --git a/unifyroot/README.md b/unifyroot/README.md new file mode 100644 index 0000000..47fc46b --- /dev/null +++ b/unifyroot/README.md @@ -0,0 +1,40 @@ +# UnifyRoot + +UnifyRoot is a powerful static analysis designed to reconstruct a full Linux filesystem from a corpus of extracted filesystems. UnifyRoot identifies which partition should be the root filesystem, +what unresolved references exist, and how mounting other filesystems can resolve these references. The resulting **partition map** is provided to a user and then used to generate a **unified filesystem archive**. + +### The Problem We Solve + +In the world of embedded systems, it's common to encounter multiple filesystem images extracted from a single device. These images often represent different partitions or overlays that, when combined, form the complete filesystem of the device. However, piecing these fragments together manually can be a time-consuming and error-prone process. + +UnifyRoot automates this reconstruction, intelligently combining multiple filesystem images into a single, coherent structure. By doing so, it provides a clear view of the entire filesystem, making it easier to analyze, understand, and work with embedded system software. + +### Key Features + +- **Intelligent Mount Point Detection**: Automatically determines the optimal mounting points for each filesystem image. +- **Reference Resolution**: Identifies and resolves file references across different filesystem images. +- **Flexible Input Handling**: Works with multiple tar.gz archives, making it compatible with most filesystem extraction tools. +- **Optimized Unification**: Employs advanced algorithms to maximize resolved references while minimizing unnecessary mounts. +- **Preservation of Filesystem Integrity**: Ensures that the unified structure maintains the integrity and hierarchy of each individual filesystem. + +### Getting Started + +To start bringing order to your embedded filesystem chaos, install this package. + +Then, use the command-line interface to unify your filesystem images: + +```bash +unify-fs /path/to/input/directory [/path/to/output.tar.gz] +``` + +Or integrate it into your Python scripts: + +```python +from filesystem_unifier import unify_filesystems + +unify_filesystems('/path/to/input/directory', '/path/to/output.tar.gz') +``` + +Note that your input directory should contain multiple extracted filesystems as `.tar.gz` archives with preserved permissions. The contents of each archive should be within a top-level directory `.`. + +UnifyRoot is a component of [fw2tar](https://github.com/rehosting/fw2tar) but may also provide value in isolation. diff --git a/unifyroot/setup.py b/unifyroot/setup.py new file mode 100644 index 0000000..cdbfca1 --- /dev/null +++ b/unifyroot/setup.py @@ -0,0 +1,34 @@ +from setuptools import setup, find_packages + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +setup( + name="unifyroot", + version="0.1.0", + author="Andrew Fasano", + author_email="fasano@mit.edu", + description="Recover filesystem layouts and combine them into a single archive", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/rehosting/fw2tar", + packages=find_packages(), + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Operating System :: Linux", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + ], + python_requires=">=3.7", + install_requires=[ + ], + entry_points={ + "console_scripts": [ + "unifyroot=unifyroot.cli:main", + ], + }, +) diff --git a/unifyroot/unifyroot/__init__.py b/unifyroot/unifyroot/__init__.py new file mode 100644 index 0000000..57ce08b --- /dev/null +++ b/unifyroot/unifyroot/__init__.py @@ -0,0 +1,5 @@ +from .common import FilesystemInfo, FilesystemRepository, FilesystemLoader +from .filesystemunifier import FilesystemUnifier +from .cli import unify_filesystems + +__all__ = ['FilesystemInfo', 'FilesystemRepository', 'FilesystemLoader', 'FilesystemUnifier', 'unify_filesystems'] \ No newline at end of file diff --git a/unifyroot/unifyroot/cli.py b/unifyroot/unifyroot/cli.py new file mode 100644 index 0000000..428ea0c --- /dev/null +++ b/unifyroot/unifyroot/cli.py @@ -0,0 +1,31 @@ +from typing import Optional +from .common import FilesystemRepository, FilesystemLoader +from .filesystemunifier import FilesystemUnifier + +def unify_filesystems(input_path: str, output_path: Optional[str] = None): + ''' + Given a directory (or a path to a .tar.gz within such a directory), + examine all the archives and find an optimal way to unify them into a single filesystem. + Create the unified filesystem at output_path. + ''' + repository = FilesystemRepository() + loader = FilesystemLoader(repository) + loader.load_filesystems(input_path) + unifier = FilesystemUnifier(repository) + mount_points = unifier.unify() + + print(f"Best mount points: {mount_points}") + + if output_path is None: + output_path = input_path + "unified.tar.gz" + unifier.create_archive(loader.load_path, mount_points, output_path) + +def main(): + import sys + if len(sys.argv) < 2: + print("Usage: unifyroot [output_path]") + sys.exit(1) + unify_filesystems(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/unifyroot/unifyroot/common.py b/unifyroot/unifyroot/common.py new file mode 100644 index 0000000..ad66a22 --- /dev/null +++ b/unifyroot/unifyroot/common.py @@ -0,0 +1,206 @@ +import glob +import os +import re +import tarfile +from typing import Dict, Set, Optional + +class FilesystemInfo: + """ + Represents information about a filesystem. + + Attributes: + name (str): The name of the filesystem. + paths (Set[str]): Set of paths in the filesystem. + references (Set[str]): Set of references found in the filesystem. + size (int): Total size of the filesystem in bytes. + """ + + def __init__(self, name: str): + self.name: str = name + self.paths: Set[str] = set() + self.references: Set[str] = set() + self.size: int = 0 + + def add_path(self, path: str) -> None: + """Add a path to the filesystem.""" + self.paths.add(path) + + def add_reference(self, reference: str) -> None: + """ + Add a reference to the filesystem. + + Args: + reference (str): The reference to add. Must not contain spaces. + """ + assert " " not in reference, "References cannot contain spaces" + self.references.add(reference) + + def set_size(self, size: int) -> None: + """Set the total size of the filesystem.""" + self.size = size + +class FilesystemRepository: + """ + Manages a collection of FilesystemInfo objects. + + Attributes: + filesystems (Dict[str, FilesystemInfo]): A dictionary mapping filesystem names to FilesystemInfo objects. + """ + + def __init__(self): + self.filesystems: Dict[str, FilesystemInfo] = {} + + def add_filesystem(self, name: str) -> None: + """ + Add a new filesystem to the repository if it doesn't already exist. + + Args: + name (str): The name of the filesystem to add. + """ + if name not in self.filesystems: + self.filesystems[name] = FilesystemInfo(name) + + def get_filesystem(self, name: str) -> Optional[FilesystemInfo]: + """ + Retrieve a filesystem by name. + + Args: + name (str): The name of the filesystem to retrieve. + + Returns: + Optional[FilesystemInfo]: The FilesystemInfo object if found, None otherwise. + """ + return self.filesystems.get(name) + + def get_all_filesystems(self) -> Dict[str, FilesystemInfo]: + """ + Get all filesystems in the repository. + + Returns: + Dict[str, FilesystemInfo]: A dictionary of all filesystems. + """ + return self.filesystems + + def add_path_to_filesystem(self, name: str, path: str) -> None: + """ + Add a path to a specific filesystem. + + Args: + name (str): The name of the filesystem. + path (str): The path to add. + """ + if name in self.filesystems: + self.filesystems[name].add_path(path) + + def add_reference_to_filesystem(self, name: str, reference: str) -> None: + """ + Add a reference to a specific filesystem. + + Args: + name (str): The name of the filesystem. + reference (str): The reference to add. + """ + if name in self.filesystems: + self.filesystems[name].add_reference(reference) + + def set_filesystem_size(self, name: str, size: int) -> None: + """ + Set the size of a specific filesystem. + + Args: + name (str): The name of the filesystem. + size (int): The size to set. + """ + if name in self.filesystems: + self.filesystems[name].set_size(size) + +class FilesystemLoader: + """ + Loads filesystem information from tar.gz files into a FilesystemRepository. + + Attributes: + repository (FilesystemRepository): The repository to store loaded filesystem information. + load_path (Optional[str]): The path from which filesystems are being loaded. + """ + + def __init__(self, repository: FilesystemRepository): + self.repository = repository + self.load_path: Optional[str] = None + + def load_filesystems(self, input_path: str) -> None: + """ + Load filesystems from a given input path. + + Args: + input_path (str): Path to a directory containing tar.gz files or a single tar.gz file. + + Raises: + ValueError: If the input path is neither a directory nor a tar.gz file. + """ + if input_path.endswith(".tar.gz"): + glob_target = f"{input_path[:-7]}*.tar.gz" + self.load_path = os.path.dirname(input_path) + elif os.path.isdir(input_path): + glob_target = f"{input_path}/*.tar.gz" + self.load_path = input_path + else: + raise ValueError(f"Input path must be a directory or a .tar.gz file. {input_path} is neither") + + for file in glob.glob(glob_target): + self._process_tar_file(file) + + def _process_tar_file(self, file_path: str) -> None: + """ + Process a single tar.gz file and extract filesystem information. + + Args: + file_path (str): Path to the tar.gz file to process. + """ + fs_name = os.path.basename(file_path) + self.repository.add_filesystem(fs_name) + + with tarfile.open(file_path, "r:gz") as tar: + for member in tar.getmembers(): + if member.name == ".": + continue + if member.isfile(): + self.repository.add_path_to_filesystem(fs_name, member.name) + self._extract_references(fs_name, tar, member) + elif member.isdir() or member.islnk(): + self.repository.add_path_to_filesystem(fs_name, member.name) + + self.repository.set_filesystem_size(fs_name, sum(member.size for member in tar.getmembers())) + + def _extract_references(self, fs_name: str, tar: tarfile.TarFile, member: tarfile.TarInfo) -> None: + """ + Extract references from a file in the tar archive. + + Args: + fs_name (str): Name of the filesystem. + tar (tarfile.TarFile): The tar archive being processed. + member (tarfile.TarInfo): The specific file in the tar archive to process. + """ + path_regex = re.compile(rb'/[^/\0\n<>"\'! :\?]+(?:/[^/\0\n<>()%"\'! ;:\?]+)+') + file_content = tar.extractfile(member).read() + + for match in re.findall(path_regex, file_content): + try: + decoded_path = match.decode('utf-8') + if self._is_valid_reference(decoded_path): + self.repository.add_reference_to_filesystem(fs_name, decoded_path) + except UnicodeDecodeError: + pass + + @staticmethod + def _is_valid_reference(path: str) -> bool: + """ + Check if a reference path is valid. + + Args: + path (str): The path to check. + + Returns: + bool: True if the path is a valid reference, False otherwise. + """ + invalid_chars = set(" \t\n^$%*") + return not (any(char in path for char in invalid_chars) or path.endswith(".c")) diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py new file mode 100644 index 0000000..da3137f --- /dev/null +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -0,0 +1,427 @@ +import string +import os +import tempfile +import subprocess +from copy import deepcopy + +from collections import defaultdict +from typing import Dict, Set, List, Tuple, Optional + +from .common import FilesystemInfo, FilesystemRepository + +class FilesystemUnifier: + def __init__(self, repository: FilesystemRepository): + self.repository = repository + + def unify(self) -> Dict[str, str]: + """ + Main method to unify filesystems. Tries each filesystem as a potential root + and returns the best overall configuration. + + Returns: + Dict[str, str]: A mapping of mount points to filesystem names. + """ + best_mount_points = {} + best_score = float('-inf') + + # Try each filesystem as a potential root - with each consider how we could mount + # others + for root_fs_name, root_fs_info in self.repository.get_all_filesystems().items(): + if not self._could_be_root(root_fs_info): + continue + initial = {"./": root_fs_name} + mount_points, score = self._try_unify_from(initial) + if score > best_score: + best_score = score + best_mount_points = mount_points + + return best_mount_points + + def _could_be_root(self, fs_info: FilesystemInfo) -> bool: + # Count how many "standard" files we see in the filesystem. + # If we have at least 3 of these, we'll saw it could be a root + standard_dirs = set([f"./{x}" for x in "var usr run bin sbin sys tmp etc home lib media mnt opt proc bin dev root srv".split()]) + standard_files = set(["./etc/passwd", "./etc/fstab", "./bin/ls", "./bin/bash", "./bin/busybox"]) + + combined = standard_dirs | standard_files + + count = 0 + for checked in combined: + if checked in fs_info.paths: + count += 1 + + return count >= 3 + + + def _try_unify_from(self, mount_points: Dict[str, str]) -> Tuple[Dict[str, str], float]: + """ + Recursively tries to unify filesystems starting from the given mount points. + + Args: + mount_points (Dict[str, str]): Current mapping of mount points to filesystem names. + + Returns: + Tuple[Dict[str, str], float]: Best mount points configuration and its score. + """ + unresolved_paths = self._get_unresolved_paths(mount_points) + remaining_filesystems = set(self.repository.get_all_filesystems().keys()) - set(mount_points.values()) + + best_score = self._calculate_configuration_score(mount_points, unresolved_paths) + + print(f"{mount_points} has score {best_score}. Trying to improve with more filesystems...") + best_config = mount_points.copy() + + for fs_name in remaining_filesystems: + fs_info = self.repository.get_filesystem(fs_name) + mount_point, score_improvement = self._find_best_mount_point(mount_points, fs_info, unresolved_paths) + + if mount_point and score_improvement > 0: + new_mount_points = mount_points.copy() + new_mount_points[mount_point] = fs_name + new_config, new_score = self._try_unify_from(new_mount_points) + + if new_score > best_score: + best_score = new_score + best_config = new_config + + return best_config, best_score + + def _find_best_mount_point(self, cur_mounts: Dict[str, str], fs_info: FilesystemInfo, unresolved_paths: Set[str]) -> Tuple[Optional[str], float]: + """ + Finds the best mount point for a filesystem based on how many unresolved paths it can resolve. + + Args: + cur_mounts (Dict[str, str]): Current mapping of mount points to filesystem names. + fs_info (FilesystemInfo): Information about the filesystem to evaluate. + unresolved_paths (Set[str]): Current set of unresolved paths. + + Returns: + Tuple[Optional[str], float]: The best mount point and the score improvement, or (None, 0) if no suitable mount point is found. + """ + best_mount_point = None + best_score_improvement = 0 + visible_paths = self._get_visible_paths(cur_mounts) + potential_mounts = self._find_potential_mount_points(cur_mounts, fs_info, unresolved_paths) + + for potential_mount_point in potential_mounts: + resolved_paths = self._get_resolved_paths(visible_paths, potential_mount_point, fs_info, unresolved_paths) + total_files_in_mount = len(fs_info.paths) + + new_mounts = deepcopy(cur_mounts) + new_mounts[potential_mount_point] = fs_info.name + # Combine all visible paths into a single set + total_files_with_mount = set.union(*self._get_visible_paths(new_mounts).values()) + print(f"\t{cur_mounts} + {fs_info.name} @ {potential_mount_point} resolves {len(resolved_paths)} paths and adds {total_files_in_mount} files to get {len(total_files_with_mount)} total files") + print(f"\t\t {' '.join(resolved_paths[:10])}") + + # XXX: is our improvement just the number of resolved paths? + # What if this mount just resolves like 1 path and adds a bunch of broken references? On the other hand, what if it's just 1 path and we're fixing it + if len(resolved_paths) > 2: + # If we resolve more than 2 paths, we're probably doing well + score_improvement = len(resolved_paths) + elif len(resolved_paths) == 0: + score_improvement = -1 + else: + # If we only resolve a couple paths, things could be good. Or bad. + if len("".join([x.replace(potential_mount_point,'') for x in resolved_paths])) > 10: + # The names are long -> more likely good + score_improvement = len(resolved_paths) + + if not any([x in string.ascii_letters for x in "".join([x.replace(potential_mount_point,'') for x in resolved_paths]).split()]): + # The names are mostly non-ascii -> probably bad + score_improvement = 0 + + elif total_files_in_mount < 10: + # Only a few files are in this mount point, less alignment + # is to be expected. + score_improvement = len(resolved_paths) + else: + # Otherwise this is probably junk. + score_improvement = 0 + + if score_improvement > best_score_improvement: + best_score_improvement = score_improvement + best_mount_point = potential_mount_point + + return best_mount_point, best_score_improvement + + def _calculate_configuration_score(self, mount_points: Dict[str, str], unresolved_paths: Set[str]) -> float: + """ + Calculates a score for the current filesystem configuration. For now we're just saying the number of paths. + This is probably too simple - adding more filesystems isn't good unless they actually resolve something + + Args: + mount_points (Dict[str, str]): Current mapping of mount points to filesystem names. + unresolved_paths (Set[str]): Set of paths that remain unresolved. + + Returns: + float: The configuration score. + """ + resolved_paths = sum(len(self.repository.get_filesystem(fs_name).paths) for fs_name in mount_points.values()) + #return resolved_paths - len(unresolved_paths) - (len(mount_points) * 10) # Penalize number of mount points + return resolved_paths + + + def _get_unresolved_paths(self, mount_points: Dict[str, str]) -> Set[str]: + """ + Identifies unresolved paths in the context of currently mounted filesystems. + + Args: + mount_points (Dict[str, str]): Current mapping of mount points to filesystem names. + + Returns: + Set[str]: Set of unresolved paths. + """ + unresolved_paths = set() + visible_paths = self._get_visible_paths(mount_points) + + for mount_point, fs_name in mount_points.items(): + fs_info = self.repository.get_filesystem(fs_name) + for reference in fs_info.references: + if not self._path_is_resolved(reference, visible_paths): + unresolved_paths.add(reference) + + return unresolved_paths + + def _path_is_resolved(self, path: str, visible_paths: Dict[str, Set[str]]) -> bool: + """ + Checks if a given path is resolved by any of the currently visible paths. + + Args: + path (str): The path to check. + visible_paths (Dict[str, Set[str]]): Mapping of mount points to their visible paths. + + Returns: + bool: True if the path is resolved, False otherwise. + """ + return any(path in paths for paths in visible_paths.values()) + + @staticmethod + def _get_relative_path(path: str, mount_point: str) -> str: + """ + Calculates a relative path based on a mount point. + + Args: + path (str): The full path. + mount_point (str): The mount point. + + Returns: + str: The relative path. + """ + if path.startswith(mount_point): + return path[len(mount_point):].lstrip('/') + return path + + def _find_best_filesystem_to_mount(self, unresolved_paths: Set[str], remaining_filesystems: Set[str]) -> Tuple[Optional[FilesystemInfo], str]: + """ + Finds the best filesystem to mount next based on how many unresolved paths it can resolve. + + Args: + unresolved_paths (Set[str]): Current set of unresolved paths. + remaining_filesystems (Set[str]): Set of filesystems not yet mounted. + + Returns: + Tuple[Optional[FilesystemInfo], str]: The best filesystem to mount and its mount point, or (None, "") if no suitable filesystem is found. + """ + best_score = float('-inf') + best_fs = None + best_mount_point = "" + + for fs_name in remaining_filesystems: + fs_info = self.repository.get_filesystem(fs_name) + mount_point, score = self._evaluate_mount_point(fs_info, unresolved_paths) + if mount_point is None: + continue + print(f"\t Adding {fs_info.name} to {mount_point} yields score {score}") + if score > best_score: + best_score = score + best_fs = fs_info + best_mount_point = mount_point + + return best_fs, best_mount_point + + def _evaluate_mount_point(self, cur_mounts: Dict[str, str], fs_info: FilesystemInfo, unresolved_paths: Set[str]) -> Tuple[str, float]: + """ + Evaluates potential mount points for a filesystem and returns the best one with its score. + + Args: + cur_mounts (Dict[str, str]): Current mapping of mount points to filesystem names. + fs_info (FilesystemInfo): Information about the filesystem to evaluate. + unresolved_paths (Set[str]): Current set of unresolved paths. + + Returns: + Tuple[str, float]: The best mount point and its score. + """ + best_mount_point = None + best_score = float('-inf') + + for potential_mount_point in self._find_potential_mount_points(fs_info, unresolved_paths): + score = self._calculate_mount_point_score(potential_mount_point, fs_info, unresolved_paths) + if score > best_score: + best_score = score + best_mount_point = potential_mount_point + + return best_mount_point, best_score + + def _find_potential_mount_points(self, cur_mounts: Dict[str, str], fs_info: FilesystemInfo, unresolved_paths: Set[str]) -> List[str]: + """ + Finds potential mount points for a filesystem based on unresolved paths. + + Args: + cur_mounts (Dict[str, str]): Current mapping of mount points to filesystem names. + fs_info (FilesystemInfo): Information about the filesystem we're considering mounting. + unresolved_paths (Set[str]): Current set of unresolved paths in the established filesystem. + + Returns: + List[str]: List of potential mount points, sorted by the number of paths they would resolve. + """ + # Step 1: Identify all potential mount points + mount_point_candidates = defaultdict(set) + for unresolved_path in unresolved_paths: + unresolved_path = "." + unresolved_path # Ensure path starts with . + for fs_path in fs_info.paths: + potential_mount_point = self._get_potential_mount_point(unresolved_path, fs_path) + if potential_mount_point and potential_mount_point != '.': + if self._is_valid_new_mount_point(potential_mount_point, cur_mounts): + mount_point_candidates[potential_mount_point].add(unresolved_path) + + # Step 2: Evaluate each potential mount point + potential_mount_points: Dict[str, int] = {} + for mount_point, candidate_paths in mount_point_candidates.items(): + resolved_paths = self._get_resolved_paths(cur_mounts, mount_point, fs_info, unresolved_paths) + potential_mount_points[mount_point] = len(resolved_paths) + #print(f"Mounting {fs_info.name} at {mount_point} resolves {resolved_paths} paths") + #print(f"\t", resolved_paths) + + # Step 3: Sort and return the results + return sorted(potential_mount_points, key=potential_mount_points.get, reverse=True) + + def _is_valid_new_mount_point(self, new_mount: str, cur_mounts: Dict[str, str]) -> bool: + """ + Checks if a new mount point is valid given the current mount points. + + Args: + new_mount (str): The potential new mount point to evaluate. + cur_mounts (Dict[str, str]): Current mapping of mount points to filesystem names. + + Returns: + bool: True if the new mount point is valid, False otherwise. + """ + for existing_mount in cur_mounts: + if new_mount == existing_mount: + return False # Prevent mounting at the same point + if existing_mount.startswith(new_mount + '/'): + return False # Prevent mounting a parent directory of an existing mount + return True + + @staticmethod + def _get_potential_mount_point(unresolved_path: str, fs_path: str) -> Optional[str]: + """ + Determines a potential mount point by comparing an unresolved path with a filesystem path. + + For example if we have an unresolved path of ./mnt/foo/zoo and fs_path is ./foo/zoo + we should return ./mnt as mounting the fs_path at ./mnt would resolve the unresolved path. + + Args: + unresolved_path (str): An unresolved path. + fs_path (str): A path in the filesystem being considered. + + Returns: + Optional[str]: A potential mount point, or None if no suitable mount point is found. + """ + + # Expect both to be ./something/... paths + if not unresolved_path.startswith("./") or not fs_path.startswith("./"): + raise ValueError(f"Paths must start with ./ but got {unresolved_path} and {fs_path}") + + # Check if unresolved_path ends with fs_path after dropping leading .s + + if unresolved_path.endswith(fs_path[1:]): + result = unresolved_path[:-len(fs_path) + 1] + if not result.startswith(("./proc", "./sys", "./dev", "./tmp")): + return result + + return None + + def _get_resolved_paths(self, visible_paths: Dict[str, Set[str]], mount_point: str, fs_info: FilesystemInfo, unresolved_paths: Set[str]) -> List[str]: + """ + Get the unresolved paths that would be resolved by mounting a filesystem at a given point. + + Args: + visible_paths (Dict[str, Set[str]]): Mapping of mount points to their visible paths. + mount_point (str): The potential mount point. + fs_info (FilesystemInfo): Information about the filesystem. + unresolved_paths (Set[str]): Current set of unresolved paths. + + Returns: + List[str]: List of paths that would be resolved. + """ + return [x for x in unresolved_paths if self._path_would_be_resolved(visible_paths, "." + x, mount_point, fs_info)] + + def _path_would_be_resolved(self, visible_paths: Dict[str, Set[str]], unresolved_path: str, mount_point: str, fs_info: FilesystemInfo) -> bool: + """ + Checks if an unresolved path would be resolved by mounting a filesystem at a given point. + + Args: + visible_paths (Dict[str, Set[str]]): Mapping of mount points to their visible paths. + unresolved_path (str): The unresolved path to check. + mount_point (str): The potential mount point. + fs_info (FilesystemInfo): Information about the filesystem. + + Returns: + bool: True if the path would be resolved, False otherwise. + """ + # Check if the path is already resolved by existing visible paths + if any(unresolved_path in paths for paths in visible_paths.values()): + return False + + if unresolved_path.startswith(mount_point): + relative_path = unresolved_path[len(mount_point):].lstrip('/') + return any(fs_path.endswith(relative_path) for fs_path in fs_info.paths) + return False + + def _get_visible_paths(self, mount_points: Dict[str, str]) -> Dict[str, Set[str]]: + """ + Calculate the visible paths for each mount point based on the current filesystem structure. + + Args: + mount_points (Dict[str, str]): Current mapping of mount points to filesystem names. + + Returns: + Dict[str, Set[str]]: A mapping of mount points to their visible paths. + """ + visible_paths = {} + sorted_mount_points = sorted(mount_points.items(), key=lambda x: len(x[0]), reverse=True) + + for mount_point, fs_name in sorted_mount_points: + fs_info = self.repository.get_filesystem(fs_name) + visible_paths[mount_point] = set() + for path in fs_info.paths: + full_path = os.path.join(mount_point, path.lstrip('./')) + if not any(full_path.startswith(other_mount) for other_mount in visible_paths if other_mount != mount_point): + visible_paths[mount_point].add(full_path) + + return visible_paths + + def create_archive(self, archive_dir, mounts, output): + # Create a temporary directory, then extract filesystems from self.repository at the mount + # points and package it up + with tempfile.TemporaryDirectory() as temp_dir: + for mount_point, fs_name in mounts.items(): + fs_info = self.repository.get_filesystem(fs_name) + src = os.path.join(archive_dir, fs_info.name) + dest = os.path.join(temp_dir, mount_point) + + # Ensure dest is within temp_dir + if not os.path.commonpath([temp_dir, dest]) == temp_dir: + raise ValueError(f"Destination {dest} is not within {temp_dir}") + + # Create the directory if it doesn't exist + os.makedirs(dest, exist_ok=True) + + # Extract + subprocess.check_output(["tar", "xf", src, "-C", dest]) + + + # All done - package it up + subprocess.check_output(["tar", "czf", output, "-C", temp_dir, "."]) From 1ea0a2a443f69f1d4e7c1ec1b65fe9d97cb582a8 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Mon, 21 Oct 2024 20:24:19 -0400 Subject: [PATCH 04/22] unifyroot common: parse elfs in a more structured way. Track symlinks within each fs --- unifyroot/unifyroot/common.py | 154 +++++++++++++++++++++++++++++++--- 1 file changed, 142 insertions(+), 12 deletions(-) diff --git a/unifyroot/unifyroot/common.py b/unifyroot/unifyroot/common.py index ad66a22..27002be 100644 --- a/unifyroot/unifyroot/common.py +++ b/unifyroot/unifyroot/common.py @@ -2,6 +2,9 @@ import os import re import tarfile + +from io import BytesIO +from elftools.elf.elffile import ELFFile from typing import Dict, Set, Optional class FilesystemInfo: @@ -20,17 +23,24 @@ def __init__(self, name: str): self.paths: Set[str] = set() self.references: Set[str] = set() self.size: int = 0 + self.links: Dict[str, str] = {} def add_path(self, path: str) -> None: """Add a path to the filesystem.""" self.paths.add(path) + def add_link(self, path: str, link: str) -> None: + """Add a link to the filesystem.""" + self.links[path] = link + + def add_reference(self, reference: str) -> None: """ Add a reference to the filesystem. Args: - reference (str): The reference to add. Must not contain spaces. + reference (str): The reference to add. Must not contain spaces as a sainity check + (Maybe drop that assertion?) """ assert " " not in reference, "References cannot contain spaces" self.references.add(reference) @@ -92,9 +102,21 @@ def add_path_to_filesystem(self, name: str, path: str) -> None: if name in self.filesystems: self.filesystems[name].add_path(path) + def add_link_to_filesystem(self, name: str, path: str, link: str) -> None: + """ + Add a link to a specific filesystem. + + Args: + name (str): The name of the filesystem. + path (str): The path to add. + link (str): The link to add. + """ + if name in self.filesystems: + self.filesystems[name].add_link(path, link) + def add_reference_to_filesystem(self, name: str, reference: str) -> None: """ - Add a reference to a specific filesystem. + Add a reference to a specific filesystem. But only if it's valid Args: name (str): The name of the filesystem. @@ -163,10 +185,14 @@ def _process_tar_file(self, file_path: str) -> None: for member in tar.getmembers(): if member.name == ".": continue - if member.isfile(): + if member.islnk() or member.issym(): + # Add as both link and path + self.repository.add_link_to_filesystem(fs_name, member.name, member.linkname) + self.repository.add_path_to_filesystem(fs_name, member.name) + elif member.isfile(): self.repository.add_path_to_filesystem(fs_name, member.name) self._extract_references(fs_name, tar, member) - elif member.isdir() or member.islnk(): + elif member.isdir(): self.repository.add_path_to_filesystem(fs_name, member.name) self.repository.set_filesystem_size(fs_name, sum(member.size for member in tar.getmembers())) @@ -180,17 +206,92 @@ def _extract_references(self, fs_name: str, tar: tarfile.TarFile, member: tarfil tar (tarfile.TarFile): The tar archive being processed. member (tarfile.TarInfo): The specific file in the tar archive to process. """ - path_regex = re.compile(rb'/[^/\0\n<>"\'! :\?]+(?:/[^/\0\n<>()%"\'! ;:\?]+)+') file_content = tar.extractfile(member).read() + path_regex = re.compile(r'/[^/\0\n<>"\'! :\?]{3,255}(?:/[^/\0\n<>()%"\'! ;:\?]+)*') - for match in re.findall(path_regex, file_content): + # If it's an elf try parsing and finding libraries it references + elf_magic = b"\x7fELF" + elf_references = None + if file_content.startswith(elf_magic): try: - decoded_path = match.decode('utf-8') - if self._is_valid_reference(decoded_path): - self.repository.add_reference_to_filesystem(fs_name, decoded_path) - except UnicodeDecodeError: + elf_references = self._parse_elf_references(file_content) + except Exception as e: + # Never seen an exception yet but maybe we'll get a malformed elf one day? + print(e) pass + if elf_references is not None: + for reference in elf_references: + if self._is_valid_reference(reference) and path_regex.match(reference): + self.repository.add_reference_to_filesystem(fs_name, reference) + else: + + # ignore HTML like files as a source for information + if member.name.endswith((".html", ".htm", ".css", ".js")): + return + + try: + file_content = file_content.decode('utf-8') + except UnicodeDecodeError: + # Non-UTF-8 file, skip (?) should we try parsing other ways + # Goal here is to find config files and things like that + return + + # Fallback to regex for finding references + for match in re.findall(path_regex, file_content): + if self._is_valid_reference(match): + self.repository.add_reference_to_filesystem(fs_name, match) + + def _parse_elf_references(self, elf_content: bytes) -> Set[str]: + """ + Extract references from an ELF file in the tar archive. + """ + + lib_paths = ["/lib", "/usr/lib"] + + with ELFFile(BytesIO(memoryview(elf_content))) as elf: + references = set() + + dynamic = elf.get_section_by_name('.dynamic') + + # Find RPATH - influences library search path + rpath = None + if dynamic: + for tag in dynamic.iter_tags(): + if tag.entry.d_tag == 'DT_RPATH': + rpath = tag.rpath + lib_paths.append(rpath) + + # Find interpreter path + interp = elf.get_section_by_name('.interp') + if interp: + interp_data = interp.data().strip(b'\x00') + references.add(interp_data.decode('utf-8', errors='ignore')) + + # Parse the dynamic section for DT_NEEDED (shared libraries) + if dynamic: + for tag in dynamic.iter_tags(): + if tag.entry.d_tag == 'DT_NEEDED': + if not tag.needed: + continue + needed = tag.needed + if needed.startswith('/'): + references.add(needed) + else: + # XXX: We're adding multiple paths, but only one needs to work + for lib in lib_paths: + references.add(os.path.join(lib, needed)) + + + # XXX do we want this? + strtab = elf.get_section_by_name('.strtab') + if strtab: + for match in re.findall(rb'^/([a-zA-Z0-9_\-./]+)*$', strtab.data()): + references.add(match.decode()) + return references + + + @staticmethod def _is_valid_reference(path: str) -> bool: """ @@ -202,5 +303,34 @@ def _is_valid_reference(path: str) -> bool: Returns: bool: True if the path is a valid reference, False otherwise. """ - invalid_chars = set(" \t\n^$%*") - return not (any(char in path for char in invalid_chars) or path.endswith(".c")) + if not (3 < len(path) < 255): + # Too short or too long + return False + + if path.replace("/", "").isnumeric(): + # Purely numeric? Probably don't want it, it's a date like 9/1992 + return False + + if path.endswith(".c"): + # Don't want source paths + return False + + if len(path.split("/")) < 3: + # Too short, probably not a reference + return False + + # Is it a website? + if path.startswith("/www.") or ".com/" in path: + return False + + # Does it start with an IP address + potential_ip = path.split("/")[1] + if len(potential_ip.split(".")) == 4 and all(part.isnumeric() for part in potential_ip.split(".")): + return False + + invalid_chars = set(" \t\n^$%*{}`\+,=\\") + if any(invalid_chars & set(path)): + # Invalid characters + return False + + return True From 1cce06cf6b2f4b03cea9c652a62111d65f27731d Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Mon, 21 Oct 2024 20:29:22 -0400 Subject: [PATCH 05/22] Avoid clobbering more than 5 files with a mount point, don't remount over symlinks. Ignore common irrelevant artifacts --- unifyroot/unifyroot/filesystemunifier.py | 136 +++++++++++++---------- 1 file changed, 78 insertions(+), 58 deletions(-) diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index da3137f..95ccf8a 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -2,6 +2,7 @@ import os import tempfile import subprocess +import re from copy import deepcopy from collections import defaultdict @@ -9,6 +10,8 @@ from .common import FilesystemInfo, FilesystemRepository +INVALID_ROOTS = ("./proc", "./sys", "./dev", "./tmp") + class FilesystemUnifier: def __init__(self, repository: FilesystemRepository): self.repository = repository @@ -69,11 +72,32 @@ def _try_unify_from(self, mount_points: Dict[str, str]) -> Tuple[Dict[str, str], best_score = self._calculate_configuration_score(mount_points, unresolved_paths) print(f"{mount_points} has score {best_score}. Trying to improve with more filesystems...") + + # Collect symlinks that exist within the mount points we've defined + # We *should not* add a filesystem at a symlink, that wouldn't really make sense + # though we could resolve them and analyze it more + # For example if we have ./var -> ./tmp, we shouldn't place anything at ./var because + # we don't want to add things to /tmp. + # If we have ./etc -> ./etc/var we should place things at ./etc/var and not at ./etc + symlinks = {} + for mount_point, existing_name in mount_points.items(): + fs_info = self.repository.get_filesystem(existing_name) + for link, target in fs_info.links.items(): + link_dest = os.path.join(os.path.dirname(mount_point + link[2:]), target) + if not link_dest.startswith("."): + if link_dest.startswith("/"): + prefix = "." + else: + prefix = "./" + link_dest = prefix + link_dest + + symlinks[mount_point + link[2:]] = link_dest + best_config = mount_points.copy() for fs_name in remaining_filesystems: fs_info = self.repository.get_filesystem(fs_name) - mount_point, score_improvement = self._find_best_mount_point(mount_points, fs_info, unresolved_paths) + mount_point, score_improvement = self._find_best_mount_point(mount_points, fs_info, unresolved_paths, symlinks) if mount_point and score_improvement > 0: new_mount_points = mount_points.copy() @@ -86,7 +110,7 @@ def _try_unify_from(self, mount_points: Dict[str, str]) -> Tuple[Dict[str, str], return best_config, best_score - def _find_best_mount_point(self, cur_mounts: Dict[str, str], fs_info: FilesystemInfo, unresolved_paths: Set[str]) -> Tuple[Optional[str], float]: + def _find_best_mount_point(self, cur_mounts: Dict[str, str], fs_info: FilesystemInfo, unresolved_paths: Set[str], symlinks: Dict[str, str]) -> Tuple[Optional[str], float]: """ Finds the best mount point for a filesystem based on how many unresolved paths it can resolve. @@ -101,7 +125,7 @@ def _find_best_mount_point(self, cur_mounts: Dict[str, str], fs_info: Filesystem best_mount_point = None best_score_improvement = 0 visible_paths = self._get_visible_paths(cur_mounts) - potential_mounts = self._find_potential_mount_points(cur_mounts, fs_info, unresolved_paths) + potential_mounts = self._find_potential_mount_points(cur_mounts, fs_info, unresolved_paths, symlinks) for potential_mount_point in potential_mounts: resolved_paths = self._get_resolved_paths(visible_paths, potential_mount_point, fs_info, unresolved_paths) @@ -111,12 +135,22 @@ def _find_best_mount_point(self, cur_mounts: Dict[str, str], fs_info: Filesystem new_mounts[potential_mount_point] = fs_info.name # Combine all visible paths into a single set total_files_with_mount = set.union(*self._get_visible_paths(new_mounts).values()) - print(f"\t{cur_mounts} + {fs_info.name} @ {potential_mount_point} resolves {len(resolved_paths)} paths and adds {total_files_in_mount} files to get {len(total_files_with_mount)} total files") + + # XXX: We don't want to lose/shadow too many files. Specifically we probably don't want to lose files + # from our root filesystem, but shadowing files is generally probably bad + lost_files = [] + for _, files in visible_paths.items(): + lost_files.extend([x for x in files if x.startswith(potential_mount_point)]) + + print(f"\t{cur_mounts} + {fs_info.name} @ {potential_mount_point} resolves {len(resolved_paths)} paths, adds {total_files_in_mount} files, loses {len(lost_files)} to get {len(total_files_with_mount)} total files") print(f"\t\t {' '.join(resolved_paths[:10])}") # XXX: is our improvement just the number of resolved paths? # What if this mount just resolves like 1 path and adds a bunch of broken references? On the other hand, what if it's just 1 path and we're fixing it - if len(resolved_paths) > 2: + if len(lost_files) > 5: + # Probably bad, we don't want to shadow too many files + score_improvement = 0 + elif len(resolved_paths) > 2: # If we resolve more than 2 paths, we're probably doing well score_improvement = len(resolved_paths) elif len(resolved_paths) == 0: @@ -166,6 +200,8 @@ def _get_unresolved_paths(self, mount_points: Dict[str, str]) -> Set[str]: """ Identifies unresolved paths in the context of currently mounted filesystems. + Filters out paths that are invalid linux paths or in /dev or /tmp + Args: mount_points (Dict[str, str]): Current mapping of mount points to filesystem names. @@ -212,58 +248,7 @@ def _get_relative_path(path: str, mount_point: str) -> str: return path[len(mount_point):].lstrip('/') return path - def _find_best_filesystem_to_mount(self, unresolved_paths: Set[str], remaining_filesystems: Set[str]) -> Tuple[Optional[FilesystemInfo], str]: - """ - Finds the best filesystem to mount next based on how many unresolved paths it can resolve. - - Args: - unresolved_paths (Set[str]): Current set of unresolved paths. - remaining_filesystems (Set[str]): Set of filesystems not yet mounted. - - Returns: - Tuple[Optional[FilesystemInfo], str]: The best filesystem to mount and its mount point, or (None, "") if no suitable filesystem is found. - """ - best_score = float('-inf') - best_fs = None - best_mount_point = "" - - for fs_name in remaining_filesystems: - fs_info = self.repository.get_filesystem(fs_name) - mount_point, score = self._evaluate_mount_point(fs_info, unresolved_paths) - if mount_point is None: - continue - print(f"\t Adding {fs_info.name} to {mount_point} yields score {score}") - if score > best_score: - best_score = score - best_fs = fs_info - best_mount_point = mount_point - - return best_fs, best_mount_point - - def _evaluate_mount_point(self, cur_mounts: Dict[str, str], fs_info: FilesystemInfo, unresolved_paths: Set[str]) -> Tuple[str, float]: - """ - Evaluates potential mount points for a filesystem and returns the best one with its score. - - Args: - cur_mounts (Dict[str, str]): Current mapping of mount points to filesystem names. - fs_info (FilesystemInfo): Information about the filesystem to evaluate. - unresolved_paths (Set[str]): Current set of unresolved paths. - - Returns: - Tuple[str, float]: The best mount point and its score. - """ - best_mount_point = None - best_score = float('-inf') - - for potential_mount_point in self._find_potential_mount_points(fs_info, unresolved_paths): - score = self._calculate_mount_point_score(potential_mount_point, fs_info, unresolved_paths) - if score > best_score: - best_score = score - best_mount_point = potential_mount_point - - return best_mount_point, best_score - - def _find_potential_mount_points(self, cur_mounts: Dict[str, str], fs_info: FilesystemInfo, unresolved_paths: Set[str]) -> List[str]: + def _find_potential_mount_points(self, cur_mounts: Dict[str, str], fs_info: FilesystemInfo, unresolved_paths: Set[str], symlinks: Dict[str, str]) -> List[str]: """ Finds potential mount points for a filesystem based on unresolved paths. @@ -282,6 +267,12 @@ def _find_potential_mount_points(self, cur_mounts: Dict[str, str], fs_info: File for fs_path in fs_info.paths: potential_mount_point = self._get_potential_mount_point(unresolved_path, fs_path) if potential_mount_point and potential_mount_point != '.': + while potential_mount_point in symlinks: + # "resolve" symlink + potential_mount_point = symlinks[potential_mount_point] + + if self._is_unlikely_mount(potential_mount_point): + continue if self._is_valid_new_mount_point(potential_mount_point, cur_mounts): mount_point_candidates[potential_mount_point].add(unresolved_path) @@ -296,6 +287,32 @@ def _find_potential_mount_points(self, cur_mounts: Dict[str, str], fs_info: File # Step 3: Sort and return the results return sorted(potential_mount_points, key=potential_mount_points.get, reverse=True) + def _is_unlikely_mount(self, potential_mount_point: str) -> bool: + ''' + We see some common patterns of invalid mount points - save some time by skipping them. + Shouldn't really make a difference in terms of end results, but simplifies debugging + ''' + # Domain names + if "www." in potential_mount_point or \ + potential_mount_point.endswith(".com") or \ + ".com/" in potential_mount_point: + return True + + # compiler directories + if "-none-" in potential_mount_point or \ + "-gcc-" in potential_mount_point or \ + "-clang-" in potential_mount_point or \ + "-gnu" in potential_mount_point: + return True + + # Long + if len(potential_mount_point) > 30: + return True + + # Otherwise it's probably fine + return False + + def _is_valid_new_mount_point(self, new_mount: str, cur_mounts: Dict[str, str]) -> bool: """ Checks if a new mount point is valid given the current mount points. @@ -307,6 +324,9 @@ def _is_valid_new_mount_point(self, new_mount: str, cur_mounts: Dict[str, str]) Returns: bool: True if the new mount point is valid, False otherwise. """ + if new_mount in INVALID_ROOTS: + return False + for existing_mount in cur_mounts: if new_mount == existing_mount: return False # Prevent mounting at the same point @@ -338,7 +358,7 @@ def _get_potential_mount_point(unresolved_path: str, fs_path: str) -> Optional[s if unresolved_path.endswith(fs_path[1:]): result = unresolved_path[:-len(fs_path) + 1] - if not result.startswith(("./proc", "./sys", "./dev", "./tmp")): + if not result.startswith(INVALID_ROOTS): return result return None From b1b5aa18905128cbee0c56ba4e0c6585dc69d7fb Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 01:34:00 -0400 Subject: [PATCH 06/22] Better arg handling in wrapper, allow users to keep partition dir. Specify result file on CLI --- fw2tar | 71 +++++++++++++++++------- src/package_partitions.sh | 33 ++++++----- unifyroot/unifyroot/cli.py | 10 ++-- unifyroot/unifyroot/filesystemunifier.py | 16 +++++- 4 files changed, 86 insertions(+), 44 deletions(-) diff --git a/fw2tar b/fw2tar index 72888f7..30db5a3 100755 --- a/fw2tar +++ b/fw2tar @@ -23,17 +23,21 @@ fw2tar_run() { local singularity=false local verbose=false local image="rehosting/fw2tar" # Name of container instance + local partition_dir="" + local force=false # Process each command-line argument while [[ $# -gt 0 ]]; do case "$1" in --wrapper-help) - echo "Usage: fw2tar [WRAPPER FLAGS] [FLAGS] FIRMWARE_FILE OUTPUT_DIR" + echo "Usage: fw2tar [WRAPPER FLAGS] [FLAGS] [FIRMWARE_BLOB] [FINAL_ARCHIVE.tar.gz]" echo "Wrapper script for running FW2TAR in a Docker container" echo "" echo "Wrapper-specific flags may be passed in *before* the fw2tar flags and args. If a value is required, it must be specified immediately after the flag with a space." echo " --build: Build the fw2tar container before running the specified command. If no other arguments are provided, the container will be built and the script will exit." echo " --build-singularity: Build the fw2tar container as a sif." + echo " --force: Remove existing output file and scratch directory if they exists." + echo " --partition_dir: Specify a directory to produce partition archives into. The directory will be deleted if --force is set." echo " --image: Which image to run. Default: $image" echo " --verbose: Print verbose output for fw2tar wrapper (e.g., filesystem mappings, docker command)" echo " --wrapper-help: this message" @@ -53,6 +57,14 @@ fw2tar_run() { singularity=true shift ;; + --partition_dir) + partition_dir="$2" + shift 2 + ;; + --force) + force=true + shift + ;; --image) image="$2" shift 2 @@ -85,7 +97,7 @@ fw2tar_run() { if $build; then echo "Running with container build (--build). Entire container will be rebuilt and tagged as $image." - # Make sure we have Dockerfile and directory is namedfw2tar + # Make sure we have Dockerfile and directory is named fw2tar if [ ! -f "Dockerfile" ]; then echo "Dockerfile not found in current directory and you requested a container rebuild" exit 1 @@ -135,10 +147,18 @@ fw2tar_run() { echo "Fatal error: Firmware file not found: ${cmd[-2]}" fi - # Final argument should be the output directory + # Final argument should be the output directory path. XXX now a filename output_dir="${cmd[-1]}" - if [[ ! -d "$output_dir" ]]; then - mkdir -p "$output_dir" + + if [ -e "$output_dir" ]; then + if $force; then + echo "Removing existing output file: $output_dir" + rm -f "$output_dir" + else + echo "Error: Output file already exists: $output_dir" + echo "Re-run with --force to remove it or delete it yourself first." + exit 1 + fi fi local arg="${cmd[-1]}" @@ -149,22 +169,28 @@ fw2tar_run() { #cmd[-1]="/host_$(basename "$host_path")/$(basename "$arg")" output_dir="/host_$(basename "$host_path")/$(basename "$arg")" - # Check for "--scratch-dir" flag, create and map the directory as necessary - for ((i=0; i<${#cmd[@]}; i++)); do - if [[ "${cmd[$i]}" == "--scratch-dir" && $((i+1)) -lt ${#cmd[@]} ]]; then - scratch_dir="${cmd[$i+1]}" - if [[ ! -d "$scratch_dir" ]]; then - mkdir -p "$scratch_dir" - fi - - # Add mapping for the scratch directory - local abspath=$(realpath "$scratch_dir") - local guest_path="/scratch" - maps+=("$abspath:$guest_path") - cmd[$i+1]="/scratch" + # If we have a scratch dir set - make sure it exists and empty it + if [[ ! -z "$partition_dir" ]]; then + echo "Scratch directory set: $partition_dir" + # If it's already here, make sure it's empty + if [[ -e "$partition_dir" ]]; then + echo "Scratch directory exists force is $force" + if ! $force; then + echo "Error: Scratch directory is not empty: $partition_dir" + echo "Re-run with --force to remove it or delete it yourself first." + exit 1 + fi + if $force; then + echo "Removing scratch directory: $partition_dir" + rm -rf "$partition_dir" + fi fi - done + mkdir -p "$partition_dir" + local abspath=$(realpath "$partition_dir") + local guest_path="/scratch" + maps+=("$abspath:$guest_path") + fi # Sort mappings by path length IFS=$'\n' maps=($(sort -r <<<"${maps[*]}")) @@ -194,10 +220,15 @@ fw2tar_run() { docker_cmd+=("${firmware_file}") docker_cmd+=("${output_dir}") + # If we have partition_dir non-empty, add /scratch as final arg + if [[ ! -z "$partition_dir" ]]; then + docker_cmd+=("/scratch") + fi + if $verbose; then echo "${BOLD}Fw2tar command:${RESET}" #echo " fw2tar ${cmd[@]}" - echo " fakeroot /usr/local/bin/unblob_package.sh ${cmd[@]}" + echo " fakeroot /usr/local/bin/package_partitions.sh ${cmd[@]}" echo echo "${BOLD}Complete docker commands:${RESET}" diff --git a/src/package_partitions.sh b/src/package_partitions.sh index 8344988..65d7534 100755 --- a/src/package_partitions.sh +++ b/src/package_partitions.sh @@ -1,24 +1,22 @@ #!/bin/bash set -eu -# USAGE: ./unblob_package.sh firmware output_root - -#if [ "$#" -ne 2 ]; then -# echo "Usage: $0 [firmware] [output_dir]" -# exit 1 -#fi - +# USAGE: ./unblob_package.sh firmware result_archive [tmp] # First we run unblob with a temporary directory to extract the files # Then we package potential rootfs files into tar.gz archives firmware="$1" -output_dir="$2" -mkdir -p "$output_dir" -chmod 777 "$output_dir" +output="$2" +# third argument is optional +tmpbase=${3:-$(mktemp -d)} +mkdir -p "$tmpbase" +chmod 777 "$tmpbase" + +hashed_partitions_dir=$(mktemp -d -p "$tmpbase") # Stage 1: Run unblob on the provided firmware -extract_dir=$(mktemp -d) +extract_dir=$(mktemp -d -p "$tmpbase") log_scratch=$(mktemp) # Can't write unblob.log into / unblob -k "$firmware" -e "$extract_dir" --log "${log_scratch}" rm ${log_scratch} @@ -42,10 +40,9 @@ find "$extract_dir" -type f -name 'control' | while read -r control_file; do done # Archive all potential rootfs directories into a temporary directory -temp_dir=$(mktemp -d) find $extract_dir -type d \( -name "*_carve" -o -name "*_extract" \) | while read -r dir; do # Create a name for the archive - temp_archive="$temp_dir/$(uuidgen).tar.gz" + temp_archive="$hashed_partitions_dir/$(uuidgen).tar.gz" # Create the archive, excluding subdirectories ending with '_carve' '_extract' or '_uncompressed' # Also filter out ###-####.[ext] files, which are almost always unblob artifacts (e.g., 0-100.lzma) @@ -73,11 +70,13 @@ while read -r size file; do new_filename="${file_hash}.tar.gz" # Move and rename the file - mv "$file" "$output_dir/$new_filename" + mv "$file" "$hashed_partitions_dir/$new_filename" echo "Packaged $new_filename (size: $size, nfiles: $nfiles)" -done < <(find "$temp_dir" -type f -name "*.tar.gz" -print0 | xargs -0 du -s | sort -rn) +done < <(find "$hashed_partitions_dir" -type f -name "*.tar.gz" -print0 | xargs -0 du -s | sort -rn) # Don't create root-owned files that end users can't delete in mapped directories -chmod 777 "${output_dir}/"* -rm -rf "$temp_dir" \ No newline at end of file +chmod 777 "${hashed_partitions_dir}/"* + +# Now call unify +python3 -m unifyroot.cli "$hashed_partitions_dir" "$output" "$extract_dir" "$tmpbase" \ No newline at end of file diff --git a/unifyroot/unifyroot/cli.py b/unifyroot/unifyroot/cli.py index 428ea0c..faf9338 100644 --- a/unifyroot/unifyroot/cli.py +++ b/unifyroot/unifyroot/cli.py @@ -2,12 +2,13 @@ from .common import FilesystemRepository, FilesystemLoader from .filesystemunifier import FilesystemUnifier -def unify_filesystems(input_path: str, output_path: Optional[str] = None): +def unify_filesystems(input_path: str, output_path: str, tmp_dir: Optional[str] = None): ''' Given a directory (or a path to a .tar.gz within such a directory), examine all the archives and find an optimal way to unify them into a single filesystem. Create the unified filesystem at output_path. ''' + # TODO: should we integrate tmp_dir into any parts of analysis here? repository = FilesystemRepository() loader = FilesystemLoader(repository) loader.load_filesystems(input_path) @@ -18,14 +19,15 @@ def unify_filesystems(input_path: str, output_path: Optional[str] = None): if output_path is None: output_path = input_path + "unified.tar.gz" - unifier.create_archive(loader.load_path, mount_points, output_path) + + unifier.create_archive(loader.load_path, mount_points, output_path, tmp_dir) def main(): import sys if len(sys.argv) < 2: - print("Usage: unifyroot [output_path]") + print("Usage: unifyroot [tmp]") sys.exit(1) - unify_filesystems(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None) + unify_filesystems(sys.argv[1], sys.argv[2], sys.argv[3] if len(sys.argv) > 3 else None) if __name__ == "__main__": main() \ No newline at end of file diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index 95ccf8a..3cd57ad 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -423,15 +423,20 @@ def _get_visible_paths(self, mount_points: Dict[str, str]) -> Dict[str, Set[str] return visible_paths - def create_archive(self, archive_dir, mounts, output): + def create_archive(self, archive_dir, mounts, output, tmp_base=None): # Create a temporary directory, then extract filesystems from self.repository at the mount # points and package it up - with tempfile.TemporaryDirectory() as temp_dir: + #with tempfile.TemporaryDirectory() as temp_dir: + + # If tmp_base is None, we'll use a temporary directory in the system's default location + # otherwise it's within the specified dir + with tempfile.TemporaryDirectory(dir=tmp_base) as temp_dir: for mount_point, fs_name in mounts.items(): fs_info = self.repository.get_filesystem(fs_name) src = os.path.join(archive_dir, fs_info.name) dest = os.path.join(temp_dir, mount_point) + # Ensure dest is within temp_dir if not os.path.commonpath([temp_dir, dest]) == temp_dir: raise ValueError(f"Destination {dest} is not within {temp_dir}") @@ -442,6 +447,11 @@ def create_archive(self, archive_dir, mounts, output): # Extract subprocess.check_output(["tar", "xf", src, "-C", dest]) + # Log the best mount points into a file + with open(os.path.join(temp_dir, ".mounts.csv"), "w") as f: + f.write("mount_point,archive\n") + for archive, mount_point in mounts.items(): + f.write(f"{archive},{mount_point}\n") # All done - package it up - subprocess.check_output(["tar", "czf", output, "-C", temp_dir, "."]) + subprocess.check_output(["tar", "czf", output, "-C", temp_dir, "."]) \ No newline at end of file From c3c475b138fa0b37fd14505882eda47aea0a13a8 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 11:48:44 -0400 Subject: [PATCH 07/22] Cleanup prints --- fw2tar | 2 -- 1 file changed, 2 deletions(-) diff --git a/fw2tar b/fw2tar index 30db5a3..f17dbc7 100755 --- a/fw2tar +++ b/fw2tar @@ -172,10 +172,8 @@ fw2tar_run() { # If we have a scratch dir set - make sure it exists and empty it if [[ ! -z "$partition_dir" ]]; then - echo "Scratch directory set: $partition_dir" # If it's already here, make sure it's empty if [[ -e "$partition_dir" ]]; then - echo "Scratch directory exists force is $force" if ! $force; then echo "Error: Scratch directory is not empty: $partition_dir" echo "Re-run with --force to remove it or delete it yourself first." From ef58562f83040a76bc0f93f95bb370163d1a4fc1 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 15:32:38 -0400 Subject: [PATCH 08/22] fix(unifyroot): ./tmp is a valid mount point even if we dislike it (unlike /proc/dev/sys) --- unifyroot/unifyroot/filesystemunifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index 3cd57ad..e4d90ce 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -10,7 +10,7 @@ from .common import FilesystemInfo, FilesystemRepository -INVALID_ROOTS = ("./proc", "./sys", "./dev", "./tmp") +INVALID_ROOTS = ("./proc", "./sys", "./dev") # Refuse to consider mounts at these points class FilesystemUnifier: def __init__(self, repository: FilesystemRepository): From 13f84d45e462215f14f74fd4a7fa91845838cfd8 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 15:33:04 -0400 Subject: [PATCH 09/22] feat(cli): take arguments for scratch dir and partition output --- fw2tar | 26 ++++++++++++++++++++------ src/package_partitions.sh | 27 ++++++++++++++++++--------- unifyroot/unifyroot/cli.py | 8 ++++---- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/fw2tar b/fw2tar index f17dbc7..452fe3a 100755 --- a/fw2tar +++ b/fw2tar @@ -25,6 +25,7 @@ fw2tar_run() { local image="rehosting/fw2tar" # Name of container instance local partition_dir="" local force=false + local scratch_dir="" # Process each command-line argument while [[ $# -gt 0 ]]; do @@ -37,6 +38,7 @@ fw2tar_run() { echo " --build: Build the fw2tar container before running the specified command. If no other arguments are provided, the container will be built and the script will exit." echo " --build-singularity: Build the fw2tar container as a sif." echo " --force: Remove existing output file and scratch directory if they exists." + echo " --scratch_dir: Specify a directory to use as a scratch directory" echo " --partition_dir: Specify a directory to produce partition archives into. The directory will be deleted if --force is set." echo " --image: Which image to run. Default: $image" echo " --verbose: Print verbose output for fw2tar wrapper (e.g., filesystem mappings, docker command)" @@ -61,6 +63,10 @@ fw2tar_run() { partition_dir="$2" shift 2 ;; + --scratch_dir) + scratch_dir="$2" + shift 2 + ;; --force) force=true shift @@ -170,23 +176,31 @@ fw2tar_run() { output_dir="/host_$(basename "$host_path")/$(basename "$arg")" - # If we have a scratch dir set - make sure it exists and empty it + # If we have a partition_dir if [[ ! -z "$partition_dir" ]]; then # If it's already here, make sure it's empty if [[ -e "$partition_dir" ]]; then if ! $force; then - echo "Error: Scratch directory is not empty: $partition_dir" + echo "Error: Partition directory is not empty: $partition_dir" echo "Re-run with --force to remove it or delete it yourself first." exit 1 fi if $force; then - echo "Removing scratch directory: $partition_dir" + echo "Removing partition directory: $partition_dir" rm -rf "$partition_dir" fi fi mkdir -p "$partition_dir" local abspath=$(realpath "$partition_dir") - local guest_path="/scratch" + local guest_path="/partition_dir" + maps+=("$abspath:$guest_path") + fi + + # If we have a scratch dir map it in as /tmp + if [[ ! -z "$scratch_dir" ]]; then + mkdir -p "$scratch_dir" + local abspath=$(realpath "$partition_dir") + local guest_path="/tmp" maps+=("$abspath:$guest_path") fi @@ -218,9 +232,9 @@ fw2tar_run() { docker_cmd+=("${firmware_file}") docker_cmd+=("${output_dir}") - # If we have partition_dir non-empty, add /scratch as final arg + # If we have partition_dir non-empty, add /partition_dir as final arg if [[ ! -z "$partition_dir" ]]; then - docker_cmd+=("/scratch") + docker_cmd+=("/partition_dir") fi if $verbose; then diff --git a/src/package_partitions.sh b/src/package_partitions.sh index 65d7534..f234967 100755 --- a/src/package_partitions.sh +++ b/src/package_partitions.sh @@ -8,15 +8,13 @@ set -eu firmware="$1" output="$2" -# third argument is optional -tmpbase=${3:-$(mktemp -d)} -mkdir -p "$tmpbase" -chmod 777 "$tmpbase" +# third argument is optional partition_dir +partition_dir=${3:-} -hashed_partitions_dir=$(mktemp -d -p "$tmpbase") +hashed_partitions_dir=$(mktemp -d) +extract_dir=$(mktemp -d) # If user specified a partition_dir, we'll move files from here later # Stage 1: Run unblob on the provided firmware -extract_dir=$(mktemp -d -p "$tmpbase") log_scratch=$(mktemp) # Can't write unblob.log into / unblob -k "$firmware" -e "$extract_dir" --log "${log_scratch}" rm ${log_scratch} @@ -75,8 +73,19 @@ while read -r size file; do echo "Packaged $new_filename (size: $size, nfiles: $nfiles)" done < <(find "$hashed_partitions_dir" -type f -name "*.tar.gz" -print0 | xargs -0 du -s | sort -rn) -# Don't create root-owned files that end users can't delete in mapped directories -chmod 777 "${hashed_partitions_dir}/"* +# If we have a partition dir, move the files there +if [ -n "$partition_dir" ]; then + mv "$hashed_partitions_dir"/* "$partition_dir" + hashed_partitions_dir="$partition_dir" +fi # Now call unify -python3 -m unifyroot.cli "$hashed_partitions_dir" "$output" "$extract_dir" "$tmpbase" \ No newline at end of file +python3 -m unifyroot.cli "$hashed_partitions_dir" "$output" "$extract_dir" + +# Always delete extract dir +rm -rf "$extract_dir" + +# Delete partition dir if user didn't speciify one +if [ ! -n "$partition_dir" ]; then + rm -rf "$hashed_partitions_dir" +fi \ No newline at end of file diff --git a/unifyroot/unifyroot/cli.py b/unifyroot/unifyroot/cli.py index faf9338..0f1db62 100644 --- a/unifyroot/unifyroot/cli.py +++ b/unifyroot/unifyroot/cli.py @@ -2,7 +2,7 @@ from .common import FilesystemRepository, FilesystemLoader from .filesystemunifier import FilesystemUnifier -def unify_filesystems(input_path: str, output_path: str, tmp_dir: Optional[str] = None): +def unify_filesystems(input_path: str, output_path: str): ''' Given a directory (or a path to a .tar.gz within such a directory), examine all the archives and find an optimal way to unify them into a single filesystem. @@ -20,14 +20,14 @@ def unify_filesystems(input_path: str, output_path: str, tmp_dir: Optional[str] if output_path is None: output_path = input_path + "unified.tar.gz" - unifier.create_archive(loader.load_path, mount_points, output_path, tmp_dir) + unifier.create_archive(loader.load_path, mount_points, output_path) def main(): import sys if len(sys.argv) < 2: - print("Usage: unifyroot [tmp]") + print("Usage: unifyroot ") sys.exit(1) - unify_filesystems(sys.argv[1], sys.argv[2], sys.argv[3] if len(sys.argv) > 3 else None) + unify_filesystems(sys.argv[1], sys.argv[2]) if __name__ == "__main__": main() \ No newline at end of file From 0544c42ed79e4fe7ac3bf082c31f326a0a89f741 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 15:52:56 -0400 Subject: [PATCH 10/22] fix(cli): silence import warning --- unifyroot/unifyroot/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unifyroot/unifyroot/__init__.py b/unifyroot/unifyroot/__init__.py index 57ce08b..f500b9e 100644 --- a/unifyroot/unifyroot/__init__.py +++ b/unifyroot/unifyroot/__init__.py @@ -1,5 +1,4 @@ from .common import FilesystemInfo, FilesystemRepository, FilesystemLoader from .filesystemunifier import FilesystemUnifier -from .cli import unify_filesystems __all__ = ['FilesystemInfo', 'FilesystemRepository', 'FilesystemLoader', 'FilesystemUnifier', 'unify_filesystems'] \ No newline at end of file From c00adb91a12a508eaa5713a40771a9560688be0d Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 15:53:08 -0400 Subject: [PATCH 11/22] Clenaup wrapper scripts --- fw2tar | 72 ++++++++++++++++++++++++--------------- src/package_partitions.sh | 7 +++- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/fw2tar b/fw2tar index 452fe3a..1345a67 100755 --- a/fw2tar +++ b/fw2tar @@ -26,6 +26,7 @@ fw2tar_run() { local partition_dir="" local force=false local scratch_dir="" + local output="" # Process each command-line argument while [[ $# -gt 0 ]]; do @@ -37,6 +38,7 @@ fw2tar_run() { echo "Wrapper-specific flags may be passed in *before* the fw2tar flags and args. If a value is required, it must be specified immediately after the flag with a space." echo " --build: Build the fw2tar container before running the specified command. If no other arguments are provided, the container will be built and the script will exit." echo " --build-singularity: Build the fw2tar container as a sif." + echo " --output: Specify the output file or directory to write the final archive to. If a directory, the archive will be named .tar.gz. If the file exists, it will be removed if --force is set." echo " --force: Remove existing output file and scratch directory if they exists." echo " --scratch_dir: Specify a directory to use as a scratch directory" echo " --partition_dir: Specify a directory to produce partition archives into. The directory will be deleted if --force is set." @@ -67,6 +69,10 @@ fw2tar_run() { scratch_dir="$2" shift 2 ;; + --output) + output="$2" + shift 2 + ;; --force) force=true shift @@ -140,41 +146,48 @@ fw2tar_run() { fi fi - # Second to last argument should be the firmware file. This file should already exist - if [[ -f "${cmd[-2]}" ]]; then - local arg="${cmd[-2]}" + # Last argument must be the firmware file. This file should already exist + if [[ -f "${cmd[-1]}" ]]; then + local arg="${cmd[-1]}" local abspath=$(realpath "$arg") local host_path=$(dirname "$abspath") local guest_path="/host_$(basename "$host_path")" maps+=("$host_path:$guest_path") - #cmd[-2]="/host_$(basename "$host_path")/$(basename "$arg")" firmware_file="/host_$(basename "$host_path")/$(basename "$arg")" else - echo "Fatal error: Firmware file not found: ${cmd[-2]}" + echo "Fatal error: Firmware file not found: ${cmd[-1]}" fi - # Final argument should be the output directory path. XXX now a filename - output_dir="${cmd[-1]}" + # If we have an output + if [[ ! -z "$output" ]]; then + # If it's a directory, we'll rename it to .tar.gz in the directory + if [[ -d "$output" ]]; then + output="$output/$(basename "$firmware_file").rootfs.tar.gz" + fi - if [ -e "$output_dir" ]; then - if $force; then - echo "Removing existing output file: $output_dir" - rm -f "$output_dir" - else - echo "Error: Output file already exists: $output_dir" - echo "Re-run with --force to remove it or delete it yourself first." - exit 1 + # Now it should be a file. If it exists, delete if force + if [[ -e "$output" ]]; then + if ! $force; then + echo "Error: Output file already exists: $output" + echo "Re-run with --force to remove it or delete it yourself first." + exit 1 + fi + if $force; then + echo "Removing output file: $output" + rm -f "$output" + fi fi - fi - local arg="${cmd[-1]}" - local abspath=$(realpath "$arg") - local host_path=$(dirname "$abspath") - local guest_path="/host_$(basename "$host_path")" - maps+=("$host_path:$guest_path") - #cmd[-1]="/host_$(basename "$host_path")/$(basename "$arg")" - output_dir="/host_$(basename "$host_path")/$(basename "$arg")" + # If directory doesn't exist, create it + mkdir -p "$(dirname "$output")" + local abspath=$(realpath $(dirname "$output")) + local guest_path="/output" + maps+=("$abspath:$guest_path") + # Replace output with /output/ + # so it's valid in the container + output="/output/$(basename "$output")" + fi # If we have a partition_dir if [[ ! -z "$partition_dir" ]]; then @@ -227,20 +240,25 @@ fw2tar_run() { docker_cmd+=("$image") - #docker_cmd+=("fakeroot_fw2tar") + + local friendly_cmd="fakeroot /usr/local/bin/package_partitions.sh ${firmware_file}" docker_cmd+=("fakeroot" "/usr/local/bin/package_partitions.sh") docker_cmd+=("${firmware_file}") - docker_cmd+=("${output_dir}") + + if [[ ! -z "$output" ]]; then + docker_cmd+=("${output}") + friendly_cmd+=" ${output}" + fi # If we have partition_dir non-empty, add /partition_dir as final arg if [[ ! -z "$partition_dir" ]]; then docker_cmd+=("/partition_dir") + friendly_cmd+=" /partition_dir" fi if $verbose; then echo "${BOLD}Fw2tar command:${RESET}" - #echo " fw2tar ${cmd[@]}" - echo " fakeroot /usr/local/bin/package_partitions.sh ${cmd[@]}" + echo " ${friendly_cmd}" echo echo "${BOLD}Complete docker commands:${RESET}" diff --git a/src/package_partitions.sh b/src/package_partitions.sh index f234967..0f7351c 100755 --- a/src/package_partitions.sh +++ b/src/package_partitions.sh @@ -7,10 +7,15 @@ set -eu # Then we package potential rootfs files into tar.gz archives firmware="$1" -output="$2" +output="${2:-}" # Second argument, the output archive # third argument is optional partition_dir partition_dir=${3:-} +# if ouptut isn't set default is firmware+.rootfs.tar.gz +if [ -z "$output" ]; then + output="${firmware##*/}.rootfs.tar.gz" +fi + hashed_partitions_dir=$(mktemp -d) extract_dir=$(mktemp -d) # If user specified a partition_dir, we'll move files from here later From 01c17c4317e9939147906639fb6bd783920face8 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 17:30:07 -0400 Subject: [PATCH 12/22] feat: Filter our extracted jars before unifyroot --- src/package_partitions.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/package_partitions.sh b/src/package_partitions.sh index 0f7351c..f4bdf08 100755 --- a/src/package_partitions.sh +++ b/src/package_partitions.sh @@ -29,6 +29,11 @@ rm ${log_scratch} # Delete all .uncompressed, .unknown, *.padding, and carved.elf files find "$extract_dir" -type f \( -name '*.uncompressed' -o -name '*.unknown' -o -name '*.padding' -o -name 'carved.elf' \) -delete +# Delete extractions we don't like: *.jar_extract (jar files) +find "$extract_dir" -type d \( -name '*.jar_extract' \) -exec rm -rf {} + +# Delete extracted debian packages +#find "$extract_dir" -type f -name 'debian-binary' -execdir sh -c 'test -f control.tar.xz -o -f control.tar.gz -o -f control.tar.bz2' \; -print -execdir rm -rf .. \; + # Find and delete debian packages - look for files named `debian_binary` that are in a directory named *_extract # Also search for 'control' files that have a 'Package:' line and delete their parent directory find "$extract_dir" -type f -name 'debian-binary' | while read -r debian_binary; do From 4010d60da1f6609f3d9b8fd3502f24a52ce4092b Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 17:30:29 -0400 Subject: [PATCH 13/22] Move check for shadowed files at a given mountpoint earlier --- unifyroot/unifyroot/filesystemunifier.py | 39 ++++++++++++++++-------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index e4d90ce..a5d64c4 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -125,7 +125,7 @@ def _find_best_mount_point(self, cur_mounts: Dict[str, str], fs_info: Filesystem best_mount_point = None best_score_improvement = 0 visible_paths = self._get_visible_paths(cur_mounts) - potential_mounts = self._find_potential_mount_points(cur_mounts, fs_info, unresolved_paths, symlinks) + potential_mounts = self._find_potential_mount_points(cur_mounts, fs_info, unresolved_paths, symlinks, visible_paths) for potential_mount_point in potential_mounts: resolved_paths = self._get_resolved_paths(visible_paths, potential_mount_point, fs_info, unresolved_paths) @@ -138,19 +138,13 @@ def _find_best_mount_point(self, cur_mounts: Dict[str, str], fs_info: Filesystem # XXX: We don't want to lose/shadow too many files. Specifically we probably don't want to lose files # from our root filesystem, but shadowing files is generally probably bad - lost_files = [] - for _, files in visible_paths.items(): - lost_files.extend([x for x in files if x.startswith(potential_mount_point)]) - print(f"\t{cur_mounts} + {fs_info.name} @ {potential_mount_point} resolves {len(resolved_paths)} paths, adds {total_files_in_mount} files, loses {len(lost_files)} to get {len(total_files_with_mount)} total files") + print(f"\t{cur_mounts} + {fs_info.name} @ {potential_mount_point} resolves {len(resolved_paths)} paths, adds {total_files_in_mount} files to get {len(total_files_with_mount)} total files") print(f"\t\t {' '.join(resolved_paths[:10])}") # XXX: is our improvement just the number of resolved paths? # What if this mount just resolves like 1 path and adds a bunch of broken references? On the other hand, what if it's just 1 path and we're fixing it - if len(lost_files) > 5: - # Probably bad, we don't want to shadow too many files - score_improvement = 0 - elif len(resolved_paths) > 2: + if len(resolved_paths) > 2: # If we resolve more than 2 paths, we're probably doing well score_improvement = len(resolved_paths) elif len(resolved_paths) == 0: @@ -248,7 +242,21 @@ def _get_relative_path(path: str, mount_point: str) -> str: return path[len(mount_point):].lstrip('/') return path - def _find_potential_mount_points(self, cur_mounts: Dict[str, str], fs_info: FilesystemInfo, unresolved_paths: Set[str], symlinks: Dict[str, str]) -> List[str]: + def _count_shadowed(self, potential_mount_point: str, + visible_paths: Dict[str, Set[str]]) -> int: + ''' + If we mounted another filesystem at potential_mount_point, how many unique files would be shadowed (lost)? + ''' + lost_files = [] + for _, files in visible_paths.items(): + lost_files.extend([x for x in files if x.startswith(potential_mount_point)]) + return len(set(lost_files)) + + def _find_potential_mount_points(self, cur_mounts: Dict[str, str], + fs_info: FilesystemInfo, + unresolved_paths: Set[str], + symlinks: Dict[str, str], + visible_paths: Dict[str, Set[str]]) -> List[str]: """ Finds potential mount points for a filesystem based on unresolved paths. @@ -273,8 +281,15 @@ def _find_potential_mount_points(self, cur_mounts: Dict[str, str], fs_info: File if self._is_unlikely_mount(potential_mount_point): continue - if self._is_valid_new_mount_point(potential_mount_point, cur_mounts): - mount_point_candidates[potential_mount_point].add(unresolved_path) + if not self._is_valid_new_mount_point(potential_mount_point, cur_mounts): + continue + + if self._count_shadowed(potential_mount_point, visible_paths) > 5: + # Hyperparameter: don't shadow too many files. Is 5 a good threshold? + # Too many files are shadowed by this mount point + continue + + mount_point_candidates[potential_mount_point].add(unresolved_path) # Step 2: Evaluate each potential mount point potential_mount_points: Dict[str, int] = {} From 1e2a8c25b9ca52a5b6c7413eb641d2b6b8b56dee Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 18:48:21 -0400 Subject: [PATCH 14/22] Fix: clobber fw2tar.sif when building singularity --- fw2tar | 1 + 1 file changed, 1 insertion(+) diff --git a/fw2tar b/fw2tar index 1345a67..d3255a5 100755 --- a/fw2tar +++ b/fw2tar @@ -133,6 +133,7 @@ fw2tar_run() { -v $(pwd):/output \ --privileged -t \ --rm quay.io/singularity/docker2singularity:v3.9.0 rehosting/fw2tar + rm -f fw2tar.sif mv rehosting_fw2tar*.sif fw2tar.sif echo "$image built. Exiting." From a4aa0fd118d6bd23db5e4211c9315715f34f8c5d Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Tue, 22 Oct 2024 18:48:28 -0400 Subject: [PATCH 15/22] Do not generate empty archive on failure --- unifyroot/unifyroot/cli.py | 4 ++++ unifyroot/unifyroot/filesystemunifier.py | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/unifyroot/unifyroot/cli.py b/unifyroot/unifyroot/cli.py index 0f1db62..95a45a6 100644 --- a/unifyroot/unifyroot/cli.py +++ b/unifyroot/unifyroot/cli.py @@ -20,6 +20,10 @@ def unify_filesystems(input_path: str, output_path: str): if output_path is None: output_path = input_path + "unified.tar.gz" + if len(mount_points) == 0: + print("No mount points found, not creating archive.") + return + unifier.create_archive(loader.load_path, mount_points, output_path) def main(): diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index a5d64c4..4aebc8c 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -441,7 +441,6 @@ def _get_visible_paths(self, mount_points: Dict[str, str]) -> Dict[str, Set[str] def create_archive(self, archive_dir, mounts, output, tmp_base=None): # Create a temporary directory, then extract filesystems from self.repository at the mount # points and package it up - #with tempfile.TemporaryDirectory() as temp_dir: # If tmp_base is None, we'll use a temporary directory in the system's default location # otherwise it's within the specified dir From 2589d3897a1519733d308519770a31c05a2a518d Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Fri, 25 Oct 2024 14:00:30 -0400 Subject: [PATCH 16/22] fix(filesystemunifier): resolve symlinks when creating outputs --- unifyroot/unifyroot/filesystemunifier.py | 46 ++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index 4aebc8c..6eaf8eb 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -275,9 +275,13 @@ def _find_potential_mount_points(self, cur_mounts: Dict[str, str], for fs_path in fs_info.paths: potential_mount_point = self._get_potential_mount_point(unresolved_path, fs_path) if potential_mount_point and potential_mount_point != '.': + old = potential_mount_point while potential_mount_point in symlinks: # "resolve" symlink potential_mount_point = symlinks[potential_mount_point] + if old != potential_mount_point: + assert(unresolved_path.startswith(old)), f"Unresolved path {unresolved_path} doesn't start with {old}" + unresolved_path = potential_mount_point + unresolved_path[len(old):] if self._is_unlikely_mount(potential_mount_point): continue @@ -438,6 +442,44 @@ def _get_visible_paths(self, mount_points: Dict[str, str]) -> Dict[str, Set[str] return visible_paths + def _resolve_symlinks(self, dest: str, base_dir: str) -> str: + ''' + Given an absolute path like /tmp/extraction/usr/opt/bin with base_dir of /tmp/extraction, + walk through and replace symlinks but ensure we stay within base_dir. + ''' + if dest.startswith(base_dir): + dest = dest[len(base_dir):] + + if dest.startswith("/"): + dest = dest[1:] + + # Dest is relative to base_dir now. + # Iteratively build up full_dest (relative to base_dir), replacing symlinks as we go + + safe_path = "" + for part in dest.split("/"): + this_path = os.path.join(*[base_dir, safe_path, part]) + print(f"Testing: {base_dir} / {safe_path} / {part}") + if os.path.islink(this_path): + # We need to resolve the base_dir/safe_path/part is a symlink, we need to resolve it, + # then make it relative to base_dir + link_dest = os.readlink(this_path) + #print("\tLink from",this_path, "to", link_dest) + if link_dest.startswith("/"): + # Absolute symlink, just drop the leading slash since it will be relative + safe_path = link_dest[1:] + #print("\tAbsolute symlink, new path is", safe_path) + else: + # Relative symlink, append it to the current path + safe_path = os.path.join(safe_path, link_dest) + #print("\tRelative symlink, new path is", safe_path) + # XXX: what about ../ in the symlink? We should probably resolve it? + else: + safe_path = os.path.join(safe_path, part) + + assert(not(safe_path.startswith("/"))) + return os.path.join(base_dir, safe_path) + def create_archive(self, archive_dir, mounts, output, tmp_base=None): # Create a temporary directory, then extract filesystems from self.repository at the mount # points and package it up @@ -455,6 +497,10 @@ def create_archive(self, archive_dir, mounts, output, tmp_base=None): if not os.path.commonpath([temp_dir, dest]) == temp_dir: raise ValueError(f"Destination {dest} is not within {temp_dir}") + # Resolve symlinks _before_ we create the archive, i.e., + # if /opt -> /tmp/opt and we (foolishly) said we wanted to mount this fs at /opt, + dest = self._resolve_symlinks(dest, temp_dir) + # Create the directory if it doesn't exist os.makedirs(dest, exist_ok=True) From 5252b051f11af7c59c66cb3bdcb5d943b079ddd5 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Fri, 25 Oct 2024 14:00:04 -0400 Subject: [PATCH 17/22] fw2tar: improved argument parsing and errors on invalid --- fw2tar | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/fw2tar b/fw2tar index d3255a5..32fe7be 100755 --- a/fw2tar +++ b/fw2tar @@ -27,12 +27,13 @@ fw2tar_run() { local force=false local scratch_dir="" local output="" + local fw="" # Process each command-line argument while [[ $# -gt 0 ]]; do case "$1" in --wrapper-help) - echo "Usage: fw2tar [WRAPPER FLAGS] [FLAGS] [FIRMWARE_BLOB] [FINAL_ARCHIVE.tar.gz]" + echo "Usage: fw2tar [WRAPPER FLAGS] [FLAGS] [FIRMWARE_BLOB]" echo "Wrapper script for running FW2TAR in a Docker container" echo "" echo "Wrapper-specific flags may be passed in *before* the fw2tar flags and args. If a value is required, it must be specified immediately after the flag with a space." @@ -49,7 +50,9 @@ fw2tar_run() { echo "All other arguments will be passed through to the main fw2tar command in the container." echo "For example try:" echo " fw2tar --help" - echo " fw2tar ./your_firmware.bin" + echo " fw2tar ./fws/your_firmware.bin" + echo " fw2tar --partition_dir partitions/your_firmware ./fws/your_firmware.bin" + echo " fw2tar --partition_dir partitions/your_firmware --output unified/your_firmware.tar.gz ./fws/your_firmware.bin" exit 0 ;; --build) @@ -85,17 +88,21 @@ fw2tar_run() { verbose=true shift ;; - *) # Default case: If no more known options, keep as part of command - cmd=("$@") + *) + # Default case: If no more known options, keep as part of command + fw="$1" + shift + # If there are any more args, it's an error + if [[ $# -gt 0 ]]; then + echo "Error: Unknown argument: $1. Parsed $fw as firmware file." + exit 1 + fi break ;; esac done # If command is empty, parse any un-shifted args into array - probably means we were run with wrapper flags only (i.e., --build) - if [[ ${#cmd[@]} -eq 0 ]]; then - cmd=("$@") - fi # If verbose, log all wrapper args and command if $verbose; then @@ -103,7 +110,7 @@ fw2tar_run() { echo " build: $build" echo " image: $image" echo " verbose: $verbose" - echo " fw2tar cmd: ${cmd[*]}" + echo " fw: $fw" echo fi @@ -140,23 +147,21 @@ fw2tar_run() { exit 0 fi - # If we have no other args, exit 0 - if [[ ${#cmd[@]} -eq 0 ]]; then - echo "$image built. Exiting as no command was specified." + if [[ -z "$fw" ]]; then + echo "$image built. Exiting as no firmware file was specified." exit 0 fi fi # Last argument must be the firmware file. This file should already exist - if [[ -f "${cmd[-1]}" ]]; then - local arg="${cmd[-1]}" - local abspath=$(realpath "$arg") + if [[ -f "${fw}" ]]; then + local abspath=$(realpath "$fw") local host_path=$(dirname "$abspath") local guest_path="/host_$(basename "$host_path")" maps+=("$host_path:$guest_path") - firmware_file="/host_$(basename "$host_path")/$(basename "$arg")" + firmware_file="/host_$(basename "$host_path")/$(basename "$fw")" else - echo "Fatal error: Firmware file not found: ${cmd[-1]}" + echo "Fatal error: Firmware file not found: ${fw}" fi # If we have an output @@ -178,7 +183,12 @@ fw2tar_run() { rm -f "$output" fi fi + else + # No output specified, so we'll use the firmware file's directory + output=$(basename "$firmware_file").rootfs.tar.gz + fi + if [[ ! -z "$output" ]]; then # If directory doesn't exist, create it mkdir -p "$(dirname "$output")" local abspath=$(realpath $(dirname "$output")) @@ -213,7 +223,7 @@ fw2tar_run() { # If we have a scratch dir map it in as /tmp if [[ ! -z "$scratch_dir" ]]; then mkdir -p "$scratch_dir" - local abspath=$(realpath "$partition_dir") + local abspath=$(realpath "$scratch_dir") local guest_path="/tmp" maps+=("$abspath:$guest_path") fi From 77bb31b8dbcb2bc8f1db621f2149b5b393c9c49d Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Fri, 25 Oct 2024 15:23:42 -0400 Subject: [PATCH 18/22] Sort archive and fix mtime to make deterministic --- src/package_partitions.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/package_partitions.sh b/src/package_partitions.sh index f4bdf08..24190bc 100755 --- a/src/package_partitions.sh +++ b/src/package_partitions.sh @@ -55,6 +55,8 @@ find $extract_dir -type d \( -name "*_carve" -o -name "*_extract" \) | while rea # Create the archive, excluding subdirectories ending with '_carve' '_extract' or '_uncompressed' # Also filter out ###-####.[ext] files, which are almost always unblob artifacts (e.g., 0-100.lzma) tar -czf "$temp_archive" \ + --sort=name \ + --mtime="UTC 2019-01-01" \ --exclude='*_carve' --exclude='*_extract' --exclude '*.uncompressed' \ --exclude='[0-9]*-[0-9]*.*' \ -C "$dir" . From 0a51a71f29aee70aeabb176dd0459328ced0b826 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Fri, 25 Oct 2024 15:27:48 -0400 Subject: [PATCH 19/22] Unifyroot: deterministic outputs --- unifyroot/unifyroot/filesystemunifier.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index 6eaf8eb..4f5470c 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -459,7 +459,7 @@ def _resolve_symlinks(self, dest: str, base_dir: str) -> str: safe_path = "" for part in dest.split("/"): this_path = os.path.join(*[base_dir, safe_path, part]) - print(f"Testing: {base_dir} / {safe_path} / {part}") + #print(f"Testing: {base_dir} / {safe_path} / {part}") if os.path.islink(this_path): # We need to resolve the base_dir/safe_path/part is a symlink, we need to resolve it, # then make it relative to base_dir @@ -501,11 +501,18 @@ def create_archive(self, archive_dir, mounts, output, tmp_base=None): # if /opt -> /tmp/opt and we (foolishly) said we wanted to mount this fs at /opt, dest = self._resolve_symlinks(dest, temp_dir) + # Ensure it's _still_ within temp_dir + if not os.path.commonpath([temp_dir, dest]) == temp_dir: + raise ValueError(f"Destination {dest} is not within {temp_dir}") + # Create the directory if it doesn't exist os.makedirs(dest, exist_ok=True) # Extract - subprocess.check_output(["tar", "xf", src, "-C", dest]) + subprocess.check_output(["tar", "xf", src, "-C", dest, + "--keep-directory-symlink", #??? + "--skip-old-files", + ]) # Log the best mount points into a file with open(os.path.join(temp_dir, ".mounts.csv"), "w") as f: @@ -514,4 +521,8 @@ def create_archive(self, archive_dir, mounts, output, tmp_base=None): f.write(f"{archive},{mount_point}\n") # All done - package it up - subprocess.check_output(["tar", "czf", output, "-C", temp_dir, "."]) \ No newline at end of file + subprocess.check_output(["tar", + "--sort=name", + "--mtime=UTC 2019-01-01", + "-czf", output, + "-C", temp_dir, "."]) \ No newline at end of file From 0c4993603ba978dacff3dcc6666b8a443f025713 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Fri, 25 Oct 2024 16:03:25 -0400 Subject: [PATCH 20/22] Prioritize larger rootfses in search --- unifyroot/unifyroot/filesystemunifier.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index 4f5470c..81309b1 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -29,9 +29,16 @@ def unify(self) -> Dict[str, str]: # Try each filesystem as a potential root - with each consider how we could mount # others + potential_roots = [] # (root_fs_name, num_files) for root_fs_name, root_fs_info in self.repository.get_all_filesystems().items(): if not self._could_be_root(root_fs_info): continue + potential_roots.append((root_fs_name, len(root_fs_info.paths))) + + # Now, for each potential root from largest to smallest, try to unify + # It's important we go from biggest to smallest because in the case of ties + # the bigger root is more likely to be right + for root_fs_name, _ in sorted(potential_roots, key=lambda x: x[1], reverse=True): initial = {"./": root_fs_name} mount_points, score = self._try_unify_from(initial) if score > best_score: @@ -71,7 +78,8 @@ def _try_unify_from(self, mount_points: Dict[str, str]) -> Tuple[Dict[str, str], best_score = self._calculate_configuration_score(mount_points, unresolved_paths) - print(f"{mount_points} has score {best_score}. Trying to improve with more filesystems...") + mount_points_s = " & ".join(f"{k} -> {v.replace('.tar.gz','')}" for k,v in mount_points.items()) + print(f"{mount_points_s} has score {best_score}. Trying to improve with more filesystems...") # Collect symlinks that exist within the mount points we've defined # We *should not* add a filesystem at a symlink, that wouldn't really make sense @@ -139,8 +147,13 @@ def _find_best_mount_point(self, cur_mounts: Dict[str, str], fs_info: Filesystem # XXX: We don't want to lose/shadow too many files. Specifically we probably don't want to lose files # from our root filesystem, but shadowing files is generally probably bad - print(f"\t{cur_mounts} + {fs_info.name} @ {potential_mount_point} resolves {len(resolved_paths)} paths, adds {total_files_in_mount} files to get {len(total_files_with_mount)} total files") - print(f"\t\t {' '.join(resolved_paths[:10])}") + cur_mounts_s = " & ".join(f"{k} -> {v.replace('.tar.gz','')}" for k,v in cur_mounts.items()) + print(f"\t{cur_mounts_s} + {fs_info.name} @ {potential_mount_point}") + #print(f"\t\tResolves {len(resolved_paths)} paths") + #print(f"\t\tAdds {total_files_in_mount} files") + #print(f"\t\tTotal of {len(total_files_with_mount)}") + #print(f"\t\tResolved: {len(resolved_paths)}") + #print(f"\t\tUnresolved: {len(unresolved_paths)}") # XXX: is our improvement just the number of resolved paths? # What if this mount just resolves like 1 path and adds a bunch of broken references? On the other hand, what if it's just 1 path and we're fixing it From ade6e36e6c8e70711928e8345670511f48e553df Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Fri, 25 Oct 2024 16:42:36 -0400 Subject: [PATCH 21/22] feat(unifyroot): Support running as non-root, but warn --- unifyroot/unifyroot/filesystemunifier.py | 44 +++++++++++++++++++++--- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index 81309b1..2712c76 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -493,6 +493,35 @@ def _resolve_symlinks(self, dest: str, base_dir: str) -> str: assert(not(safe_path.startswith("/"))) return os.path.join(base_dir, safe_path) + @staticmethod + def get_device_files(archive_path): + # List all files in the tar archive and identify device files + device_files = [] + tar_list = subprocess.check_output(["tar", "tvf", archive_path]).decode("utf-8").splitlines() + for entry in tar_list: + # Device files typically have 'b' or 'c' as the first character in permissions + if entry.startswith(('b', 'c')): + # Extract the file path + file_path = entry.split()[-1] + device_files.append(file_path) + return device_files + + @staticmethod + def extract_tar(src, dest, exclude=None): + + if not exclude: + exclude_args = [] + else: + exclude_args = ["--exclude=" + x for x in exclude] + + # Run tar extraction with exclusion of device files + tar_command = ["tar", "xf", src, "-C", dest, + "--keep-directory-symlink", + "--skip-old-files", + ] + exclude_args + + subprocess.check_output(tar_command) + def create_archive(self, archive_dir, mounts, output, tmp_base=None): # Create a temporary directory, then extract filesystems from self.repository at the mount # points and package it up @@ -521,11 +550,16 @@ def create_archive(self, archive_dir, mounts, output, tmp_base=None): # Create the directory if it doesn't exist os.makedirs(dest, exist_ok=True) - # Extract - subprocess.check_output(["tar", "xf", src, "-C", dest, - "--keep-directory-symlink", #??? - "--skip-old-files", - ]) + # Extract. If non-root we have to filter out devices + # to avoid an error with tar trying to mknod + if os.getuid() != 0: + device_files = FilesystemUnifier.get_device_files(src) + if len(device_files) > 0: + print(f"Warning: Not running as (fake)root, dropping {len(device_files)} device files from {fs_info.name}") + print("THIS IS BAD YOU SHOULD RERUN WITH FAKEROOT") + self.extract_tar(src, dest, exclude=device_files) + else: + self.extract_tar(src, dest) # Log the best mount points into a file with open(os.path.join(temp_dir, ".mounts.csv"), "w") as f: From 1d3e06f1a2c14ae7482a331c168c7b6beefe7b61 Mon Sep 17 00:00:00 2001 From: Andrew Fasano Date: Fri, 25 Oct 2024 17:33:18 -0400 Subject: [PATCH 22/22] Raise ValueError on recursive symlinks --- unifyroot/unifyroot/filesystemunifier.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unifyroot/unifyroot/filesystemunifier.py b/unifyroot/unifyroot/filesystemunifier.py index 2712c76..d7ed59d 100644 --- a/unifyroot/unifyroot/filesystemunifier.py +++ b/unifyroot/unifyroot/filesystemunifier.py @@ -473,6 +473,7 @@ def _resolve_symlinks(self, dest: str, base_dir: str) -> str: for part in dest.split("/"): this_path = os.path.join(*[base_dir, safe_path, part]) #print(f"Testing: {base_dir} / {safe_path} / {part}") + old_path = safe_path if os.path.islink(this_path): # We need to resolve the base_dir/safe_path/part is a symlink, we need to resolve it, # then make it relative to base_dir @@ -486,6 +487,8 @@ def _resolve_symlinks(self, dest: str, base_dir: str) -> str: # Relative symlink, append it to the current path safe_path = os.path.join(safe_path, link_dest) #print("\tRelative symlink, new path is", safe_path) + if old_path == safe_path: + raise ValueError("Symlink loop detected") # TODO - something? # XXX: what about ../ in the symlink? We should probably resolve it? else: safe_path = os.path.join(safe_path, part)