Skip to content

Implement a native PAK decoder #406

Description

@rdw-software

Details:

Pretty sure the POC worked, but I haven't tested it recently. Might as well dump the code here:

local jit = require("jit")

-- local LOGFILE_HANDLE = assert(io.open("jit.log", "w+"))

local ffi = require("ffi")

local ArcturusPAK = {
	FOOTER_SIZE_IN_BYTES = 9, -- sizeof ArcturusPAK?
	MAGIC_VERSION_TAG = 18,
	RECORD_TYPE_FILE = 1,
	RECORD_TYPE_DIRECTORY = 2,
	cdefs = [[
		typedef struct ArcturusPAK {
			unsigned long offset;
			unsigned long numRecords;
			unsigned char versionTag;
		}  ArcturusPAK;

		typedef struct ArcturusFileRecord {
			unsigned char pathSize;
			unsigned char type;
			int offset;
			int compressedSize;
			int decompressedSize;
		} ArcturusFileRecord;
	]],
}

ffi.cdef(ArcturusPAK.cdefs)

-- Blocking load using standard Lua io library
function ArcturusPAK:Open(filePath)
	local pakInfo = {}

	local pakFileHandle = io.open(filePath, "rb")
	if not pakFileHandle then
		error("Failed to open PAK file " .. filePath .. " (no such file exists)", 0)
	end

	pakInfo.handle = pakFileHandle

	local EOF = pakFileHandle:seek("end")
	if EOF < self.FOOTER_SIZE_IN_BYTES then
		error("Failed to open PAK file " .. filePath .. " (not a valid PAK file)", 0)
	end
	pakFileHandle:seek("set", EOF - self.FOOTER_SIZE_IN_BYTES)

	local metadata = pakFileHandle:read(self.FOOTER_SIZE_IN_BYTES)
	local header = ffi.cast("ArcturusPAK*", metadata)

	if tonumber(header.versionTag) ~= ArcturusPAK.MAGIC_VERSION_TAG then
		error(
			"Invalid PAK version tag " .. header.versionTag .. " (" .. ArcturusPAK.MAGIC_VERSION_TAG .. " expected)",
			0
		)
	end

	pakInfo.offset = tonumber(header.offset)
	pakInfo.numRecords = tonumber(header.numRecords)
	pakInfo.versionTag = tonumber(header.versionTag)

	return pakInfo
end

function ArcturusPAK:ReadTableOfContents(pakInfo)

	if io.type(pakInfo.handle) ~= "file" then
		error("Failed to read table of contents (invalid PAK file handle)", 0)
	end

	pakInfo.handle:seek("set", pakInfo.offset)

	local records = {}

	-- Even for HUGE archives this should be small enough to keep in memory
	for _ = 1, pakInfo.numRecords, 1 do
		local recordHeader = pakInfo.handle:read(ffi.sizeof("ArcturusFileRecord"))
		local record = ffi.cast("ArcturusFileRecord*", recordHeader)
		local pathString = pakInfo.handle:read(record.pathSize + 1)
		pathString = ffi.string(pathString, record.pathSize)

		-- TBD too big/slow? but at least no cdata glitches...
		local recordData = {
			pathSize = tonumber(record.pathSize),
			type = tonumber(record.type),
			offset = tonumber(record.offset),
			compressedSize = tonumber(record.compressedSize),
			decompressedSize = tonumber(record.decompressedSize),
			pathString = pathString,
			cdata = record,
		}

		table.insert(records, record)
		records[pathString] = recordData
	end

	pakInfo.records = records
	return records
end

function ArcturusPAK:GetCompressedFileContents(pakInfo, pathString)
	if io.type(pakInfo.handle) ~= "file" then
		error("Failed to get compressed file contents for record " .. pathString .. " (invalid PAK file handle)", 0)
	end

	if not pakInfo.records then
		error("Failed to get compressed file contents for record " .. pathString .. " (table of contents not read)", 0)
	end

	local record = pakInfo.records[pathString]
	if not record then
		error("Failed to get compressed file contents for record " .. pathString .. " (invalid path string)", 0)
	end

	if record.type == ArcturusPAK.RECORD_TYPE_DIRECTORY then
		error("Failed to get compressed file contents for record " .. pathString .. " (it's a directory)", 0)
	end

	pakInfo.handle:seek("set", record.offset)
	local compressedFileContents = pakInfo.handle:read(record.compressedSize)
	return compressedFileContents
end

function ArcturusPAK:GetDecompressedFileContents(pakInfo, pathString)
	if io.type(pakInfo.handle) ~= "file" then
		error("Failed to get decompressed file contents for record " .. pathString .. " (invalid PAK file handle)", 0)
	end

	if not pakInfo.records then
		error(
			"Failed to get decompressed file contents for record " .. pathString .. " (table of contents not read)",
			0
		)
	end

	local record = pakInfo.records[pathString]
	if not record then
		error("Failed to get decompressed file contents for record " .. pathString .. " (invalid path string)", 0)
	end

	if record.type == ArcturusPAK.RECORD_TYPE_DIRECTORY then
		error("Failed to get decompressed file contents for record " .. pathString .. " (it's a directory)", 0)
	end

	pakInfo.handle:seek("set", record.offset)
	local compressedFileContents = pakInfo.handle:read(record.compressedSize)

	return self:DecompressFileContents(compressedFileContents, record.decompressedSize)
end

local bit = require("bit")

local CHUNK_SIZE = 1024

local function decompress(size_compressed, size_original, compressed_data)
	compressed_data = { string.byte(compressed_data, 1, #compressed_data) }

	local result = {}
	local result_index, bytes_read = 1, 1

	while bytes_read <= size_compressed do
		if bytes_read > size_compressed then
			break
		end

		local mask = compressed_data[bytes_read]
		bytes_read = bytes_read + 1

		for i = 0, 7 do
			if bytes_read > size_compressed then
				break
			end

			if bit.band(mask, 1) == 1 then
				local byte1, byte2 = compressed_data[bytes_read], compressed_data[bytes_read + 1]
				bytes_read = bytes_read + 2

				local displacement = bit.rshift(byte2, 4) + 2
				local index_offset = bit.lshift(bit.band(byte2, 0x0F), 8) + byte1

				for j = 0, displacement - 1 do
					result[result_index] = result[result_index - index_offset]
					result_index = result_index + 1
				end
			else
				result[result_index] = compressed_data[bytes_read]
				result_index = result_index + 1
				bytes_read = bytes_read + 1
			end

			mask = bit.rshift(mask, 1)
		end
	end
	-- dump(result)
	-- return result
	return string.char(unpack(result))
end

local function decompressFFI(size_compressed, size_original, compressed_data)
	local compressed_buffer = ffi.new("uint8_t[?]", #compressed_data)
	ffi.copy(compressed_buffer, compressed_data, #compressed_data)

	local result_buffer = ffi.new("uint8_t[?]", size_original)

	local result_index, bytes_read = 0, 0

	while bytes_read < size_compressed do
		local mask = compressed_buffer[bytes_read]
		bytes_read = bytes_read + 1

		for i = 0, 7 do
			if bytes_read >= size_compressed then
				break
			end

			if bit.band(mask, 1) == 1 then
				local byte1, byte2 = compressed_buffer[bytes_read], compressed_buffer[bytes_read + 1]
				bytes_read = bytes_read + 2

				local displacement = bit.rshift(byte2, 4) + 2
				local index_offset = bit.lshift(bit.band(byte2, 0x0F), 8) + byte1

				for j = 0, displacement - 1 do
					result_buffer[result_index] = result_buffer[result_index - index_offset]
					result_index = result_index + 1
				end
			else
				result_buffer[result_index] = compressed_buffer[bytes_read]
				result_index = result_index + 1
				bytes_read = bytes_read + 1
			end

			mask = bit.rshift(mask, 1)
		end
	end

	return ffi.string(result_buffer, size_original)
end
-- todo use string buffer as input, too - much easier to index?
-- TODO eliminate/move
function ArcturusPAK:DecompressFileContents(compressedFileContents, decompressedSize)
	return decompress(#compressedFileContents, decompressedSize, compressedFileContents)
end

local pakFilePath = "../Fixtures/data.pak"
local ZERO_SIZE_PAK = "../Fixtures/zerosize.pak"

local describe = _G.describe
local it = _G.it
local assertEquals = _G.assertEquals
local assertThrows = _G.assertThrows

describe("ArcturusPAK", function()
	describe("Open", function()
		it("should throw if passed an invalid file path", function()
			local function openNonexistentFile()
				ArcturusPAK:Open("meep.404")
			end
			assertThrows(openNonexistentFile, "Failed to open PAK file meep.404 (no such file exists)")
		end)

		it("should throw if the file is empty ", function()
			local function openNonexistentFile()
				ArcturusPAK:Open(ZERO_SIZE_PAK) -- Should always exist
			end
			assertThrows(openNonexistentFile, "Failed to open PAK file " .. ZERO_SIZE_PAK .. " (not a valid PAK file)")

			-- TODO assert fd is closed
		end)

		it("should throw if passed a valid non-PAK file path ", function()
			local function openNonexistentFile()
				ArcturusPAK:Open("../Fixtures/invalid.pak") -- Should always exist
			end
			assertThrows(openNonexistentFile, "Invalid PAK version tag 46 (18 expected)")

			-- TODO assert fd is closed
		end)

		-- Throw if not a valid PAK file

		it("should be able to read the archive metadata when given a valid PAK file path", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			assertEquals(pakInfo.versionTag, ArcturusPAK.MAGIC_VERSION_TAG)
			assertEquals(pakInfo.offset, 695226075)
			assertEquals(pakInfo.numRecords, 17743)
			assertEquals(type(pakInfo.handle), "userdata")

			-- TODO close fd
		end)
	end)

	describe("ReadTableOfContents", function()
		it("should throw if the PAK file handle is already closed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			pakInfo.handle:close()
			local function readFromClosedHandle()
				ArcturusPAK:ReadTableOfContents(pakInfo)
			end
			assertThrows(readFromClosedHandle, "Failed to read table of contents (invalid PAK file handle)")
		end)

		it("should return the table of file records when passed a valid PAK file handle", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local records = ArcturusPAK:ReadTableOfContents(pakInfo)
			assertEquals(#records, pakInfo.numRecords)

			local firstRecord = records["data"]
			local secondRecord = records["data/_tactics.scr"]
			local lastRecord = records[pakInfo.numRecords]

			-- TODO assert first record is data folder, last is X (TBD)

			assertEquals(secondRecord.pathSize, 17)
			assertEquals(secondRecord.type, ArcturusPAK.RECORD_TYPE_FILE)
			assertEquals(secondRecord.offset, 695224756)
			assertEquals(secondRecord.compressedSize, 1312)
			assertEquals(secondRecord.decompressedSize, 2472)
			-- assertEquals(records["data"], firstRecord)

			-- TODO close
		end)

		it("should cache the table of contents when passed a valid PAK file handle", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local records = ArcturusPAK:ReadTableOfContents(pakInfo)
			-- print(records, pakInfo.records)
			assertEquals(records, pakInfo.records)
		end)
	end)

	describe("GetCompressedFileContents", function()
		it("should throw if the PAK file handle is already closed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			pakInfo.handle:close()
			local function readFromClosedHandle()
				ArcturusPAK:GetCompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readFromClosedHandle,
				"Failed to get compressed file contents for record hello.world (invalid PAK file handle)"
			)
		end)

		-- TODO
		-- should throw if no handle was opened

		it("should throw if the table of contents wasn't yet read", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local function readWithoutTOC()
				ArcturusPAK:GetCompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readWithoutTOC,
				"Failed to get compressed file contents for record hello.world (table of contents not read)"
			)
		end)

		it("should throw if an invalid path string was passed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local function readWithInvalidPathString()
				ArcturusPAK:ReadTableOfContents(pakInfo)
				ArcturusPAK:GetCompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readWithInvalidPathString,
				"Failed to get compressed file contents for record hello.world (invalid path string)"
			)
			-- TODO close handles everywhere
		end)


		it("should throw if a directory path string was passed", function()
			local function attemptToDecompressDirectoryRecord()
				local pakInfo = ArcturusPAK:Open(pakFilePath)
				ArcturusPAK:ReadTableOfContents(pakInfo)
				ArcturusPAK:GetCompressedFileContents(pakInfo, "data/diary")
			end
			assertThrows(
				attemptToDecompressDirectoryRecord,
				"Failed to get compressed file contents for record data/diary (it's a directory)"
			)
		end)

		-- TODO move
		-- local zlib = require("zlib")

		it("should return the compressed buffer if a valid path string was passed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			ArcturusPAK:ReadTableOfContents(pakInfo)

			local compressedFileContents = ArcturusPAK:GetCompressedFileContents(pakInfo, "data/global.ini")
			-- print(#compressedFileContents, compressedFileContents)
			assertEquals(#compressedFileContents, 187)
			-- local crc = zlib.crc32()(compressedFileContents)
			-- local adler = zlib.adler32()(compressedFileContents)
			-- assertEquals(crc, "hello world123")
			-- assertEquals(adler, "hello world123")
			-- It's not a guarantee that the file contents are 100% correct, but it's good enough for now
			assertEquals(compressedFileContents:sub(6, 8), "gnd")
			assertEquals(compressedFileContents:sub(165, 169), "@load")
		end)
	end)
	-- Close: throw if no fd, success if fd

	describe("GetDecompressedFileContents", function()
		it("should throw if the PAK file handle is already closed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			pakInfo.handle:close()
			local function readFromClosedHandle()
				ArcturusPAK:GetDecompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readFromClosedHandle,
				"Failed to get decompressed file contents for record hello.world (invalid PAK file handle)"
			)
		end)

		-- TODO
		-- should throw if no handle was opened

		it("should throw if the table of contents wasn't yet read", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local function readWithoutTOC()
				ArcturusPAK:GetDecompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readWithoutTOC,
				"Failed to get decompressed file contents for record hello.world (table of contents not read)"
			)
		end)

		it("should throw if an invalid path string was passed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			local function readWithInvalidPathString()
				ArcturusPAK:ReadTableOfContents(pakInfo)
				ArcturusPAK:GetDecompressedFileContents(pakInfo, "hello.world")
			end
			assertThrows(
				readWithInvalidPathString,
				"Failed to get decompressed file contents for record hello.world (invalid path string)"
			)
			-- TODO close handles everywhere
		end)

		local assertNil = _G.assertNil -- TODO move

		it("should throw if a directory path string was passed", function()
			local function attemptToDecompressDirectoryRecord()
				local pakInfo = ArcturusPAK:Open(pakFilePath)
				ArcturusPAK:ReadTableOfContents(pakInfo)
				ArcturusPAK:GetDecompressedFileContents(pakInfo, "data/bmp")
			end
			assertThrows(
				attemptToDecompressDirectoryRecord,
				"Failed to get decompressed file contents for record data/bmp (it's a directory)"
			)
		end)

		-- TODO move
		-- local zlib = require("zlib")

		it("should return the decompressed buffer if a valid path string was passed", function()
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			ArcturusPAK:ReadTableOfContents(pakInfo)

			local decompressedFileContents = ArcturusPAK:GetDecompressedFileContents(pakInfo, "data/global.ini")
			-- print(#compressedFileContents, compressedFileContents)
			assertEquals(#decompressedFileContents, 275)
			-- local crc = zlib.crc32()(decompressedFileContents)
			-- local adler = zlib.adler32()(decompressedFileContents)
			-- assertEquals(crc, "hello world123")
			-- assertEquals(adler, "hello world123")
			-- It's not a guarantee that the file contents are 100% correct, but it's good enough for now
			assertEquals(decompressedFileContents:sub(5, 14), "gndopacity")
			assertEquals(decompressedFileContents:sub(145, 153), "wavewater")
		end)
	end)

	describe("DecompressBytes", function()
		it("should do some magic (TBD)", function()
			-- Example usage and benchmark
			local pakInfo = ArcturusPAK:Open(pakFilePath)
			ArcturusPAK:ReadTableOfContents(pakInfo)
			-- local compressedFileContents = ArcturusPAK:GetDecompressedFileContents(pakInfo, "data/arcfonth.dat")
			local compressedFileContents = ArcturusPAK:GetCompressedFileContents(pakInfo, "data/global.ini")
			local size_compressed = #compressedFileContents
			-- local size_original = pakInfo.records["data/arcfonth.dat"].decompressedSize
			local size_original = pakInfo.records["data/global.ini"].decompressedSize

			local result
			local start = os.clock() -- uv.hrtime?
			for i = 0, 1000000, 1 do
				result = decompress(size_compressed, size_original, compressedFileContents)
			end
			local elapsed = os.clock() - start

			print("Decompressed using Lua code in", elapsed, "seconds")

			start = os.clock()
			local resultFFI
			for j = 0, 1000000, 1 do
				resultFFI = decompressFFI(size_compressed, size_original, compressedFileContents)
			end
			elapsed = os.clock() - start

			print("Decompressed using FFI in", elapsed, "seconds")
			-- print(result, resultFFI)
			assert(result == resultFFI)
		end)
	end)
end)


-- todo test all files can be dec, extracted

Metadata

Metadata

Assignees

No one assigned

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions