diff --git a/src/Compression.cc b/src/Compression.cc index a340b8f5..06cc7d6a 100644 --- a/src/Compression.cc +++ b/src/Compression.cc @@ -13,6 +13,162 @@ using namespace std; +template <> +const char* name_for_enum(PRSCompressOptimalPhase v) { + switch (v) { + case PRSCompressOptimalPhase::INDEX_SHORT_COPIES: + return "INDEX_SHORT_COPIES"; + case PRSCompressOptimalPhase::INDEX_LONG_COPIES: + return "INDEX_LONG_COPIES"; + case PRSCompressOptimalPhase::INDEX_EXTENDED_COPIES: + return "INDEX_EXTENDED_COPIES"; + case PRSCompressOptimalPhase::CONSTRUCT_PATHS: + return "CONSTRUCT_PATHS"; + case PRSCompressOptimalPhase::BACKTRACE_OPTIMAL_PATH: + return "BACKTRACE_OPTIMAL_PATH"; + case PRSCompressOptimalPhase::GENERATE_RESULT: + return "GENERATE_RESULT"; + default: + return "__UNKNOWN__"; + } +} + +template <> +const char* name_for_enum(BC0CompressOptimalPhase v) { + switch (v) { + case BC0CompressOptimalPhase::INDEX: + return "INDEX"; + case BC0CompressOptimalPhase::CONSTRUCT_PATHS: + return "CONSTRUCT_PATHS"; + case BC0CompressOptimalPhase::BACKTRACE_OPTIMAL_PATH: + return "BACKTRACE_OPTIMAL_PATH"; + case BC0CompressOptimalPhase::GENERATE_RESULT: + return "GENERATE_RESULT"; + default: + return "__UNKNOWN__"; + } +} + +template +struct WindowIndex { + const uint8_t* data; + size_t size; + size_t offset; + multiset> index; + + WindowIndex(const void* data, size_t size) + : data(reinterpret_cast(data)), + size(size), + offset(0), + index(bind(&WindowIndex::set_comparator, this, placeholders::_1, placeholders::_2)) {} + + void advance() { + if (this->offset >= WindowLength) { + this->index.erase(this->offset - WindowLength); + } + this->index.emplace(this->offset); + this->offset++; + if (DebugLength) { + this->print_state(); + } + } + + size_t get_match_length(size_t match_offset) const { + size_t match_iter = match_offset; + size_t offset_iter = this->offset; + while ((match_iter < match_offset + MaxMatchLength) && + (match_iter < this->size) && + (offset_iter < this->size) && + (this->data[match_iter] == this->data[offset_iter])) { + match_iter++; + offset_iter++; + } + return match_iter - match_offset; + }; + + // The data structure we want is a binary-searchable set of all strings + // starting at all possible offsets within the sliding window, and we need + // to be able to search lexicographically but insert and delete by offset. + // A std::map would accomplish this, but would be + // horrendously inefficient: we'd have to copy strings far too much. We can + // solve this by instead storing the offset of each string as keys in a set + // and using a custom comparator to treat them as references to binary + // strings within the data. + bool set_comparator(size_t a, size_t b) const { + size_t max_length = min(MaxMatchLength, this->size - max(a, b)); + size_t end_a = a + max_length; + for (; a < end_a; a++, b++) { + uint8_t data_a = static_cast(this->data[a]); + uint8_t data_b = static_cast(this->data[b]); + if (data_a < data_b) { + return true; // a comes before b lexicographically + } else if (data_a > data_b) { + return false; // a comes after b lexicographically + } + } + return a < b; // Maximum-length match; order them by offset + }; + + pair get_best_match() const { + // Find the best match from the index. It's unlikely that we'll get an + // exact match, so check the entry before the upper_bound result too. + // Note: We use upper_bound rather than lower_bound because in PRS, a + // backreference can be encoded with fewer bits if it's close to the + // decompression offset, and this makes us pick the latest match by + // default. + if (DebugLength) { + string hex_str = format_data_string(&this->data[this->offset], min(this->size - this->offset, DebugLength)); + fprintf(stderr, "[%05zX] match SEARCH %s\n", this->offset, hex_str.c_str()); + } + size_t match_offset = 0; + size_t match_size = 0; + auto start_it = this->index.upper_bound(this->offset); + for (auto it = start_it; it != this->index.end(); it++) { + size_t new_match_offset = *it; + size_t new_match_size = this->get_match_length(new_match_offset); + if (DebugLength) { + fprintf(stderr, "[%05zX] match BEFORE %zX %zX\n", this->offset, new_match_offset, new_match_size); + } + if ((new_match_size > match_size) || (new_match_size == match_size && new_match_offset > match_offset)) { + match_offset = new_match_offset; + match_size = new_match_size; + } else if (!UseLatestBestMatch || (new_match_size < match_size)) { + // In PRS, using the latest of a set of equivalent matches may be + // advantageous because it may be possible to encode it with fewer bits. + // All backreferences are the same length in BC0, so this doesn't apply. + break; + } + } + for (auto it = start_it; it != this->index.begin();) { + it--; + size_t new_match_offset = *it; + size_t new_match_size = this->get_match_length(new_match_offset); + if (DebugLength) { + fprintf(stderr, "[%05zX] match BEFORE %zX %zX\n", this->offset, new_match_offset, new_match_size); + } + if ((new_match_size > match_size) || (new_match_size == match_size && new_match_offset > match_offset)) { + match_offset = new_match_offset; + match_size = new_match_size; + } else if (!UseLatestBestMatch || (new_match_size < match_size)) { + break; + } + } + if (DebugLength) { + fprintf(stderr, "[%05zX] match OVERALL %zX %zX\n", this->offset, match_offset, match_size); + } + return make_pair(match_offset, match_size); + } + + void print_state() const { + fprintf(stderr, "[%05zX] Window<0x%zX, 0x%zX> at 0x%zX contains 0x%zX entries:\n", + this->offset, WindowLength, MaxMatchLength, this->offset, this->index.size()); + for (size_t z : this->index) { + string hex_str = format_data_string(&this->data[z], min(this->size - z, DebugLength)); + fprintf(stderr, "[%05zX] %05zX => %s\n", this->offset, z, hex_str.c_str()); + } + } +}; + template struct LZSSInterleavedWriter { StringWriter w; @@ -51,16 +207,302 @@ struct LZSSInterleavedWriter { } this->next_control_bit <<= 1; } + void write_data(uint8_t v) { this->buf[this->buf_offset++] = v; } + size_t size() const { return this->w.size() + this->buf_offset; } }; +class ControlStreamReader { +public: + ControlStreamReader(StringReader& r) + : r(r), + bits(0x0000) {} + + bool read() { + if (!(this->bits & 0x0100)) { + this->bits = 0xFF00 | this->r.get_u8(); + } + bool ret = this->bits & 1; + this->bits >>= 1; + return ret; + } + + uint8_t buffered_bits() const { + uint16_t z = this->bits; + uint8_t ret = 0; + for (; z & 0x0100; z >>= 1, ret++) { + } + return ret; + } + +private: + StringReader& r; + uint16_t bits; +}; + +struct PRSPathNode { + enum class CommandType { + NONE = 0, + LITERAL, + SHORT_COPY, + LONG_COPY, + EXTENDED_COPY, + }; + + int16_t short_copy_offset = 0; + uint8_t max_short_copy_size = 0; + int16_t long_copy_offset = 0; + uint8_t max_long_copy_size = 0; + int16_t extended_copy_offset = 0; + uint16_t max_extended_copy_size = 0; + + // Pathfinding state + size_t from_offset = 0; + CommandType from_command_type = CommandType::NONE; + size_t bits_used = static_cast(-1); + + // Stream generation state + size_t to_offset = 0; + + std::string str() const { + const char* command_type_name; + switch (this->from_command_type) { + case CommandType::NONE: + command_type_name = "NONE"; + break; + case CommandType::LITERAL: + command_type_name = "LITERAL"; + break; + case CommandType::SHORT_COPY: + command_type_name = "SHORT_COPY"; + break; + case CommandType::LONG_COPY: + command_type_name = "LONG_COPY"; + break; + case CommandType::EXTENDED_COPY: + command_type_name = "EXTENDED_COPY"; + break; + default: + command_type_name = "__UNKNOWN__"; + } + return string_printf("[Node short=%hX %hhX long=%hX %hhX ext=%hX %hX from=%zX %s bits=%zX to=%zX]", + this->short_copy_offset, this->max_short_copy_size, + this->long_copy_offset, this->max_long_copy_size, this->extended_copy_offset, this->max_extended_copy_size, + this->from_offset, command_type_name, this->bits_used, this->to_offset); + } +}; + +string prs_compress_optimal( + const void* in_data_v, size_t in_size, function progress_fn) { + const uint8_t* in_data = reinterpret_cast(in_data_v); + + vector nodes; + nodes.resize(in_size + 1); + nodes[0].bits_used = 18; // Stop command: 2 control bits and 2 data bytes + + // Populate all possible short copies + { + WindowIndex<0x100, 5, true> window(in_data_v, in_size); + while (window.offset < in_size) { + if ((window.offset & 0xFFF) == 0) { + progress_fn(PRSCompressOptimalPhase::INDEX_SHORT_COPIES, window.offset, 0); + } + auto& node = nodes[window.offset]; + auto match = window.get_best_match(); + if (match.second >= 2) { + node.short_copy_offset = match.first - window.offset; + node.max_short_copy_size = match.second; + } + window.advance(); + } + } + + // Populate all possible long copies + { + WindowIndex<0x1FFF, 9, true> window(in_data_v, in_size); + while (window.offset < in_size) { + if ((window.offset & 0xFFF) == 0) { + progress_fn(PRSCompressOptimalPhase::INDEX_LONG_COPIES, window.offset, 0); + } + auto& node = nodes[window.offset]; + auto match = window.get_best_match(); + if (match.second >= 3) { + node.long_copy_offset = match.first - window.offset; + node.max_long_copy_size = match.second; + } + window.advance(); + } + } + + // Populate all possible extended copies + { + WindowIndex<0x1FFF, 0x100, true> window(in_data_v, in_size); + while (window.offset < in_size) { + if ((window.offset & 0xFFF) == 0) { + progress_fn(PRSCompressOptimalPhase::INDEX_EXTENDED_COPIES, window.offset, 0); + } + auto& node = nodes[window.offset]; + auto match = window.get_best_match(); + if (match.second >= 1) { + node.extended_copy_offset = match.first - window.offset; + node.max_extended_copy_size = match.second; + } + window.advance(); + } + } + + // For each node, populate the literal value, and the best ways to get to the + // following nodes + for (size_t z = 0; z < in_size; z++) { + if ((z & 0xFFF) == 0) { + progress_fn(PRSCompressOptimalPhase::CONSTRUCT_PATHS, z, 0); + } + + auto& node = nodes[z]; + + // Literal: 1 control bit + 1 data byte + size_t bits_used = node.bits_used + 9; + { + auto& next_node = nodes[z + 1]; + if (next_node.bits_used > bits_used) { + next_node.from_offset = z; + next_node.from_command_type = PRSPathNode::CommandType::LITERAL; + next_node.bits_used = bits_used; + } + } + + // Short copy: 4 control bits + 1 data byte + bits_used = node.bits_used + 12; + for (size_t x = 2; x <= node.max_short_copy_size; x++) { + auto& next_node = nodes[z + x]; + if (next_node.bits_used > bits_used) { + next_node.from_offset = z; + next_node.from_command_type = PRSPathNode::CommandType::SHORT_COPY; + next_node.bits_used = bits_used; + } + } + + // Long copy: 2 control bits + 2 data bytes + bits_used = node.bits_used + 18; + for (size_t x = 3; x <= node.max_long_copy_size; x++) { + auto& next_node = nodes[z + x]; + if (next_node.bits_used > bits_used) { + next_node.from_offset = z; + next_node.from_command_type = PRSPathNode::CommandType::LONG_COPY; + next_node.bits_used = bits_used; + } + } + + // Extended copy: 2 control bits + 3 data bytes + bits_used = node.bits_used + 26; + for (size_t x = 1; x <= node.max_extended_copy_size; x++) { + auto& next_node = nodes[z + x]; + if (next_node.bits_used > bits_used) { + next_node.from_offset = z; + next_node.from_command_type = PRSPathNode::CommandType::EXTENDED_COPY; + next_node.bits_used = bits_used; + } + } + } + + // Find the shortest path from the last node to the first node + size_t last_progress_fn_call = static_cast(-1); + for (size_t z = in_size; z > 0;) { + if ((z & ~0xFFF) != (last_progress_fn_call & ~0xFFF)) { + last_progress_fn_call = z; + progress_fn(PRSCompressOptimalPhase::BACKTRACE_OPTIMAL_PATH, z, 0); + } + size_t from_offset = nodes[z].from_offset; + nodes[from_offset].to_offset = z; + z = from_offset; + } + + // Produce the PRS command stream from the shortest path + LZSSInterleavedWriter<3> w; + last_progress_fn_call = static_cast(-1); + for (size_t offset = 0; offset < in_size;) { + if ((offset & ~0xFFF) != (last_progress_fn_call & ~0xFFF)) { + last_progress_fn_call = offset; + progress_fn(PRSCompressOptimalPhase::GENERATE_RESULT, offset, w.size()); + } + + const auto& node = nodes[offset]; + const auto& next_node = nodes[node.to_offset]; + + size_t copy_size = node.to_offset - offset; + switch (next_node.from_command_type) { + case PRSPathNode::CommandType::LITERAL: + if (copy_size != 1) { + throw logic_error("incorrect size for LITERAL copy type"); + } + w.write_control(true); + w.write_data(in_data[offset]); + break; + case PRSPathNode::CommandType::SHORT_COPY: { + if (copy_size < 2 || copy_size > 5) { + throw logic_error("incorrect size for SHORT_COPY copy type"); + } + uint8_t encoded_size = copy_size - 2; + w.write_control(false); + w.flush_if_ready(); + w.write_control(false); + w.flush_if_ready(); + w.write_control(encoded_size & 2); + w.flush_if_ready(); + w.write_control(encoded_size & 1); + w.write_data(node.short_copy_offset & 0xFF); + break; + } + case PRSPathNode::CommandType::LONG_COPY: { + if (copy_size < 2 || copy_size > 9) { + throw logic_error("incorrect size for LONG_COPY copy type"); + } + w.write_control(false); + w.flush_if_ready(); + w.write_control(true); + uint16_t a = (node.long_copy_offset << 3) | (copy_size - 2); + w.write_data(a & 0xFF); + w.write_data(a >> 8); + break; + } + case PRSPathNode::CommandType::EXTENDED_COPY: { + if (copy_size < 1 || copy_size > 0x100) { + throw logic_error("incorrect size for EXTENDED_COPY copy type"); + } + w.write_control(false); + w.flush_if_ready(); + w.write_control(true); + uint16_t a = (node.extended_copy_offset << 3); + w.write_data(a & 0xFF); + w.write_data(a >> 8); + w.write_data(copy_size - 1); + break; + } + default: + throw logic_error("invalid copy type in shortest path"); + } + w.flush_if_ready(); + + offset = node.to_offset; + } + + // Write stop command + w.write_control(false); + w.flush_if_ready(); + w.write_control(true); + w.write_data(0); + w.write_data(0); + + return std::move(w.close()); +} + PRSCompressor::PRSCompressor( - size_t compression_level, function progress_fn) + ssize_t compression_level, function progress_fn) : compression_level(compression_level), progress_fn(progress_fn), closed(false), @@ -98,8 +540,8 @@ void PRSCompressor::advance() { size_t best_match_size = 0; size_t best_match_offset = 0; size_t best_match_literals = 0; - for (size_t num_literals = 0; num_literals < this->compression_level; num_literals++) { - for (size_t z = 0; z < num_literals; z++) { + for (ssize_t num_literals = 0; num_literals <= this->compression_level; num_literals++) { + for (size_t z = 0; z < static_cast(num_literals); z++) { this->reverse_log.push_back(this->forward_log.at(this->reverse_log.end_offset())); } @@ -130,7 +572,7 @@ void PRSCompressor::advance() { best_match_literals = num_literals; } } - for (size_t z = 0; z < num_literals; z++) { + for (size_t z = 0; z < static_cast(num_literals); z++) { this->reverse_log.pop_back(); } } @@ -266,7 +708,7 @@ void PRSCompressor::flush_control() { string prs_compress( const void* vdata, size_t size, - size_t compression_level, + ssize_t compression_level, function progress_fn) { PRSCompressor prs(compression_level, progress_fn); prs.add(vdata, size); @@ -275,38 +717,107 @@ string prs_compress( string prs_compress( const string& data, - size_t compression_level, + ssize_t compression_level, function progress_fn) { return prs_compress(data.data(), data.size(), compression_level, progress_fn); } -class ControlStreamReader { -public: - ControlStreamReader(StringReader& r) - : r(r), - bits(0x0000) {} +string prs_compress(const void* in_data_v, size_t in_size, function progress_fn) { + const uint8_t* in_data = reinterpret_cast(in_data_v); - bool read() { - if (!(this->bits & 0x0100)) { - this->bits = 0xFF00 | this->r.get_u8(); + LZSSInterleavedWriter<3> w; + WindowIndex<0x1FFF, 0x100, true> window(in_data_v, in_size); + + size_t last_progress_fn_call_offset = 0; + while (window.offset < in_size) { + if (progress_fn && ((last_progress_fn_call_offset & ~0xFFF) != (window.offset & ~0xFFF))) { + last_progress_fn_call_offset = window.offset; + progress_fn(window.offset, w.size()); + } + + auto match = window.get_best_match(); + + // Look ahead by 1 literal to see if there's a significantly better match. + window.advance(); + auto advanced_match = window.get_best_match(); + if (advanced_match.second > match.second + 1) { + match.second = 1; + } + + // If there is a suitable match, write a backreference; otherwise, write a + // literal. The backreference should be encoded: + // - As a short copy if offset in [-0x100, -1] and size in [2, 5] + // - As a long copy if offset in [-0x1FFF, -1] and size in [3, 9] + // - As an extended copy if offset in [-0x1FFF, -1] and size in [10, 0x100] + // Technically an extended copy can be used for sizes 1-9 as well, but if + // size is 1 or 2, writing literals is better (since it uses fewer data + // bytes and control bits), and a long copy can cover sizes 3-9 (and also + // uses fewer data bytes and control bits). + ssize_t backreference_offset = match.first - (window.offset - 1); + if (match.second < 2) { + // The match is too small; a literal would use fewer bits + w.write_control(true); + w.write_data(in_data[window.offset - 1]); + match.second = 1; + + } else if ((backreference_offset >= -0x100) && (match.second <= 5)) { + uint8_t encoded_size = match.second - 2; + w.write_control(false); + w.flush_if_ready(); + w.write_control(false); + w.flush_if_ready(); + w.write_control(encoded_size & 2); + w.flush_if_ready(); + w.write_control(encoded_size & 1); + w.write_data(backreference_offset & 0xFF); + + } else if (match.second < 3) { + // We can't use a long copy for size 2, and it's not worth it to use an + // extended copy for this either (as noted above), so write a literal + w.write_control(true); + w.write_data(in_data[window.offset - 1]); + match.second = 1; + + } else if ((backreference_offset >= -0x1FFF) && (match.second <= 9)) { + w.write_control(false); + w.flush_if_ready(); + w.write_control(true); + uint16_t a = (backreference_offset << 3) | (match.second - 2); + w.write_data(a & 0xFF); + w.write_data(a >> 8); + + } else if ((backreference_offset >= -0x1FFF) && (match.second <= 0x100)) { + w.write_control(false); + w.flush_if_ready(); + w.write_control(true); + uint16_t a = (backreference_offset << 3); + w.write_data(a & 0xFF); + w.write_data(a >> 8); + w.write_data(match.second - 1); + + } else { + throw logic_error("invalid best match"); + } + w.flush_if_ready(); + + for (size_t z = 1; z < match.second; z++) { + window.advance(); } - bool ret = this->bits & 1; - this->bits >>= 1; - return ret; } - uint8_t buffered_bits() const { - uint16_t z = this->bits; - uint8_t ret = 0; - for (; z & 0x0100; z >>= 1, ret++) { - } - return ret; - } + // Write stop command + w.write_control(false); + w.flush_if_ready(); + w.write_control(true); + w.write_data(0); + w.write_data(0); -private: - StringReader& r; - uint16_t bits; -}; + return std::move(w.close()); +} + +string prs_compress(const string& data, function progress_fn) { + return prs_compress(data.data(), data.size(), progress_fn); +} string prs_decompress(const void* data, size_t size, size_t max_output_size) { // PRS is an LZ77-based compression algorithm. Compressed data is split into @@ -464,38 +975,40 @@ void prs_disassemble(FILE* stream, const void* data, size_t size) { ControlStreamReader cr(r); while (!r.eof()) { - size_t r_offset = r.where(); - uint8_t buffered_bits = cr.buffered_bits(); - size_t input_bits = 8 * r_offset + (buffered_bits ? (8 - buffered_bits) : 0); if (cr.read()) { - fprintf(stream, "[%zX / %zX => %zX] literal %02hhX\n", r_offset, input_bits, output_bytes, r.get_u8()); + fprintf(stream, "[%zX] literal %02hhX\n", output_bytes, r.get_u8()); output_bytes++; } else { ssize_t offset; size_t count; + const char* copy_type; - bool is_long_copy = cr.read(); - if (is_long_copy) { + if (cr.read()) { uint16_t a = r.get_u8(); a |= (r.get_u8() << 8); offset = (a >> 3) | (~0x1FFF); if (offset == ~0x1FFF) { - fprintf(stream, "[%zX / %zX => %zX] end\n", r_offset, input_bits, output_bytes); + fprintf(stream, "[%zX] end\n", output_bytes); break; } - count = (a & 7) ? ((a & 7) + 2) : (r.get_u8() + 1); + if (a & 7) { + copy_type = "long"; + count = (a & 7) + 2; + } else { + copy_type = "extended"; + count = r.get_u8() + 1; + } } else { + copy_type = "short"; count = cr.read() << 1; count = (count | cr.read()) + 2; offset = r.get_u8() | (~0xFF); } size_t read_offset = output_bytes + offset; - fprintf(stream, "[%zX / %zX => %zX] %s copy -%zX (from %zX) %zX\n", - r_offset, input_bits, output_bytes, is_long_copy ? "long" : "short", - -offset, read_offset, count); + fprintf(stream, "[%zX] %s copy %zX\n", output_bytes, copy_type, count); if (read_offset >= output_bytes) { throw runtime_error("backreference offset beyond beginning of output"); @@ -515,6 +1028,118 @@ void prs_disassemble(FILE* stream, const std::string& data) { // PRS, there is only one type of backreference. Also, there is no stop opcode; // the decompressor simply stops when there are no more input bytes to read. +struct BC0PathNode { + uint16_t memo_offset = 0; + uint8_t max_copy_size = 0; + + // Pathfinding state + size_t from_offset = 0; + size_t bits_used = static_cast(-1); + + // Stream generation state + size_t to_offset = 0; + + std::string str() const { + return string_printf("[Node ref=%04hX %hhX from=%zX bits=%zX to=%zX]", + this->memo_offset, this->max_copy_size, + this->from_offset, this->bits_used, this->to_offset); + } +}; + +string bc0_compress_optimal( + const void* in_data_v, size_t in_size, function progress_fn) { + const uint8_t* in_data = reinterpret_cast(in_data_v); + + vector nodes; + nodes.resize(in_size + 1); + nodes[0].bits_used = 0; + + // Populate all possible backreferences + { + WindowIndex<0x1000, 0x12> window(in_data_v, in_size); + while (window.offset < in_size) { + if ((window.offset & 0xFFF) == 0) { + progress_fn(BC0CompressOptimalPhase::INDEX, window.offset, 0); + } + auto& node = nodes[window.offset]; + auto match = window.get_best_match(); + if (match.second >= 3) { + node.memo_offset = (match.first - 0x12) & 0xFFF; + node.max_copy_size = match.second; + } + window.advance(); + } + } + + // For each node, populate the literal value, and the best ways to get to the + // following nodes + for (size_t z = 0; z < in_size; z++) { + if ((z & 0xFFF) == 0) { + progress_fn(BC0CompressOptimalPhase::CONSTRUCT_PATHS, z, 0); + } + + auto& node = nodes[z]; + + // Literal: 1 control bit + 1 data byte + size_t bits_used = node.bits_used + 9; + { + auto& next_node = nodes[z + 1]; + if (next_node.bits_used > bits_used) { + next_node.from_offset = z; + next_node.bits_used = bits_used; + } + } + + // Backreference: 1 control bit + 2 data bytes + bits_used = node.bits_used + 17; + for (size_t x = 3; x <= node.max_copy_size; x++) { + auto& next_node = nodes[z + x]; + if (next_node.bits_used > bits_used) { + next_node.from_offset = z; + next_node.bits_used = bits_used; + } + } + } + + // Find the shortest path from the last node to the first node + size_t last_progress_fn_call = static_cast(-1); + for (size_t z = in_size; z > 0;) { + if ((z & ~0xFFF) != (last_progress_fn_call & ~0xFFF)) { + last_progress_fn_call = z; + progress_fn(BC0CompressOptimalPhase::BACKTRACE_OPTIMAL_PATH, z, 0); + } + size_t from_offset = nodes[z].from_offset; + nodes[from_offset].to_offset = z; + z = from_offset; + } + + // Produce the BC0 command stream from the shortest path + LZSSInterleavedWriter<3> w; + last_progress_fn_call = static_cast(-1); + for (size_t offset = 0; offset < in_size;) { + if ((offset & ~0xFFF) != (last_progress_fn_call & ~0xFFF)) { + last_progress_fn_call = offset; + progress_fn(BC0CompressOptimalPhase::GENERATE_RESULT, offset, w.size()); + } + + const auto& node = nodes[offset]; + size_t copy_size = node.to_offset - offset; + if (copy_size >= 3 && copy_size <= 0x12) { + w.write_control(false); + w.write_data(node.memo_offset & 0xFF); + w.write_data(((node.memo_offset >> 4) & 0xF0) | (copy_size - 3)); + } else if (copy_size == 1) { + w.write_control(true); + w.write_data(in_data[offset]); + } + w.flush_if_ready(); + + offset = node.to_offset; + } + + return std::move(w.close()); +} + string bc0_compress(const string& data, function progress_fn) { return bc0_compress(data.data(), data.size(), progress_fn); } @@ -523,108 +1148,53 @@ string bc0_compress(const void* in_data_v, size_t in_size, function(in_data_v); LZSSInterleavedWriter<2> w; - size_t read_offset = 0; - - // The data structure we want is a binaary-searchable set of all strings - // starting at all possible offsets within the sliding window, and we need - // to be able to search lexicographically but insert and delete by offset. - // A std::map would accomplish this, but would be - // horrendously inefficient: we'd have to copy strings far too much. We can - // solve this by instead storing the offset of each string as keys in a set - // and using a custom comparator to treat them as references to binary - // strings within the data. - auto set_comparator = [&](size_t a, size_t b) -> bool { - size_t max_length = min(0x12, in_size - max(a, b)); - size_t end_a = a + max_length; - for (; a < end_a; a++, b++) { - uint8_t data_a = static_cast(in_data[a]); - uint8_t data_b = static_cast(in_data[b]); - if (data_a < data_b) { - return true; // a comes before b lexicographically - } else if (data_a > data_b) { - return false; // a comes after b lexicographically - } - } - return a < b; // Maximum-length match; order them by offset - }; - multiset> window_index(set_comparator); - - auto get_match_length = [&](size_t a, size_t b) -> size_t { - size_t ret = 0; - while ((ret < 0x12) && (a + ret < in_size) && (b + ret < in_size) && - (in_data[a + ret] == in_data[b + ret])) { - ret++; - } - return ret; - }; + WindowIndex<0x1000, 0x12> window(in_data_v, in_size); size_t last_progress_fn_call_offset = 0; - while (read_offset < in_size) { - if (progress_fn && ((last_progress_fn_call_offset & ~0xFFF) != (read_offset & ~0xFFF))) { - last_progress_fn_call_offset = read_offset; - progress_fn(read_offset, w.size()); + while (window.offset < in_size) { + if (progress_fn && ((last_progress_fn_call_offset & ~0xFFF) != (window.offset & ~0xFFF))) { + last_progress_fn_call_offset = window.offset; + progress_fn(window.offset, w.size()); } - // Find the best match from the index. It's unlikely that we'll get an - // exact match, so check the entry before the lower_bound result too. - size_t match_offset = 0; - size_t match_size = 0; - // string hex_search_data = format_data_string(data.substr(read_offset, 0x12)); - // fprintf(stderr, "[%zX] match SEARCH %s\n", read_offset, hex_search_data.c_str()); - auto match_it = window_index.lower_bound(read_offset); - if (match_it != window_index.end()) { - match_offset = *match_it; - match_size = get_match_length(read_offset, match_offset); - // fprintf(stderr, "[%zX] match AFTER %zX %zX\n", read_offset, match_offset, match_size); - } - if (match_it != window_index.begin()) { - match_it--; - size_t before_match_offset = *match_it; - size_t before_match_size = get_match_length(read_offset, before_match_offset); - // fprintf(stderr, "[%zX] match BEFORE %zX %zX\n", read_offset, before_match_offset, before_match_size); - if (before_match_size > match_size) { - match_offset = before_match_offset; - match_size = before_match_size; - } - } - // fprintf(stderr, "[%zX] match OVERALL %zX %zX\n", read_offset, match_offset, match_size); - - if (match_size < 3) { - match_size = 1; + auto match = window.get_best_match(); + if (match.second < 3) { + match.second = 1; } // Write a backreference if a match was found; otherwise, write a literal - if (match_size >= 3) { + if (match.second >= 3) { w.write_control(false); - size_t memo_offset = match_offset - 0x12; + size_t memo_offset = match.first - 0x12; w.write_data(memo_offset & 0xFF); - w.write_data(((memo_offset >> 4) & 0xF0) | (match_size - 3)); - // fprintf(stderr, "[%zX] backreference %03zX %zX\n", read_offset, memo_offset, match_size); + w.write_data(((memo_offset >> 4) & 0xF0) | (match.second - 3)); } else { w.write_control(true); - w.write_data(in_data[read_offset]); - // fprintf(stderr, "[%zX] literal %02hhX\n", read_offset, data[read_offset]); + w.write_data(in_data[window.offset]); } w.flush_if_ready(); - // Update the index and advance read_offset - for (size_t z = 0; z < match_size; z++, read_offset++) { - if (read_offset >= 0x1000) { - window_index.erase(read_offset - 0x1000); - } - window_index.emplace(read_offset); - // fprintf(stderr, "[%zX] Index state updated (%zX):\n", read_offset, window_index.size()); - // for (size_t it : window_index) { - // string index_data = data.substr(it, 0x12); - // string hex_data = format_data_string(index_data); - // fprintf(stderr, "[%zX] %05zX => %s\n", read_offset, it, hex_data.c_str()); - // } + for (size_t z = 0; z < match.second; z++) { + window.advance(); } } return std::move(w.close()); } +string bc0_encode(const void* in_data_v, size_t in_size) { + const uint8_t* in_data = reinterpret_cast(in_data_v); + + LZSSInterleavedWriter<1> w; + for (size_t z = 0; z < in_size; z++) { + w.write_control(true); + w.write_data(in_data[z]); + w.flush_if_ready(); + } + + return std::move(w.close()); +} + // The BC0 decompression implementation in PSO GC is vulnerable to overflow // attacks - there is no bounds checking on the output buffer. It is unlikely // that this can be usefully exploited (e.g. for RCE) because the output pointer diff --git a/src/Compression.hh b/src/Compression.hh index d44db866..e404d5bb 100644 --- a/src/Compression.hh +++ b/src/Compression.hh @@ -5,18 +5,56 @@ #include #include #include +#include #include #include "Text.hh" +enum class PRSCompressOptimalPhase { + INDEX_SHORT_COPIES = 0, + INDEX_LONG_COPIES, + INDEX_EXTENDED_COPIES, + CONSTRUCT_PATHS, + BACKTRACE_OPTIMAL_PATH, + GENERATE_RESULT, +}; + +template <> +const char* name_for_enum(PRSCompressOptimalPhase v); + +enum class BC0CompressOptimalPhase { + INDEX = 0, + CONSTRUCT_PATHS, + BACKTRACE_OPTIMAL_PATH, + GENERATE_RESULT, +}; + +template <> +const char* name_for_enum(BC0CompressOptimalPhase v); + +//////////////////////////////////////////////////////////////////////////////// +// PRS compression +//////////////////////////////////////////////////////////////////////////////// + // Use this class if you need to compress from multiple input buffers, or need // to compress multiple chunks and don't want to copy their contents -// unnecessarily. (For most common use cases, use prs_compress (below) instead.) +// unnecessarily. (For most common use cases, use prs_compress, below, instead.) +// To use this class, instantiate it, then call .add() one or more times, then +// call .close() and use the returned string as the compressed result. class PRSCompressor { public: - // To use this class, instantiate it, then call .add() one or more times, then - // call .close() and use the returned string as the compressed result. - explicit PRSCompressor(size_t compression_level = 1, std::function progress_fn = nullptr); + // compression_level specifies how aggressively to search for alternate paths: + // -1: Don't perform any compression at all, but produce output that can be + // understood by prs_decompress. The output will be about 9/8 the size + // of the input. + // 0: Greedily search for the longest backreference at every point. Don't + // consider any alternate paths. Generally offers a good balance between + // speed and output size. + // 1: Consider two paths at each point when a backreference is found: using + // the backreference or ignoring it. + // 2+: Consider further chains of paths at each point. Using values 2 or + // greater for compression_level generally yields diminishing returns. + explicit PRSCompressor(ssize_t compression_level = 0, std::function progress_fn = nullptr); ~PRSCompressor() = default; // Adds more input data to be compressed, which logically comes after all @@ -107,7 +145,7 @@ private: void write_control(bool z); void flush_control(); - size_t compression_level; + ssize_t compression_level; std::function progress_fn; bool closed; @@ -121,19 +159,27 @@ private: StringWriter output; }; -// Compresses data from a single input buffer using PRS and returns the -// compressed result. This is a shortcut for constructing a PRSCompressor, -// calling .add() once, and calling .close(). +// These functions use PRSCompressor to compress a buffer of data. This is +// essentially a shortcut for constructing a PRSCompressor, calling .add() on +// it once, then calling .close(). std::string prs_compress( const void* vdata, size_t size, - size_t compression_level = 1, + ssize_t compression_level = 0, std::function progress_fn = nullptr); std::string prs_compress( const std::string& data, - size_t compression_level = 1, + ssize_t compression_level = 0, std::function progress_fn = nullptr); +// Compresses data using PRS to the smallest possible output size. This function +// is slow, but produces results even significantly smaller than Sega's original +// compressor. +std::string prs_compress_optimal( + const void* vdata, + size_t size, + std::function progress_fn = nullptr); + // Decompresses PRS-compressed data. std::string prs_decompress(const void* data, size_t size, size_t max_output_size = 0); std::string prs_decompress(const std::string& data, size_t max_output_size = 0); @@ -147,10 +193,26 @@ size_t prs_decompress_size(const std::string& data, size_t max_output_size = 0); void prs_disassemble(FILE* stream, const void* data, size_t size); void prs_disassemble(FILE* stream, const std::string& data); -// Compresses and decompresses data using the BC0 algorithm. +//////////////////////////////////////////////////////////////////////////////// +// BC0 compression +//////////////////////////////////////////////////////////////////////////////// + +// Compresses data using the BC0 algorithm. +std::string bc0_compress_optimal( + const void* in_data_v, + size_t in_size, + std::function progress_fn = nullptr); std::string bc0_compress(const std::string& data, std::function progress_fn = nullptr); std::string bc0_compress(const void* in_data_v, size_t in_size, std::function progress_fn = nullptr); + +// Encodes data in a BC0-compatible format without compression (similar to using +// compression_level=-1 with prs_compress). +std::string bc0_encode(const void* in_data_v, size_t in_size); + +// Decompresses BC0-compressed data. std::string bc0_decompress(const std::string& data); std::string bc0_decompress(const void* data, size_t size); + +// Prints the command stream from a BC0-compressed buffer. void bc0_disassemble(FILE* stream, const std::string& data); void bc0_disassemble(FILE* stream, const void* data, size_t size); diff --git a/src/Main.cc b/src/Main.cc index 7eb3240e..644f9291 100644 --- a/src/Main.cc +++ b/src/Main.cc @@ -304,7 +304,8 @@ int main(int argc, char** argv) { size_t stride = 1; size_t num_threads = 0; size_t bytes = 0; - size_t prs_compression_level = 1; + ssize_t compression_level = 0; + bool compress_optimal = false; const char* find_decryption_seed_ciphertext = nullptr; vector find_decryption_seed_plaintexts; const char* input_filename = nullptr; @@ -335,7 +336,9 @@ int main(int argc, char** argv) { } else if (!strcmp(argv[x], "--bb")) { cli_version = GameVersion::BB; } else if (!strncmp(argv[x], "--compression-level=", 20)) { - prs_compression_level = strtoull(&argv[x][20], nullptr, 0); + compression_level = strtoll(&argv[x][20], nullptr, 0); + } else if (!strcmp(argv[x], "--optimal")) { + compress_optimal = true; } else if (!strcmp(argv[x], "--round2")) { round2 = true; } else if (!strncmp(argv[x], "--bytes=", 8)) { @@ -549,14 +552,31 @@ int main(int argc, char** argv) { fprintf(stderr, "... %zu/%zu (%g%%) => %zu (%g%%) \r", input_progress, input_bytes, progress, output_progress, size_ratio); }; + auto optimal_progress_fn = [&](auto phase, size_t input_progress, size_t output_progress) -> void { + const char* phase_name = name_for_enum(phase); + float progress = static_cast(input_progress * 100) / input_bytes; + float size_ratio = static_cast(output_progress * 100) / input_progress; + fprintf(stderr, "... [%s] %zu/%zu (%g%%) => %zu (%g%%) \r", + phase_name, input_progress, input_bytes, progress, output_progress, size_ratio); + }; uint64_t start = now(); if (behavior == Behavior::COMPRESS_PRS) { - data = prs_compress(data, prs_compression_level, progress_fn); + if (compress_optimal) { + data = prs_compress_optimal(data.data(), data.size(), optimal_progress_fn); + } else { + data = prs_compress(data, compression_level, progress_fn); + } } else if (behavior == Behavior::DECOMPRESS_PRS) { data = prs_decompress(data); } else if (behavior == Behavior::COMPRESS_BC0) { - data = bc0_compress(data, progress_fn); + if (compress_optimal) { + data = bc0_compress_optimal(data.data(), data.size(), optimal_progress_fn); + } else if (compression_level < 0) { + data = bc0_encode(data.data(), data.size()); + } else { + data = bc0_compress(data, progress_fn); + } } else if (behavior == Behavior::DECOMPRESS_BC0) { data = bc0_decompress(data); } else { diff --git a/tests/test-compression.sh b/tests/test-compression.sh index fb003eef..4bcc68ec 100755 --- a/tests/test-compression.sh +++ b/tests/test-compression.sh @@ -9,12 +9,44 @@ if [ "$EXECUTABLE" == "" ]; then EXECUTABLE="./newserv" fi -echo "... decompress card definitions" + +echo "... decompress" $EXECUTABLE decompress-prs system/ep3/card-definitions.mnr card-defs.mnrd -echo "... compress card definitions" -$EXECUTABLE compress-$SCHEME card-defs.mnrd card-defs.mnr.$SCHEME -echo "... check compressed card definitions" -$EXECUTABLE decompress-$SCHEME card-defs.mnr.$SCHEME - | diff card-defs.mnrd - + +echo "... compress with level=-1 (no compression)" +$EXECUTABLE compress-$SCHEME --compression-level=-1 card-defs.mnrd card-defs.mnrd.$SCHEME.lN +echo "... compress with level=0" +$EXECUTABLE compress-$SCHEME --compression-level=0 card-defs.mnrd card-defs.mnrd.$SCHEME.l0 +echo "... compress with level=1" +$EXECUTABLE compress-$SCHEME --compression-level=1 card-defs.mnrd card-defs.mnrd.$SCHEME.l1 +echo "... compress optimally" +$EXECUTABLE compress-$SCHEME --optimal card-defs.mnrd card-defs.mnrd.$SCHEME.opt + +echo "... decompress from level=-1 (no compression)" +$EXECUTABLE decompress-$SCHEME card-defs.mnrd.$SCHEME.lN card-defs.mnrd.$SCHEME.lN.dec +echo "... decompress from level=0" +$EXECUTABLE decompress-$SCHEME card-defs.mnrd.$SCHEME.l0 card-defs.mnrd.$SCHEME.l0.dec +echo "... decompress from level=1" +$EXECUTABLE decompress-$SCHEME card-defs.mnrd.$SCHEME.l1 card-defs.mnrd.$SCHEME.l1.dec +echo "... decompress from optimal" +$EXECUTABLE decompress-$SCHEME card-defs.mnrd.$SCHEME.opt card-defs.mnrd.$SCHEME.opt.dec + +echo "... check result from level=-1 (no compression)" +diff card-defs.mnrd card-defs.mnrd.$SCHEME.lN.dec +echo "... check result from level=0" +diff card-defs.mnrd card-defs.mnrd.$SCHEME.l0.dec +echo "... check result from level=1" +diff card-defs.mnrd card-defs.mnrd.$SCHEME.l1.dec +echo "... check result from optimal" +diff card-defs.mnrd card-defs.mnrd.$SCHEME.opt.dec echo "... clean up" -rm card-defs.mnrd card-defs.mnr.$SCHEME +rm card-defs.mnrd \ + card-defs.mnrd.$SCHEME.lN \ + card-defs.mnrd.$SCHEME.l0 \ + card-defs.mnrd.$SCHEME.l1 \ + card-defs.mnrd.$SCHEME.opt \ + card-defs.mnrd.$SCHEME.lN.dec \ + card-defs.mnrd.$SCHEME.l0.dec \ + card-defs.mnrd.$SCHEME.l1.dec \ + card-defs.mnrd.$SCHEME.opt.dec