#include "Compression.hh" #include #include #include #include #include #include #include #include "Text.hh" using namespace std; template <> const char* name_for_enum(CompressPhase v) { switch (v) { case CompressPhase::INDEX: return "INDEX"; case CompressPhase::CONSTRUCT_PATHS: return "CONSTRUCT_PATHS"; case CompressPhase::BACKTRACE_OPTIMAL_PATH: return "BACKTRACE_OPTIMAL_PATH"; case CompressPhase::GENERATE_RESULT: return "GENERATE_RESULT"; default: return "__UNKNOWN__"; } } template struct WindowIndex { const uint8_t* data; size_t size; size_t offset; set> index; WindowIndex(const void* data, size_t size) : data(reinterpret_cast(data)), size(size), offset(0), index(bind(&WindowIndex::set_comparator, this, placeholders::_1, placeholders::_2)) {} void advance() { if (this->offset >= WindowLength) { this->index.erase(this->offset - WindowLength); } this->index.emplace(this->offset); this->offset++; } size_t get_match_length(size_t match_offset) const { size_t match_iter = match_offset; size_t offset_iter = this->offset; while ((match_iter < match_offset + MaxMatchLength) && (match_iter < this->size) && (offset_iter < this->size) && (this->data[match_iter] == this->data[offset_iter])) { match_iter++; offset_iter++; } return match_iter - match_offset; }; // The data structure we want is a binary-searchable set of all strings // starting at all possible offsets within the sliding window, and we need // to be able to search lexicographically but insert and delete by offset. // A std::map would accomplish this, but would be // horrendously inefficient: we'd have to copy strings far too much. We can // solve this by instead storing the offset of each string as keys in a set // and using a custom comparator to treat them as references to binary // strings within the data. bool set_comparator(size_t a, size_t b) const { size_t max_length = min(MaxMatchLength, this->size - max(a, b)); size_t end_a = a + max_length; for (; a < end_a; a++, b++) { uint8_t data_a = static_cast(this->data[a]); uint8_t data_b = static_cast(this->data[b]); if (data_a < data_b) { return true; // a comes before b lexicographically } else if (data_a > data_b) { return false; // a comes after b lexicographically } } return a < b; // Maximum-length match; order them by offset }; pair get_best_match() const { // Find the best match from the index. It's unlikely that we'll get an // exact match, so check the entry before the upper_bound result too. // Note: We use upper_bound rather than lower_bound because in PRS, a // backreference can be encoded with fewer bits if it's close to the // decompression offset, and this makes us pick the latest match by // default. size_t match_offset = 0; size_t match_size = 0; auto it = this->index.upper_bound(this->offset); if (it != this->index.end()) { size_t new_match_offset = *it; size_t new_match_size = this->get_match_length(new_match_offset); if ((new_match_size > match_size) || (new_match_size == match_size && new_match_offset > match_offset)) { match_offset = new_match_offset; match_size = new_match_size; } } if (it != this->index.begin()) { it--; size_t new_match_offset = *it; size_t new_match_size = this->get_match_length(new_match_offset); if ((new_match_size > match_size) || (new_match_size == match_size && new_match_offset > match_offset)) { match_offset = new_match_offset; match_size = new_match_size; } } return make_pair(match_offset, match_size); } }; struct LZSSInterleavedWriter { StringWriter w; size_t buf_offset; uint8_t next_control_bit; uint8_t buf[0x19]; LZSSInterleavedWriter() : buf_offset(1), next_control_bit(1) { this->buf[0] = 0; } void flush_if_ready() { if (this->next_control_bit == 0) { this->w.write(this->buf, this->buf_offset); this->buf[0] = 0; this->buf_offset = 1; this->next_control_bit = 1; } } std::string&& close() { if (this->buf_offset > 1 || this->next_control_bit != 1) { this->w.write(this->buf, this->buf_offset); } return std::move(this->w.str()); } void write_control(bool v) { if (this->next_control_bit == 0) { throw logic_error("write_control called with no space to write"); } if (v) { this->buf[0] |= this->next_control_bit; } this->next_control_bit <<= 1; } void write_data(uint8_t v) { this->buf[this->buf_offset++] = v; } size_t size() const { return this->w.size() + this->buf_offset; } }; class ControlStreamReader { public: ControlStreamReader(StringReader& r) : r(r), bits(0x0000) {} bool read() { if (!(this->bits & 0x0100)) { this->bits = 0xFF00 | this->r.get_u8(); } bool ret = this->bits & 1; this->bits >>= 1; return ret; } uint8_t buffered_bits() const { uint16_t z = this->bits; uint8_t ret = 0; for (; z & 0x0100; z >>= 1, ret++) { } return ret; } private: StringReader& r; uint16_t bits; }; struct PRSPathNode { enum class CommandType { NONE = 0, LITERAL, SHORT_COPY, LONG_COPY, EXTENDED_COPY, }; int16_t short_copy_offset = 0; uint8_t max_short_copy_size = 0; int16_t long_copy_offset = 0; uint8_t max_long_copy_size = 0; int16_t extended_copy_offset = 0; uint16_t max_extended_copy_size = 0; // Pathfinding state size_t from_offset = 0; CommandType from_command_type = CommandType::NONE; size_t bits_used = static_cast(-1); // Stream generation state size_t to_offset = 0; }; string prs_compress_optimal( const void* in_data_v, size_t in_size, ProgressCallback progress_fn) { const uint8_t* in_data = reinterpret_cast(in_data_v); vector nodes; nodes.resize(in_size + 1); nodes[0].bits_used = 18; // Stop command: 2 control bits and 2 data bytes size_t copy_progress_max = 3 * in_size; atomic copy_progress = 0; // Populate all possible short copies std::thread short_window_thread([&]() -> void { WindowIndex<0x100, 5> window(in_data_v, in_size); while (window.offset < in_size) { if (window.offset && (window.offset & 0xFFF) == 0 && progress_fn) { size_t progress = copy_progress.fetch_add(0x1000) + 0x1000; progress_fn(CompressPhase::INDEX, progress, copy_progress_max, 0); } auto& node = nodes[window.offset]; auto match = window.get_best_match(); if (match.second >= 2) { node.short_copy_offset = match.first - window.offset; node.max_short_copy_size = match.second; } window.advance(); } }); // Populate all possible long copies std::thread long_window_thread([&]() -> void { WindowIndex<0x1FFF, 9> window(in_data_v, in_size); while (window.offset < in_size) { if (window.offset && (window.offset & 0xFFF) == 0 && progress_fn) { size_t progress = copy_progress.fetch_add(0x1000) + 0x1000; progress_fn(CompressPhase::INDEX, progress, copy_progress_max, 0); } auto& node = nodes[window.offset]; auto match = window.get_best_match(); if (match.second >= 3) { node.long_copy_offset = match.first - window.offset; node.max_long_copy_size = match.second; } window.advance(); } }); // Populate all possible extended copies std::thread extended_window_thread([&]() -> void { WindowIndex<0x1FFF, 0x100> window(in_data_v, in_size); while (window.offset < in_size) { if (window.offset && (window.offset & 0xFFF) == 0 && progress_fn) { size_t progress = copy_progress.fetch_add(0x1000) + 0x1000; progress_fn(CompressPhase::INDEX, progress, copy_progress_max, 0); } auto& node = nodes[window.offset]; auto match = window.get_best_match(); if (match.second >= 1) { node.extended_copy_offset = match.first - window.offset; node.max_extended_copy_size = match.second; } window.advance(); } }); short_window_thread.join(); long_window_thread.join(); extended_window_thread.join(); // For each node, populate the literal value, and the best ways to get to the // following nodes for (size_t z = 0; z < in_size; z++) { if ((z & 0xFFF) == 0 && progress_fn) { progress_fn(CompressPhase::CONSTRUCT_PATHS, z, in_size, 0); } auto& node = nodes[z]; // Literal: 1 control bit + 1 data byte size_t bits_used = node.bits_used + 9; { auto& next_node = nodes[z + 1]; if (next_node.bits_used > bits_used) { next_node.from_offset = z; next_node.from_command_type = PRSPathNode::CommandType::LITERAL; next_node.bits_used = bits_used; } } // Short copy: 4 control bits + 1 data byte bits_used = node.bits_used + 12; for (size_t x = 2; x <= node.max_short_copy_size; x++) { auto& next_node = nodes[z + x]; if (next_node.bits_used > bits_used) { next_node.from_offset = z; next_node.from_command_type = PRSPathNode::CommandType::SHORT_COPY; next_node.bits_used = bits_used; } } // Long copy: 2 control bits + 2 data bytes bits_used = node.bits_used + 18; for (size_t x = 3; x <= node.max_long_copy_size; x++) { auto& next_node = nodes[z + x]; if (next_node.bits_used > bits_used) { next_node.from_offset = z; next_node.from_command_type = PRSPathNode::CommandType::LONG_COPY; next_node.bits_used = bits_used; } } // Extended copy: 2 control bits + 3 data bytes bits_used = node.bits_used + 26; for (size_t x = 1; x <= node.max_extended_copy_size; x++) { auto& next_node = nodes[z + x]; if (next_node.bits_used > bits_used) { next_node.from_offset = z; next_node.from_command_type = PRSPathNode::CommandType::EXTENDED_COPY; next_node.bits_used = bits_used; } } } // Find the shortest path from the last node to the first node size_t last_progress_fn_call = static_cast(-1); for (size_t z = in_size; z > 0;) { if ((z & ~0xFFF) != (last_progress_fn_call & ~0xFFF)) { last_progress_fn_call = z; if (progress_fn) { progress_fn(CompressPhase::BACKTRACE_OPTIMAL_PATH, z, in_size, 0); } } size_t from_offset = nodes[z].from_offset; nodes[from_offset].to_offset = z; z = from_offset; } // Produce the PRS command stream from the shortest path LZSSInterleavedWriter w; last_progress_fn_call = static_cast(-1); for (size_t offset = 0; offset < in_size;) { if ((offset & ~0xFFF) != (last_progress_fn_call & ~0xFFF)) { last_progress_fn_call = offset; if (progress_fn) { progress_fn(CompressPhase::GENERATE_RESULT, offset, in_size, w.size()); } } const auto& node = nodes[offset]; const auto& next_node = nodes[node.to_offset]; size_t copy_size = node.to_offset - offset; switch (next_node.from_command_type) { case PRSPathNode::CommandType::LITERAL: if (copy_size != 1) { throw logic_error("incorrect size for LITERAL copy type"); } w.write_control(true); w.write_data(in_data[offset]); break; case PRSPathNode::CommandType::SHORT_COPY: { if (copy_size < 2 || copy_size > 5) { throw logic_error("incorrect size for SHORT_COPY copy type"); } uint8_t encoded_size = copy_size - 2; w.write_control(false); w.flush_if_ready(); w.write_control(false); w.flush_if_ready(); w.write_control(encoded_size & 2); w.flush_if_ready(); w.write_control(encoded_size & 1); w.write_data(node.short_copy_offset & 0xFF); break; } case PRSPathNode::CommandType::LONG_COPY: { if (copy_size < 2 || copy_size > 9) { throw logic_error("incorrect size for LONG_COPY copy type"); } w.write_control(false); w.flush_if_ready(); w.write_control(true); uint16_t a = (node.long_copy_offset << 3) | (copy_size - 2); w.write_data(a & 0xFF); w.write_data(a >> 8); break; } case PRSPathNode::CommandType::EXTENDED_COPY: { if (copy_size < 1 || copy_size > 0x100) { throw logic_error("incorrect size for EXTENDED_COPY copy type"); } w.write_control(false); w.flush_if_ready(); w.write_control(true); uint16_t a = (node.extended_copy_offset << 3); w.write_data(a & 0xFF); w.write_data(a >> 8); w.write_data(copy_size - 1); break; } default: throw logic_error("invalid copy type in shortest path"); } w.flush_if_ready(); offset = node.to_offset; } // Write stop command w.write_control(false); w.flush_if_ready(); w.write_control(true); w.write_data(0); w.write_data(0); return std::move(w.close()); } PRSCompressor::PRSCompressor( ssize_t compression_level, ProgressCallback progress_fn) : compression_level(compression_level), progress_fn(progress_fn), closed(false), control_byte_offset(0), pending_control_bits(0), input_bytes(0) { this->output.put_u8(0); } void PRSCompressor::add(const void* data, size_t size) { if (this->closed) { throw logic_error("compressor is closed"); } StringReader r(data, size); while (!r.eof()) { this->add_byte(r.get_u8()); } } void PRSCompressor::add(const string& data) { this->add(data.data(), data.size()); } void PRSCompressor::add_byte(uint8_t v) { if (this->reverse_log.end_offset() + this->forward_log.data.size() <= this->input_bytes) { this->advance(); } this->forward_log.at(this->input_bytes) = v; this->input_bytes++; } void PRSCompressor::advance() { // Search for a match in the decompressed data history size_t best_match_size = 0; size_t best_match_offset = 0; size_t best_match_literals = 0; for (ssize_t num_literals = 0; num_literals <= this->compression_level; num_literals++) { for (size_t z = 0; z < static_cast(num_literals); z++) { this->reverse_log.push_back(this->forward_log.at(this->reverse_log.end_offset())); } size_t compression_offset = reverse_log.end_offset(); uint8_t first_v = this->forward_log.at(compression_offset); const auto& start_offsets = this->reverse_log.find(first_v); for (auto it = start_offsets.begin(); (it != start_offsets.end()) && (best_match_size < 0x100); it++) { size_t match_offset = *it; if (match_offset + 0x2000 <= compression_offset) { continue; } size_t match_size = 0; size_t match_loop_bytes = compression_offset - match_offset; while ((match_size < 0x100) && (compression_offset + match_size < this->input_bytes) && (this->reverse_log.at(match_offset + (match_size % match_loop_bytes)) == this->forward_log.at(compression_offset + match_size))) { match_size++; } // If there are multiple matches of the longest length, use the latest one, // since it's more likely that it can be expressed as a short copy instead // of a long copy. if (match_size >= (best_match_size + best_match_literals)) { best_match_offset = match_offset; best_match_size = match_size; best_match_literals = num_literals; } } for (size_t z = 0; z < static_cast(num_literals); z++) { this->reverse_log.pop_back(); } } // If the best match has literals preceding it, write those literals for (size_t z = 0; z < best_match_literals; z++) { this->advance_literal(); } // If there is a suitable match, write a backreference; otherwise, write a // literal. The backreference should be encoded: // - As a short copy if offset in [-0x100, -1] and size in [2, 5] // - As a long copy if offset in [-0x1FFF, -1] and size in [3, 9] // - As an extended copy if offset in [-0x1FFF, -1] and size in [10, 0x100] // Technically an extended copy can be used for sizes 1-9 as well, but if // size is 1 or 2, writing literals is better (since it uses fewer data // bytes and control bits), and a long copy can cover sizes 3-9 (and also // uses fewer data bytes and control bits). ssize_t backreference_offset = best_match_offset - this->reverse_log.end_offset(); if (best_match_size < 2) { // The match is too small; a literal would use fewer bits this->advance_literal(); } else if ((backreference_offset >= -0x100) && (best_match_size <= 5)) { this->advance_short_copy(backreference_offset, best_match_size); } else if (best_match_size < 3) { // We can't use a long copy for size 2, and it's not worth it to use an // extended copy for this either (as noted above), so write a literal this->advance_literal(); } else if ((backreference_offset >= -0x1FFF) && (best_match_size <= 9)) { this->advance_long_copy(backreference_offset, best_match_size); } else if ((backreference_offset >= -0x1FFF) && (best_match_size <= 0x100)) { this->advance_extended_copy(backreference_offset, best_match_size); } else { throw logic_error("invalid best match"); } } void PRSCompressor::move_forward_data_to_reverse_log(size_t size) { for (; size > 0; size--) { this->reverse_log.push_back(this->forward_log.at(this->reverse_log.end_offset())); if (this->progress_fn && ((this->reverse_log.end_offset() & 0xFFF) == 0)) { this->progress_fn(CompressPhase::GENERATE_RESULT, this->reverse_log.end_offset(), this->input_bytes, this->output.size()); } } } void PRSCompressor::advance_literal() { this->write_control(true); this->output.put_u8(this->forward_log.at(this->reverse_log.end_offset())); this->move_forward_data_to_reverse_log(1); } void PRSCompressor::advance_short_copy(ssize_t offset, size_t size) { uint8_t encoded_size = size - 2; this->write_control(false); this->write_control(false); this->write_control(encoded_size & 2); this->write_control(encoded_size & 1); this->output.put_u8(offset & 0xFF); this->move_forward_data_to_reverse_log(size); } void PRSCompressor::advance_long_copy(ssize_t offset, size_t size) { this->write_control(false); this->write_control(true); uint16_t a = (offset << 3) | (size - 2); this->output.put_u8(a & 0xFF); this->output.put_u8(a >> 8); this->move_forward_data_to_reverse_log(size); } void PRSCompressor::advance_extended_copy(ssize_t offset, size_t size) { this->write_control(false); this->write_control(true); uint16_t a = (offset << 3); this->output.put_u8(a & 0xFF); this->output.put_u8(a >> 8); this->output.put_u8(size - 1); this->move_forward_data_to_reverse_log(size); } string& PRSCompressor::close() { if (!this->closed) { // Advance until all input is consumed while (this->reverse_log.end_offset() < this->input_bytes) { this->advance(); } // Write stop command this->write_control(false); this->write_control(true); this->output.put_u8(0); this->output.put_u8(0); // Write remaining control bits this->flush_control(); this->closed = true; } return this->output.str(); } void PRSCompressor::write_control(bool z) { if (this->pending_control_bits & 0x0100) { this->output.pput_u8( this->control_byte_offset, this->pending_control_bits & 0xFF); this->control_byte_offset = this->output.size(); this->output.put_u8(0); this->pending_control_bits = z ? 0x8080 : 0x8000; } else { this->pending_control_bits = (this->pending_control_bits >> 1) | (z ? 0x8080 : 0x8000); } } void PRSCompressor::flush_control() { if (this->pending_control_bits & 0xFF00) { while (!(this->pending_control_bits & 0x0100)) { this->pending_control_bits >>= 1; } this->output.pput_u8( this->control_byte_offset, this->pending_control_bits & 0xFF); } else { if (this->control_byte_offset != this->output.size() - 1) { throw logic_error("data written without control bits"); } this->output.str().resize(this->output.str().size() - 1); } } string prs_compress( const void* vdata, size_t size, ssize_t compression_level, ProgressCallback progress_fn) { PRSCompressor prs(compression_level, progress_fn); prs.add(vdata, size); return std::move(prs.close()); } string prs_compress( const string& data, ssize_t compression_level, ProgressCallback progress_fn) { return prs_compress(data.data(), data.size(), compression_level, progress_fn); } string prs_compress_indexed( const void* in_data_v, size_t in_size, ProgressCallback progress_fn) { const uint8_t* in_data = reinterpret_cast(in_data_v); LZSSInterleavedWriter w; WindowIndex<0x100, 5> w_short(in_data_v, in_size); WindowIndex<0x1FFF, 9> w_long(in_data_v, in_size); WindowIndex<0x1FFF, 0x100> w_extended(in_data_v, in_size); size_t last_progress_fn_call_offset = 0; while (w_short.offset < in_size) { if (progress_fn && ((last_progress_fn_call_offset & ~0xFFF) != (w_short.offset & ~0xFFF))) { last_progress_fn_call_offset = w_short.offset; progress_fn(CompressPhase::GENERATE_RESULT, w_short.offset, in_size, w.size()); } auto m_short = w_short.get_best_match(); auto m_long = w_long.get_best_match(); auto m_extended = w_extended.get_best_match(); // Write the match that achieves the best ratio of output bytes to // compressed bits used. To do this without floating-point math, we multiply // the output byte count for each type of command by 468 / (command_bits), // since 468 is the least common multiple of the number of bits for each // command type. The command type with the highest score is the one we'll // use, breaking ties by choosing the shorter command type. Note that the // size of any copy type can be zero if no match was found; if no matches // were found at all, then we can always write a literal. size_t score_literal = 52; size_t score_short = m_short.second * 39; size_t score_long = m_long.second * 26; size_t score_extended = m_extended.second * 18; PRSPathNode::CommandType command_type = PRSPathNode::CommandType::NONE; if (score_literal < score_short) { if (score_short < score_long) { if (score_long < score_extended) { command_type = PRSPathNode::CommandType::EXTENDED_COPY; } else { command_type = PRSPathNode::CommandType::LONG_COPY; } } else { if (score_short < score_extended) { command_type = PRSPathNode::CommandType::EXTENDED_COPY; } else { command_type = PRSPathNode::CommandType::SHORT_COPY; } } } else { if (score_literal < score_long) { if (score_long < score_extended) { command_type = PRSPathNode::CommandType::EXTENDED_COPY; } else { command_type = PRSPathNode::CommandType::LONG_COPY; } } else { if (score_literal < score_extended) { command_type = PRSPathNode::CommandType::EXTENDED_COPY; } else { command_type = PRSPathNode::CommandType::LITERAL; } } } size_t bytes_consumed = 0; switch (command_type) { case PRSPathNode::CommandType::LITERAL: w.write_control(true); w.write_data(in_data[w_short.offset]); bytes_consumed = 1; break; case PRSPathNode::CommandType::SHORT_COPY: { ssize_t backreference_offset = m_short.first - w_short.offset; uint8_t encoded_size = m_short.second - 2; w.write_control(false); w.flush_if_ready(); w.write_control(false); w.flush_if_ready(); w.write_control(encoded_size & 2); w.flush_if_ready(); w.write_control(encoded_size & 1); w.write_data(backreference_offset & 0xFF); bytes_consumed = m_short.second; break; } case PRSPathNode::CommandType::LONG_COPY: { ssize_t backreference_offset = m_long.first - w_long.offset; w.write_control(false); w.flush_if_ready(); w.write_control(true); uint16_t a = (backreference_offset << 3) | (m_long.second - 2); w.write_data(a & 0xFF); w.write_data(a >> 8); bytes_consumed = m_long.second; break; } case PRSPathNode::CommandType::EXTENDED_COPY: { ssize_t backreference_offset = m_extended.first - w_extended.offset; w.write_control(false); w.flush_if_ready(); w.write_control(true); uint16_t a = (backreference_offset << 3); w.write_data(a & 0xFF); w.write_data(a >> 8); w.write_data(m_extended.second - 1); bytes_consumed = m_extended.second; break; } case PRSPathNode::CommandType::NONE: default: throw logic_error("invalid command type"); } w.flush_if_ready(); if (bytes_consumed == 0) { throw logic_error("no input data was consumed"); } for (size_t z = 0; z < bytes_consumed; z++) { w_short.advance(); w_long.advance(); w_extended.advance(); } } // Write stop command w.write_control(false); w.flush_if_ready(); w.write_control(true); w.write_data(0); w.write_data(0); return std::move(w.close()); } string prs_compress_indexed(const string& data, ProgressCallback progress_fn) { return prs_compress_indexed(data.data(), data.size(), progress_fn); } PRSDecompressResult prs_decompress_with_meta( const void* data, size_t size, size_t max_output_size, bool allow_unterminated) { // PRS is an LZ77-based compression algorithm. Compressed data is split into // two streams: a control stream and a data stream. The control stream is read // one bit at a time, and the data stream is read one byte at a time. The // streams are interleaved such that the decompressor never has to move // backward in the input stream - when the decompressor needs a control bit // and there are no unused bits from the previous byte of the control stream, // it reads a byte from the input and treats it as the next 8 control bits. // There are 3 distinct commands in PRS, labeled here with their control bits: // 1 - Literal byte. The decompressor copies one byte from the input data // stream to the output. // 00 - Short backreference. The decompressor reads two control bits and adds // 2 to this value to determine the number of bytes to copy, then reads // one byte from the data stream to determine how far back in the output // to copy from. This byte is treated as an 8-bit negative number - so // 0xF7, for example, means to start copying data from 9 bytes before the // end of the output. The range must start before the end of the output, // but the end of the range may be beyond the end of the output. In this // case, the bytes between the beginning of the range and original end of // the output are simply repeated. // 01 - Long backreference. The decompressor reads two bytes from the data and // byteswaps the resulting 16-bit value (that is, the low byte is read // first). The start offset (again, as a negative number) is the top 13 // bits of this value; the size is the low 3 bits of this value, plus 2. // If the size bits are all zero, an additional byte is read from the // data stream and 1 is added to it to determine the backreference size // (we call this an extended backreference). Therefore, the maximum // backreference size is 256 bytes. // Decompression ends when either there are no more input bytes to read, or // when a long backreference is read with all zeroes in its offset field. The // original implementation stops decompression successfully when any attempt // to read from the input encounters the end of the stream, but newserv's // implementation only allows this at the end of an opcode - if end-of-stream // is encountered partway through an opcode, we throw instead, because it's // likely the input has been truncated or is malformed in some way. StringWriter w; StringReader r(data, size); ControlStreamReader cr(r); while (!r.eof()) { // Control 1 = literal byte if (cr.read()) { if (max_output_size && w.size() == max_output_size) { if (allow_unterminated) { return {std::move(w.str()), r.where()}; } else { throw runtime_error("maximum output size exceeded"); } } w.put_u8(r.get_u8()); } else { ssize_t offset; size_t count; // Control 01 = long backreference if (cr.read()) { // The bits stored in the data stream are AAAAABBBCCCCCCCC, which we // rearrange into offset = CCCCCCCCAAAAA and size = BBB. uint16_t a = r.get_u8(); a |= (r.get_u8() << 8); offset = (a >> 3) | (~0x1FFF); // If offset is zero, it's a stop opcode if (offset == ~0x1FFF) { break; } // If the size field is zero, it's an extended backreference (size comes // from another byte in the data stream) count = (a & 7) ? ((a & 7) + 2) : (r.get_u8() + 1); // Control 00 = short backreference } else { // Count comes from 2 bits in the control stream instead of from the // data stream (and 2 is added). Importantly, the control stream bits // are read first - this may involve reading another control stream // byte, which happens before the offset is read from the data stream. count = cr.read() << 1; count = (count | cr.read()) + 2; offset = r.get_u8() | (~0xFF); } // Copy bytes from the referenced location in the output. Importantly, // copy only one byte at a time, in order to support ranges that cover the // current end of the output. size_t read_offset = w.size() + offset; if (read_offset >= w.size()) { throw runtime_error("backreference offset beyond beginning of output"); } for (size_t z = 0; z < count; z++) { if (max_output_size && w.size() == max_output_size) { if (allow_unterminated) { return {std::move(w.str()), r.where()}; } else { throw out_of_range("maximum output size exceeded"); } } w.put_u8(w.str()[read_offset + z]); } } } return {std::move(w.str()), r.where()}; } PRSDecompressResult prs_decompress_with_meta(const string& data, size_t max_output_size, bool allow_unterminated) { return prs_decompress_with_meta(data.data(), data.size(), max_output_size, allow_unterminated); } string prs_decompress(const void* data, size_t size, size_t max_output_size, bool allow_unterminated) { auto ret = prs_decompress_with_meta(data, size, max_output_size, allow_unterminated); return std::move(ret.data); } string prs_decompress(const string& data, size_t max_output_size, bool allow_unterminated) { auto ret = prs_decompress_with_meta(data.data(), data.size(), max_output_size, allow_unterminated); return std::move(ret.data); } size_t prs_decompress_size(const void* data, size_t size, size_t max_output_size, bool allow_unterminated) { size_t ret = 0; StringReader r(data, size); ControlStreamReader cr(r); while (!r.eof()) { if (cr.read()) { ret++; r.get_u8(); } else { ssize_t offset; size_t count; if (cr.read()) { uint16_t a = r.get_u8(); a |= (r.get_u8() << 8); offset = (a >> 3) | (~0x1FFF); if (offset == ~0x1FFF) { break; } count = (a & 7) ? ((a & 7) + 2) : (r.get_u8() + 1); } else { count = cr.read() << 1; count = (count | cr.read()) + 2; offset = r.get_u8() | (~0xFF); } size_t read_offset = ret + offset; if (read_offset >= ret) { throw runtime_error("backreference offset beyond beginning of output"); } ret += count; } if (max_output_size && ret > max_output_size) { if (allow_unterminated) { return max_output_size; } else { throw out_of_range("maximum output size exceeded"); } } } return ret; } size_t prs_decompress_size(const string& data, size_t max_output_size, bool allow_unterminated) { return prs_decompress_size(data.data(), data.size(), max_output_size, allow_unterminated); } void prs_disassemble(FILE* stream, const void* data, size_t size) { size_t output_bytes = 0; StringReader r(data, size); ControlStreamReader cr(r); while (!r.eof()) { if (cr.read()) { fprintf(stream, "[%zX] literal %02hhX\n", output_bytes, r.get_u8()); output_bytes++; } else { ssize_t offset; size_t count; const char* copy_type; if (cr.read()) { uint16_t a = r.get_u8(); a |= (r.get_u8() << 8); offset = (a >> 3) | (~0x1FFF); if (offset == ~0x1FFF) { fprintf(stream, "[%zX] end\n", output_bytes); break; } if (a & 7) { copy_type = "long"; count = (a & 7) + 2; } else { copy_type = "extended"; count = r.get_u8() + 1; } } else { copy_type = "short"; count = cr.read() << 1; count = (count | cr.read()) + 2; offset = r.get_u8() | (~0xFF); } size_t read_offset = output_bytes + offset; fprintf(stream, "[%zX] %s copy %zX\n", output_bytes, copy_type, count); if (read_offset >= output_bytes) { throw runtime_error("backreference offset beyond beginning of output"); } output_bytes += count; } } } void prs_disassemble(FILE* stream, const std::string& data) { return prs_disassemble(stream, data.data(), data.size()); } // BC0 is a compression algorithm fairly similar to PRS, but with a simpler set // of commands. Like PRS, there is a control stream, indicating when to copy a // literal byte from the input and when to copy from a backreference; unlike // PRS, there is only one type of backreference. Also, there is no stop opcode; // the decompressor simply stops when there are no more input bytes to read. struct BC0PathNode { uint16_t memo_offset = 0; uint8_t max_copy_size = 0; // Pathfinding state size_t from_offset = 0; size_t bits_used = static_cast(-1); // Stream generation state size_t to_offset = 0; }; string bc0_compress_optimal( const void* in_data_v, size_t in_size, ProgressCallback progress_fn) { const uint8_t* in_data = reinterpret_cast(in_data_v); vector nodes; nodes.resize(in_size + 1); nodes[0].bits_used = 0; // Populate all possible backreferences { WindowIndex<0x1000, 0x12> window(in_data_v, in_size); while (window.offset < in_size) { if ((window.offset & 0xFFF) == 0 && progress_fn) { progress_fn(CompressPhase::INDEX, window.offset, in_size, 0); } auto& node = nodes[window.offset]; auto match = window.get_best_match(); if (match.second >= 3) { node.memo_offset = (match.first - 0x12) & 0xFFF; node.max_copy_size = match.second; } window.advance(); } } // For each node, populate the literal value, and the best ways to get to the // following nodes for (size_t z = 0; z < in_size; z++) { if ((z & 0xFFF) == 0 && progress_fn) { progress_fn(CompressPhase::CONSTRUCT_PATHS, z, in_size, 0); } auto& node = nodes[z]; // Literal: 1 control bit + 1 data byte size_t bits_used = node.bits_used + 9; { auto& next_node = nodes[z + 1]; if (next_node.bits_used > bits_used) { next_node.from_offset = z; next_node.bits_used = bits_used; } } // Backreference: 1 control bit + 2 data bytes bits_used = node.bits_used + 17; for (size_t x = 3; x <= node.max_copy_size; x++) { auto& next_node = nodes[z + x]; if (next_node.bits_used > bits_used) { next_node.from_offset = z; next_node.bits_used = bits_used; } } } // Find the shortest path from the last node to the first node size_t last_progress_fn_call = static_cast(-1); for (size_t z = in_size; z > 0;) { if ((z & ~0xFFF) != (last_progress_fn_call & ~0xFFF)) { last_progress_fn_call = z; if (progress_fn) { progress_fn(CompressPhase::BACKTRACE_OPTIMAL_PATH, z, in_size, 0); } } size_t from_offset = nodes[z].from_offset; nodes[from_offset].to_offset = z; z = from_offset; } // Produce the BC0 command stream from the shortest path LZSSInterleavedWriter w; last_progress_fn_call = static_cast(-1); for (size_t offset = 0; offset < in_size;) { if ((offset & ~0xFFF) != (last_progress_fn_call & ~0xFFF)) { last_progress_fn_call = offset; if (progress_fn) { progress_fn(CompressPhase::GENERATE_RESULT, offset, in_size, w.size()); } } const auto& node = nodes[offset]; size_t copy_size = node.to_offset - offset; if (copy_size >= 3 && copy_size <= 0x12) { w.write_control(false); w.write_data(node.memo_offset & 0xFF); w.write_data(((node.memo_offset >> 4) & 0xF0) | (copy_size - 3)); } else if (copy_size == 1) { w.write_control(true); w.write_data(in_data[offset]); } w.flush_if_ready(); offset = node.to_offset; } return std::move(w.close()); } string bc0_compress(const string& data, ProgressCallback progress_fn) { return bc0_compress(data.data(), data.size(), progress_fn); } string bc0_compress(const void* in_data_v, size_t in_size, ProgressCallback progress_fn) { const uint8_t* in_data = reinterpret_cast(in_data_v); LZSSInterleavedWriter w; WindowIndex<0x1000, 0x12> window(in_data_v, in_size); size_t last_progress_fn_call_offset = 0; while (window.offset < in_size) { if (progress_fn && ((last_progress_fn_call_offset & ~0xFFF) != (window.offset & ~0xFFF))) { last_progress_fn_call_offset = window.offset; progress_fn(CompressPhase::GENERATE_RESULT, window.offset, in_size, w.size()); } auto match = window.get_best_match(); // Write a backreference if a match was found; otherwise, write a literal if (match.second >= 3) { w.write_control(false); size_t memo_offset = match.first - 0x12; w.write_data(memo_offset & 0xFF); w.write_data(((memo_offset >> 4) & 0xF0) | (match.second - 3)); } else { w.write_control(true); w.write_data(in_data[window.offset]); match.second = 1; } w.flush_if_ready(); for (size_t z = 0; z < match.second; z++) { window.advance(); } } return std::move(w.close()); } string bc0_encode(const void* in_data_v, size_t in_size) { const uint8_t* in_data = reinterpret_cast(in_data_v); LZSSInterleavedWriter w; for (size_t z = 0; z < in_size; z++) { w.write_control(true); w.write_data(in_data[z]); w.flush_if_ready(); } return std::move(w.close()); } // The BC0 decompression implementation in PSO GC is vulnerable to overflow // attacks - there is no bounds checking on the output buffer. It is unlikely // that this can be usefully exploited (e.g. for RCE) because the output pointer // is loaded from memory before every byte is written, so we cannot change the // output pointer to any arbitrary address. string bc0_decompress(const string& data) { return bc0_decompress(data.data(), data.size()); } string bc0_decompress(const void* data, size_t size) { StringReader r(data, size); StringWriter w; // Unlike PRS, BC0 uses a memo which "rolls over" every 0x1000 bytes. The // boundaries of these "memo pages" are offset by -0x12 bytes for some reason, // so the first output byte corresponds to position 0xFEE on the first memo // page. Backreferences refer to offsets based on the start of memo pages; for // example, if the current output offset is 0x1234, a backreference with // offset 0x123 refers to the byte that was written at offset 0x1111 (because // that byte is at offset 0x111 in the memo, because the memo rolls over every // 0x1000 bytes and the first memo byte was 0x12 bytes before the beginning of // the next page). The memo is initially zeroed from 0 to 0xFEE; it seems PSO // GC doesn't initialize the last 0x12 bytes of the first memo page. parray memo; uint16_t memo_offset = 0x0FEE; // The low byte of this value contains the control stream data; the high bits // specify which low bits are valid. When the last 1 is shifted out of the // high byte, we need to read a new control stream byte to get the next set of // control bits. uint16_t control_stream_bits = 0x0000; while (!r.eof()) { // Read control stream bits if needed control_stream_bits >>= 1; if ((control_stream_bits & 0x100) == 0) { control_stream_bits = 0xFF00 | r.get_u8(); if (r.eof()) { break; } } // Control bit 0 means to perform a backreference copy. The offset and // size are stored in two bytes in the input stream, laid out as follows: // a1 = 0bBBBBBBBB // a2 = 0bAAAACCCC // The offset is the concatenation of bits AAAABBBBBBBB, which refers to a // position in the memo; the number of bytes to copy is (CCCC + 3). The // decompressor copies that many bytes from that offset in the memo, and // writes them to the output and to the current position in the memo. if ((control_stream_bits & 1) == 0) { uint8_t a1 = r.get_u8(); if (r.eof()) { break; } uint8_t a2 = r.get_u8(); size_t count = (a2 & 0x0F) + 3; size_t backreference_offset = a1 | ((a2 << 4) & 0xF00); for (size_t z = 0; z < count; z++) { uint8_t v = memo[(backreference_offset + z) & 0x0FFF]; w.put_u8(v); memo[memo_offset] = v; memo_offset = (memo_offset + 1) & 0x0FFF; } // Control bit 1 means to write a byte directly from the input to the // output. As above, the byte is also written to the memo. } else { uint8_t v = r.get_u8(); w.put_u8(v); memo[memo_offset] = v; memo_offset = (memo_offset + 1) & 0x0FFF; } } return std::move(w.str()); } void bc0_disassemble(FILE* stream, const string& data) { bc0_disassemble(stream, data.data(), data.size()); } void bc0_disassemble(FILE* stream, const void* data, size_t size) { StringReader r(data, size); uint16_t control_stream_bits = 0x0000; size_t output_bytes = 0; while (!r.eof()) { // size_t opcode_offset = r.where(); control_stream_bits >>= 1; if ((control_stream_bits & 0x100) == 0) { control_stream_bits = 0xFF00 | r.get_u8(); if (r.eof()) { break; } } if ((control_stream_bits & 1) == 0) { uint8_t a1 = r.get_u8(); if (r.eof()) { break; } (void)a1; uint8_t a2 = r.get_u8(); size_t count = (a2 & 0x0F) + 3; // size_t backreference_offset = a1 | ((a2 << 4) & 0xF00); fprintf(stream, "[%zX] backreference %02zX\n", output_bytes, count); output_bytes += count; } else { fprintf(stream, "[%zX] literal %02hhX\n", output_bytes, r.get_u8()); output_bytes++; } } }