support compression levels in prs

This commit is contained in:
Martin Michelsen
2023-05-27 13:39:57 -07:00
parent 54a734e049
commit afd93047c1
3 changed files with 224 additions and 114 deletions
+134 -105
View File
@@ -12,14 +12,14 @@
using namespace std;
PRSCompressor::PRSCompressor(function<void(size_t, size_t)> progress_fn)
: progress_fn(progress_fn),
PRSCompressor::PRSCompressor(
size_t compression_level, function<void(size_t, size_t)> progress_fn)
: compression_level(compression_level),
progress_fn(progress_fn),
closed(false),
control_byte_offset(0),
pending_control_bits(0),
input_bytes(0),
compression_offset(0),
reverse_log_index(0x100) {
input_bytes(0) {
this->output.put_u8(0);
}
@@ -39,10 +39,10 @@ void PRSCompressor::add(const string& data) {
}
void PRSCompressor::add_byte(uint8_t v) {
if (this->compression_offset + 0x100 <= this->input_bytes) {
if (this->reverse_log.end_offset() + this->forward_log.data.size() <= this->input_bytes) {
this->advance();
}
this->forward_log[this->input_bytes & 0xFF] = v;
this->forward_log.at(this->input_bytes) = v;
this->input_bytes++;
}
@@ -50,116 +50,130 @@ void PRSCompressor::advance() {
// Search for a match in the decompressed data history
size_t best_match_size = 0;
size_t best_match_offset = 0;
uint8_t first_v = this->forward_log[this->compression_offset & 0xFF];
const auto& start_offsets = this->reverse_log_index[first_v];
for (auto it = start_offsets.begin(); (it != start_offsets.end()) && (best_match_size < 0x100); it++) {
size_t match_offset = *it;
if (match_offset == this->compression_offset - 0x2000) {
continue;
size_t best_match_literals = 0;
for (size_t num_literals = 0; num_literals < this->compression_level; num_literals++) {
for (size_t z = 0; z < num_literals; z++) {
this->reverse_log.push_back(this->forward_log.at(this->reverse_log.end_offset()));
}
size_t match_size = 0;
size_t match_loop_bytes = this->compression_offset - match_offset;
while ((match_size < 0x100) &&
(this->compression_offset + match_size < this->input_bytes) &&
(this->reverse_log[(match_offset + (match_size % match_loop_bytes)) & 0x1FFF] == this->forward_log[(this->compression_offset + match_size) & 0xFF])) {
match_size++;
}
size_t compression_offset = reverse_log.end_offset();
uint8_t first_v = this->forward_log.at(compression_offset);
const auto& start_offsets = this->reverse_log.find(first_v);
// If there are multiple matches of the longest length, use the latest one,
// since it's more likely that it can be expressed as a short copy instead
// of a long copy.
if (match_size >= best_match_size) {
best_match_offset = match_offset;
best_match_size = match_size;
for (auto it = start_offsets.begin(); (it != start_offsets.end()) && (best_match_size < 0x100); it++) {
size_t match_offset = *it;
if (match_offset + 0x2000 <= compression_offset) {
continue;
}
size_t match_size = 0;
size_t match_loop_bytes = compression_offset - match_offset;
while ((match_size < 0x100) &&
(compression_offset + match_size < this->input_bytes) &&
(this->reverse_log.at(match_offset + (match_size % match_loop_bytes)) == this->forward_log.at(compression_offset + match_size))) {
match_size++;
}
// If there are multiple matches of the longest length, use the latest one,
// since it's more likely that it can be expressed as a short copy instead
// of a long copy.
if (match_size >= (best_match_size + best_match_literals)) {
best_match_offset = match_offset;
best_match_size = match_size;
best_match_literals = num_literals;
}
}
for (size_t z = 0; z < num_literals; z++) {
this->reverse_log.pop_back();
}
}
// If there is a suitable match, write a backreference
bool should_write_literal = false;
size_t advance_bytes = 0;
ssize_t backreference_offset = best_match_offset - this->compression_offset;
// If the best match has literals preceding it, write those literals
for (size_t z = 0; z < best_match_literals; z++) {
this->advance_literal();
}
// If there is a suitable match, write a backreference; otherwise, write a
// literal. The backreference should be encoded:
// - As a short copy if offset in [-0x100, -1] and size in [2, 5]
// - As a long copy if offset in [-0x1FFF, -1] and size in [3, 9]
// - As an extended copy if offset in [-0x1FFF, -1] and size in [10, 0x100]
// Technically an extended copy can be used for sizes 1-9 as well, but if
// size is 1 or 2, writing literals is better (since it uses fewer data
// bytes and control bits), and a long copy can cover sizes 3-9 (and also
// uses fewer data bytes and control bits).
ssize_t backreference_offset = best_match_offset - this->reverse_log.end_offset();
if (best_match_size < 2) {
should_write_literal = true;
// The match is too small; a literal would use fewer bits
this->advance_literal();
} else if ((backreference_offset >= -0x100) && (best_match_size <= 5)) {
this->advance_short_copy(backreference_offset, best_match_size);
} else if (best_match_size < 3) {
// We can't use a long copy for size 2, and it's not worth it to use an
// extended copy for this either (as noted above), so write a literal
this->advance_literal();
} else if ((backreference_offset >= -0x1FFF) && (best_match_size <= 9)) {
this->advance_long_copy(backreference_offset, best_match_size);
} else if ((backreference_offset >= -0x1FFF) && (best_match_size <= 0x100)) {
this->advance_extended_copy(backreference_offset, best_match_size);
} else {
// The backreference should be encoded:
// - As a short copy if offset in [-0x100, -1] and size in [2, 5]
// - As a long copy if offset in [-0x1FFF, -1] and size in [3, 9]
// - As an extended copy if offset in [-0x1FFF, -1] and size in [10, 0x100]
// Technically an extended copy can be used for sizes 1-9 as well, but if
// size is 1 or 2, writing literals is better (since it uses fewer data
// bytes and control bits), and a long copy can cover sizes 3-9 (and also
// uses fewer data bytes and control bits).
throw logic_error("invalid best match");
}
}
if ((backreference_offset >= -0x100) && (best_match_size <= 5)) {
// Write short copy
uint8_t size = best_match_size - 2;
this->write_control(false);
this->write_control(false);
this->write_control(size & 2);
this->write_control(size & 1);
this->output.put_u8(backreference_offset & 0xFF);
advance_bytes = best_match_size;
} else if (best_match_size < 3) {
// Can't use a long copy for size 2, and it's not worth it to use extended
// copy for this either (as noted above)
should_write_literal = true;
} else if ((backreference_offset >= -0x1FFF) && (best_match_size <= 9)) {
// Write long copy
this->write_control(false);
this->write_control(true);
uint16_t a = (backreference_offset << 3) | (best_match_size - 2);
this->output.put_u8(a & 0xFF);
this->output.put_u8(a >> 8);
advance_bytes = best_match_size;
} else if ((backreference_offset >= -0x1FFF) && (best_match_size <= 0x100)) {
// Write extended copy
this->write_control(false);
this->write_control(true);
uint16_t a = (backreference_offset << 3);
this->output.put_u8(a & 0xFF);
this->output.put_u8(a >> 8);
this->output.put_u8(best_match_size - 1);
advance_bytes = best_match_size;
} else {
throw logic_error("invalid best match");
void PRSCompressor::move_forward_data_to_reverse_log(size_t size) {
for (; size > 0; size--) {
this->reverse_log.push_back(this->forward_log.at(this->reverse_log.end_offset()));
if (this->progress_fn && ((this->reverse_log.end_offset() & 0xFFF) == 0)) {
this->progress_fn(this->reverse_log.end_offset(), this->output.size());
}
}
}
if (should_write_literal) {
this->write_control(true);
this->output.put_u8(this->forward_log[this->compression_offset & 0xFF]);
advance_bytes = 1;
}
void PRSCompressor::advance_literal() {
this->write_control(true);
this->output.put_u8(this->forward_log.at(this->reverse_log.end_offset()));
this->move_forward_data_to_reverse_log(1);
}
for (size_t z = 0; z < advance_bytes; z++) {
if ((this->compression_offset & 0x1000) && this->progress_fn) {
this->progress_fn(this->compression_offset, this->output.size());
}
size_t reverse_log_offset = this->compression_offset & 0x1FFF;
uint8_t existing_v = this->reverse_log[reverse_log_offset];
uint8_t new_v = this->forward_log[this->compression_offset & 0xFF];
void PRSCompressor::advance_short_copy(ssize_t offset, size_t size) {
uint8_t encoded_size = size - 2;
this->write_control(false);
this->write_control(false);
this->write_control(encoded_size & 2);
this->write_control(encoded_size & 1);
this->output.put_u8(offset & 0xFF);
this->move_forward_data_to_reverse_log(size);
}
if (this->compression_offset & (~0x1FFF)) {
this->reverse_log_index[existing_v].pop_front();
}
this->reverse_log[reverse_log_offset] = new_v;
this->reverse_log_index[new_v].emplace_back(this->compression_offset);
this->compression_offset++;
}
void PRSCompressor::advance_long_copy(ssize_t offset, size_t size) {
this->write_control(false);
this->write_control(true);
uint16_t a = (offset << 3) | (size - 2);
this->output.put_u8(a & 0xFF);
this->output.put_u8(a >> 8);
this->move_forward_data_to_reverse_log(size);
}
void PRSCompressor::advance_extended_copy(ssize_t offset, size_t size) {
this->write_control(false);
this->write_control(true);
uint16_t a = (offset << 3);
this->output.put_u8(a & 0xFF);
this->output.put_u8(a >> 8);
this->output.put_u8(size - 1);
this->move_forward_data_to_reverse_log(size);
}
string& PRSCompressor::close() {
if (!this->closed) {
// Advance until all input is consumed
while (this->compression_offset < this->input_bytes) {
while (this->reverse_log.end_offset() < this->input_bytes) {
this->advance();
}
// Write stop command
@@ -203,15 +217,20 @@ void PRSCompressor::flush_control() {
}
string prs_compress(
const void* vdata, size_t size, function<void(size_t, size_t)> progress_fn) {
PRSCompressor prs(progress_fn);
const void* vdata,
size_t size,
size_t compression_level,
function<void(size_t, size_t)> progress_fn) {
PRSCompressor prs(compression_level, progress_fn);
prs.add(vdata, size);
return std::move(prs.close());
}
string prs_compress(
const string& data, function<void(size_t, size_t)> progress_fn) {
return prs_compress(data.data(), data.size(), progress_fn);
const string& data,
size_t compression_level,
function<void(size_t, size_t)> progress_fn) {
return prs_compress(data.data(), data.size(), compression_level, progress_fn);
}
class ControlStreamReader {
@@ -229,6 +248,14 @@ public:
return ret;
}
uint8_t buffered_bits() const {
uint16_t z = this->bits;
uint8_t ret = 0;
for (; z & 0x0100; z >>= 1, ret++) {
}
return ret;
}
private:
StringReader& r;
uint16_t bits;
@@ -391,8 +418,10 @@ void prs_disassemble(FILE* stream, const void* data, size_t size) {
while (!r.eof()) {
size_t r_offset = r.where();
uint8_t buffered_bits = cr.buffered_bits();
size_t input_bits = 8 * r_offset + (buffered_bits ? (8 - buffered_bits) : 0);
if (cr.read()) {
fprintf(stream, "[%zX => %zX] literal %02hhX\n", r_offset, output_bytes, r.get_u8());
fprintf(stream, "[%zX / %zX => %zX] literal %02hhX\n", r_offset, input_bits, output_bytes, r.get_u8());
output_bytes++;
} else {
@@ -405,7 +434,7 @@ void prs_disassemble(FILE* stream, const void* data, size_t size) {
a |= (r.get_u8() << 8);
offset = (a >> 3) | (~0x1FFF);
if (offset == ~0x1FFF) {
fprintf(stream, "[%zX => %zX] end\n", r_offset, output_bytes);
fprintf(stream, "[%zX / %zX => %zX] end\n", r_offset, input_bits, output_bytes);
break;
}
count = (a & 7) ? ((a & 7) + 2) : (r.get_u8() + 1);
@@ -417,8 +446,8 @@ void prs_disassemble(FILE* stream, const void* data, size_t size) {
}
size_t read_offset = output_bytes + offset;
fprintf(stream, "[%zX => %zX] %s copy -%zX (from %zX) %zX\n",
r_offset, output_bytes, is_long_copy ? "long" : "short",
fprintf(stream, "[%zX / %zX => %zX] %s copy -%zX (from %zX) %zX\n",
r_offset, input_bits, output_bytes, is_long_copy ? "long" : "short",
-offset, read_offset, count);
if (read_offset >= output_bytes) {
+80 -7
View File
@@ -15,7 +15,7 @@ class PRSCompressor {
public:
// To use this class, instantiate it, then call .add() one or more times, then
// call .close() and use the returned string as the compressed result.
PRSCompressor(std::function<void(size_t, size_t)> progress_fn = nullptr);
explicit PRSCompressor(size_t compression_level = 1, std::function<void(size_t, size_t)> progress_fn = nullptr);
~PRSCompressor() = default;
// Adds more input data to be compressed, which logically comes after all
@@ -34,11 +34,79 @@ public:
}
private:
template <size_t Size>
struct WrappedLog {
parray<uint8_t, Size> data;
WrappedLog() : data(0) {}
~WrappedLog() = default;
inline uint8_t at(size_t offset) const {
return this->data[offset % this->data.size()];
}
inline uint8_t& at(size_t offset) {
return this->data[offset % this->data.size()];
}
};
template <size_t Size>
struct IndexedLog : WrappedLog<Size> {
size_t offset;
size_t size;
std::array<std::deque<size_t>, 0x100> index;
IndexedLog()
: WrappedLog<Size>(),
offset(0),
size(0) {}
~IndexedLog() = default;
inline size_t end_offset() const {
return this->offset + this->size;
}
void push_back(uint8_t v) {
if (this->size == Size) {
this->pop_front();
}
size_t write_offset = this->offset + this->size;
this->at(write_offset) = v;
this->index[v].push_back(write_offset);
this->size++;
}
uint8_t pop_back() {
if (!this->size) {
throw std::logic_error("pop_back called on empty IndexedLog");
}
this->size--;
size_t offset = this->offset + this->size;
uint8_t v = this->at(offset);
this->index[v].pop_back();
return v;
}
uint8_t pop_front() {
uint8_t v = this->at(this->offset);
this->index[v].pop_front();
this->offset++;
this->size--;
return v;
}
const std::deque<size_t>& find(uint8_t v) {
return this->index[v];
}
};
void add_byte(uint8_t v);
void advance();
void move_forward_data_to_reverse_log(size_t size);
void advance_literal();
void advance_short_copy(ssize_t offset, size_t size);
void advance_long_copy(ssize_t offset, size_t size);
void advance_extended_copy(ssize_t offset, size_t size);
void write_control(bool z);
void flush_control();
size_t compression_level;
std::function<void(size_t, size_t)> progress_fn;
bool closed;
@@ -46,10 +114,8 @@ private:
uint16_t pending_control_bits;
size_t input_bytes;
parray<uint8_t, 0x100> forward_log;
size_t compression_offset;
parray<uint8_t, 0x2000> reverse_log;
std::vector<std::deque<size_t>> reverse_log_index;
WrappedLog<0x101> forward_log;
IndexedLog<0x2000> reverse_log;
StringWriter output;
};
@@ -57,8 +123,15 @@ private:
// Compresses data from a single input buffer using PRS and returns the
// compressed result. This is a shortcut for constructing a PRSCompressor,
// calling .add() once, and calling .close().
std::string prs_compress(const void* vdata, size_t size, std::function<void(size_t, size_t)> progress_fn = nullptr);
std::string prs_compress(const std::string& data, std::function<void(size_t, size_t)> progress_fn = nullptr);
std::string prs_compress(
const void* vdata,
size_t size,
size_t compression_level = 1,
std::function<void(size_t, size_t)> progress_fn = nullptr);
std::string prs_compress(
const std::string& data,
size_t compression_level = 1,
std::function<void(size_t, size_t)> progress_fn = nullptr);
// Decompresses PRS-compressed data.
std::string prs_decompress(const void* data, size_t size, size_t max_output_size = 0);
+10 -2
View File
@@ -98,7 +98,12 @@ The actions are:\n\
decompress-prs [INPUT-FILENAME [OUTPUT-FILENAME]]\n\
compress-bc0 [INPUT-FILENAME [OUTPUT-FILENAME]]\n\
decompress-bc0 [INPUT-FILENAME [OUTPUT-FILENAME]]\n\
Compress or decompress data using the PRS or BC0 algorithms.\n\
Compress or decompress data using the PRS or BC0 algorithms. When\n\
compressing with PRS, the --compression-level=N option (default 1)\n\
specifies how aggressive the compressor should be in searching for literal\n\
sequences. A higher value generally means slower compression and a smaller\n\
output size. If 0 is given, the data is PRS-encoded but not actually\n\
compressed, resulting in valid PRS data which is larger than the input.\n\
prs-size [INPUT-FILENAME]\n\
Compute the decompressed size of the PRS-compressed input data, but don\'t\n\
write the decompressed data anywhere.\n\
@@ -292,6 +297,7 @@ int main(int argc, char** argv) {
size_t stride = 1;
size_t num_threads = 0;
size_t bytes = 0;
size_t prs_compression_level = 1;
const char* find_decryption_seed_ciphertext = nullptr;
vector<const char*> find_decryption_seed_plaintexts;
const char* input_filename = nullptr;
@@ -321,6 +327,8 @@ int main(int argc, char** argv) {
cli_version = GameVersion::XB;
} else if (!strcmp(argv[x], "--bb")) {
cli_version = GameVersion::BB;
} else if (!strncmp(argv[x], "--compression-level=", 20)) {
prs_compression_level = strtoull(&argv[x][20], nullptr, 0);
} else if (!strcmp(argv[x], "--round2")) {
round2 = true;
} else if (!strncmp(argv[x], "--bytes=", 8)) {
@@ -531,7 +539,7 @@ int main(int argc, char** argv) {
uint64_t start = now();
if (behavior == Behavior::COMPRESS_PRS) {
data = prs_compress(data, progress_fn);
data = prs_compress(data, prs_compression_level, progress_fn);
} else if (behavior == Behavior::DECOMPRESS_PRS) {
data = prs_decompress(data);
} else if (behavior == Behavior::COMPRESS_BC0) {