show progress during slow prs and bc0 compression
This commit is contained in:
+94
-85
@@ -14,8 +14,9 @@ using namespace std;
|
||||
|
||||
|
||||
|
||||
PRSCompressor::PRSCompressor()
|
||||
: closed(false),
|
||||
PRSCompressor::PRSCompressor(function<void(size_t, size_t)> progress_fn)
|
||||
: progress_fn(progress_fn),
|
||||
closed(false),
|
||||
control_byte_offset(0),
|
||||
pending_control_bits(0),
|
||||
input_bytes(0),
|
||||
@@ -132,6 +133,9 @@ void PRSCompressor::advance() {
|
||||
}
|
||||
|
||||
for (size_t z = 0; z < advance_bytes; z++) {
|
||||
if ((this->compression_offset & 0x1000) && this->progress_fn) {
|
||||
this->progress_fn(this->compression_offset, this->output.size());
|
||||
}
|
||||
this->reverse_log[this->compression_offset & 0x1FFF] = this->forward_log[this->compression_offset & 0xFF];
|
||||
this->compression_offset++;
|
||||
}
|
||||
@@ -185,14 +189,16 @@ void PRSCompressor::flush_control() {
|
||||
|
||||
|
||||
|
||||
string prs_compress(const void* vdata, size_t size) {
|
||||
PRSCompressor prs;
|
||||
string prs_compress(
|
||||
const void* vdata, size_t size, function<void(size_t, size_t)> progress_fn) {
|
||||
PRSCompressor prs(progress_fn);
|
||||
prs.add(vdata, size);
|
||||
return move(prs.close());
|
||||
}
|
||||
|
||||
string prs_compress(const string& data) {
|
||||
return prs_compress(data.data(), data.size());
|
||||
string prs_compress(
|
||||
const string& data, function<void(size_t, size_t)> progress_fn) {
|
||||
return prs_compress(data.data(), data.size(), progress_fn);
|
||||
}
|
||||
|
||||
|
||||
@@ -373,85 +379,8 @@ size_t prs_decompress_size(const string& data, size_t max_output_size) {
|
||||
// PRS, there is only one type of backreference. Also, there is no stop opcode;
|
||||
// the decompressor simply stops when there are no more input bytes to read.
|
||||
|
||||
// The BC0 decompression implementation in PSO GC is vulnerable to overflow
|
||||
// attacks - there is no bounds checking on the output buffer. It is unlikely
|
||||
// that this can be usefully exploited (e.g. for RCE) because the output pointer
|
||||
// is checked before every byte is written, so we cannot change the output
|
||||
// pointer to any arbitrary address.
|
||||
|
||||
string bc0_decompress(const string& data) {
|
||||
StringReader r(data);
|
||||
StringWriter w;
|
||||
|
||||
// Unlike PRS, BC0 uses a memo which "rolls over" every 0x1000 bytes. The
|
||||
// boundaries of these "memo pages" are offset by -0x12 bytes for some reason,
|
||||
// so the first output byte corresponds to position 0xFEE on the first memo
|
||||
// page. Backreferences refer to offsets based on the start of memo pages; for
|
||||
// example, if the current output offset is 0x1234, a backreference with
|
||||
// offset 0x123 refers to the byte that was written at offset 0x1112 (because
|
||||
// that byte is at offset 0x112 in the memo, because the memo rolls over every
|
||||
// 0x1000 bytes and the first memo byte was 0x12 bytes before the beginning of
|
||||
// the next page). The memo is initially zeroed from 0 to 0xFEE; it seems PSO
|
||||
// GC doesn't initialize the last 0x12 bytes of the first memo page. For this
|
||||
// reason, we avoid generating backreferences that refer to those bytes.
|
||||
parray<uint8_t, 0x1000> memo;
|
||||
uint16_t memo_offset = 0x0FEE;
|
||||
|
||||
// The low byte of this value contains the control stream data; the high bits
|
||||
// specify which low bits are valid. When the last 1 is shifted out of the
|
||||
// high bit, we need to read a new control stream byte to get the next set of
|
||||
// control bits.
|
||||
uint16_t control_stream_bits = 0x0000;
|
||||
|
||||
while (!r.eof()) {
|
||||
// Read control stream bits if needed
|
||||
control_stream_bits >>= 1;
|
||||
if ((control_stream_bits & 0x100) == 0) {
|
||||
control_stream_bits = 0xFF00 | r.get_u8();
|
||||
if (r.eof()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Control bit 0 means to perform a backreference copy. The offset and
|
||||
// size are stored in two bytes in the input stream, laid out as follows:
|
||||
// a1 = 0bBBBBBBBB
|
||||
// a2 = 0bAAAACCCC
|
||||
// The offset is the concatenation of bits AAAABBBBBBBB, which refers to a
|
||||
// position in the memo; the number of bytes to copy is (CCCC + 3). The
|
||||
// decompressor copies that many bytes from that offset in the memo, and
|
||||
// writes them to the output and to the current position in the memo.
|
||||
if ((control_stream_bits & 1) == 0) {
|
||||
uint8_t a1 = r.get_u8();
|
||||
if (r.eof()) {
|
||||
break;
|
||||
}
|
||||
uint8_t a2 = r.get_u8();
|
||||
size_t count = (a2 & 0x0F) + 3;
|
||||
size_t backreference_offset = a1 | ((a2 << 4) & 0xF00);
|
||||
for (size_t z = 0; z < count; z++) {
|
||||
uint8_t v = memo[(backreference_offset + z) & 0x0FFF];
|
||||
w.put_u8(v);
|
||||
memo[memo_offset] = v;
|
||||
memo_offset = (memo_offset + 1) & 0x0FFF;
|
||||
}
|
||||
|
||||
// Control bit 1 means to write a byte directly from the input to the
|
||||
// output. As above, the byte is also written to the memo.
|
||||
} else {
|
||||
uint8_t v = r.get_u8();
|
||||
w.put_u8(v);
|
||||
memo[memo_offset] = v;
|
||||
memo_offset = (memo_offset + 1) & 0x0FFF;
|
||||
}
|
||||
}
|
||||
|
||||
return move(w.str());
|
||||
}
|
||||
|
||||
|
||||
|
||||
string bc0_compress(const string& data) {
|
||||
string bc0_compress(
|
||||
const string& data, function<void(size_t, size_t)> progress_fn) {
|
||||
StringReader r(data);
|
||||
StringWriter w;
|
||||
|
||||
@@ -464,6 +393,10 @@ string bc0_compress(const string& data) {
|
||||
|
||||
parray<uint8_t, 17> match_buf;
|
||||
while (!r.eof()) {
|
||||
if ((r.where() & 0x1000) && progress_fn) {
|
||||
progress_fn(r.where(), w.size());
|
||||
}
|
||||
|
||||
// Search in the memo for the longest string matching the upcoming data, of
|
||||
// size 3-17 bytes
|
||||
size_t best_match_offset = 0;
|
||||
@@ -545,3 +478,79 @@ string bc0_compress(const string& data) {
|
||||
|
||||
return move(w.str());
|
||||
}
|
||||
|
||||
// The BC0 decompression implementation in PSO GC is vulnerable to overflow
|
||||
// attacks - there is no bounds checking on the output buffer. It is unlikely
|
||||
// that this can be usefully exploited (e.g. for RCE) because the output pointer
|
||||
// is checked before every byte is written, so we cannot change the output
|
||||
// pointer to any arbitrary address.
|
||||
|
||||
string bc0_decompress(const string& data) {
|
||||
StringReader r(data);
|
||||
StringWriter w;
|
||||
|
||||
// Unlike PRS, BC0 uses a memo which "rolls over" every 0x1000 bytes. The
|
||||
// boundaries of these "memo pages" are offset by -0x12 bytes for some reason,
|
||||
// so the first output byte corresponds to position 0xFEE on the first memo
|
||||
// page. Backreferences refer to offsets based on the start of memo pages; for
|
||||
// example, if the current output offset is 0x1234, a backreference with
|
||||
// offset 0x123 refers to the byte that was written at offset 0x1112 (because
|
||||
// that byte is at offset 0x112 in the memo, because the memo rolls over every
|
||||
// 0x1000 bytes and the first memo byte was 0x12 bytes before the beginning of
|
||||
// the next page). The memo is initially zeroed from 0 to 0xFEE; it seems PSO
|
||||
// GC doesn't initialize the last 0x12 bytes of the first memo page. For this
|
||||
// reason, we avoid generating backreferences that refer to those bytes.
|
||||
parray<uint8_t, 0x1000> memo;
|
||||
uint16_t memo_offset = 0x0FEE;
|
||||
|
||||
// The low byte of this value contains the control stream data; the high bits
|
||||
// specify which low bits are valid. When the last 1 is shifted out of the
|
||||
// high bit, we need to read a new control stream byte to get the next set of
|
||||
// control bits.
|
||||
uint16_t control_stream_bits = 0x0000;
|
||||
|
||||
while (!r.eof()) {
|
||||
// Read control stream bits if needed
|
||||
control_stream_bits >>= 1;
|
||||
if ((control_stream_bits & 0x100) == 0) {
|
||||
control_stream_bits = 0xFF00 | r.get_u8();
|
||||
if (r.eof()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Control bit 0 means to perform a backreference copy. The offset and
|
||||
// size are stored in two bytes in the input stream, laid out as follows:
|
||||
// a1 = 0bBBBBBBBB
|
||||
// a2 = 0bAAAACCCC
|
||||
// The offset is the concatenation of bits AAAABBBBBBBB, which refers to a
|
||||
// position in the memo; the number of bytes to copy is (CCCC + 3). The
|
||||
// decompressor copies that many bytes from that offset in the memo, and
|
||||
// writes them to the output and to the current position in the memo.
|
||||
if ((control_stream_bits & 1) == 0) {
|
||||
uint8_t a1 = r.get_u8();
|
||||
if (r.eof()) {
|
||||
break;
|
||||
}
|
||||
uint8_t a2 = r.get_u8();
|
||||
size_t count = (a2 & 0x0F) + 3;
|
||||
size_t backreference_offset = a1 | ((a2 << 4) & 0xF00);
|
||||
for (size_t z = 0; z < count; z++) {
|
||||
uint8_t v = memo[(backreference_offset + z) & 0x0FFF];
|
||||
w.put_u8(v);
|
||||
memo[memo_offset] = v;
|
||||
memo_offset = (memo_offset + 1) & 0x0FFF;
|
||||
}
|
||||
|
||||
// Control bit 1 means to write a byte directly from the input to the
|
||||
// output. As above, the byte is also written to the memo.
|
||||
} else {
|
||||
uint8_t v = r.get_u8();
|
||||
w.put_u8(v);
|
||||
memo[memo_offset] = v;
|
||||
memo_offset = (memo_offset + 1) & 0x0FFF;
|
||||
}
|
||||
}
|
||||
|
||||
return move(w.str());
|
||||
}
|
||||
|
||||
+7
-5
@@ -3,6 +3,7 @@
|
||||
#include <stddef.h>
|
||||
|
||||
#include <string>
|
||||
#include <functional>
|
||||
|
||||
#include "Text.hh"
|
||||
|
||||
@@ -15,7 +16,7 @@ class PRSCompressor {
|
||||
public:
|
||||
// To use this class, instantiate it, then call .add() one or more times, then
|
||||
// call .close() and use the returned string as the compressed result.
|
||||
PRSCompressor();
|
||||
PRSCompressor(std::function<void(size_t, size_t)> progress_fn = nullptr);
|
||||
~PRSCompressor() = default;
|
||||
|
||||
// Adds more input data to be compressed, which logically comes after all
|
||||
@@ -39,6 +40,7 @@ private:
|
||||
void write_control(bool z);
|
||||
void flush_control();
|
||||
|
||||
std::function<void(size_t, size_t)> progress_fn;
|
||||
bool closed;
|
||||
|
||||
size_t control_byte_offset;
|
||||
@@ -55,8 +57,8 @@ private:
|
||||
// Compresses data from a single input buffer using PRS and returns the
|
||||
// compressed result. This is a shortcut for constructing a PRSCompressor,
|
||||
// calling .add() once, and calling .close().
|
||||
std::string prs_compress(const void* vdata, size_t size);
|
||||
std::string prs_compress(const std::string& data);
|
||||
std::string prs_compress(const void* vdata, size_t size, std::function<void(size_t, size_t)> progress_fn = nullptr);
|
||||
std::string prs_compress(const std::string& data, std::function<void(size_t, size_t)> progress_fn = nullptr);
|
||||
|
||||
// Decompresses PRS-compressed data.
|
||||
std::string prs_decompress(const void* data, size_t size, size_t max_output_size = 0);
|
||||
@@ -67,6 +69,6 @@ std::string prs_decompress(const std::string& data, size_t max_output_size = 0);
|
||||
size_t prs_decompress_size(const void* data, size_t size, size_t max_output_size = 0);
|
||||
size_t prs_decompress_size(const std::string& data, size_t max_output_size = 0);
|
||||
|
||||
// Decompresses and compresses data using the BC0 algorithm.
|
||||
// Compresses and decompresses data using the BC0 algorithm.
|
||||
std::string bc0_compress(const std::string& data, std::function<void(size_t, size_t)> progress_fn = nullptr);
|
||||
std::string bc0_decompress(const std::string& data);
|
||||
std::string bc0_compress(const std::string& data);
|
||||
|
||||
+12
-4
@@ -590,19 +590,27 @@ int main(int argc, char** argv) {
|
||||
case Behavior::DECOMPRESS_BC0: {
|
||||
string data = read_input_data();
|
||||
size_t input_bytes = data.size();
|
||||
auto progress_fn = [&](size_t input_progress, size_t output_progress) -> void {
|
||||
float progress = static_cast<float>(input_progress * 100) / input_bytes;
|
||||
float size_ratio = static_cast<float>(output_progress * 100) / input_progress;
|
||||
fprintf(stderr, "... %zu (%g%%) <= %zu/%zu (%g%%) \r",
|
||||
output_progress, size_ratio, input_progress, input_bytes, progress);
|
||||
};
|
||||
|
||||
if (behavior == Behavior::COMPRESS_PRS) {
|
||||
data = prs_compress(data);
|
||||
data = prs_compress(data, progress_fn);
|
||||
} else if (behavior == Behavior::DECOMPRESS_PRS) {
|
||||
data = prs_decompress(data);
|
||||
} else if (behavior == Behavior::COMPRESS_BC0) {
|
||||
data = bc0_compress(data);
|
||||
data = bc0_compress(data, progress_fn);
|
||||
} else if (behavior == Behavior::DECOMPRESS_BC0) {
|
||||
data = bc0_decompress(data);
|
||||
} else {
|
||||
throw logic_error("invalid behavior");
|
||||
}
|
||||
log_info("%zu (0x%zX) bytes input => %zu (0x%zX) bytes output",
|
||||
input_bytes, input_bytes, data.size(), data.size());
|
||||
float size_ratio = static_cast<float>(data.size() * 100) / input_bytes;
|
||||
log_info("%zu (0x%zX) bytes input => %zu (0x%zX) bytes output (%g%%)",
|
||||
input_bytes, input_bytes, data.size(), data.size(), size_ratio);
|
||||
|
||||
write_output_data(data.data(), data.size());
|
||||
break;
|
||||
|
||||
Reference in New Issue
Block a user