diff --git a/src/Main.cc b/src/Main.cc index a9c0adbe..ea1eafc8 100644 --- a/src/Main.cc +++ b/src/Main.cc @@ -249,6 +249,10 @@ The actions are:\n\ encode-text-archive [INPUT-FILENAME [OUTPUT-FILENAME]]\n\ Decode a text archive (e.g. TextEnglish.pr2) to JSON for easy editing, or\n\ encode a JSON file to a text archive.\n\ + decode-unicode-text-set [INPUT-FILENAME [OUTPUT-FILENAME]]\n\ + encode-unicode-text-set [INPUT-FILENAME [OUTPUT-FILENAME]]\n\ + Decode a Unicode text set (e.g. unitxt_e.prs) to JSON for easy editing, or\n\ + encode a JSON file to a Unicode text set.\n\ format-rare-item-set [--json] [INPUT-FILENAME]\n\ Print the contents of a rare item table in a human-readable format. If\n\ --json is given, the input is parsed as a JSON rare item set (see\n\ @@ -1504,10 +1508,13 @@ int main(int argc, char** argv) { break; } case Behavior::DECODE_UNICODE_TEXT_SET: { - auto strings = parse_unicode_text_set(read_input_data()); + auto collections = parse_unicode_text_set(read_input_data()); JSON j = JSON::list(); - for (const string& s : strings) { - j.emplace_back(s); + for (const auto& collection : collections) { + JSON& coll_j = j.emplace_back(JSON::list()); + for (const auto& s : collection) { + coll_j.emplace_back(s); + } } string out_data = j.serialize(JSON::SerializeOption::FORMAT); write_output_data(out_data.data(), out_data.size()); @@ -1515,11 +1522,14 @@ int main(int argc, char** argv) { } case Behavior::ENCODE_UNICODE_TEXT_SET: { auto json = JSON::parse(read_input_data()); - vector strings; - for (const auto& s_json : json.as_list()) { - strings.emplace_back(s_json->as_string()); + vector> collections; + for (const auto& coll_json : json.as_list()) { + auto& collection = collections.emplace_back(); + for (const auto& s_json : coll_json->as_list()) { + collection.emplace_back(std::move(s_json->as_string())); + } } - string encoded = serialize_unicode_text_set(strings); + string encoded = serialize_unicode_text_set(collections); write_output_data(encoded.data(), encoded.size()); break; } diff --git a/src/UnicodeTextSet.cc b/src/UnicodeTextSet.cc index 27283815..3972dc4f 100644 --- a/src/UnicodeTextSet.cc +++ b/src/UnicodeTextSet.cc @@ -8,35 +8,68 @@ using namespace std; -vector parse_unicode_text_set(const string& prs_data) { +vector> parse_unicode_text_set(const string& prs_data) { string data = prs_decompress(prs_data); StringReader r(data); - r.skip(4); - uint32_t count = r.get_u32l(); - vector ret; - while (ret.size() < count) { - u16string s(&r.pget(r.get_u32l())); - ret.emplace_back(tt_utf16_to_utf8(s.data(), s.size() * 2)); + uint32_t num_collections = r.get_u32l(); + deque collection_sizes; + while (collection_sizes.size() < num_collections) { + collection_sizes.emplace_back(r.get_u32l()); + } + + vector> ret; + ret.reserve(collection_sizes.size()); + while (!collection_sizes.empty()) { + uint32_t num_strings = collection_sizes.front(); + collection_sizes.pop_front(); + + auto& strings = ret.emplace_back(); + strings.reserve(num_strings); + while (strings.size() < num_strings) { + StringReader sub_r = r.sub(r.get_u32l()); + StringWriter w; + for (uint16_t ch = sub_r.get_u16l(); ch != 0; ch = sub_r.get_u16l()) { + w.put_u16l(ch); + } + strings.emplace_back(tt_utf16_to_utf8(w.str())); + } } return ret; } -string serialize_unicode_text_set(const vector& strings) { - StringWriter w; - w.put_u32l(strings.size()); - size_t string_offset = (strings.size() * 4) + 4; // Header size - for (const auto& s : strings) { - w.put_u32l(string_offset); - string_offset = (((s.size() + 1) << 1) + 3) & (~3); +string serialize_unicode_text_set(const vector>& collections) { + StringWriter header_w; + StringWriter data_w; + + size_t total_num_strings = 0; + header_w.put_u32l(collections.size()); + for (const auto& collection : collections) { + header_w.put_u32l(collection.size()); + total_num_strings += collection.size(); } - for (const auto& s : strings) { - string s_utf16 = tt_utf8_to_utf16(s); - w.write(s_utf16.data(), s_utf16.size()); - w.put_u16(0); - while (w.size() & 3) { - w.put_u8(0); + + unordered_map encoded; + + size_t data_base_offset = (total_num_strings * 4) + header_w.size(); + for (const auto& collection : collections) { + for (const auto& s : collection) { + auto encoded_it = encoded.find(s); + if (encoded_it == encoded.end()) { + uint32_t offset = data_base_offset + data_w.size(); + encoded.emplace(s, offset); + string s_utf16 = tt_utf8_to_utf16(s); + data_w.write(s_utf16.data(), s_utf16.size()); + data_w.put_u16(0); + while (data_w.size() & 3) { + data_w.put_u8(0); + } + } else { + header_w.put_u32l(encoded_it->second); + } } } - return std::move(w.str()); + + header_w.write(data_w.str()); + return std::move(header_w.str()); } diff --git a/src/UnicodeTextSet.hh b/src/UnicodeTextSet.hh index abc77134..99d4e0e0 100644 --- a/src/UnicodeTextSet.hh +++ b/src/UnicodeTextSet.hh @@ -3,5 +3,5 @@ #include #include -std::vector parse_unicode_text_set(const std::string& prs_data); -std::string serialize_unicode_text_set(const std::vector& strings); +std::vector> parse_unicode_text_set(const std::string& prs_data); +std::string serialize_unicode_text_set(const std::vector>& collections);