fix unicode text set codec

This commit is contained in:
Martin Michelsen
2023-10-25 20:55:33 -07:00
parent ee4dade7ad
commit 5c2564336e
3 changed files with 73 additions and 30 deletions
+54 -21
View File
@@ -8,35 +8,68 @@
using namespace std;
vector<string> parse_unicode_text_set(const string& prs_data) {
vector<vector<string>> parse_unicode_text_set(const string& prs_data) {
string data = prs_decompress(prs_data);
StringReader r(data);
r.skip(4);
uint32_t count = r.get_u32l();
vector<string> ret;
while (ret.size() < count) {
u16string s(&r.pget<char16_t>(r.get_u32l()));
ret.emplace_back(tt_utf16_to_utf8(s.data(), s.size() * 2));
uint32_t num_collections = r.get_u32l();
deque<uint32_t> collection_sizes;
while (collection_sizes.size() < num_collections) {
collection_sizes.emplace_back(r.get_u32l());
}
vector<vector<string>> ret;
ret.reserve(collection_sizes.size());
while (!collection_sizes.empty()) {
uint32_t num_strings = collection_sizes.front();
collection_sizes.pop_front();
auto& strings = ret.emplace_back();
strings.reserve(num_strings);
while (strings.size() < num_strings) {
StringReader sub_r = r.sub(r.get_u32l());
StringWriter w;
for (uint16_t ch = sub_r.get_u16l(); ch != 0; ch = sub_r.get_u16l()) {
w.put_u16l(ch);
}
strings.emplace_back(tt_utf16_to_utf8(w.str()));
}
}
return ret;
}
string serialize_unicode_text_set(const vector<string>& strings) {
StringWriter w;
w.put_u32l(strings.size());
size_t string_offset = (strings.size() * 4) + 4; // Header size
for (const auto& s : strings) {
w.put_u32l(string_offset);
string_offset = (((s.size() + 1) << 1) + 3) & (~3);
string serialize_unicode_text_set(const vector<vector<string>>& collections) {
StringWriter header_w;
StringWriter data_w;
size_t total_num_strings = 0;
header_w.put_u32l(collections.size());
for (const auto& collection : collections) {
header_w.put_u32l(collection.size());
total_num_strings += collection.size();
}
for (const auto& s : strings) {
string s_utf16 = tt_utf8_to_utf16(s);
w.write(s_utf16.data(), s_utf16.size());
w.put_u16(0);
while (w.size() & 3) {
w.put_u8(0);
unordered_map<string, uint32_t> encoded;
size_t data_base_offset = (total_num_strings * 4) + header_w.size();
for (const auto& collection : collections) {
for (const auto& s : collection) {
auto encoded_it = encoded.find(s);
if (encoded_it == encoded.end()) {
uint32_t offset = data_base_offset + data_w.size();
encoded.emplace(s, offset);
string s_utf16 = tt_utf8_to_utf16(s);
data_w.write(s_utf16.data(), s_utf16.size());
data_w.put_u16(0);
while (data_w.size() & 3) {
data_w.put_u8(0);
}
} else {
header_w.put_u32l(encoded_it->second);
}
}
}
return std::move(w.str());
header_w.write(data_w.str());
return std::move(header_w.str());
}