handle PSO font characters properly

This commit is contained in:
Martin Michelsen
2024-04-07 13:03:11 -07:00
parent e141642dd6
commit f05dc6d9f9
5 changed files with 142 additions and 12 deletions
+111 -12
View File
@@ -139,28 +139,127 @@ std::string TextTranscoder::on_untranslatable(const void**, size_t*) const {
TextTranscoderCustomSJISToUTF8::TextTranscoderCustomSJISToUTF8() : TextTranscoder("UTF-8", "SHIFT_JIS") {}
std::string TextTranscoderCustomSJISToUTF8::on_untranslatable(const void** src, size_t* size) const {
// Sega implemented a single nonstandard Shift-JIS character on PSO GC (and
// probably XB as well): the heart symbol, encoded as F040. Understandably,
// libiconv doesn't know what to do with it because it's not actually part of
// Shift-JIS, so we have to handle it manually here.
if ((*size >= 2) && !memcmp(*src, "\xF0\x40", 2)) {
*src = reinterpret_cast<const char*>(*src) + 2;
*size -= 2;
return "\xE2\x99\xA5";
std::string encode_utf8_char(uint32_t ch) {
string ret;
if (ch < 0x80) {
ret.push_back(ch);
} else if (ch < 0x800) {
ret.push_back(0xC0 | (ch >> 6));
ret.push_back(0x80 | (ch & 0x3F));
} else if (ch < 0x10000) {
ret.push_back(0xE0 | (ch >> 12));
ret.push_back(0x80 | ((ch >> 6) & 0x3F));
ret.push_back(0x80 | (ch & 0x3F));
} else if (ch < 0x110000) {
ret.push_back(0xF0 | (ch >> 18));
ret.push_back(0x80 | ((ch >> 12) & 0x3F));
ret.push_back(0x80 | ((ch >> 6) & 0x3F));
ret.push_back(0x80 | (ch & 0x3F));
} else {
throw runtime_error("unencodable Unicode code point");
}
return ret;
}
uint32_t decode_utf8_char(const void** vdata, size_t* size) {
if (*size == 0) {
throw runtime_error("incomplete UTF-8 character");
}
const uint8_t* data = reinterpret_cast<const uint8_t*>(*vdata);
if (!(data[0] & 0x80)) {
(*size)--;
*vdata = data + 1;
return *data;
} else if ((data[0] & 0xE0) == 0xC0) {
if ((*size < 2) || ((data[1] & 0xC0) != 0x80)) {
throw runtime_error("incomplete UTF-8 character");
}
(*size) -= 2;
*vdata = data + 2;
return ((data[0] & 0x1F) << 6) | (data[1] & 0x3F);
} else if ((data[0] & 0xF0) == 0xE0) {
if ((*size < 3) || ((data[1] & 0xC0) != 0x80) || ((data[2] & 0xC0) != 0x80)) {
throw runtime_error("incomplete UTF-8 character");
}
(*size) -= 3;
*vdata = data + 3;
return ((data[0] & 0x0F) << 12) | ((data[1] & 0x3F) << 6) | (data[2] & 0x3F);
} else if ((data[0] & 0xF8) == 0xF0) {
if ((*size < 4) || ((data[1] & 0xC0) != 0x80) || ((data[2] & 0xC0) != 0x80) || ((data[3] & 0xC0) != 0x80)) {
throw runtime_error("incomplete UTF-8 character");
}
(*size) -= 4;
*vdata = data + 4;
return ((data[0] & 0x07) << 18) | ((data[1] & 0x3F) << 12) | ((data[2] & 0x3F) << 6) | (data[3] & 0x3F);
} else {
throw runtime_error("invalid UTF-8 character");
}
}
std::string TextTranscoderCustomSJISToUTF8::on_untranslatable(const void** vsrc, size_t* size) const {
// Sega implemented some nonstandard Shift-JIS characters on PSO GC (and
// probably XB as well): the heart symbol, encoded as F040, and the PSO font,
// encoded as F041-F064. Understandably, libiconv doesn't know what to do
// with these because they're not actually part of Shift-JIS, so we have to
// handle them manually here. We convert them to actual UTF-8 symbols:
// F040 (heart symbol) -> U+2665 (heart suit symbol)
// F041 (PSO font number 0) -> 24EA (circled digit zero)
// F042-F04A (PSO font numbers 1-9) -> 2460-2468 (circled digits 1-9)
// F04B-F064 (PSO font letters) -> 1D4D0-1D4E9 (script letters A-Z)
const uint8_t* src = reinterpret_cast<const uint8_t*>(*vsrc);
if ((*size < 2) || (src[0] != 0xF0)) {
return "";
}
string ret;
if (src[1] < 0x40) {
return "";
} else if (src[1] == 0x40) { // F040 -> U+2665
ret = encode_utf8_char(0x2665);
} else if (src[1] == 0x41) { // F041 -> U+24EA
ret = encode_utf8_char(0x24EA);
} else if (src[1] <= 0x4A) { // F042-F04A -> U+2460-U+2468
ret = encode_utf8_char(0x2460 + (src[1] - 0x42));
} else if (src[1] <= 0x64) { // F04B-F064 -> U+1D4D0-U+1D4E9
ret = encode_utf8_char(0x1D4D0 + (src[1] - 0x4B));
} else {
return "";
}
*vsrc = src + 2;
(*size) -= 2;
return ret;
}
TextTranscoderUTF8ToCustomSJIS::TextTranscoderUTF8ToCustomSJIS() : TextTranscoder("SHIFT_JIS", "UTF-8") {}
std::string TextTranscoderUTF8ToCustomSJIS::on_untranslatable(const void** src, size_t* size) const {
if ((*size >= 3) && !memcmp(*src, "\xE2\x99\xA5", 3)) {
*src = reinterpret_cast<const char*>(*src) + 3;
*size -= 3;
const void* orig_src = *src;
size_t orig_size = *size;
uint32_t ch;
try {
ch = decode_utf8_char(src, size);
} catch (const runtime_error&) {
return "";
}
if (ch == 0x2665) { // U+2665 -> F040
return "\xF0\x40";
} else if (ch == 0x24EA) { // U+24EA -> F041
return "\xF0\x41";
} else if (ch >= 0x2460 && ch <= 0x2468) { // U+2460-U+2468 -> F042-F04A
string ret("\xF0");
ret.push_back(0x42 + (ch - 0x2460));
return ret;
} else if (ch >= 0x1D4D0 && ch <= 0x1D4E9) { // U+1D4D0-U+1D4E9 -> F04B-F064
string ret("\xF0");
ret.push_back(0x4B + (ch - 0x1D4D0));
return ret;
} else {
*src = orig_src;
*size = orig_size;
return "";
}
}