handle PSO font characters properly

This commit is contained in:
Martin Michelsen
2024-04-07 13:03:11 -07:00
parent e141642dd6
commit f05dc6d9f9
5 changed files with 142 additions and 12 deletions
+6
View File
@@ -1301,6 +1301,12 @@ Action a_extract_bml("extract-bml", "\
PC/BB format.\n",
a_extract_archive_fn);
Action a_encode_sjis(
"encode-sjis", nullptr, +[](Arguments& args) {
string data = read_input_data(args);
string result = tt_utf8_to_sega_sjis(data);
write_output_data(args, result.data(), result.size(), "txt");
});
Action a_decode_sjis(
"decode-sjis", nullptr, +[](Arguments& args) {
string data = read_input_data(args);
+111 -12
View File
@@ -139,28 +139,127 @@ std::string TextTranscoder::on_untranslatable(const void**, size_t*) const {
TextTranscoderCustomSJISToUTF8::TextTranscoderCustomSJISToUTF8() : TextTranscoder("UTF-8", "SHIFT_JIS") {}
std::string TextTranscoderCustomSJISToUTF8::on_untranslatable(const void** src, size_t* size) const {
// Sega implemented a single nonstandard Shift-JIS character on PSO GC (and
// probably XB as well): the heart symbol, encoded as F040. Understandably,
// libiconv doesn't know what to do with it because it's not actually part of
// Shift-JIS, so we have to handle it manually here.
if ((*size >= 2) && !memcmp(*src, "\xF0\x40", 2)) {
*src = reinterpret_cast<const char*>(*src) + 2;
*size -= 2;
return "\xE2\x99\xA5";
std::string encode_utf8_char(uint32_t ch) {
string ret;
if (ch < 0x80) {
ret.push_back(ch);
} else if (ch < 0x800) {
ret.push_back(0xC0 | (ch >> 6));
ret.push_back(0x80 | (ch & 0x3F));
} else if (ch < 0x10000) {
ret.push_back(0xE0 | (ch >> 12));
ret.push_back(0x80 | ((ch >> 6) & 0x3F));
ret.push_back(0x80 | (ch & 0x3F));
} else if (ch < 0x110000) {
ret.push_back(0xF0 | (ch >> 18));
ret.push_back(0x80 | ((ch >> 12) & 0x3F));
ret.push_back(0x80 | ((ch >> 6) & 0x3F));
ret.push_back(0x80 | (ch & 0x3F));
} else {
throw runtime_error("unencodable Unicode code point");
}
return ret;
}
uint32_t decode_utf8_char(const void** vdata, size_t* size) {
if (*size == 0) {
throw runtime_error("incomplete UTF-8 character");
}
const uint8_t* data = reinterpret_cast<const uint8_t*>(*vdata);
if (!(data[0] & 0x80)) {
(*size)--;
*vdata = data + 1;
return *data;
} else if ((data[0] & 0xE0) == 0xC0) {
if ((*size < 2) || ((data[1] & 0xC0) != 0x80)) {
throw runtime_error("incomplete UTF-8 character");
}
(*size) -= 2;
*vdata = data + 2;
return ((data[0] & 0x1F) << 6) | (data[1] & 0x3F);
} else if ((data[0] & 0xF0) == 0xE0) {
if ((*size < 3) || ((data[1] & 0xC0) != 0x80) || ((data[2] & 0xC0) != 0x80)) {
throw runtime_error("incomplete UTF-8 character");
}
(*size) -= 3;
*vdata = data + 3;
return ((data[0] & 0x0F) << 12) | ((data[1] & 0x3F) << 6) | (data[2] & 0x3F);
} else if ((data[0] & 0xF8) == 0xF0) {
if ((*size < 4) || ((data[1] & 0xC0) != 0x80) || ((data[2] & 0xC0) != 0x80) || ((data[3] & 0xC0) != 0x80)) {
throw runtime_error("incomplete UTF-8 character");
}
(*size) -= 4;
*vdata = data + 4;
return ((data[0] & 0x07) << 18) | ((data[1] & 0x3F) << 12) | ((data[2] & 0x3F) << 6) | (data[3] & 0x3F);
} else {
throw runtime_error("invalid UTF-8 character");
}
}
std::string TextTranscoderCustomSJISToUTF8::on_untranslatable(const void** vsrc, size_t* size) const {
// Sega implemented some nonstandard Shift-JIS characters on PSO GC (and
// probably XB as well): the heart symbol, encoded as F040, and the PSO font,
// encoded as F041-F064. Understandably, libiconv doesn't know what to do
// with these because they're not actually part of Shift-JIS, so we have to
// handle them manually here. We convert them to actual UTF-8 symbols:
// F040 (heart symbol) -> U+2665 (heart suit symbol)
// F041 (PSO font number 0) -> 24EA (circled digit zero)
// F042-F04A (PSO font numbers 1-9) -> 2460-2468 (circled digits 1-9)
// F04B-F064 (PSO font letters) -> 1D4D0-1D4E9 (script letters A-Z)
const uint8_t* src = reinterpret_cast<const uint8_t*>(*vsrc);
if ((*size < 2) || (src[0] != 0xF0)) {
return "";
}
string ret;
if (src[1] < 0x40) {
return "";
} else if (src[1] == 0x40) { // F040 -> U+2665
ret = encode_utf8_char(0x2665);
} else if (src[1] == 0x41) { // F041 -> U+24EA
ret = encode_utf8_char(0x24EA);
} else if (src[1] <= 0x4A) { // F042-F04A -> U+2460-U+2468
ret = encode_utf8_char(0x2460 + (src[1] - 0x42));
} else if (src[1] <= 0x64) { // F04B-F064 -> U+1D4D0-U+1D4E9
ret = encode_utf8_char(0x1D4D0 + (src[1] - 0x4B));
} else {
return "";
}
*vsrc = src + 2;
(*size) -= 2;
return ret;
}
TextTranscoderUTF8ToCustomSJIS::TextTranscoderUTF8ToCustomSJIS() : TextTranscoder("SHIFT_JIS", "UTF-8") {}
std::string TextTranscoderUTF8ToCustomSJIS::on_untranslatable(const void** src, size_t* size) const {
if ((*size >= 3) && !memcmp(*src, "\xE2\x99\xA5", 3)) {
*src = reinterpret_cast<const char*>(*src) + 3;
*size -= 3;
const void* orig_src = *src;
size_t orig_size = *size;
uint32_t ch;
try {
ch = decode_utf8_char(src, size);
} catch (const runtime_error&) {
return "";
}
if (ch == 0x2665) { // U+2665 -> F040
return "\xF0\x40";
} else if (ch == 0x24EA) { // U+24EA -> F041
return "\xF0\x41";
} else if (ch >= 0x2460 && ch <= 0x2468) { // U+2460-U+2468 -> F042-F04A
string ret("\xF0");
ret.push_back(0x42 + (ch - 0x2460));
return ret;
} else if (ch >= 0x1D4D0 && ch <= 0x1D4E9) { // U+1D4D0-U+1D4E9 -> F04B-F064
string ret("\xF0");
ret.push_back(0x4B + (ch - 0x1D4D0));
return ret;
} else {
*src = orig_src;
*size = orig_size;
return "";
}
}
+3
View File
@@ -22,6 +22,9 @@
// Conversion functions
std::string encode_utf8_char(uint32_t ch);
uint32_t decode_utf8_char(const void** data, size_t* size);
class TextTranscoder {
public:
TextTranscoder(const char* to, const char* from);
+20
View File
@@ -0,0 +1,20 @@
#!/bin/sh
set -e
SCHEME=$1
EXECUTABLE="$2"
if [ "$EXECUTABLE" == "" ]; then
EXECUTABLE="./newserv"
fi
echo "... decode-sjis"
$EXECUTABLE decode-sjis tests/custom-sjis.txt tests/custom-sjis.utf8.txt
echo "... encode-sjis"
$EXECUTABLE encode-sjis tests/custom-sjis.utf8.txt tests/custom-sjis.recoded.txt
diff tests/custom-sjis.txt tests/custom-sjis.recoded.txt
echo "... clean up"
rm tests/custom-sjis.utf8.txt tests/custom-sjis.recoded.txt
+2
View File
@@ -0,0 +1,2 @@
These are all the custom characters: ð@ðAðBðCðDðEðFðGðHðIðJðKðLðMðNðOðPðQðRðSðTðUðVðWðXðYðZð[ð\ð]ð^ð_ð`ðaðbðcðd
That's all of them.