From f05dc6d9f9c018882ad95d18900a19917b434266 Mon Sep 17 00:00:00 2001 From: Martin Michelsen Date: Sun, 7 Apr 2024 13:03:11 -0700 Subject: [PATCH] handle PSO font characters properly --- src/Main.cc | 6 ++ src/Text.cc | 123 ++++++++++++++++++++++++++++++++++---- src/Text.hh | 3 + tests/custom-sjis.test.sh | 20 +++++++ tests/custom-sjis.txt | 2 + 5 files changed, 142 insertions(+), 12 deletions(-) create mode 100755 tests/custom-sjis.test.sh create mode 100755 tests/custom-sjis.txt diff --git a/src/Main.cc b/src/Main.cc index 5d3f94ea..97782bc8 100644 --- a/src/Main.cc +++ b/src/Main.cc @@ -1301,6 +1301,12 @@ Action a_extract_bml("extract-bml", "\ PC/BB format.\n", a_extract_archive_fn); +Action a_encode_sjis( + "encode-sjis", nullptr, +[](Arguments& args) { + string data = read_input_data(args); + string result = tt_utf8_to_sega_sjis(data); + write_output_data(args, result.data(), result.size(), "txt"); + }); Action a_decode_sjis( "decode-sjis", nullptr, +[](Arguments& args) { string data = read_input_data(args); diff --git a/src/Text.cc b/src/Text.cc index 52f94fea..1c7377c9 100644 --- a/src/Text.cc +++ b/src/Text.cc @@ -139,28 +139,127 @@ std::string TextTranscoder::on_untranslatable(const void**, size_t*) const { TextTranscoderCustomSJISToUTF8::TextTranscoderCustomSJISToUTF8() : TextTranscoder("UTF-8", "SHIFT_JIS") {} -std::string TextTranscoderCustomSJISToUTF8::on_untranslatable(const void** src, size_t* size) const { - // Sega implemented a single nonstandard Shift-JIS character on PSO GC (and - // probably XB as well): the heart symbol, encoded as F040. Understandably, - // libiconv doesn't know what to do with it because it's not actually part of - // Shift-JIS, so we have to handle it manually here. - if ((*size >= 2) && !memcmp(*src, "\xF0\x40", 2)) { - *src = reinterpret_cast(*src) + 2; - *size -= 2; - return "\xE2\x99\xA5"; +std::string encode_utf8_char(uint32_t ch) { + string ret; + if (ch < 0x80) { + ret.push_back(ch); + } else if (ch < 0x800) { + ret.push_back(0xC0 | (ch >> 6)); + ret.push_back(0x80 | (ch & 0x3F)); + } else if (ch < 0x10000) { + ret.push_back(0xE0 | (ch >> 12)); + ret.push_back(0x80 | ((ch >> 6) & 0x3F)); + ret.push_back(0x80 | (ch & 0x3F)); + } else if (ch < 0x110000) { + ret.push_back(0xF0 | (ch >> 18)); + ret.push_back(0x80 | ((ch >> 12) & 0x3F)); + ret.push_back(0x80 | ((ch >> 6) & 0x3F)); + ret.push_back(0x80 | (ch & 0x3F)); + } else { + throw runtime_error("unencodable Unicode code point"); + } + return ret; +} + +uint32_t decode_utf8_char(const void** vdata, size_t* size) { + if (*size == 0) { + throw runtime_error("incomplete UTF-8 character"); + } + + const uint8_t* data = reinterpret_cast(*vdata); + if (!(data[0] & 0x80)) { + (*size)--; + *vdata = data + 1; + return *data; + } else if ((data[0] & 0xE0) == 0xC0) { + if ((*size < 2) || ((data[1] & 0xC0) != 0x80)) { + throw runtime_error("incomplete UTF-8 character"); + } + (*size) -= 2; + *vdata = data + 2; + return ((data[0] & 0x1F) << 6) | (data[1] & 0x3F); + } else if ((data[0] & 0xF0) == 0xE0) { + if ((*size < 3) || ((data[1] & 0xC0) != 0x80) || ((data[2] & 0xC0) != 0x80)) { + throw runtime_error("incomplete UTF-8 character"); + } + (*size) -= 3; + *vdata = data + 3; + return ((data[0] & 0x0F) << 12) | ((data[1] & 0x3F) << 6) | (data[2] & 0x3F); + } else if ((data[0] & 0xF8) == 0xF0) { + if ((*size < 4) || ((data[1] & 0xC0) != 0x80) || ((data[2] & 0xC0) != 0x80) || ((data[3] & 0xC0) != 0x80)) { + throw runtime_error("incomplete UTF-8 character"); + } + (*size) -= 4; + *vdata = data + 4; + return ((data[0] & 0x07) << 18) | ((data[1] & 0x3F) << 12) | ((data[2] & 0x3F) << 6) | (data[3] & 0x3F); + } else { + throw runtime_error("invalid UTF-8 character"); + } +} + +std::string TextTranscoderCustomSJISToUTF8::on_untranslatable(const void** vsrc, size_t* size) const { + // Sega implemented some nonstandard Shift-JIS characters on PSO GC (and + // probably XB as well): the heart symbol, encoded as F040, and the PSO font, + // encoded as F041-F064. Understandably, libiconv doesn't know what to do + // with these because they're not actually part of Shift-JIS, so we have to + // handle them manually here. We convert them to actual UTF-8 symbols: + // F040 (heart symbol) -> U+2665 (heart suit symbol) + // F041 (PSO font number 0) -> 24EA (circled digit zero) + // F042-F04A (PSO font numbers 1-9) -> 2460-2468 (circled digits 1-9) + // F04B-F064 (PSO font letters) -> 1D4D0-1D4E9 (script letters A-Z) + + const uint8_t* src = reinterpret_cast(*vsrc); + if ((*size < 2) || (src[0] != 0xF0)) { + return ""; + } + + string ret; + if (src[1] < 0x40) { + return ""; + } else if (src[1] == 0x40) { // F040 -> U+2665 + ret = encode_utf8_char(0x2665); + } else if (src[1] == 0x41) { // F041 -> U+24EA + ret = encode_utf8_char(0x24EA); + } else if (src[1] <= 0x4A) { // F042-F04A -> U+2460-U+2468 + ret = encode_utf8_char(0x2460 + (src[1] - 0x42)); + } else if (src[1] <= 0x64) { // F04B-F064 -> U+1D4D0-U+1D4E9 + ret = encode_utf8_char(0x1D4D0 + (src[1] - 0x4B)); } else { return ""; } + + *vsrc = src + 2; + (*size) -= 2; + return ret; } TextTranscoderUTF8ToCustomSJIS::TextTranscoderUTF8ToCustomSJIS() : TextTranscoder("SHIFT_JIS", "UTF-8") {} std::string TextTranscoderUTF8ToCustomSJIS::on_untranslatable(const void** src, size_t* size) const { - if ((*size >= 3) && !memcmp(*src, "\xE2\x99\xA5", 3)) { - *src = reinterpret_cast(*src) + 3; - *size -= 3; + const void* orig_src = *src; + size_t orig_size = *size; + uint32_t ch; + try { + ch = decode_utf8_char(src, size); + } catch (const runtime_error&) { + return ""; + } + + if (ch == 0x2665) { // U+2665 -> F040 return "\xF0\x40"; + } else if (ch == 0x24EA) { // U+24EA -> F041 + return "\xF0\x41"; + } else if (ch >= 0x2460 && ch <= 0x2468) { // U+2460-U+2468 -> F042-F04A + string ret("\xF0"); + ret.push_back(0x42 + (ch - 0x2460)); + return ret; + } else if (ch >= 0x1D4D0 && ch <= 0x1D4E9) { // U+1D4D0-U+1D4E9 -> F04B-F064 + string ret("\xF0"); + ret.push_back(0x4B + (ch - 0x1D4D0)); + return ret; } else { + *src = orig_src; + *size = orig_size; return ""; } } diff --git a/src/Text.hh b/src/Text.hh index 923e9750..06dfcdba 100644 --- a/src/Text.hh +++ b/src/Text.hh @@ -22,6 +22,9 @@ // Conversion functions +std::string encode_utf8_char(uint32_t ch); +uint32_t decode_utf8_char(const void** data, size_t* size); + class TextTranscoder { public: TextTranscoder(const char* to, const char* from); diff --git a/tests/custom-sjis.test.sh b/tests/custom-sjis.test.sh new file mode 100755 index 00000000..c80e07ca --- /dev/null +++ b/tests/custom-sjis.test.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +set -e + +SCHEME=$1 + +EXECUTABLE="$2" +if [ "$EXECUTABLE" == "" ]; then + EXECUTABLE="./newserv" +fi + +echo "... decode-sjis" +$EXECUTABLE decode-sjis tests/custom-sjis.txt tests/custom-sjis.utf8.txt +echo "... encode-sjis" +$EXECUTABLE encode-sjis tests/custom-sjis.utf8.txt tests/custom-sjis.recoded.txt + +diff tests/custom-sjis.txt tests/custom-sjis.recoded.txt + +echo "... clean up" +rm tests/custom-sjis.utf8.txt tests/custom-sjis.recoded.txt diff --git a/tests/custom-sjis.txt b/tests/custom-sjis.txt new file mode 100755 index 00000000..2ced9b78 --- /dev/null +++ b/tests/custom-sjis.txt @@ -0,0 +1,2 @@ +These are all the custom characters: ð@ðAðBðCðDðEðFðGðHðIðJðKðLðMðNðOðPðQðRðSðTðUðVðWðXðYðZð[ð\ð]ð^ð_ð`ðaðbðcðd +That's all of them.