Files
psopeeps-newserv/src/Text.cc
T
2023-10-28 01:06:43 -07:00

307 lines
8.7 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "Text.hh"
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <phosg/Filesystem.hh>
#include <phosg/Strings.hh>
#include <vector>
using namespace std;
// A third case is when inbuf is NULL or *inbuf is NULL, and outbuf is NULL or *outbuf is NULL. In this case, the iconv function sets cds conversion state to the initial state.
const iconv_t TextTranscoder::INVALID_IC = (iconv_t)(-1);
const size_t TextTranscoder::FAILURE_RESULT = static_cast<size_t>(-1);
TextTranscoder::TextTranscoder(const char* to, const char* from)
: ic(iconv_open(to, from)) {
if (ic == this->INVALID_IC) {
string error_str = string_for_error(errno);
throw runtime_error(string_printf("failed to initialize %s -> %s text converter: %s", from, to, error_str.c_str()));
}
}
TextTranscoder::TextTranscoder(TextTranscoder&& other) : ic(other.ic) {
other.ic = this->INVALID_IC;
}
TextTranscoder& TextTranscoder::operator=(TextTranscoder&& other) {
this->ic = other.ic;
other.ic = this->INVALID_IC;
return *this;
}
TextTranscoder::~TextTranscoder() {
iconv_close(this->ic);
}
TextTranscoder::Result TextTranscoder::operator()(
void* dest, size_t dest_size, const void* src, size_t src_bytes, bool truncate_oversize_result) {
// Clear any conversion state left over from the previous call
iconv(this->ic, nullptr, nullptr, nullptr, nullptr);
void* orig_dest = dest;
const void* orig_src = src;
size_t ret = iconv(
this->ic,
reinterpret_cast<char**>(const_cast<void**>(&src)),
&src_bytes,
reinterpret_cast<char**>(&dest),
&dest_size);
size_t bytes_read = reinterpret_cast<const char*>(src) - reinterpret_cast<const char*>(orig_src);
if (ret == this->FAILURE_RESULT) {
switch (errno) {
case EILSEQ:
throw runtime_error(string_printf("untranslatable character at position 0x%zX", bytes_read));
case EINVAL:
throw runtime_error(string_printf("incomplete multibyte sequence at position 0x%zX", bytes_read));
case E2BIG:
if (!truncate_oversize_result) {
throw runtime_error("string does not fit in buffer");
} else {
break;
}
default:
throw runtime_error("transcoding failed: " + string_for_error(errno));
}
}
size_t bytes_written = reinterpret_cast<char*>(dest) - reinterpret_cast<char*>(orig_dest);
return Result{
.bytes_read = bytes_read,
.bytes_written = bytes_written,
};
}
string TextTranscoder::operator()(const void* src, size_t src_size) {
// Clear any conversion state left over from the previous call
iconv(this->ic, nullptr, nullptr, nullptr, nullptr);
const void* orig_src = src;
deque<string> blocks;
while (src_size > 0) {
// Assume 2x input size on average, but always alocate at least 4 bytes
string& block = blocks.emplace_back(max<size_t>((src_size << 2), 4), '\0');
char* dest = block.data();
size_t dest_size = block.size();
size_t ret = iconv(
this->ic,
reinterpret_cast<char**>(const_cast<void**>(&src)),
&src_size,
reinterpret_cast<char**>(&dest),
&dest_size);
block.resize(block.size() - dest_size);
if (block.size() == 0) {
// This should never happen because no character should be more than 4
// bytes long in any known encoding
throw runtime_error("block size too small for conversion");
}
size_t bytes_read = reinterpret_cast<const char*>(src) - reinterpret_cast<const char*>(orig_src);
if (ret == this->FAILURE_RESULT) {
switch (errno) {
case EILSEQ:
throw runtime_error(string_printf("untranslatable character at position %zu", bytes_read));
case EINVAL:
throw runtime_error(string_printf("incomplete multibyte sequence at position %zu", bytes_read));
case E2BIG:
break;
default:
throw runtime_error("transcoding failed: " + string_for_error(errno));
}
}
}
return join(blocks, "");
}
string TextTranscoder::operator()(const string& data) {
return this->operator()(data.data(), data.size());
}
TextTranscoder tt_8859_to_utf8("UTF-8", "ISO-8859-1");
TextTranscoder tt_utf8_to_8859("ISO-8859-1", "UTF-8");
TextTranscoder tt_sjis_to_utf8("UTF-8", "SHIFT_JIS");
TextTranscoder tt_utf8_to_sjis("SHIFT_JIS", "UTF-8");
TextTranscoder tt_utf16_to_utf8("UTF-8", "UTF-16LE");
TextTranscoder tt_utf8_to_utf16("UTF-16LE", "UTF-8");
TextTranscoder tt_ascii_to_utf8("UTF-8", "ASCII");
TextTranscoder tt_utf8_to_ascii("ASCII", "UTF-8");
string tt_encode_marked_optional(const string& utf8, uint8_t default_language, bool is_utf16) {
if (is_utf16) {
return tt_utf8_to_utf16(utf8);
} else {
if (default_language) {
try {
return tt_utf8_to_8859(utf8);
} catch (const exception& e) {
return "\tJ" + tt_utf8_to_sjis(utf8);
}
} else {
try {
return tt_utf8_to_sjis(utf8);
} catch (const exception& e) {
return "\tE" + tt_utf8_to_8859(utf8);
}
}
}
}
string tt_encode_marked(const string& utf8, uint8_t default_language, bool is_utf16) {
if (is_utf16) {
return tt_utf8_to_utf16((default_language ? "\tE" : "\tJ") + utf8);
} else {
if (default_language) {
try {
return "\tE" + tt_utf8_to_8859(utf8);
} catch (const exception& e) {
return "\tJ" + tt_utf8_to_sjis(utf8);
}
} else {
try {
return "\tJ" + tt_utf8_to_sjis(utf8);
} catch (const exception& e) {
return "\tE" + tt_utf8_to_8859(utf8);
}
}
}
}
string tt_decode_marked(const string& data, uint8_t default_language, bool is_utf16) {
if (is_utf16) {
string ret = tt_utf16_to_utf8(data);
if (ret.size() >= 2 && ret[0] == '\t' && (ret[1] == 'E' || ret[1] == 'J')) {
ret = ret.substr(2);
}
return ret;
} else {
if (data.size() >= 2 && data[0] == '\t') {
if (data[1] == 'J') {
return tt_sjis_to_utf8(data.substr(2));
} else if (data[1] == 'E') {
return tt_8859_to_utf8(data.substr(2));
}
}
return default_language ? tt_8859_to_utf8(data) : tt_sjis_to_utf8(data);
}
}
string add_language_marker(const string& s, char marker) {
if ((s.size() >= 2) && (s[0] == '\t') && (s[1] != 'C')) {
return s;
}
string ret;
ret.push_back('\t');
ret.push_back(marker);
ret += s;
return ret;
}
string remove_language_marker(const string& s) {
if ((s.size() < 2) || (s[0] != '\t') || (s[1] == 'C')) {
return s;
}
return s.substr(2);
}
void replace_char_inplace(char* a, char f, char r) {
while (*a) {
if (*a == f) {
*a = r;
}
a++;
}
}
size_t add_color_inplace(char* a, size_t max_chars) {
char* d = a;
char* orig_d = d;
for (size_t x = 0; (x < max_chars) && *a; x++) {
if (*a == '$') {
*(d++) = '\t';
} else if (*a == '#') {
*(d++) = '\n';
} else if (*a == '%') {
a++;
x++;
if (*a == 's') {
*(d++) = '$';
} else if (*a == '%') {
*(d++) = '%';
} else if (*a == 'n') {
*(d++) = '#';
} else if (*a == '\0') {
break;
} else {
*(d++) = *a;
}
} else {
*(d++) = *a;
}
a++;
}
*d = 0;
// TODO: we should clear the chars after the null if the new string is shorter
// than the original
return d - orig_d;
}
void add_color_inplace(string& s) {
s.resize(add_color_inplace(s.data(), s.size()));
}
void add_color(StringWriter& w, const char* src, size_t max_input_chars) {
for (size_t x = 0; (x < max_input_chars) && *src; x++) {
if (*src == '$') {
w.put<char>('\t');
} else if (*src == '#') {
w.put<char>('\n');
} else if (*src == '%') {
src++;
x++;
if (*src == 's') {
w.put<char>('$');
} else if (*src == '%') {
w.put<char>('%');
} else if (*src == 'n') {
w.put<char>('#');
} else if (*src == '\0') {
break;
} else {
w.put<char>(*src);
}
} else {
w.put<char>(*src);
}
src++;
}
w.put<char>(0);
}
string add_color(const string& s) {
StringWriter w;
add_color(w, s.data(), s.size());
return std::move(w.str());
}
string strip_color(const string& s) {
string ret;
for (size_t r = 0; r < s.size(); r++) {
if ((s[r] == '$' || s[r] == '\t') &&
(s[r + 1] == 'C') && (((s[r + 2] >= '0') && (s[r + 2] <= '9')) || (s[r + 2] == 'G') || (s[r + 2] == 'a'))) {
r += 2;
} else {
ret.push_back(s[r]);
}
}
return ret;
}