use safe packed string types

2022-03-31 23:23:02 -07:00
parent 832135a505
commit 8a9e1a2049
22 changed files with 1040 additions and 879 deletions
@@ -30,20 +30,14 @@ int char16ncmp(const char16_t* s1, const char16_t* s2, size_t count) {
  return 0;
 }

-size_t char16len(const char16_t* s) {
-  size_t x;
-  for (x = 0; s[x] != 0; x++);
-  return x;
-}
-


 static vector<char16_t> unicode_to_sjis_table_data;
 static vector<char16_t> sjis_to_unicode_table_data;

 static void load_sjis_tables() {
-  unicode_to_sjis_table_data.resize(0x10000);
-  sjis_to_unicode_table_data.resize(0x10000);
+  unicode_to_sjis_table_data.resize(0x10000, 0);
+  sjis_to_unicode_table_data.resize(0x10000, 0);

  // TODO: this is inefficient; it makes multiple copies of the string
  auto file_contents = load_file("system/sjis-table.ini");
@@ -75,166 +69,107 @@ static const vector<char16_t>& unicode_to_sjis_table() {
  return unicode_to_sjis_table_data;
 }

-// TODO: It looks like these functions are probably wrong. Specifically, we
-// don't write the high byte when encoding non-ASCII chars, do we?

-void encode_sjis(char* dest, const char16_t* source, size_t max) {
-  const auto& table = unicode_to_sjis_table();
-  while (*source && (--max)) {
-    *(dest++) = table[*(source++)];
-  };
-  *dest = 0;
-}
-
-void decode_sjis(char16_t* dest, const char* source, size_t max) {
-  const auto& table = sjis_to_unicode_table();
-  while (*source && (--max)) {
-    char16_t src_char = *(source++);
-    if (src_char & 0x80) {
-      src_char = (src_char << 8) | *(source++);
-      if ((src_char & 0xFF) == 0) {
-        return;
-      }
-    }
-    *(dest++) = table[src_char];
-  };
-  *dest = 0;
-}

 std::string encode_sjis(const char16_t* src, size_t src_count) {
  const auto& table = unicode_to_sjis_table();
+
+  const char16_t* src_end = src + src_count;
  string ret;
-  for (; *src && (src_count > 0); src_count--) {
-    ret.push_back(table[*(src++)]);
+  while ((src != src_end) && *src) {
+    uint16_t ch = *(src++);
+    uint16_t translated_c = table[ch];
+    if (translated_c == 0) {
+      throw runtime_error("untranslatable unicode character");
+    } else if (translated_c & 0xFF00) {
+      ret.push_back((translated_c >> 8) & 0xFF);
+      ret.push_back(translated_c & 0xFF);
+    } else {
+      ret.push_back(translated_c & 0xFF);
+    }
  };
  return ret;
 }

+void encode_sjis(
+    char* dest,
+    size_t dest_count,
+    const char16_t* src,
+    size_t src_count) {
+  const auto& table = unicode_to_sjis_table();
+
+  if (dest_count == 0) {
+    throw logic_error("cannot encode into zero-length buffer");
+  }
+
+  const char16_t* src_end = src + src_count;
+  const char* dest_end = dest + (dest_count - 1);
+  while ((dest != dest_end) && (src != src_end) && *src) {
+    uint16_t ch = *(src++);
+    uint16_t translated_c = table[ch];
+    if (translated_c == 0) {
+      throw runtime_error("untranslatable unicode character");
+    } else if (translated_c & 0xFF00) {
+      *(dest++) = (translated_c >> 8) & 0xFF;
+      // If the second byte of this character would cause the null to overrun
+      // the buffer, erase the first byte instead and return early
+      if (dest == dest_end) {
+        *(dest - 1) = 0;
+      } else {
+        *(dest++) = translated_c & 0xFF;
+      }
+    } else {
+      *(dest++) = translated_c & 0xFF;
+    }
+  }
+  *dest = 0;
+}
+
 std::u16string decode_sjis(const char* src, size_t src_count) {
  const auto& table = sjis_to_unicode_table();
+
+  const char* src_end = src + src_count;
  u16string ret;
-  while (*src && (src_count > 0)) {
-    char16_t src_char = *(src++);
-    src_count--;
+  while ((src != src_end) && *src) {
+    uint16_t src_char = *(src++);
    if (src_char & 0x80) {
-      if (src_count == 0) {
-        return ret;
+      if (src == src_end) {
+        throw runtime_error("incomplete extended character");
      }
      src_char = (src_char << 8) | *(src++);
      if ((src_char & 0xFF) == 0) {
-        return ret;
+        throw runtime_error("incomplete extended character");
      }
-      src_count--;
    }
    ret.push_back(table[src_char]);
  };
  return ret;
 }

-std::string encode_sjis(const std::u16string& source) {
-  const auto& table = unicode_to_sjis_table();
-  string ret;
-  for (char16_t ch : source) {
-    ret.push_back(table[ch]);
-  };
-  return ret;
-}
-
-std::u16string decode_sjis(const std::string& source) {
+void decode_sjis(
+    char16_t* dest,
+    size_t dest_count,
+    const char* src,
+    size_t src_count) {
  const auto& table = sjis_to_unicode_table();
-  u16string ret;
-  for (size_t x = 0; x < source.size();) {
-    char16_t src_char = source[x++];
+
+  if (dest_count == 0) {
+    throw logic_error("cannot decode into zero-length buffer");
+  }
+
+  const char* src_end = src + src_count;
+  const char16_t* dest_end = dest + (dest_count - 1);
+  while ((dest != dest_end) && (src != src_end) && *src) {
+    uint16_t src_char = *(src++);
    if (src_char & 0x80) {
-      if (x == source.size()) {
-        return ret;
+      if (src == src_end) {
+        throw runtime_error("incomplete extended character");
      }
-      src_char = (src_char << 8) | source[x++];
+      src_char = (src_char << 8) | *(src++);
      if ((src_char & 0xFF) == 0) {
-        return ret;
+        throw runtime_error("incomplete extended character");
      }
    }
-    ret.push_back(table[src_char]);
+    *(dest++) = table[src_char];
  };
-  return ret;
-}
-
-
-
-void add_language_marker_inplace(char* a, char e, size_t dest_count) {
-  if ((a[0] == '\t') && (a[1] != 'C')) {
-    return;
-  }
-
-  size_t existing_count = strlen(a);
-  if (existing_count > dest_count - 3) {
-    existing_count = dest_count - 3;
-  }
-  memmove(&a[2], a, (existing_count + 1) * sizeof(char));
-  a[0] = '\t';
-  a[1] = e;
-  a[existing_count + 2] = 0;
-}
-
-void add_language_marker_inplace(char16_t* a, char16_t e, size_t dest_count) {
-  if ((a[0] == '\t') && (a[1] != 'C')) {
-    return;
-  }
-
-  size_t existing_count = char16len(a);
-  if (existing_count > dest_count - 3) {
-    existing_count = dest_count - 3;
-  }
-  memmove(&a[2], a, (existing_count + 1) * sizeof(char16_t));
-  a[0] = '\t';
-  a[1] = e;
-  a[existing_count + 2] = 0;
-}
-
-void remove_language_marker_inplace(char* a) {
-  if ((a[0] == '\t') && (a[1] != 'C')) {
-    strcpy(a, &a[2]);
-  }
-}
-
-void remove_language_marker_inplace(char16_t* a) {
-  if ((a[0] == '\t') && (a[1] != 'C')) {
-    strcpy_z(a, &a[2], char16len(a) - 2);
-  }
-}
-
-std::string add_language_marker(const std::string& s, char marker) {
-  if ((s.size() >= 2) && (s[0] == '\t') && (s[1] != 'C')) {
-    return s;
-  }
-
-  string ret;
-  ret.push_back('\t');
-  ret.push_back(marker);
-  return ret + s;
-}
-
-std::u16string add_language_marker(const std::u16string& s, char16_t marker) {
-  if ((s.size() >= 2) && (s[0] == L'\t') && (s[1] != L'C')) {
-    return s;
-  }
-
-  u16string ret;
-  ret.push_back(L'\t');
-  ret.push_back(marker);
-  return ret + s;
-}
-
-std::string remove_language_marker(const std::string& s) {
-  if ((s.size() < 2) || (s[0] != '\t') || (s[1] == 'C')) {
-    return s;
-  }
-  return s.substr(2);
-}
-
-std::u16string remove_language_marker(const std::u16string& s) {
-  if ((s.size() < 2) || (s[0] != L'\t') || (s[1] == L'C')) {
-    return s;
-  }
-  return s.substr(2);
 }