charsets: add UTF-16BE as a possible modem charset

Just as an implementation detail to be taken as an extension of UCS2BE, never really to be used as a real modem charset.
author: Aleksander Morgado <aleksander@aleksander.es> 2020-08-20 10:58:24 +0200
committer: Aleksander Morgado <aleksander@aleksander.es> 2020-08-20 18:13:18 +0200
commit: eb5443b197464e55c85d7a8af67a28f2088506a3 (patch)
tree: 938203db0b78602ff9a07fb7e97c8e4d3a871cee
parent: 93686510d737bc373100beaeeb3edb7ca091a3f0 (diff)
3 files changed, 26 insertions, 11 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index e48cec3e..46b3c68c 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -43,6 +43,7 @@ static CharsetEntry charset_map[] = {
     { "PCCP437", "CP437",  "CP437",     "CP437//TRANSLIT",     MM_MODEM_CHARSET_PCCP437 },
     { "PCDN",    "CP850",  "CP850",     "CP850//TRANSLIT",     MM_MODEM_CHARSET_PCDN },
     { "HEX",     NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_HEX },
+    { "UTF-16",  "UTF16",  "UTF-16BE",  "UTF-16BE//TRANSLIT",  MM_MODEM_CHARSET_UTF16 },
     { NULL,      NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_UNKNOWN }
 };
 
@@ -536,6 +537,14 @@ ucs2_is_subset (gunichar c, const char *utf8, gsize ulen)
 }
 
 static gboolean
+utf16_is_subset (gunichar     c,
+                 const gchar *utf8,
+                 gsize        ulen)
+{
+    return TRUE;
+}
+
+static gboolean
 iso88591_is_subset (gunichar c, const char *utf8, gsize ulen)
 {
     return (c <= 0xFF);
@@ -613,6 +622,7 @@ SubsetEntry subset_table[] = {
     { MM_MODEM_CHARSET_GSM,     gsm_is_subset },
     { MM_MODEM_CHARSET_IRA,     ira_is_subset },
     { MM_MODEM_CHARSET_UCS2,    ucs2_is_subset },
+    { MM_MODEM_CHARSET_UTF16,   utf16_is_subset },
     { MM_MODEM_CHARSET_8859_1,  iso88591_is_subset },
     { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset },
     { MM_MODEM_CHARSET_PCDN,    pcdn_is_subset },
@@ -786,7 +796,8 @@ mm_charset_take_and_convert_to_utf8 (gchar *str, MMModemCharset charset)
         break;
     }
 
-    case MM_MODEM_CHARSET_UCS2: {
+    case MM_MODEM_CHARSET_UCS2:
+    case MM_MODEM_CHARSET_UTF16: {
         gsize len;
         gboolean possibly_hex = TRUE;
         gsize bread = 0, bwritten = 0;
@@ -914,7 +925,8 @@ mm_utf8_take_and_convert_to_charset (gchar *str,
         break;
     }
 
-    case MM_MODEM_CHARSET_UCS2: {
+    case MM_MODEM_CHARSET_UCS2:
+    case MM_MODEM_CHARSET_UTF16: {
         const gchar *iconv_to;
         gsize encoded_len = 0;
         GError *error = NULL;
diff --git a/src/mm-charsets.h b/src/mm-charsets.h
index 9e9215d5..e81674c4 100644
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -27,7 +27,8 @@ typedef enum {
     MM_MODEM_CHARSET_UCS2    = 0x00000010,
     MM_MODEM_CHARSET_PCCP437 = 0x00000020,
     MM_MODEM_CHARSET_PCDN    = 0x00000040,
-    MM_MODEM_CHARSET_HEX     = 0x00000080
+    MM_MODEM_CHARSET_HEX     = 0x00000080,
+    MM_MODEM_CHARSET_UTF16   = 0x00000100,
 } MMModemCharset;
 
 const char *mm_modem_charset_to_string (MMModemCharset charset);
diff --git a/src/tests/test-charsets.c b/src/tests/test-charsets.c
index 0931d7e8..a15e0332 100644
--- a/src/tests/test-charsets.c
+++ b/src/tests/test-charsets.c
@@ -369,6 +369,7 @@ struct charset_can_convert_to_test_s {
     gboolean    to_ira;
     gboolean    to_8859_1;
     gboolean    to_ucs2;
+    gboolean    to_utf16;
     gboolean    to_pccp437;
     gboolean    to_pcdn;
 };
@@ -379,35 +380,35 @@ test_charset_can_covert_to (void)
     static const struct charset_can_convert_to_test_s charset_can_convert_to_test[] = {
         {
             .utf8 = "",
-            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
         },
         {
             .utf8 = " ",
-            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
         },
         {
             .utf8 = "some basic ascii",
-            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
         },
         {
             .utf8 = "ホモ・サピエンス 喂人类 katakana, chinese, english: UCS2 takes it all",
-            .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
+            .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
         },
         {
             .utf8 = "Some from the GSM7 basic set: a % Ψ Ω ñ ö è æ",
-            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
+            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
         },
         {
             .utf8 = "More from the GSM7 extended set: {} [] ~ € |",
-            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
+            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
         },
         {
             .utf8 = "patín cannot be encoded in GSM7 or IRA, but is valid UCS2, ISO-8859-1, CP437 and CP850",
-            .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
         },
         {
             .utf8 = "ècole can be encoded in multiple ways, but not in IRA",
-            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
         },
     };
     guint i;
@@ -418,6 +419,7 @@ test_charset_can_covert_to (void)
         g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_IRA)     == charset_can_convert_to_test[i].to_ira);
         g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_8859_1)  == charset_can_convert_to_test[i].to_8859_1);
         g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_UCS2)    == charset_can_convert_to_test[i].to_ucs2);
+        g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_UTF16)   == charset_can_convert_to_test[i].to_utf16);
         g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_PCCP437) == charset_can_convert_to_test[i].to_pccp437);
         g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_PCDN)    == charset_can_convert_to_test[i].to_pcdn);
     }
author	Aleksander Morgado <aleksander@aleksander.es>	2020-08-20 10:58:24 +0200
committer	Aleksander Morgado <aleksander@aleksander.es>	2020-08-20 18:13:18 +0200
commit	eb5443b197464e55c85d7a8af67a28f2088506a3 (patch)
tree	938203db0b78602ff9a07fb7e97c8e4d3a871cee
parent	93686510d737bc373100beaeeb3edb7ca091a3f0 (diff)