charsets: new common APIs to convert from/to charsets and UTF-8

author: Aleksander Morgado <aleksander@aleksander.es> 2020-11-27 00:41:40 +0100
committer: Aleksander Morgado <aleksander@aleksander.es> 2021-02-23 11:35:11 +0000
commit: 9c613d33e1f60501cc8406f6429097d8bda87c59 (patch)
tree: e3ff469d0fac1d703dfbe9191d000993b2799140
parent: 6f32c8d38f2c7ad269c4ccf84190ad6e917293a9 (diff)
2 files changed, 352 insertions, 7 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index 1b7d3d7c..caf2abb5 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -11,6 +11,7 @@
  * GNU General Public License for more details:
  *
  * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (C) 2020 Aleksander Morgado <aleksander@aleksander.es>
  */
 
 #include <config.h>
@@ -184,6 +185,7 @@ mm_modem_charset_hex_to_utf8 (const gchar    *src,
     return g_steal_pointer (&converted);
 }
 
+/******************************************************************************/
 /* GSM 03.38 encoding conversion stuff */
 
 #define GSM_DEF_ALPHABET_SIZE 128
@@ -302,6 +304,23 @@ utf8_to_gsm_def_char (const gchar *utf8,
     return FALSE;
 }
 
+static gboolean
+translit_gsm_nul_byte (GByteArray *gsm)
+{
+    static const gchar *replacement = "?";
+    guint               i;
+    guint               n_replaces = 0;
+
+    for (i = 0; i < gsm->len; i++) {
+        if (gsm->data[i] == 0x00) {
+            utf8_to_gsm_def_char (replacement, 1, &gsm->data[i]);
+            n_replaces++;
+        }
+    }
+
+    return (n_replaces > 0);
+}
+
 
 #define EONE(a, g)        { {a, 0x00, 0x00}, 1, g }
 #define ETHR(a, b, c, g)  { {a, b,    c},    3, g }
@@ -488,6 +507,10 @@ mm_charset_utf8_to_unpacked_gsm (const gchar  *utf8,
     return g_byte_array_free (g_steal_pointer (&gsm), FALSE);
 }
 
+/******************************************************************************/
+/* Checks to see whether conversion to a target charset may be done without
+ * any loss. */
+
 static gboolean
 gsm_is_subset (gunichar     c,
                const gchar *utf8,
@@ -617,13 +640,6 @@ const SubsetEntry subset_table[] = {
     { MM_MODEM_CHARSET_PCDN,    pcdn_is_subset     },
 };
 
-/**
- * mm_charset_can_covert_to:
- * @utf8: UTF-8 valid string.
- * @charset: the #MMModemCharset to validate the conversion from @utf8.
- *
- * Returns: %TRUE if the conversion is possible without errors, %FALSE otherwise.
- */
 gboolean
 mm_charset_can_convert_to (const gchar    *utf8,
                            MMModemCharset  charset)
@@ -667,6 +683,9 @@ mm_charset_can_convert_to (const gchar    *utf8,
     return TRUE;
 }
 
+/******************************************************************************/
+/* GSM-7 pack/unpack operations */
+
 guint8 *
 mm_charset_gsm_unpack (const guint8 *gsm,
                        guint32       num_septets,
@@ -944,3 +963,268 @@ mm_utf8_take_and_convert_to_charset (gchar          *str,
 
     return encoded;
 }
+
+/*****************************************************************************/
+/* Main conversion functions */
+
+static guint8 *
+charset_iconv_from_utf8 (const gchar     *utf8,
+                         MMModemCharset   charset,
+                         gboolean         translit,
+                         guint           *out_size,
+                         GError         **error)
+{
+    g_autoptr(GError)      inner_error = NULL;
+    const CharsetSettings *settings;
+    gsize                  bytes_written = 0;
+    g_autofree guint8     *encoded = NULL;
+
+    settings = lookup_charset_settings (charset);
+
+    encoded = (guint8 *) g_convert (utf8, -1,
+                                    settings->iconv_name, "UTF-8",
+                                    NULL, &bytes_written, &inner_error);
+    if (encoded) {
+        if (out_size)
+            *out_size = (guint) bytes_written;
+        return g_steal_pointer (&encoded);
+    }
+
+    if (!translit) {
+        g_propagate_error (error, g_steal_pointer (&inner_error));
+        g_prefix_error (error, "Couldn't convert from UTF-8 to %s: ", settings->gsm_name);
+        return NULL;
+    }
+
+    encoded = (guint8 *) g_convert_with_fallback (utf8, -1,
+                                                  settings->iconv_name, "UTF-8", "?",
+                                                  NULL, &bytes_written, error);
+    if (encoded) {
+        if (out_size)
+            *out_size = (guint) bytes_written;
+        return g_steal_pointer (&encoded);
+    }
+
+    g_prefix_error (error, "Couldn't convert from UTF-8 to %s with translit: ", settings->gsm_name);
+    return NULL;
+}
+
+GByteArray *
+mm_modem_charset_bytearray_from_utf8 (const gchar     *utf8,
+                                      MMModemCharset   charset,
+                                      gboolean         translit,
+                                      GError         **error)
+{
+    guint8 *encoded = NULL;
+    guint   encoded_size = 0;
+
+    if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+        g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                     "Cannot convert from UTF-8: unknown target charset");
+        return NULL;
+    }
+
+    switch (charset) {
+        case MM_MODEM_CHARSET_GSM:
+            encoded = mm_charset_utf8_to_unpacked_gsm (utf8, translit, &encoded_size, error);
+            break;
+        case MM_MODEM_CHARSET_IRA:
+        case MM_MODEM_CHARSET_8859_1:
+        case MM_MODEM_CHARSET_UTF8:
+        case MM_MODEM_CHARSET_UCS2:
+        case MM_MODEM_CHARSET_PCCP437:
+        case MM_MODEM_CHARSET_PCDN:
+        case MM_MODEM_CHARSET_UTF16:
+            encoded = charset_iconv_from_utf8 (utf8, charset, translit, &encoded_size, error);
+            break;
+        case MM_MODEM_CHARSET_UNKNOWN:
+        default:
+            g_assert_not_reached ();
+    }
+
+    return g_byte_array_new_take (encoded, encoded_size);
+}
+
+gchar *
+mm_modem_charset_str_from_utf8 (const gchar     *utf8,
+                                MMModemCharset   charset,
+                                gboolean         translit,
+                                GError         **error)
+{
+    g_autoptr(GByteArray) bytearray = NULL;
+
+    if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+        g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                     "Cannot convert from UTF-8: unknown target charset");
+        return NULL;
+    }
+
+    bytearray = mm_modem_charset_bytearray_from_utf8 (utf8, charset, translit, error);
+    if (!bytearray)
+        return NULL;
+
+    switch (charset) {
+        case MM_MODEM_CHARSET_GSM:
+            /* Note: strings encoded in unpacked GSM-7 can be used as plain
+             * strings as long as the string doesn't contain character '@', which
+             * is the one encoded as 0x00. At this point, we perform transliteration
+             * of the NUL bytes in the GSM-7 bytearray, and we fail the operation
+             * if one or more replacements were done and transliteration wasn't
+             * requested */
+            if (translit_gsm_nul_byte (bytearray) && !translit) {
+                g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                             "Cannot convert to GSM-7 string: transliteration required for embedded '@'");
+                return NULL;
+            }
+            /* fall through */
+        case MM_MODEM_CHARSET_IRA:
+        case MM_MODEM_CHARSET_8859_1:
+        case MM_MODEM_CHARSET_UTF8:
+        case MM_MODEM_CHARSET_PCCP437:
+        case MM_MODEM_CHARSET_PCDN:
+            return (gchar *) g_byte_array_free (g_steal_pointer (&bytearray), FALSE);
+        case MM_MODEM_CHARSET_UCS2:
+        case MM_MODEM_CHARSET_UTF16:
+            return mm_utils_bin2hexstr (bytearray->data, bytearray->len);
+        default:
+        case MM_MODEM_CHARSET_UNKNOWN:
+            g_assert_not_reached ();
+    }
+}
+
+static gchar *
+charset_iconv_to_utf8 (const guint8           *data,
+                       guint32                 len,
+                       MMModemCharset          charset,
+                       gboolean                translit,
+                       GError                **error)
+{
+    g_autoptr(GError)      inner_error = NULL;
+    g_autofree gchar      *utf8 = NULL;
+    const CharsetSettings *settings;
+
+    settings = lookup_charset_settings (charset);
+
+    utf8 = g_convert ((const gchar *) data, len,
+                      "UTF-8",
+                      settings->iconv_name,
+                      NULL, NULL, &inner_error);
+    if (utf8)
+        return g_steal_pointer (&utf8);
+
+    if (!translit) {
+        g_propagate_error (error, g_steal_pointer (&inner_error));
+        g_prefix_error (error, "Couldn't convert from %s to UTF-8: ", settings->gsm_name);
+        return NULL;
+    }
+
+    utf8 = g_convert_with_fallback ((const gchar *) data, len,
+                                    "UTF-8", settings->iconv_name, "?",
+                                    NULL, NULL, error);
+    if (utf8)
+        return g_steal_pointer (&utf8);
+
+    g_prefix_error (error, "Couldn't convert from %s to UTF-8 with translit: ", settings->gsm_name);
+    return NULL;
+}
+
+gchar *
+mm_modem_charset_bytearray_to_utf8 (GByteArray      *bytearray,
+                                    MMModemCharset   charset,
+                                    gboolean         translit,
+                                    GError         **error)
+{
+    const CharsetSettings *settings;
+    g_autofree gchar      *utf8 = NULL;
+
+    if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+        g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                     "Cannot convert from UTF-8: unknown target charset");
+        return NULL;
+    }
+
+    settings = lookup_charset_settings (charset);
+    switch (charset) {
+        case MM_MODEM_CHARSET_GSM:
+            utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (bytearray->data,
+                                                              bytearray->len,
+                                                              translit,
+                                                              error);
+            break;
+        case MM_MODEM_CHARSET_IRA:
+        case MM_MODEM_CHARSET_UTF8:
+        case MM_MODEM_CHARSET_8859_1:
+        case MM_MODEM_CHARSET_PCCP437:
+        case MM_MODEM_CHARSET_PCDN:
+        case MM_MODEM_CHARSET_UCS2:
+        case MM_MODEM_CHARSET_UTF16:
+            utf8 = charset_iconv_to_utf8 (bytearray->data,
+                                          bytearray->len,
+                                          charset,
+                                          translit,
+                                          error);
+            break;
+        case MM_MODEM_CHARSET_UNKNOWN:
+        default:
+            g_assert_not_reached ();
+    }
+
+    if (utf8 && g_utf8_validate (utf8, -1, NULL))
+        return g_steal_pointer (&utf8);
+
+    g_prefix_error (error, "Invalid conversion from %s to UTF-8: ", settings->gsm_name);
+    return NULL;
+}
+
+gchar *
+mm_modem_charset_str_to_utf8 (const gchar     *str,
+                              gssize           len,
+                              MMModemCharset   charset,
+                              gboolean         translit,
+                              GError         **error)
+{
+    g_autoptr(GByteArray) bytearray = NULL;
+
+    if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+        g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                     "Cannot convert from UTF-8: unknown target charset");
+        return NULL;
+    }
+
+    /* Note: if the input string is GSM-7 encoded and it contains the '@'
+     * character, using -1 to indicate string length won't work properly,
+     * as '@' is encoded as 0x00. Whenever possible, if using GSM-7,
+     * give a proper len value or otherwise use the bytearray_to_utf8()
+     * method instead. */
+    if (len < 0)
+        len = strlen (str);
+
+    switch (charset) {
+        case MM_MODEM_CHARSET_GSM:
+        case MM_MODEM_CHARSET_IRA:
+        case MM_MODEM_CHARSET_8859_1:
+        case MM_MODEM_CHARSET_UTF8:
+        case MM_MODEM_CHARSET_PCCP437:
+        case MM_MODEM_CHARSET_PCDN:
+            bytearray = g_byte_array_sized_new (len);
+            g_byte_array_append (bytearray, (const guint8 *)str, len);
+            break;
+        case MM_MODEM_CHARSET_UCS2:
+        case MM_MODEM_CHARSET_UTF16: {
+            guint8 *bin = NULL;
+            gsize   bin_len;
+
+            bin = (guint8 *) mm_utils_hexstr2bin (str, len, &bin_len, error);
+            if (!bin)
+                return NULL;
+
+            bytearray = g_byte_array_new_take (bin, bin_len);
+            break;
+        }
+        case MM_MODEM_CHARSET_UNKNOWN:
+        default:
+            g_assert_not_reached ();
+    }
+
+    return mm_modem_charset_bytearray_to_utf8 (bytearray, charset, translit, error);
+}
diff --git a/src/mm-charsets.h b/src/mm-charsets.h
index b59eeeaa..4d032f38 100644
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -18,6 +18,8 @@
 
 #include <glib.h>
 
+/*****************************************************************************************/
+
 typedef enum {
     MM_MODEM_CHARSET_UNKNOWN = 0,
     MM_MODEM_CHARSET_GSM     = 1 << 0,
@@ -33,6 +35,8 @@ typedef enum {
 const gchar    *mm_modem_charset_to_string   (MMModemCharset  charset);
 MMModemCharset  mm_modem_charset_from_string (const gchar    *string);
 
+/*****************************************************************************************/
+
 /* Append the given string to the given byte array but re-encode it
  * into the given charset first.  The original string is assumed to be
  * UTF-8 encoded.
@@ -81,4 +85,61 @@ gchar *mm_charset_take_and_convert_to_utf8 (gchar          *str,
 gchar *mm_utf8_take_and_convert_to_charset (gchar          *str,
                                             MMModemCharset  charset);
 
+/*****************************************************************************************/
+
+/*
+ * Convert the given UTF-8 encoded string into the given charset.
+ *
+ * The output is given as a bytearray, because the target charset may allow
+ * embedded NUL bytes (e.g. UTF-16).
+ *
+ * The output encoded string is not guaranteed to be NUL-terminated, instead
+ * the bytearray length itself gives the correct string length.
+ */
+GByteArray *mm_modem_charset_bytearray_from_utf8 (const gchar     *utf8,
+                                                  MMModemCharset   charset,
+                                                  gboolean         translit,
+                                                  GError         **error);
+
+/*
+ * Convert the given UTF-8 encoded string into the given charset.
+ *
+ * The output is given as a C string, and those charsets that allow
+ * embedded NUL bytes (e.g. UTF-16) will be hex-encoded.
+ *
+ * The output encoded string is guaranteed to be NUL-terminated, and so no
+ * explicit output length is returned.
+ */
+gchar *mm_modem_charset_str_from_utf8 (const gchar     *utf8,
+                                       MMModemCharset   charset,
+                                       gboolean         translit,
+                                       GError         **error);
+
+/*
+ * Convert into an UTF-8 encoded string the input byte array, which is
+ * encoded in the given charset.
+ *
+ * The output string is guaranteed to be valid UTF-8 and NUL-terminated.
+ */
+gchar *mm_modem_charset_bytearray_to_utf8 (GByteArray      *bytearray,
+                                           MMModemCharset   charset,
+                                           gboolean         translit,
+                                           GError         **error);
+
+/*
+ * Convert into an UTF-8 encoded string the input string, which is
+ * encoded in the given charset. Those charsets that allow embedded NUL
+ * bytes (e.g. UTF-16) need to be hex-encoded.
+ *
+ * If the input string is NUL-terminated, len may be given as -1; otherwise
+ * len needs to specify the number of valid bytes in the input string.
+ *
+ * The output string is guaranteed to be valid UTF-8 and NUL-terminated.
+ */
+gchar *mm_modem_charset_str_to_utf8 (const gchar     *str,
+                                     gssize           len,
+                                     MMModemCharset   charset,
+                                     gboolean         translit,
+                                     GError         **error);
+
 #endif /* MM_CHARSETS_H */
author	Aleksander Morgado <aleksander@aleksander.es>	2020-11-27 00:41:40 +0100
committer	Aleksander Morgado <aleksander@aleksander.es>	2021-02-23 11:35:11 +0000
commit	9c613d33e1f60501cc8406f6429097d8bda87c59 (patch)
tree	e3ff469d0fac1d703dfbe9191d000993b2799140
parent	6f32c8d38f2c7ad269c4ccf84190ad6e917293a9 (diff)