aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAleksander Morgado <aleksander@aleksander.es>2020-11-27 00:41:40 +0100
committerAleksander Morgado <aleksander@aleksander.es>2021-02-23 11:35:11 +0000
commit9c613d33e1f60501cc8406f6429097d8bda87c59 (patch)
treee3ff469d0fac1d703dfbe9191d000993b2799140
parent6f32c8d38f2c7ad269c4ccf84190ad6e917293a9 (diff)
charsets: new common APIs to convert from/to charsets and UTF-8
-rw-r--r--src/mm-charsets.c298
-rw-r--r--src/mm-charsets.h61
2 files changed, 352 insertions, 7 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index 1b7d3d7c..caf2abb5 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -11,6 +11,7 @@
* GNU General Public License for more details:
*
* Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (C) 2020 Aleksander Morgado <aleksander@aleksander.es>
*/
#include <config.h>
@@ -184,6 +185,7 @@ mm_modem_charset_hex_to_utf8 (const gchar *src,
return g_steal_pointer (&converted);
}
+/******************************************************************************/
/* GSM 03.38 encoding conversion stuff */
#define GSM_DEF_ALPHABET_SIZE 128
@@ -302,6 +304,23 @@ utf8_to_gsm_def_char (const gchar *utf8,
return FALSE;
}
+static gboolean
+translit_gsm_nul_byte (GByteArray *gsm)
+{
+ static const gchar *replacement = "?";
+ guint i;
+ guint n_replaces = 0;
+
+ for (i = 0; i < gsm->len; i++) {
+ if (gsm->data[i] == 0x00) {
+ utf8_to_gsm_def_char (replacement, 1, &gsm->data[i]);
+ n_replaces++;
+ }
+ }
+
+ return (n_replaces > 0);
+}
+
#define EONE(a, g) { {a, 0x00, 0x00}, 1, g }
#define ETHR(a, b, c, g) { {a, b, c}, 3, g }
@@ -488,6 +507,10 @@ mm_charset_utf8_to_unpacked_gsm (const gchar *utf8,
return g_byte_array_free (g_steal_pointer (&gsm), FALSE);
}
+/******************************************************************************/
+/* Checks to see whether conversion to a target charset may be done without
+ * any loss. */
+
static gboolean
gsm_is_subset (gunichar c,
const gchar *utf8,
@@ -617,13 +640,6 @@ const SubsetEntry subset_table[] = {
{ MM_MODEM_CHARSET_PCDN, pcdn_is_subset },
};
-/**
- * mm_charset_can_covert_to:
- * @utf8: UTF-8 valid string.
- * @charset: the #MMModemCharset to validate the conversion from @utf8.
- *
- * Returns: %TRUE if the conversion is possible without errors, %FALSE otherwise.
- */
gboolean
mm_charset_can_convert_to (const gchar *utf8,
MMModemCharset charset)
@@ -667,6 +683,9 @@ mm_charset_can_convert_to (const gchar *utf8,
return TRUE;
}
+/******************************************************************************/
+/* GSM-7 pack/unpack operations */
+
guint8 *
mm_charset_gsm_unpack (const guint8 *gsm,
guint32 num_septets,
@@ -944,3 +963,268 @@ mm_utf8_take_and_convert_to_charset (gchar *str,
return encoded;
}
+
+/*****************************************************************************/
+/* Main conversion functions */
+
+static guint8 *
+charset_iconv_from_utf8 (const gchar *utf8,
+ MMModemCharset charset,
+ gboolean translit,
+ guint *out_size,
+ GError **error)
+{
+ g_autoptr(GError) inner_error = NULL;
+ const CharsetSettings *settings;
+ gsize bytes_written = 0;
+ g_autofree guint8 *encoded = NULL;
+
+ settings = lookup_charset_settings (charset);
+
+ encoded = (guint8 *) g_convert (utf8, -1,
+ settings->iconv_name, "UTF-8",
+ NULL, &bytes_written, &inner_error);
+ if (encoded) {
+ if (out_size)
+ *out_size = (guint) bytes_written;
+ return g_steal_pointer (&encoded);
+ }
+
+ if (!translit) {
+ g_propagate_error (error, g_steal_pointer (&inner_error));
+ g_prefix_error (error, "Couldn't convert from UTF-8 to %s: ", settings->gsm_name);
+ return NULL;
+ }
+
+ encoded = (guint8 *) g_convert_with_fallback (utf8, -1,
+ settings->iconv_name, "UTF-8", "?",
+ NULL, &bytes_written, error);
+ if (encoded) {
+ if (out_size)
+ *out_size = (guint) bytes_written;
+ return g_steal_pointer (&encoded);
+ }
+
+ g_prefix_error (error, "Couldn't convert from UTF-8 to %s with translit: ", settings->gsm_name);
+ return NULL;
+}
+
+GByteArray *
+mm_modem_charset_bytearray_from_utf8 (const gchar *utf8,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error)
+{
+ guint8 *encoded = NULL;
+ guint encoded_size = 0;
+
+ if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert from UTF-8: unknown target charset");
+ return NULL;
+ }
+
+ switch (charset) {
+ case MM_MODEM_CHARSET_GSM:
+ encoded = mm_charset_utf8_to_unpacked_gsm (utf8, translit, &encoded_size, error);
+ break;
+ case MM_MODEM_CHARSET_IRA:
+ case MM_MODEM_CHARSET_8859_1:
+ case MM_MODEM_CHARSET_UTF8:
+ case MM_MODEM_CHARSET_UCS2:
+ case MM_MODEM_CHARSET_PCCP437:
+ case MM_MODEM_CHARSET_PCDN:
+ case MM_MODEM_CHARSET_UTF16:
+ encoded = charset_iconv_from_utf8 (utf8, charset, translit, &encoded_size, error);
+ break;
+ case MM_MODEM_CHARSET_UNKNOWN:
+ default:
+ g_assert_not_reached ();
+ }
+
+ return g_byte_array_new_take (encoded, encoded_size);
+}
+
+gchar *
+mm_modem_charset_str_from_utf8 (const gchar *utf8,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error)
+{
+ g_autoptr(GByteArray) bytearray = NULL;
+
+ if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert from UTF-8: unknown target charset");
+ return NULL;
+ }
+
+ bytearray = mm_modem_charset_bytearray_from_utf8 (utf8, charset, translit, error);
+ if (!bytearray)
+ return NULL;
+
+ switch (charset) {
+ case MM_MODEM_CHARSET_GSM:
+ /* Note: strings encoded in unpacked GSM-7 can be used as plain
+ * strings as long as the string doesn't contain character '@', which
+ * is the one encoded as 0x00. At this point, we perform transliteration
+ * of the NUL bytes in the GSM-7 bytearray, and we fail the operation
+ * if one or more replacements were done and transliteration wasn't
+ * requested */
+ if (translit_gsm_nul_byte (bytearray) && !translit) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert to GSM-7 string: transliteration required for embedded '@'");
+ return NULL;
+ }
+ /* fall through */
+ case MM_MODEM_CHARSET_IRA:
+ case MM_MODEM_CHARSET_8859_1:
+ case MM_MODEM_CHARSET_UTF8:
+ case MM_MODEM_CHARSET_PCCP437:
+ case MM_MODEM_CHARSET_PCDN:
+ return (gchar *) g_byte_array_free (g_steal_pointer (&bytearray), FALSE);
+ case MM_MODEM_CHARSET_UCS2:
+ case MM_MODEM_CHARSET_UTF16:
+ return mm_utils_bin2hexstr (bytearray->data, bytearray->len);
+ default:
+ case MM_MODEM_CHARSET_UNKNOWN:
+ g_assert_not_reached ();
+ }
+}
+
+static gchar *
+charset_iconv_to_utf8 (const guint8 *data,
+ guint32 len,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error)
+{
+ g_autoptr(GError) inner_error = NULL;
+ g_autofree gchar *utf8 = NULL;
+ const CharsetSettings *settings;
+
+ settings = lookup_charset_settings (charset);
+
+ utf8 = g_convert ((const gchar *) data, len,
+ "UTF-8",
+ settings->iconv_name,
+ NULL, NULL, &inner_error);
+ if (utf8)
+ return g_steal_pointer (&utf8);
+
+ if (!translit) {
+ g_propagate_error (error, g_steal_pointer (&inner_error));
+ g_prefix_error (error, "Couldn't convert from %s to UTF-8: ", settings->gsm_name);
+ return NULL;
+ }
+
+ utf8 = g_convert_with_fallback ((const gchar *) data, len,
+ "UTF-8", settings->iconv_name, "?",
+ NULL, NULL, error);
+ if (utf8)
+ return g_steal_pointer (&utf8);
+
+ g_prefix_error (error, "Couldn't convert from %s to UTF-8 with translit: ", settings->gsm_name);
+ return NULL;
+}
+
+gchar *
+mm_modem_charset_bytearray_to_utf8 (GByteArray *bytearray,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error)
+{
+ const CharsetSettings *settings;
+ g_autofree gchar *utf8 = NULL;
+
+ if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert from UTF-8: unknown target charset");
+ return NULL;
+ }
+
+ settings = lookup_charset_settings (charset);
+ switch (charset) {
+ case MM_MODEM_CHARSET_GSM:
+ utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (bytearray->data,
+ bytearray->len,
+ translit,
+ error);
+ break;
+ case MM_MODEM_CHARSET_IRA:
+ case MM_MODEM_CHARSET_UTF8:
+ case MM_MODEM_CHARSET_8859_1:
+ case MM_MODEM_CHARSET_PCCP437:
+ case MM_MODEM_CHARSET_PCDN:
+ case MM_MODEM_CHARSET_UCS2:
+ case MM_MODEM_CHARSET_UTF16:
+ utf8 = charset_iconv_to_utf8 (bytearray->data,
+ bytearray->len,
+ charset,
+ translit,
+ error);
+ break;
+ case MM_MODEM_CHARSET_UNKNOWN:
+ default:
+ g_assert_not_reached ();
+ }
+
+ if (utf8 && g_utf8_validate (utf8, -1, NULL))
+ return g_steal_pointer (&utf8);
+
+ g_prefix_error (error, "Invalid conversion from %s to UTF-8: ", settings->gsm_name);
+ return NULL;
+}
+
+gchar *
+mm_modem_charset_str_to_utf8 (const gchar *str,
+ gssize len,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error)
+{
+ g_autoptr(GByteArray) bytearray = NULL;
+
+ if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert from UTF-8: unknown target charset");
+ return NULL;
+ }
+
+ /* Note: if the input string is GSM-7 encoded and it contains the '@'
+ * character, using -1 to indicate string length won't work properly,
+ * as '@' is encoded as 0x00. Whenever possible, if using GSM-7,
+ * give a proper len value or otherwise use the bytearray_to_utf8()
+ * method instead. */
+ if (len < 0)
+ len = strlen (str);
+
+ switch (charset) {
+ case MM_MODEM_CHARSET_GSM:
+ case MM_MODEM_CHARSET_IRA:
+ case MM_MODEM_CHARSET_8859_1:
+ case MM_MODEM_CHARSET_UTF8:
+ case MM_MODEM_CHARSET_PCCP437:
+ case MM_MODEM_CHARSET_PCDN:
+ bytearray = g_byte_array_sized_new (len);
+ g_byte_array_append (bytearray, (const guint8 *)str, len);
+ break;
+ case MM_MODEM_CHARSET_UCS2:
+ case MM_MODEM_CHARSET_UTF16: {
+ guint8 *bin = NULL;
+ gsize bin_len;
+
+ bin = (guint8 *) mm_utils_hexstr2bin (str, len, &bin_len, error);
+ if (!bin)
+ return NULL;
+
+ bytearray = g_byte_array_new_take (bin, bin_len);
+ break;
+ }
+ case MM_MODEM_CHARSET_UNKNOWN:
+ default:
+ g_assert_not_reached ();
+ }
+
+ return mm_modem_charset_bytearray_to_utf8 (bytearray, charset, translit, error);
+}
diff --git a/src/mm-charsets.h b/src/mm-charsets.h
index b59eeeaa..4d032f38 100644
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -18,6 +18,8 @@
#include <glib.h>
+/*****************************************************************************************/
+
typedef enum {
MM_MODEM_CHARSET_UNKNOWN = 0,
MM_MODEM_CHARSET_GSM = 1 << 0,
@@ -33,6 +35,8 @@ typedef enum {
const gchar *mm_modem_charset_to_string (MMModemCharset charset);
MMModemCharset mm_modem_charset_from_string (const gchar *string);
+/*****************************************************************************************/
+
/* Append the given string to the given byte array but re-encode it
* into the given charset first. The original string is assumed to be
* UTF-8 encoded.
@@ -81,4 +85,61 @@ gchar *mm_charset_take_and_convert_to_utf8 (gchar *str,
gchar *mm_utf8_take_and_convert_to_charset (gchar *str,
MMModemCharset charset);
+/*****************************************************************************************/
+
+/*
+ * Convert the given UTF-8 encoded string into the given charset.
+ *
+ * The output is given as a bytearray, because the target charset may allow
+ * embedded NUL bytes (e.g. UTF-16).
+ *
+ * The output encoded string is not guaranteed to be NUL-terminated, instead
+ * the bytearray length itself gives the correct string length.
+ */
+GByteArray *mm_modem_charset_bytearray_from_utf8 (const gchar *utf8,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error);
+
+/*
+ * Convert the given UTF-8 encoded string into the given charset.
+ *
+ * The output is given as a C string, and those charsets that allow
+ * embedded NUL bytes (e.g. UTF-16) will be hex-encoded.
+ *
+ * The output encoded string is guaranteed to be NUL-terminated, and so no
+ * explicit output length is returned.
+ */
+gchar *mm_modem_charset_str_from_utf8 (const gchar *utf8,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error);
+
+/*
+ * Convert into an UTF-8 encoded string the input byte array, which is
+ * encoded in the given charset.
+ *
+ * The output string is guaranteed to be valid UTF-8 and NUL-terminated.
+ */
+gchar *mm_modem_charset_bytearray_to_utf8 (GByteArray *bytearray,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error);
+
+/*
+ * Convert into an UTF-8 encoded string the input string, which is
+ * encoded in the given charset. Those charsets that allow embedded NUL
+ * bytes (e.g. UTF-16) need to be hex-encoded.
+ *
+ * If the input string is NUL-terminated, len may be given as -1; otherwise
+ * len needs to specify the number of valid bytes in the input string.
+ *
+ * The output string is guaranteed to be valid UTF-8 and NUL-terminated.
+ */
+gchar *mm_modem_charset_str_to_utf8 (const gchar *str,
+ gssize len,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error);
+
#endif /* MM_CHARSETS_H */