charsets: make translit optional in gsm_unpacked_to_utf8()

Until now, this method would automatically apply transliteration; i.e. replacing characters with '?' when no direct translation was available. We can attempt to do that transliteration on strings that are not critical, e.g. the operator name reported by the network. But we should not do that on other types of strings, e.g. on SMS contents that may really have additional purposes than just being human-readable. This commit makes the transliteration option to be explicitly requested by the caller.
author: Aleksander Morgado <aleksander@aleksander.es> 2020-11-26 23:07:11 +0100
committer: Aleksander Morgado <aleksander@aleksander.es> 2021-02-23 11:35:11 +0000
commit: 5ce97abd73da12b64393be798f2c294d29be2705 (patch)
tree: 48010cd5880f1dc07fd24e2156ecd7fd9290622b /src
parent: 5480cb67b283c078770b02766c37768cb0930d7b (diff)
6 files changed, 95 insertions, 64 deletions
diff --git a/src/mm-base-sim.c b/src/mm-base-sim.c
index 0e00342c..f50af820 100644
--- a/src/mm-base-sim.c
+++ b/src/mm-base-sim.c
@@ -1427,7 +1427,7 @@ parse_spn (const gchar *response,
             buflen--;
 
         /* First byte is metadata; remainder is GSM-7 unpacked into octets; convert to UTF8 */
-        return (gchar *)mm_charset_gsm_unpacked_to_utf8 (bin + 1, buflen - 1);
+        return (gchar *)mm_charset_gsm_unpacked_to_utf8 (bin + 1, buflen - 1, FALSE, error);
     }
 
     g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_FAILED,
diff --git a/src/mm-broadband-modem-mbim.c b/src/mm-broadband-modem-mbim.c
index 7c592fe4..c073bb29 100644
--- a/src/mm-broadband-modem-mbim.c
+++ b/src/mm-broadband-modem-mbim.c
@@ -4820,11 +4820,9 @@ ussd_decode (guint32      scheme,
         guint32             unpacked_len;
 
         unpacked = mm_charset_gsm_unpack ((const guint8 *)data->data, (data->len * 8) / 7, 0, &unpacked_len);
-        decoded = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len);
+        decoded = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len, FALSE, error);
         if (!decoded)
-            g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_UNSUPPORTED,
-                         "Error decoding USSD command in 0x%04x scheme (GSM7 charset)",
-                         scheme);
+            g_prefix_error (error, "Error decoding USSD command in 0x%04x scheme (GSM7 charset): ", scheme);
     } else if (scheme == MM_MODEM_GSM_USSD_SCHEME_UCS2) {
         decoded = mm_modem_charset_byte_array_to_utf8 (data, MM_MODEM_CHARSET_UCS2);
         if (!decoded)
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index bc33b2ae..3a8ea719 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -360,11 +360,13 @@ utf8_to_gsm_ext_char (const gchar *utf8,
 }
 
 guint8 *
-mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm,
-                                 guint32       len)
+mm_charset_gsm_unpacked_to_utf8 (const guint8  *gsm,
+                                 guint32        len,
+                                 gboolean       translit,
+                                 GError       **error)
 {
-    guint       i;
-    GByteArray *utf8;
+    g_autoptr(GByteArray) utf8 = NULL;
+    guint                 i;
 
     g_return_val_if_fail (gsm != NULL, NULL);
     g_return_val_if_fail (len < 4096, NULL);
@@ -410,13 +412,18 @@ mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm,
 
         if (ulen)
             g_byte_array_append (utf8, &uchars[0], ulen);
-        else
+        else if (translit)
             g_byte_array_append (utf8, (guint8 *) "?", 1);
+        else {
+            g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                         "Invalid conversion from GSM7");
+            return NULL;
+        }
     }
 
     /* Always make sure returned string is NUL terminated */
     g_byte_array_append (utf8, (guint8 *) "\0", 1);
-    return g_byte_array_free (utf8, FALSE);
+    return g_byte_array_free (g_steal_pointer (&utf8), FALSE);
 }
 
 guint8 *
@@ -740,7 +747,7 @@ mm_charset_take_and_convert_to_utf8 (gchar          *str,
         break;
 
     case MM_MODEM_CHARSET_GSM:
-        utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 ((const guint8 *) str, strlen (str));
+        utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 ((const guint8 *) str, strlen (str), FALSE, NULL);
         g_free (str);
         break;
 
diff --git a/src/mm-charsets.h b/src/mm-charsets.h
index a84b7ac5..dc8613a5 100644
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -53,10 +53,12 @@ gchar *mm_modem_charset_byte_array_to_utf8 (GByteArray     *array,
 gchar *mm_modem_charset_hex_to_utf8 (const gchar    *src,
                                     MMModemCharset  charset);
 
-guint8 *mm_charset_utf8_to_unpacked_gsm (const gchar  *utf8,
-                                         guint32      *out_len);
-guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm,
-                                         guint32       len);
+guint8 *mm_charset_utf8_to_unpacked_gsm (const gchar   *utf8,
+                                         guint32       *out_len);
+guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8  *gsm,
+                                         guint32        len,
+                                         gboolean       translit,
+                                         GError       **error);
 
 /* Checks whether conversion to the given charset may be done without errors */
 gboolean mm_charset_can_convert_to (const gchar    *utf8,
diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
index 83181d3b..e7735e1d 100644
--- a/src/mm-sms-part-3gpp.c
+++ b/src/mm-sms-part-3gpp.c
@@ -120,23 +120,24 @@ sms_string_to_bcd_semi_octets (guint8 *buf, gsize buflen, const char *string)
 }
 
 /* len is in semi-octets */
-static char *
-sms_decode_address (const guint8 *address, int len)
+static gchar *
+sms_decode_address (const guint8  *address,
+                    gint           len,
+                    GError       **error)
 {
     guint8 addrtype, addrplan;
-    char *utf8;
+    gchar *utf8;
 
     addrtype = address[0] & SMS_NUMBER_TYPE_MASK;
     addrplan = address[0] & SMS_NUMBER_PLAN_MASK;
     address++;
 
     if (addrtype == SMS_NUMBER_TYPE_ALPHA) {
-        guint8 *unpacked;
-        guint32 unpacked_len;
+        g_autofree guint8 *unpacked = NULL;
+        guint32            unpacked_len;
+
         unpacked = mm_charset_gsm_unpack (address, (len * 4) / 7, 0, &unpacked_len);
-        utf8 = (char *)mm_charset_gsm_unpacked_to_utf8 (unpacked,
-                                                        unpacked_len);
-        g_free (unpacked);
+        utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len, FALSE, error);
     } else if (addrtype == SMS_NUMBER_TYPE_INTL &&
                addrplan == SMS_NUMBER_PLAN_TELEPHONE) {
         /* International telphone number, format as "+1234567890" */
@@ -239,41 +240,45 @@ sms_encoding_type (int dcs)
     return scheme;
 }
 
-static char *
-sms_decode_text (const guint8 *text,
-                 int           len,
-                 MMSmsEncoding encoding,
-                 int           bit_offset,
-                 gpointer      log_object)
+static gchar *
+sms_decode_text (const guint8   *text,
+                 int             len,
+                 MMSmsEncoding   encoding,
+                 int             bit_offset,
+                 gpointer        log_object,
+                 GError        **error)
 {
-    gchar *utf8;
-
     if (encoding == MM_SMS_ENCODING_GSM7) {
         g_autofree guint8 *unpacked = NULL;
         guint32            unpacked_len;
+        gchar             *utf8;
 
-        mm_obj_dbg (log_object, "converting SMS part text from GSM-7 to UTF-8...");
         unpacked = mm_charset_gsm_unpack ((const guint8 *) text, len, bit_offset, &unpacked_len);
-        utf8 = (char *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len);
-        mm_obj_dbg (log_object, "   got UTF-8 text: '%s'", utf8);
-    } else if (encoding == MM_SMS_ENCODING_UCS2) {
-        g_autoptr(GByteArray) bytearray = NULL;
+        utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len, FALSE, error);
+        if (utf8)
+            mm_obj_dbg (log_object, "converted SMS part text from GSM-7 to UTF-8: %s", utf8);
+        return utf8;
+    }
+
+    if (encoding == MM_SMS_ENCODING_UCS2) {
+        g_autoptr(GByteArray)  bytearray = NULL;
+        gchar                 *utf8;
 
-        mm_obj_dbg (log_object, "converting SMS part text from UTF-16BE to UTF-8...");
         bytearray = g_byte_array_append (g_byte_array_sized_new (len), (const guint8 *)text, len);
         /* Always assume UTF-16 instead of UCS-2! */
         utf8 = mm_modem_charset_byte_array_to_utf8 (bytearray, MM_MODEM_CHARSET_UTF16);
-        if (!utf8) {
-            mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text");
-            utf8 = g_strdup ("");
-        } else
-            mm_obj_dbg (log_object, "   got UTF-8 text: '%s'", utf8);
-    } else {
-        mm_obj_warn (log_object, "unexpected encoding: %s; not decoding any text", mm_sms_encoding_get_string (encoding));
-        utf8 = g_strdup ("");
+        if (!utf8)
+            g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_FAILED,
+                         "Couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text");
+        else
+            mm_obj_dbg (log_object, "converted SMS part text from UTF-16BE to UTF-8: %s", utf8);
+        return utf8;
     }
 
-    return utf8;
+    g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_FAILED,
+                 "Couldn't convert SMS part contents from %s to UTF-8",
+                 mm_sms_encoding_get_string (encoding));
+    return NULL;
 }
 
 static guint
@@ -373,6 +378,7 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint         index,
     guint tp_dcs_offset = 0;
     guint tp_user_data_len_offset = 0;
     MMSmsEncoding user_data_encoding = MM_SMS_ENCODING_UNKNOWN;
+    gchar *address;
 
     /* Create the new MMSmsPart */
     sms_part = mm_sms_part_new (index, MM_SMS_PDU_TYPE_UNKNOWN);
@@ -405,8 +411,13 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint         index,
     if (smsc_addr_size_bytes > 0) {
         PDU_SIZE_CHECK (offset + smsc_addr_size_bytes, "cannot read SMSC address");
         /* SMSC may not be given in DELIVER PDUs */
-        mm_sms_part_take_smsc (sms_part,
-                               sms_decode_address (&pdu[1], 2 * (smsc_addr_size_bytes - 1)));
+        address = sms_decode_address (&pdu[1], 2 * (smsc_addr_size_bytes - 1), error);
+        if (!address) {
+            g_prefix_error (error, "Couldn't read SMSC address: ");
+            mm_sms_part_free (sms_part);
+            return NULL;
+        }
+        mm_sms_part_take_smsc (sms_part, g_steal_pointer (&address));
         mm_obj_dbg (log_object, "  SMSC address parsed: '%s'", mm_sms_part_get_smsc (sms_part));
         offset += smsc_addr_size_bytes;
     } else
@@ -478,9 +489,13 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint         index,
     tp_addr_size_bytes = (tp_addr_size_digits + 1) >> 1;
 
     PDU_SIZE_CHECK (offset + tp_addr_size_bytes, "cannot read number");
-    mm_sms_part_take_number (sms_part,
-                             sms_decode_address (&pdu[offset],
-                                                 tp_addr_size_digits));
+    address = sms_decode_address (&pdu[offset], tp_addr_size_digits, error);
+    if (!address) {
+        g_prefix_error (error, "Couldn't read address: ");
+        mm_sms_part_free (sms_part);
+        return NULL;
+    }
+    mm_sms_part_take_number (sms_part, g_steal_pointer (&address));
     mm_obj_dbg (log_object, "  number parsed: %s", mm_sms_part_get_number (sms_part));
     offset += (1 + tp_addr_size_bytes); /* +1 due to the Type of Address byte */
 
@@ -709,17 +724,24 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint         index,
         switch (user_data_encoding) {
         case MM_SMS_ENCODING_GSM7:
         case MM_SMS_ENCODING_UCS2:
-            /* Otherwise if it's 7-bit or UCS2 we can decode it */
-            mm_obj_dbg (log_object, "decoding SMS text with %u elements", tp_user_data_size_elements);
-            mm_sms_part_take_text (sms_part,
-                                   sms_decode_text (&pdu[tp_user_data_offset],
-                                                    tp_user_data_size_elements,
-                                                    user_data_encoding,
-                                                    bit_offset,
-                                                    log_object));
-            g_warn_if_fail (mm_sms_part_get_text (sms_part) != NULL);
-            break;
-
+            {
+                gchar *text;
+
+                /* Otherwise if it's 7-bit or UCS2 we can decode it */
+                mm_obj_dbg (log_object, "decoding SMS text with %u elements", tp_user_data_size_elements);
+                text = sms_decode_text (&pdu[tp_user_data_offset],
+                                        tp_user_data_size_elements,
+                                        user_data_encoding,
+                                        bit_offset,
+                                        log_object,
+                                        error);
+                if (!text) {
+                    mm_sms_part_free (sms_part);
+                    return NULL;
+                }
+                mm_sms_part_take_text (sms_part, text);
+                break;
+            }
         case MM_SMS_ENCODING_8BIT:
         case MM_SMS_ENCODING_UNKNOWN:
         default:
diff --git a/src/tests/test-charsets.c b/src/tests/test-charsets.c
index a15e0332..b18c11b1 100644
--- a/src/tests/test-charsets.c
+++ b/src/tests/test-charsets.c
@@ -30,6 +30,7 @@ common_test_gsm7 (const gchar *in_utf8)
     g_autofree guint8 *packed_gsm = NULL;
     g_autofree guint8 *unpacked_gsm_2 = NULL;
     g_autofree gchar *built_utf8 = NULL;
+    g_autoptr(GError) error = NULL;
 
     /* Convert to GSM */
     unpacked_gsm = mm_charset_utf8_to_unpacked_gsm (in_utf8, &unpacked_gsm_len);
@@ -58,8 +59,9 @@ common_test_gsm7 (const gchar *in_utf8)
     g_assert_nonnull (unpacked_gsm_2);
 
     /* And back to UTF-8 */
-    built_utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked_gsm_2, unpacked_gsm_len_2);
+    built_utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked_gsm_2, unpacked_gsm_len_2, FALSE, &error);
     g_assert_nonnull (built_utf8);
+    g_assert_no_error (error);
     g_assert_cmpstr (built_utf8, ==, in_utf8);
 }
author	Aleksander Morgado <aleksander@aleksander.es>	2020-11-26 23:07:11 +0100
committer	Aleksander Morgado <aleksander@aleksander.es>	2021-02-23 11:35:11 +0000
commit	5ce97abd73da12b64393be798f2c294d29be2705 (patch)
tree	48010cd5880f1dc07fd24e2156ecd7fd9290622b /src
parent	5480cb67b283c078770b02766c37768cb0930d7b (diff)