sms-part-3gpp: allow sending UTF-16 as if it were UCS-2

Despite 3GPP TS 23.038 specifies that Unicode SMS messages are encoded in UCS-2, UTF-16 encoding is commonly used instead on many modern platforms to allow encoding code points that fall outside the Basic Multilingual Plane (BMP), such as Emoji. Update the logic to always use UTF-16 instead of UCS-2 when creating or parsing PDUs (even if we always report as sending or receiving UCS-2). For all purposes, UCS-2 is considered a subset of UTF-16 (assuming that code points out of the [U+0000,U+D7FF] and [U+E000,U+FFFF] ranges are not applicable in UCS-2). Fixes https://gitlab.freedesktop.org/mobile-broadband/ModemManager/-/issues/250
author: Aleksander Morgado <aleksander@aleksander.es> 2020-08-20 12:18:05 +0200
committer: Aleksander Morgado <aleksander@aleksander.es> 2020-08-20 18:15:37 +0200
commit: 599f545c0d905505516c6546ff77caced2aa14f1 (patch)
tree: a06b77c6dce87ec3aca97ea3c88037c6e7653d67 /src/mm-sms-part-3gpp.c
parent: 81162df15dc9a409d0979ff8d472a026f31ed883 (diff)
1 files changed, 82 insertions, 77 deletions
diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
index 9e1862d1..c18aaa75 100644
--- a/src/mm-sms-part-3gpp.c
+++ b/src/mm-sms-part-3gpp.c
@@ -246,38 +246,25 @@ sms_decode_text (const guint8 *text,
                  int           bit_offset,
                  gpointer      log_object)
 {
-    char *utf8;
-    guint8 *unpacked;
-    guint32 unpacked_len;
+    gchar *utf8;
 
     if (encoding == MM_SMS_ENCODING_GSM7) {
+        g_autofree guint8 *unpacked = NULL;
+        guint32            unpacked_len;
+
         mm_obj_dbg (log_object, "converting SMS part text from GSM-7 to UTF-8...");
         unpacked = mm_charset_gsm_unpack ((const guint8 *) text, len, bit_offset, &unpacked_len);
         utf8 = (char *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len);
         mm_obj_dbg (log_object, "   got UTF-8 text: '%s'", utf8);
-        g_free (unpacked);
     } else if (encoding == MM_SMS_ENCODING_UCS2) {
-        /* Despite 3GPP TS 23.038 specifies that Unicode SMS messages are
-         * encoded in UCS-2, UTF-16 encoding is commonly used instead on many
-         * modern platforms to allow encoding code points that fall outside the
-         * Basic Multilingual Plane (BMP), such as Emoji. Most of the UCS-2
-         * code points are identical to their equivalent UTF-16 code points.
-         * In UTF-16, non-BMP code points are encoded in a pair of surrogate
-         * code points (i.e. a high surrogate in 0xD800..0xDBFF, followed by a
-         * low surrogate in 0xDC00..0xDFFF). An isolated surrogate code point
-         * has no general interpretation in UTF-16, but could be a valid
-         * (though unmapped) code point in UCS-2. Here we first try to decode
-         * the SMS message in UTF-16BE, and if that fails, fall back to decode
-         * in UCS-2BE.
-         */
+        g_autoptr(GByteArray) bytearray = NULL;
+
         mm_obj_dbg (log_object, "converting SMS part text from UTF-16BE to UTF-8...");
-        utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UTF-16BE", NULL, NULL, NULL);
-        if (!utf8) {
-            mm_obj_dbg (log_object, "converting SMS part text from UCS-2BE to UTF-8...");
-            utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UCS-2BE", NULL, NULL, NULL);
-        }
+        bytearray = g_byte_array_append (g_byte_array_sized_new (len), (const guint8 *)text, len);
+        /* Always assume UTF-16 instead of UCS-2! */
+        utf8 = mm_modem_charset_byte_array_to_utf8 (bytearray, MM_MODEM_CHARSET_UTF16);
         if (!utf8) {
-            mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE/UCS-2BE to UTF-8: not decoding any text");
+            mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text");
             utf8 = g_strdup ("");
         } else
             mm_obj_dbg (log_object, "   got UTF-8 text: '%s'", utf8);
@@ -625,9 +612,11 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint         index,
             mm_obj_dbg (log_object, "  user data encoding is 8bit");
             break;
         case MM_SMS_ENCODING_UNKNOWN:
-        default:
             mm_obj_dbg (log_object, "  user data encoding is unknown");
             break;
+        default:
+            g_assert_not_reached ();
+
         }
         mm_sms_part_set_encoding (sms_part, user_data_encoding);
 
@@ -829,6 +818,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
     guint len, offset = 0;
     guint shift = 0;
     guint8 *udl_ptr;
+    MMSmsEncoding encoding;
 
     g_return_val_if_fail (mm_sms_part_get_number (part) != NULL, NULL);
     g_return_val_if_fail (mm_sms_part_get_text (part) != NULL || mm_sms_part_get_data (part) != NULL, NULL);
@@ -923,7 +913,9 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
         pdu[offset] |= mm_sms_part_get_class (part);
     }
 
-    switch (mm_sms_part_get_encoding (part)) {
+    encoding = mm_sms_part_get_encoding (part);
+
+    switch (encoding) {
     case MM_SMS_ENCODING_UCS2:
         mm_obj_dbg (log_object, "  using UCS2 encoding...");
         pdu[offset] |= SMS_DCS_CODING_UCS2;
@@ -976,7 +968,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
         shift = 1;
     }
 
-    if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_GSM7) {
+    if (encoding == MM_SMS_ENCODING_GSM7) {
         guint8 *unpacked, *packed;
         guint32 unlen = 0, packlen = 0;
 
@@ -1012,17 +1004,19 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
         memcpy (&pdu[offset], packed, packlen);
         g_free (packed);
         offset += packlen;
-    } else if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_UCS2) {
+    } else if (encoding == MM_SMS_ENCODING_UCS2) {
         g_autoptr(GByteArray) array = NULL;
         g_autoptr(GError)     inner_error = NULL;
 
         /* Try to guess a good value for the array */
         array = g_byte_array_sized_new (strlen (mm_sms_part_get_text (part)) * 2);
-        if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UCS2, &inner_error)) {
+        /* Always assume UTF-16 instead of UCS-2! */
+        if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UTF16, &inner_error)) {
             g_set_error (error,
                          MM_MESSAGE_ERROR,
                          MM_MESSAGE_ERROR_INVALID_PDU_PARAMETER,
-                         "Failed to convert message text to UCS2: %s", inner_error->message);
+                         "Failed to convert message text to UTF-16: %s",
+                         inner_error->message);
             goto error;
         }
 
@@ -1094,62 +1088,68 @@ util_split_text_gsm7 (const gchar *text,
 }
 
 static gchar **
-util_split_text_ucs2 (const gchar *text,
-                      gsize        text_len,
-                      gpointer     log_object)
+util_split_text_utf16_or_ucs2 (const gchar *text,
+                               gsize        text_len,
+                               gpointer     log_object)
 {
-    g_autoptr(GByteArray)   array = NULL;
-    g_autoptr(GError)       error = NULL;
-    gchar                 **out;
-    guint                   n_chunks;
-    guint                   i;
-    guint                   j;
-
-    /* Guess the size of the output array to avoid multiple allocations */
-    array = g_byte_array_sized_new (text_len * 2);
-    if (!mm_modem_charset_byte_array_append (array,
-                                             text,
-                                             FALSE,
-                                             MM_MODEM_CHARSET_UCS2,
-                                             &error)) {
-        mm_obj_warn (log_object, "failed to append UCS2: %s", error->message);
-        return NULL;
+    g_autoptr(GPtrArray)  chunks = NULL;
+    const gchar          *walker;
+    const gchar          *chunk_start;
+    glong                 encoded_chunk_length;
+    glong                 total_encoded_chunk_length;
+
+    chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
+
+    walker = text;
+    chunk_start = text;
+    encoded_chunk_length = 0;
+    total_encoded_chunk_length = 0;
+    while (walker && *walker) {
+        g_autofree gunichar2 *unichar2 = NULL;
+        glong                 unichar2_written = 0;
+        glong                 unichar2_written_bytes = 0;
+        gunichar              single;
+
+        single = g_utf8_get_char (walker);
+        unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
+        g_assert (unichar2_written > 0);
+
+        /* When splitting for UCS-2 encoding, only one single unichar2 will be
+         * written, because all codepoints represented in UCS2 fit in the BMP.
+         * When splitting for UTF-16, though, we may end up writing one or two
+         * unichar2 (without or with surrogate pairs), because UTF-16 covers the
+         * whole Unicode spectrum. */
+        unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
+        if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
+            g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+            chunk_start = walker;
+            encoded_chunk_length = unichar2_written_bytes;
+        } else
+            encoded_chunk_length += unichar2_written_bytes;
+
+        total_encoded_chunk_length += unichar2_written_bytes;
+        walker = g_utf8_next_char (walker);
     }
 
-    /* Our bytearray has it in UCS-2 now.
-     * UCS-2 is a fixed-size encoding, which means that the text has exactly
-     * 2 bytes for each unicode point. We can now split this array into
-     * chunks of 67 UCS-2 characters (134 bytes).
-     *
-     * Note that UCS-2 covers unicode points between U+0000 and U+FFFF, which
-     * means that there is no direct relationship between the size of the
-     * input text in UTF-8 and the size of the text in UCS-2. A 3-byte UTF-8
-     * encoded character will still be represented with 2 bytes in UCS-2.
-     */
+    /* We have split the original string in chunks, where each chunk
+     * does not require more than 134 bytes when encoded in UTF-16.
+     * As a special case now, we consider the case that no splitting
+     * is necessary, i.e. if the total amount of bytes after encoding
+     * in UTF-16 is less or equal than 140. */
+    if (total_encoded_chunk_length <= 140) {
+        gchar **out;
 
-    /* No splitting needed? */
-    if (array->len <= 140) {
         out = g_new0 (gchar *, 2);
         out[0] = g_strdup (text);
         return out;
     }
 
-    /* Compute number of chunks needed */
-    n_chunks = array->len / 134;
-    if (array->len % 134 != 0)
-        n_chunks++;
+    /* Otherwise, we do need the splitted chunks. Add the last one
+     * with contents plus the last trailing NULL */
+    g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+    g_ptr_array_add (chunks, NULL);
 
-    /* Fill in all chunks */
-    out = g_new0 (gchar *, n_chunks + 1);
-    for (i = 0, j = 0; i < n_chunks; i++, j += 134) {
-        out[i] = sms_decode_text (&array->data[j],
-                                  MIN (array->len - j, 134),
-                                  MM_SMS_ENCODING_UCS2,
-                                  0,
-                                  log_object);
-    }
-
-    return out;
+    return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
 }
 
 gchar **
@@ -1174,6 +1174,11 @@ mm_sms_part_3gpp_util_split_text (const gchar   *text,
      *      134 * 8 = 1072; 1072/7=153.14
      *  2) If we're using UCS2 encoding, we can pack up to 70 characters in
      *     140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
+     *  3) If we're using UTF-16 encoding (instead of UCS2), the amount of
+     *     characters we can pack is variable, depends on how the characters
+     *     are encoded in UTF-16 (e.g. if there are characters out of the BMP
+     *     we'll need surrogate pairs and a single character will need 4 bytes
+     *     instead of 2).
      *
      * This method does the split of the input string into N strings, so that
      * each of the strings can be placed in a SMS part.
@@ -1185,9 +1190,9 @@ mm_sms_part_3gpp_util_split_text (const gchar   *text,
         return util_split_text_gsm7 (text, strlen (text), log_object);
     }
 
-    /* Otherwise, fallback to UCS2 encoding */
+    /* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
     *encoding = MM_SMS_ENCODING_UCS2;
-    return util_split_text_ucs2 (text, strlen (text), log_object);
+    return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
 }
 
 GByteArray **
author	Aleksander Morgado <aleksander@aleksander.es>	2020-08-20 12:18:05 +0200
committer	Aleksander Morgado <aleksander@aleksander.es>	2020-08-20 18:15:37 +0200
commit	599f545c0d905505516c6546ff77caced2aa14f1 (patch)
tree	a06b77c6dce87ec3aca97ea3c88037c6e7653d67 /src/mm-sms-part-3gpp.c
parent	81162df15dc9a409d0979ff8d472a026f31ed883 (diff)