sms-part-3gpp: allow sending UTF-16 as if it were UCS-2

Despite 3GPP TS 23.038 specifies that Unicode SMS messages are encoded in UCS-2, UTF-16 encoding is commonly used instead on many modern platforms to allow encoding code points that fall outside the Basic Multilingual Plane (BMP), such as Emoji. Update the logic to always use UTF-16 instead of UCS-2 when creating or parsing PDUs (even if we always report as sending or receiving UCS-2). For all purposes, UCS-2 is considered a subset of UTF-16 (assuming that code points out of the [U+0000,U+D7FF] and [U+E000,U+FFFF] ranges are not applicable in UCS-2). Fixes https://gitlab.freedesktop.org/mobile-broadband/ModemManager/-/issues/250
author: Aleksander Morgado <aleksander@aleksander.es> 2020-08-20 12:18:05 +0200
committer: Aleksander Morgado <aleksander@aleksander.es> 2020-08-20 18:15:37 +0200
commit: 599f545c0d905505516c6546ff77caced2aa14f1 (patch)
tree: a06b77c6dce87ec3aca97ea3c88037c6e7653d67 /src
parent: 81162df15dc9a409d0979ff8d472a026f31ed883 (diff)
3 files changed, 165 insertions, 90 deletions
diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
index 9e1862d1..c18aaa75 100644
--- a/src/mm-sms-part-3gpp.c
+++ b/src/mm-sms-part-3gpp.c
@@ -246,38 +246,25 @@ sms_decode_text (const guint8 *text,
                  int           bit_offset,
                  gpointer      log_object)
 {
-    char *utf8;
-    guint8 *unpacked;
-    guint32 unpacked_len;
+    gchar *utf8;
 
     if (encoding == MM_SMS_ENCODING_GSM7) {
+        g_autofree guint8 *unpacked = NULL;
+        guint32            unpacked_len;
+
         mm_obj_dbg (log_object, "converting SMS part text from GSM-7 to UTF-8...");
         unpacked = mm_charset_gsm_unpack ((const guint8 *) text, len, bit_offset, &unpacked_len);
         utf8 = (char *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len);
         mm_obj_dbg (log_object, "   got UTF-8 text: '%s'", utf8);
-        g_free (unpacked);
     } else if (encoding == MM_SMS_ENCODING_UCS2) {
-        /* Despite 3GPP TS 23.038 specifies that Unicode SMS messages are
-         * encoded in UCS-2, UTF-16 encoding is commonly used instead on many
-         * modern platforms to allow encoding code points that fall outside the
-         * Basic Multilingual Plane (BMP), such as Emoji. Most of the UCS-2
-         * code points are identical to their equivalent UTF-16 code points.
-         * In UTF-16, non-BMP code points are encoded in a pair of surrogate
-         * code points (i.e. a high surrogate in 0xD800..0xDBFF, followed by a
-         * low surrogate in 0xDC00..0xDFFF). An isolated surrogate code point
-         * has no general interpretation in UTF-16, but could be a valid
-         * (though unmapped) code point in UCS-2. Here we first try to decode
-         * the SMS message in UTF-16BE, and if that fails, fall back to decode
-         * in UCS-2BE.
-         */
+        g_autoptr(GByteArray) bytearray = NULL;
+
         mm_obj_dbg (log_object, "converting SMS part text from UTF-16BE to UTF-8...");
-        utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UTF-16BE", NULL, NULL, NULL);
-        if (!utf8) {
-            mm_obj_dbg (log_object, "converting SMS part text from UCS-2BE to UTF-8...");
-            utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UCS-2BE", NULL, NULL, NULL);
-        }
+        bytearray = g_byte_array_append (g_byte_array_sized_new (len), (const guint8 *)text, len);
+        /* Always assume UTF-16 instead of UCS-2! */
+        utf8 = mm_modem_charset_byte_array_to_utf8 (bytearray, MM_MODEM_CHARSET_UTF16);
         if (!utf8) {
-            mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE/UCS-2BE to UTF-8: not decoding any text");
+            mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text");
             utf8 = g_strdup ("");
         } else
             mm_obj_dbg (log_object, "   got UTF-8 text: '%s'", utf8);
@@ -625,9 +612,11 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint         index,
             mm_obj_dbg (log_object, "  user data encoding is 8bit");
             break;
         case MM_SMS_ENCODING_UNKNOWN:
-        default:
             mm_obj_dbg (log_object, "  user data encoding is unknown");
             break;
+        default:
+            g_assert_not_reached ();
+
         }
         mm_sms_part_set_encoding (sms_part, user_data_encoding);
 
@@ -829,6 +818,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
     guint len, offset = 0;
     guint shift = 0;
     guint8 *udl_ptr;
+    MMSmsEncoding encoding;
 
     g_return_val_if_fail (mm_sms_part_get_number (part) != NULL, NULL);
     g_return_val_if_fail (mm_sms_part_get_text (part) != NULL || mm_sms_part_get_data (part) != NULL, NULL);
@@ -923,7 +913,9 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
         pdu[offset] |= mm_sms_part_get_class (part);
     }
 
-    switch (mm_sms_part_get_encoding (part)) {
+    encoding = mm_sms_part_get_encoding (part);
+
+    switch (encoding) {
     case MM_SMS_ENCODING_UCS2:
         mm_obj_dbg (log_object, "  using UCS2 encoding...");
         pdu[offset] |= SMS_DCS_CODING_UCS2;
@@ -976,7 +968,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
         shift = 1;
     }
 
-    if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_GSM7) {
+    if (encoding == MM_SMS_ENCODING_GSM7) {
         guint8 *unpacked, *packed;
         guint32 unlen = 0, packlen = 0;
 
@@ -1012,17 +1004,19 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
         memcpy (&pdu[offset], packed, packlen);
         g_free (packed);
         offset += packlen;
-    } else if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_UCS2) {
+    } else if (encoding == MM_SMS_ENCODING_UCS2) {
         g_autoptr(GByteArray) array = NULL;
         g_autoptr(GError)     inner_error = NULL;
 
         /* Try to guess a good value for the array */
         array = g_byte_array_sized_new (strlen (mm_sms_part_get_text (part)) * 2);
-        if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UCS2, &inner_error)) {
+        /* Always assume UTF-16 instead of UCS-2! */
+        if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UTF16, &inner_error)) {
             g_set_error (error,
                          MM_MESSAGE_ERROR,
                          MM_MESSAGE_ERROR_INVALID_PDU_PARAMETER,
-                         "Failed to convert message text to UCS2: %s", inner_error->message);
+                         "Failed to convert message text to UTF-16: %s",
+                         inner_error->message);
             goto error;
         }
 
@@ -1094,62 +1088,68 @@ util_split_text_gsm7 (const gchar *text,
 }
 
 static gchar **
-util_split_text_ucs2 (const gchar *text,
-                      gsize        text_len,
-                      gpointer     log_object)
+util_split_text_utf16_or_ucs2 (const gchar *text,
+                               gsize        text_len,
+                               gpointer     log_object)
 {
-    g_autoptr(GByteArray)   array = NULL;
-    g_autoptr(GError)       error = NULL;
-    gchar                 **out;
-    guint                   n_chunks;
-    guint                   i;
-    guint                   j;
-
-    /* Guess the size of the output array to avoid multiple allocations */
-    array = g_byte_array_sized_new (text_len * 2);
-    if (!mm_modem_charset_byte_array_append (array,
-                                             text,
-                                             FALSE,
-                                             MM_MODEM_CHARSET_UCS2,
-                                             &error)) {
-        mm_obj_warn (log_object, "failed to append UCS2: %s", error->message);
-        return NULL;
+    g_autoptr(GPtrArray)  chunks = NULL;
+    const gchar          *walker;
+    const gchar          *chunk_start;
+    glong                 encoded_chunk_length;
+    glong                 total_encoded_chunk_length;
+
+    chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
+
+    walker = text;
+    chunk_start = text;
+    encoded_chunk_length = 0;
+    total_encoded_chunk_length = 0;
+    while (walker && *walker) {
+        g_autofree gunichar2 *unichar2 = NULL;
+        glong                 unichar2_written = 0;
+        glong                 unichar2_written_bytes = 0;
+        gunichar              single;
+
+        single = g_utf8_get_char (walker);
+        unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
+        g_assert (unichar2_written > 0);
+
+        /* When splitting for UCS-2 encoding, only one single unichar2 will be
+         * written, because all codepoints represented in UCS2 fit in the BMP.
+         * When splitting for UTF-16, though, we may end up writing one or two
+         * unichar2 (without or with surrogate pairs), because UTF-16 covers the
+         * whole Unicode spectrum. */
+        unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
+        if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
+            g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+            chunk_start = walker;
+            encoded_chunk_length = unichar2_written_bytes;
+        } else
+            encoded_chunk_length += unichar2_written_bytes;
+
+        total_encoded_chunk_length += unichar2_written_bytes;
+        walker = g_utf8_next_char (walker);
     }
 
-    /* Our bytearray has it in UCS-2 now.
-     * UCS-2 is a fixed-size encoding, which means that the text has exactly
-     * 2 bytes for each unicode point. We can now split this array into
-     * chunks of 67 UCS-2 characters (134 bytes).
-     *
-     * Note that UCS-2 covers unicode points between U+0000 and U+FFFF, which
-     * means that there is no direct relationship between the size of the
-     * input text in UTF-8 and the size of the text in UCS-2. A 3-byte UTF-8
-     * encoded character will still be represented with 2 bytes in UCS-2.
-     */
+    /* We have split the original string in chunks, where each chunk
+     * does not require more than 134 bytes when encoded in UTF-16.
+     * As a special case now, we consider the case that no splitting
+     * is necessary, i.e. if the total amount of bytes after encoding
+     * in UTF-16 is less or equal than 140. */
+    if (total_encoded_chunk_length <= 140) {
+        gchar **out;
 
-    /* No splitting needed? */
-    if (array->len <= 140) {
         out = g_new0 (gchar *, 2);
         out[0] = g_strdup (text);
         return out;
     }
 
-    /* Compute number of chunks needed */
-    n_chunks = array->len / 134;
-    if (array->len % 134 != 0)
-        n_chunks++;
+    /* Otherwise, we do need the splitted chunks. Add the last one
+     * with contents plus the last trailing NULL */
+    g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+    g_ptr_array_add (chunks, NULL);
 
-    /* Fill in all chunks */
-    out = g_new0 (gchar *, n_chunks + 1);
-    for (i = 0, j = 0; i < n_chunks; i++, j += 134) {
-        out[i] = sms_decode_text (&array->data[j],
-                                  MIN (array->len - j, 134),
-                                  MM_SMS_ENCODING_UCS2,
-                                  0,
-                                  log_object);
-    }
-
-    return out;
+    return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
 }
 
 gchar **
@@ -1174,6 +1174,11 @@ mm_sms_part_3gpp_util_split_text (const gchar   *text,
      *      134 * 8 = 1072; 1072/7=153.14
      *  2) If we're using UCS2 encoding, we can pack up to 70 characters in
      *     140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
+     *  3) If we're using UTF-16 encoding (instead of UCS2), the amount of
+     *     characters we can pack is variable, depends on how the characters
+     *     are encoded in UTF-16 (e.g. if there are characters out of the BMP
+     *     we'll need surrogate pairs and a single character will need 4 bytes
+     *     instead of 2).
      *
      * This method does the split of the input string into N strings, so that
      * each of the strings can be placed in a SMS part.
@@ -1185,9 +1190,9 @@ mm_sms_part_3gpp_util_split_text (const gchar   *text,
         return util_split_text_gsm7 (text, strlen (text), log_object);
     }
 
-    /* Otherwise, fallback to UCS2 encoding */
+    /* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
     *encoding = MM_SMS_ENCODING_UCS2;
-    return util_split_text_ucs2 (text, strlen (text), log_object);
+    return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
 }
 
 GByteArray **
diff --git a/src/mm-sms-part.h b/src/mm-sms-part.h
index 92f39b11..2ee7f308 100644
--- a/src/mm-sms-part.h
+++ b/src/mm-sms-part.h
@@ -20,11 +20,27 @@
 #include <glib.h>
 #include <ModemManager.h>
 
+/* Despite 3GPP TS 23.038 specifies that Unicode SMS messages are
+ * encoded in UCS-2, UTF-16 encoding is commonly used instead on many
+ * modern platforms to allow encoding code points that fall outside the
+ * Basic Multilingual Plane (BMP), such as Emoji. Most of the UCS-2
+ * code points are identical to their equivalent UTF-16 code points.
+ * In UTF-16, non-BMP code points are encoded in a pair of surrogate
+ * code points (i.e. a high surrogate in 0xD800..0xDBFF, followed by a
+ * low surrogate in 0xDC00..0xDFFF). An isolated surrogate code point
+ * has no general interpretation in UTF-16, but could be a valid
+ * (though unmapped) code point in UCS-2.
+ *
+ * The current implementation in ModemManager just assumes that whenever
+ * possible (i.e. when parsing received PDUs or when creating submit
+ * PDUs) UTF-16 will be used instead of plain UCS-2 (even if the PDUs
+ * report the encoding as UCS-2).
+ */
 typedef enum { /*< underscore_name=mm_sms_encoding >*/
     MM_SMS_ENCODING_UNKNOWN = 0x0,
     MM_SMS_ENCODING_GSM7,
     MM_SMS_ENCODING_8BIT,
-    MM_SMS_ENCODING_UCS2
+    MM_SMS_ENCODING_UCS2,
 } MMSmsEncoding;
 
 typedef struct _MMSmsPart MMSmsPart;
diff --git a/src/tests/test-sms-part-3gpp.c b/src/tests/test-sms-part-3gpp.c
index c3d59d87..db6aa7a0 100644
--- a/src/tests/test-sms-part-3gpp.c
+++ b/src/tests/test-sms-part-3gpp.c
@@ -553,8 +553,7 @@ common_test_create_pdu (const gchar *smsc,
 
     g_assert_no_error (error);
     g_assert (pdu != NULL);
-    g_assert_cmpuint (len, ==, expected_size);
-    g_assert_cmpint (memcmp (pdu, expected, len), ==, 0);
+    g_assert_cmpmem (pdu, len, expected, expected_size);
     g_assert_cmpint (msgstart, ==, expected_msgstart);
 
     g_free (pdu);
@@ -735,7 +734,7 @@ common_test_text_split (const gchar *text,
 }
 
 static void
-test_text_split_short (void)
+test_text_split_short_gsm7 (void)
 {
     const gchar *text = "Hello";
     const gchar *expected [] = {
@@ -749,7 +748,7 @@ test_text_split_short (void)
 static void
 test_text_split_short_ucs2 (void)
 {
-    const gchar *text = "你好";
+    const gchar *text = "你好"; /* (UTF-8) e4 bd a0 e5 a5 bd */
     const gchar *expected [] = {
         "你好",
         NULL
@@ -759,7 +758,19 @@ test_text_split_short_ucs2 (void)
 }
 
 static void
-test_text_split_max_single_pdu (void)
+test_text_split_short_utf16 (void)
+{
+    const gchar *text = "😉"; /* U+1F609, winking face */
+    const gchar *expected [] = {
+        "😉",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
+static void
+test_text_split_max_single_pdu_gsm7 (void)
 {
     const gchar *text =
         "0123456789012345678901234567890123456789"
@@ -798,7 +809,23 @@ test_text_split_max_single_pdu_ucs2 (void)
 }
 
 static void
-test_text_split_two_pdu (void)
+test_text_split_max_single_pdu_utf16 (void)
+{
+    /* NOTE: this string contains 35 Bhaiksuki characters, each of
+     * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
+     * in total). */
+    const gchar *text =
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣";
+    const gchar *expected [] = {
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
+static void
+test_text_split_two_pdu_gsm7 (void)
 {
     const gchar *text =
         "0123456789012345678901234567890123456789"
@@ -839,6 +866,30 @@ test_text_split_two_pdu_ucs2 (void)
     common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
 }
 
+static void
+test_text_split_two_pdu_utf16 (void)
+{
+    /* NOTE: this string contains 35 Bhaiksuki characters, each of
+     * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
+     * in total) plus one ASCII char (encoded with 1 byte in UTF-8 and
+     * 2 bytes in UTF-16), making it a total of 142 bytes when in
+     * UTF-16 (so not fitting in one single PDU)
+     *
+     * When split in chunks, the last chunk will hold 2 Bhaiksuki
+     * characters plus the last ASCII one (9 bytes in UTF-16) so that
+     * the first chunk contains the leading 33 Bhaiksuki characters
+     * (132 characters, less than 134) */
+    const gchar *text =
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣a";
+    const gchar *expected [] = {
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡",
+        "𑰢𑰣a",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
 /************************************************************/
 
 int main (int argc, char **argv)
@@ -874,12 +925,15 @@ int main (int argc, char **argv)
     g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-3", test_create_pdu_gsm_3);
     g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-no-validity", test_create_pdu_gsm_no_validity);
 
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/short", test_text_split_short);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/short-UCS2", test_text_split_short_ucs2);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/max-single-pdu", test_text_split_max_single_pdu);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/max-single-pdu-UCS2", test_text_split_max_single_pdu_ucs2);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/two-pdu", test_text_split_two_pdu);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/two-pdu-UCS2", test_text_split_two_pdu_ucs2);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/short",           test_text_split_short_gsm7);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/short",           test_text_split_short_ucs2);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/short",          test_text_split_short_utf16);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/max-single-pdu",  test_text_split_max_single_pdu_gsm7);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/max-single-pdu",  test_text_split_max_single_pdu_ucs2);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/max-single-pdu", test_text_split_max_single_pdu_utf16);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/two-pdu",         test_text_split_two_pdu_gsm7);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/two-pdu",         test_text_split_two_pdu_ucs2);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/two-pdu",        test_text_split_two_pdu_utf16);
 
     return g_test_run ();
 }
author	Aleksander Morgado <aleksander@aleksander.es>	2020-08-20 12:18:05 +0200
committer	Aleksander Morgado <aleksander@aleksander.es>	2020-08-20 18:15:37 +0200
commit	599f545c0d905505516c6546ff77caced2aa14f1 (patch)
tree	a06b77c6dce87ec3aca97ea3c88037c6e7653d67 /src
parent	81162df15dc9a409d0979ff8d472a026f31ed883 (diff)