3 files changed, 165 insertions, 90 deletions
diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
index 9e1862d1..c18aaa75 100644
--- a/src/mm-sms-part-3gpp.c
+++ b/src/mm-sms-part-3gpp.c
@@ -246,38 +246,25 @@ sms_decode_text (const guint8 *text,
                  int           bit_offset,
                  gpointer      log_object)
 {
-    char *utf8;
-    guint8 *unpacked;
-    guint32 unpacked_len;
+    gchar *utf8;
 
     if (encoding == MM_SMS_ENCODING_GSM7) {
+        g_autofree guint8 *unpacked = NULL;
+        guint32            unpacked_len;
+
         mm_obj_dbg (log_object, "converting SMS part text from GSM-7 to UTF-8...");
         unpacked = mm_charset_gsm_unpack ((const guint8 *) text, len, bit_offset, &unpacked_len);
         utf8 = (char *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len);
         mm_obj_dbg (log_object, "   got UTF-8 text: '%s'", utf8);
-        g_free (unpacked);
     } else if (encoding == MM_SMS_ENCODING_UCS2) {
-        /* Despite 3GPP TS 23.038 specifies that Unicode SMS messages are
-         * encoded in UCS-2, UTF-16 encoding is commonly used instead on many
-         * modern platforms to allow encoding code points that fall outside the
-         * Basic Multilingual Plane (BMP), such as Emoji. Most of the UCS-2
-         * code points are identical to their equivalent UTF-16 code points.
-         * In UTF-16, non-BMP code points are encoded in a pair of surrogate
-         * code points (i.e. a high surrogate in 0xD800..0xDBFF, followed by a
-         * low surrogate in 0xDC00..0xDFFF). An isolated surrogate code point
-         * has no general interpretation in UTF-16, but could be a valid
-         * (though unmapped) code point in UCS-2. Here we first try to decode
-         * the SMS message in UTF-16BE, and if that fails, fall back to decode
-         * in UCS-2BE.
-         */
+        g_autoptr(GByteArray) bytearray = NULL;
+
         mm_obj_dbg (log_object, "converting SMS part text from UTF-16BE to UTF-8...");
-        utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UTF-16BE", NULL, NULL, NULL);
-        if (!utf8) {
-            mm_obj_dbg (log_object, "converting SMS part text from UCS-2BE to UTF-8...");
-            utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UCS-2BE", NULL, NULL, NULL);
-        }
+        bytearray = g_byte_array_append (g_byte_array_sized_new (len), (const guint8 *)text, len);
+        /* Always assume UTF-16 instead of UCS-2! */
+        utf8 = mm_modem_charset_byte_array_to_utf8 (bytearray, MM_MODEM_CHARSET_UTF16);
         if (!utf8) {
-            mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE/UCS-2BE to UTF-8: not decoding any text");
+            mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text");
             utf8 = g_strdup ("");
         } else
             mm_obj_dbg (log_object, "   got UTF-8 text: '%s'", utf8);
@@ -625,9 +612,11 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint         index,
             mm_obj_dbg (log_object, "  user data encoding is 8bit");
             break;
         case MM_SMS_ENCODING_UNKNOWN:
-        default:
             mm_obj_dbg (log_object, "  user data encoding is unknown");
             break;
+        default:
+            g_assert_not_reached ();
+
         }
         mm_sms_part_set_encoding (sms_part, user_data_encoding);
 
@@ -829,6 +818,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
     guint len, offset = 0;
     guint shift = 0;
     guint8 *udl_ptr;
+    MMSmsEncoding encoding;
 
     g_return_val_if_fail (mm_sms_part_get_number (part) != NULL, NULL);
     g_return_val_if_fail (mm_sms_part_get_text (part) != NULL || mm_sms_part_get_data (part) != NULL, NULL);
@@ -923,7 +913,9 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
         pdu[offset] |= mm_sms_part_get_class (part);
     }
 
-    switch (mm_sms_part_get_encoding (part)) {
+    encoding = mm_sms_part_get_encoding (part);
+
+    switch (encoding) {
     case MM_SMS_ENCODING_UCS2:
         mm_obj_dbg (log_object, "  using UCS2 encoding...");
         pdu[offset] |= SMS_DCS_CODING_UCS2;
@@ -976,7 +968,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
         shift = 1;
     }
 
-    if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_GSM7) {
+    if (encoding == MM_SMS_ENCODING_GSM7) {
         guint8 *unpacked, *packed;
         guint32 unlen = 0, packlen = 0;
 
@@ -1012,17 +1004,19 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
         memcpy (&pdu[offset], packed, packlen);
         g_free (packed);
         offset += packlen;
-    } else if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_UCS2) {
+    } else if (encoding == MM_SMS_ENCODING_UCS2) {
         g_autoptr(GByteArray) array = NULL;
         g_autoptr(GError)     inner_error = NULL;
 
         /* Try to guess a good value for the array */
         array = g_byte_array_sized_new (strlen (mm_sms_part_get_text (part)) * 2);
-        if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UCS2, &inner_error)) {
+        /* Always assume UTF-16 instead of UCS-2! */
+        if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UTF16, &inner_error)) {
             g_set_error (error,
                          MM_MESSAGE_ERROR,
                          MM_MESSAGE_ERROR_INVALID_PDU_PARAMETER,
-                         "Failed to convert message text to UCS2: %s", inner_error->message);
+                         "Failed to convert message text to UTF-16: %s",
+                         inner_error->message);
             goto error;
         }
 
@@ -1094,62 +1088,68 @@ util_split_text_gsm7 (const gchar *text,
 }
 
 static gchar **
-util_split_text_ucs2 (const gchar *text,
-                      gsize        text_len,
-                      gpointer     log_object)
+util_split_text_utf16_or_ucs2 (const gchar *text,
+                               gsize        text_len,
+                               gpointer     log_object)
 {
-    g_autoptr(GByteArray)   array = NULL;
-    g_autoptr(GError)       error = NULL;
-    gchar                 **out;
-    guint                   n_chunks;
-    guint                   i;
-    guint                   j;
-
-    /* Guess the size of the output array to avoid multiple allocations */
-    array = g_byte_array_sized_new (text_len * 2);
-    if (!mm_modem_charset_byte_array_append (array,
-                                             text,
-                                             FALSE,
-                                             MM_MODEM_CHARSET_UCS2,
-                                             &error)) {
-        mm_obj_warn (log_object, "failed to append UCS2: %s", error->message);
-        return NULL;
+    g_autoptr(GPtrArray)  chunks = NULL;
+    const gchar          *walker;
+    const gchar          *chunk_start;
+    glong                 encoded_chunk_length;
+    glong                 total_encoded_chunk_length;
+
+    chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
+
+    walker = text;
+    chunk_start = text;
+    encoded_chunk_length = 0;
+    total_encoded_chunk_length = 0;
+    while (walker && *walker) {
+        g_autofree gunichar2 *unichar2 = NULL;
+        glong                 unichar2_written = 0;
+        glong                 unichar2_written_bytes = 0;
+        gunichar              single;
+
+        single = g_utf8_get_char (walker);
+        unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
+        g_assert (unichar2_written > 0);
+
+        /* When splitting for UCS-2 encoding, only one single unichar2 will be
+         * written, because all codepoints represented in UCS2 fit in the BMP.
+         * When splitting for UTF-16, though, we may end up writing one or two
+         * unichar2 (without or with surrogate pairs), because UTF-16 covers the
+         * whole Unicode spectrum. */
+        unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
+        if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
+            g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+            chunk_start = walker;
+            encoded_chunk_length = unichar2_written_bytes;
+        } else
+            encoded_chunk_length += unichar2_written_bytes;
+
+        total_encoded_chunk_length += unichar2_written_bytes;
+        walker = g_utf8_next_char (walker);
     }
 
-    /* Our bytearray has it in UCS-2 now.
-     * UCS-2 is a fixed-size encoding, which means that the text has exactly
-     * 2 bytes for each unicode point. We can now split this array into
-     * chunks of 67 UCS-2 characters (134 bytes).
-     *
-     * Note that UCS-2 covers unicode points between U+0000 and U+FFFF, which
-     * means that there is no direct relationship between the size of the
-     * input text in UTF-8 and the size of the text in UCS-2. A 3-byte UTF-8
-     * encoded character will still be represented with 2 bytes in UCS-2.
-     */
+    /* We have split the original string in chunks, where each chunk
+     * does not require more than 134 bytes when encoded in UTF-16.
+     * As a special case now, we consider the case that no splitting
+     * is necessary, i.e. if the total amount of bytes after encoding
+     * in UTF-16 is less or equal than 140. */
+    if (total_encoded_chunk_length <= 140) {
+        gchar **out;
 
-    /* No splitting needed? */
-    if (array->len <= 140) {
         out = g_new0 (gchar *, 2);
         out[0] = g_strdup (text);
         return out;
     }
 
-    /* Compute number of chunks needed */
-    n_chunks = array->len / 134;
-    if (array->len % 134 != 0)
-        n_chunks++;
+    /* Otherwise, we do need the splitted chunks. Add the last one
+     * with contents plus the last trailing NULL */
+    g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+    g_ptr_array_add (chunks, NULL);
 
-    /* Fill in all chunks */
-    out = g_new0 (gchar *, n_chunks + 1);
-    for (i = 0, j = 0; i < n_chunks; i++, j += 134) {
-        out[i] = sms_decode_text (&array->data[j],
-                                  MIN (array->len - j, 134),
-                                  MM_SMS_ENCODING_UCS2,
-                                  0,
-                                  log_object);
-    }
-
-    return out;
+    return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
 }
 
 gchar **
@@ -1174,6 +1174,11 @@ mm_sms_part_3gpp_util_split_text (const gchar   *text,
      *      134 * 8 = 1072; 1072/7=153.14
      *  2) If we're using UCS2 encoding, we can pack up to 70 characters in
      *     140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
+     *  3) If we're using UTF-16 encoding (instead of UCS2), the amount of
+     *     characters we can pack is variable, depends on how the characters
+     *     are encoded in UTF-16 (e.g. if there are characters out of the BMP
+     *     we'll need surrogate pairs and a single character will need 4 bytes
+     *     instead of 2).
      *
      * This method does the split of the input string into N strings, so that
      * each of the strings can be placed in a SMS part.
@@ -1185,9 +1190,9 @@ mm_sms_part_3gpp_util_split_text (const gchar   *text,
         return util_split_text_gsm7 (text, strlen (text), log_object);
     }
 
-    /* Otherwise, fallback to UCS2 encoding */
+    /* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
     *encoding = MM_SMS_ENCODING_UCS2;
-    return util_split_text_ucs2 (text, strlen (text), log_object);
+    return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
 }
 
 GByteArray **
diff --git a/src/mm-sms-part.h b/src/mm-sms-part.h
index 92f39b11..2ee7f308 100644
--- a/src/mm-sms-part.h
+++ b/src/mm-sms-part.h
@@ -20,11 +20,27 @@
 #include <glib.h>
 #include <ModemManager.h>
 
+/* Despite 3GPP TS 23.038 specifies that Unicode SMS messages are
+ * encoded in UCS-2, UTF-16 encoding is commonly used instead on many
+ * modern platforms to allow encoding code points that fall outside the
+ * Basic Multilingual Plane (BMP), such as Emoji. Most of the UCS-2
+ * code points are identical to their equivalent UTF-16 code points.
+ * In UTF-16, non-BMP code points are encoded in a pair of surrogate
+ * code points (i.e. a high surrogate in 0xD800..0xDBFF, followed by a
+ * low surrogate in 0xDC00..0xDFFF). An isolated surrogate code point
+ * has no general interpretation in UTF-16, but could be a valid
+ * (though unmapped) code point in UCS-2.
+ *
+ * The current implementation in ModemManager just assumes that whenever
+ * possible (i.e. when parsing received PDUs or when creating submit
+ * PDUs) UTF-16 will be used instead of plain UCS-2 (even if the PDUs
+ * report the encoding as UCS-2).
+ */
 typedef enum { /*< underscore_name=mm_sms_encoding >*/
     MM_SMS_ENCODING_UNKNOWN = 0x0,
     MM_SMS_ENCODING_GSM7,
     MM_SMS_ENCODING_8BIT,
-    MM_SMS_ENCODING_UCS2
+    MM_SMS_ENCODING_UCS2,
 } MMSmsEncoding;
 
 typedef struct _MMSmsPart MMSmsPart;
diff --git a/src/tests/test-sms-part-3gpp.c b/src/tests/test-sms-part-3gpp.c
index c3d59d87..db6aa7a0 100644
--- a/src/tests/test-sms-part-3gpp.c
+++ b/src/tests/test-sms-part-3gpp.c
@@ -553,8 +553,7 @@ common_test_create_pdu (const gchar *smsc,
 
     g_assert_no_error (error);
     g_assert (pdu != NULL);
-    g_assert_cmpuint (len, ==, expected_size);
-    g_assert_cmpint (memcmp (pdu, expected, len), ==, 0);
+    g_assert_cmpmem (pdu, len, expected, expected_size);
     g_assert_cmpint (msgstart, ==, expected_msgstart);
 
     g_free (pdu);
@@ -735,7 +734,7 @@ common_test_text_split (const gchar *text,
 }
 
 static void
-test_text_split_short (void)
+test_text_split_short_gsm7 (void)
 {
     const gchar *text = "Hello";
     const gchar *expected [] = {
@@ -749,7 +748,7 @@ test_text_split_short (void)
 static void
 test_text_split_short_ucs2 (void)
 {
-    const gchar *text = "你好";
+    const gchar *text = "你好"; /* (UTF-8) e4 bd a0 e5 a5 bd */
     const gchar *expected [] = {
         "你好",
         NULL
@@ -759,7 +758,19 @@ test_text_split_short_ucs2 (void)
 }
 
 static void
-test_text_split_max_single_pdu (void)
+test_text_split_short_utf16 (void)
+{
+    const gchar *text = "😉"; /* U+1F609, winking face */
+    const gchar *expected [] = {
+        "😉",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
+static void
+test_text_split_max_single_pdu_gsm7 (void)
 {
     const gchar *text =
         "0123456789012345678901234567890123456789"
@@ -798,7 +809,23 @@ test_text_split_max_single_pdu_ucs2 (void)
 }
 
 static void
-test_text_split_two_pdu (void)
+test_text_split_max_single_pdu_utf16 (void)
+{
+    /* NOTE: this string contains 35 Bhaiksuki characters, each of
+     * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
+     * in total). */
+    const gchar *text =
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣";
+    const gchar *expected [] = {
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
+static void
+test_text_split_two_pdu_gsm7 (void)
 {
     const gchar *text =
         "0123456789012345678901234567890123456789"
@@ -839,6 +866,30 @@ test_text_split_two_pdu_ucs2 (void)
     common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
 }
 
+static void
+test_text_split_two_pdu_utf16 (void)
+{
+    /* NOTE: this string contains 35 Bhaiksuki characters, each of
+     * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
+     * in total) plus one ASCII char (encoded with 1 byte in UTF-8 and
+     * 2 bytes in UTF-16), making it a total of 142 bytes when in
+     * UTF-16 (so not fitting in one single PDU)
+     *
+     * When split in chunks, the last chunk will hold 2 Bhaiksuki
+     * characters plus the last ASCII one (9 bytes in UTF-16) so that
+     * the first chunk contains the leading 33 Bhaiksuki characters
+     * (132 characters, less than 134) */
+    const gchar *text =
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣a";
+    const gchar *expected [] = {
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡",
+        "𑰢𑰣a",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
 /************************************************************/
 
 int main (int argc, char **argv)
@@ -874,12 +925,15 @@ int main (int argc, char **argv)
     g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-3", test_create_pdu_gsm_3);
     g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-no-validity", test_create_pdu_gsm_no_validity);
 
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/short", test_text_split_short);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/short-UCS2", test_text_split_short_ucs2);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/max-single-pdu", test_text_split_max_single_pdu);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/max-single-pdu-UCS2", test_text_split_max_single_pdu_ucs2);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/two-pdu", test_text_split_two_pdu);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/two-pdu-UCS2", test_text_split_two_pdu_ucs2);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/short",           test_text_split_short_gsm7);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/short",           test_text_split_short_ucs2);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/short",          test_text_split_short_utf16);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/max-single-pdu",  test_text_split_max_single_pdu_gsm7);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/max-single-pdu",  test_text_split_max_single_pdu_ucs2);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/max-single-pdu", test_text_split_max_single_pdu_utf16);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/two-pdu",         test_text_split_two_pdu_gsm7);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/two-pdu",         test_text_split_two_pdu_ucs2);
+    g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/two-pdu",        test_text_split_two_pdu_utf16);
 
     return g_test_run ();
 }