diff options
Diffstat (limited to 'src/mm-sms-part-3gpp.c')
-rw-r--r-- | src/mm-sms-part-3gpp.c | 159 |
1 files changed, 82 insertions, 77 deletions
diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c index 9e1862d1..c18aaa75 100644 --- a/src/mm-sms-part-3gpp.c +++ b/src/mm-sms-part-3gpp.c @@ -246,38 +246,25 @@ sms_decode_text (const guint8 *text, int bit_offset, gpointer log_object) { - char *utf8; - guint8 *unpacked; - guint32 unpacked_len; + gchar *utf8; if (encoding == MM_SMS_ENCODING_GSM7) { + g_autofree guint8 *unpacked = NULL; + guint32 unpacked_len; + mm_obj_dbg (log_object, "converting SMS part text from GSM-7 to UTF-8..."); unpacked = mm_charset_gsm_unpack ((const guint8 *) text, len, bit_offset, &unpacked_len); utf8 = (char *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len); mm_obj_dbg (log_object, " got UTF-8 text: '%s'", utf8); - g_free (unpacked); } else if (encoding == MM_SMS_ENCODING_UCS2) { - /* Despite 3GPP TS 23.038 specifies that Unicode SMS messages are - * encoded in UCS-2, UTF-16 encoding is commonly used instead on many - * modern platforms to allow encoding code points that fall outside the - * Basic Multilingual Plane (BMP), such as Emoji. Most of the UCS-2 - * code points are identical to their equivalent UTF-16 code points. - * In UTF-16, non-BMP code points are encoded in a pair of surrogate - * code points (i.e. a high surrogate in 0xD800..0xDBFF, followed by a - * low surrogate in 0xDC00..0xDFFF). An isolated surrogate code point - * has no general interpretation in UTF-16, but could be a valid - * (though unmapped) code point in UCS-2. Here we first try to decode - * the SMS message in UTF-16BE, and if that fails, fall back to decode - * in UCS-2BE. - */ + g_autoptr(GByteArray) bytearray = NULL; + mm_obj_dbg (log_object, "converting SMS part text from UTF-16BE to UTF-8..."); - utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UTF-16BE", NULL, NULL, NULL); - if (!utf8) { - mm_obj_dbg (log_object, "converting SMS part text from UCS-2BE to UTF-8..."); - utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UCS-2BE", NULL, NULL, NULL); - } + bytearray = g_byte_array_append (g_byte_array_sized_new (len), (const guint8 *)text, len); + /* Always assume UTF-16 instead of UCS-2! */ + utf8 = mm_modem_charset_byte_array_to_utf8 (bytearray, MM_MODEM_CHARSET_UTF16); if (!utf8) { - mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE/UCS-2BE to UTF-8: not decoding any text"); + mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text"); utf8 = g_strdup (""); } else mm_obj_dbg (log_object, " got UTF-8 text: '%s'", utf8); @@ -625,9 +612,11 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint index, mm_obj_dbg (log_object, " user data encoding is 8bit"); break; case MM_SMS_ENCODING_UNKNOWN: - default: mm_obj_dbg (log_object, " user data encoding is unknown"); break; + default: + g_assert_not_reached (); + } mm_sms_part_set_encoding (sms_part, user_data_encoding); @@ -829,6 +818,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part, guint len, offset = 0; guint shift = 0; guint8 *udl_ptr; + MMSmsEncoding encoding; g_return_val_if_fail (mm_sms_part_get_number (part) != NULL, NULL); g_return_val_if_fail (mm_sms_part_get_text (part) != NULL || mm_sms_part_get_data (part) != NULL, NULL); @@ -923,7 +913,9 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part, pdu[offset] |= mm_sms_part_get_class (part); } - switch (mm_sms_part_get_encoding (part)) { + encoding = mm_sms_part_get_encoding (part); + + switch (encoding) { case MM_SMS_ENCODING_UCS2: mm_obj_dbg (log_object, " using UCS2 encoding..."); pdu[offset] |= SMS_DCS_CODING_UCS2; @@ -976,7 +968,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part, shift = 1; } - if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_GSM7) { + if (encoding == MM_SMS_ENCODING_GSM7) { guint8 *unpacked, *packed; guint32 unlen = 0, packlen = 0; @@ -1012,17 +1004,19 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part, memcpy (&pdu[offset], packed, packlen); g_free (packed); offset += packlen; - } else if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_UCS2) { + } else if (encoding == MM_SMS_ENCODING_UCS2) { g_autoptr(GByteArray) array = NULL; g_autoptr(GError) inner_error = NULL; /* Try to guess a good value for the array */ array = g_byte_array_sized_new (strlen (mm_sms_part_get_text (part)) * 2); - if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UCS2, &inner_error)) { + /* Always assume UTF-16 instead of UCS-2! */ + if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UTF16, &inner_error)) { g_set_error (error, MM_MESSAGE_ERROR, MM_MESSAGE_ERROR_INVALID_PDU_PARAMETER, - "Failed to convert message text to UCS2: %s", inner_error->message); + "Failed to convert message text to UTF-16: %s", + inner_error->message); goto error; } @@ -1094,62 +1088,68 @@ util_split_text_gsm7 (const gchar *text, } static gchar ** -util_split_text_ucs2 (const gchar *text, - gsize text_len, - gpointer log_object) +util_split_text_utf16_or_ucs2 (const gchar *text, + gsize text_len, + gpointer log_object) { - g_autoptr(GByteArray) array = NULL; - g_autoptr(GError) error = NULL; - gchar **out; - guint n_chunks; - guint i; - guint j; - - /* Guess the size of the output array to avoid multiple allocations */ - array = g_byte_array_sized_new (text_len * 2); - if (!mm_modem_charset_byte_array_append (array, - text, - FALSE, - MM_MODEM_CHARSET_UCS2, - &error)) { - mm_obj_warn (log_object, "failed to append UCS2: %s", error->message); - return NULL; + g_autoptr(GPtrArray) chunks = NULL; + const gchar *walker; + const gchar *chunk_start; + glong encoded_chunk_length; + glong total_encoded_chunk_length; + + chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free); + + walker = text; + chunk_start = text; + encoded_chunk_length = 0; + total_encoded_chunk_length = 0; + while (walker && *walker) { + g_autofree gunichar2 *unichar2 = NULL; + glong unichar2_written = 0; + glong unichar2_written_bytes = 0; + gunichar single; + + single = g_utf8_get_char (walker); + unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL); + g_assert (unichar2_written > 0); + + /* When splitting for UCS-2 encoding, only one single unichar2 will be + * written, because all codepoints represented in UCS2 fit in the BMP. + * When splitting for UTF-16, though, we may end up writing one or two + * unichar2 (without or with surrogate pairs), because UTF-16 covers the + * whole Unicode spectrum. */ + unichar2_written_bytes = (unichar2_written * sizeof (gunichar2)); + if ((encoded_chunk_length + unichar2_written_bytes) > 134) { + g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start)); + chunk_start = walker; + encoded_chunk_length = unichar2_written_bytes; + } else + encoded_chunk_length += unichar2_written_bytes; + + total_encoded_chunk_length += unichar2_written_bytes; + walker = g_utf8_next_char (walker); } - /* Our bytearray has it in UCS-2 now. - * UCS-2 is a fixed-size encoding, which means that the text has exactly - * 2 bytes for each unicode point. We can now split this array into - * chunks of 67 UCS-2 characters (134 bytes). - * - * Note that UCS-2 covers unicode points between U+0000 and U+FFFF, which - * means that there is no direct relationship between the size of the - * input text in UTF-8 and the size of the text in UCS-2. A 3-byte UTF-8 - * encoded character will still be represented with 2 bytes in UCS-2. - */ + /* We have split the original string in chunks, where each chunk + * does not require more than 134 bytes when encoded in UTF-16. + * As a special case now, we consider the case that no splitting + * is necessary, i.e. if the total amount of bytes after encoding + * in UTF-16 is less or equal than 140. */ + if (total_encoded_chunk_length <= 140) { + gchar **out; - /* No splitting needed? */ - if (array->len <= 140) { out = g_new0 (gchar *, 2); out[0] = g_strdup (text); return out; } - /* Compute number of chunks needed */ - n_chunks = array->len / 134; - if (array->len % 134 != 0) - n_chunks++; + /* Otherwise, we do need the splitted chunks. Add the last one + * with contents plus the last trailing NULL */ + g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start)); + g_ptr_array_add (chunks, NULL); - /* Fill in all chunks */ - out = g_new0 (gchar *, n_chunks + 1); - for (i = 0, j = 0; i < n_chunks; i++, j += 134) { - out[i] = sms_decode_text (&array->data[j], - MIN (array->len - j, 134), - MM_SMS_ENCODING_UCS2, - 0, - log_object); - } - - return out; + return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE); } gchar ** @@ -1174,6 +1174,11 @@ mm_sms_part_3gpp_util_split_text (const gchar *text, * 134 * 8 = 1072; 1072/7=153.14 * 2) If we're using UCS2 encoding, we can pack up to 70 characters in * 140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes. + * 3) If we're using UTF-16 encoding (instead of UCS2), the amount of + * characters we can pack is variable, depends on how the characters + * are encoded in UTF-16 (e.g. if there are characters out of the BMP + * we'll need surrogate pairs and a single character will need 4 bytes + * instead of 2). * * This method does the split of the input string into N strings, so that * each of the strings can be placed in a SMS part. @@ -1185,9 +1190,9 @@ mm_sms_part_3gpp_util_split_text (const gchar *text, return util_split_text_gsm7 (text, strlen (text), log_object); } - /* Otherwise, fallback to UCS2 encoding */ + /* Otherwise fallback to report UCS-2 and split supporting UTF-16 */ *encoding = MM_SMS_ENCODING_UCS2; - return util_split_text_ucs2 (text, strlen (text), log_object); + return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object); } GByteArray ** |