aboutsummaryrefslogtreecommitdiff
path: root/src/mm-sms-part-3gpp.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/mm-sms-part-3gpp.c')
-rw-r--r--src/mm-sms-part-3gpp.c159
1 files changed, 82 insertions, 77 deletions
diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
index 9e1862d1..c18aaa75 100644
--- a/src/mm-sms-part-3gpp.c
+++ b/src/mm-sms-part-3gpp.c
@@ -246,38 +246,25 @@ sms_decode_text (const guint8 *text,
int bit_offset,
gpointer log_object)
{
- char *utf8;
- guint8 *unpacked;
- guint32 unpacked_len;
+ gchar *utf8;
if (encoding == MM_SMS_ENCODING_GSM7) {
+ g_autofree guint8 *unpacked = NULL;
+ guint32 unpacked_len;
+
mm_obj_dbg (log_object, "converting SMS part text from GSM-7 to UTF-8...");
unpacked = mm_charset_gsm_unpack ((const guint8 *) text, len, bit_offset, &unpacked_len);
utf8 = (char *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len);
mm_obj_dbg (log_object, " got UTF-8 text: '%s'", utf8);
- g_free (unpacked);
} else if (encoding == MM_SMS_ENCODING_UCS2) {
- /* Despite 3GPP TS 23.038 specifies that Unicode SMS messages are
- * encoded in UCS-2, UTF-16 encoding is commonly used instead on many
- * modern platforms to allow encoding code points that fall outside the
- * Basic Multilingual Plane (BMP), such as Emoji. Most of the UCS-2
- * code points are identical to their equivalent UTF-16 code points.
- * In UTF-16, non-BMP code points are encoded in a pair of surrogate
- * code points (i.e. a high surrogate in 0xD800..0xDBFF, followed by a
- * low surrogate in 0xDC00..0xDFFF). An isolated surrogate code point
- * has no general interpretation in UTF-16, but could be a valid
- * (though unmapped) code point in UCS-2. Here we first try to decode
- * the SMS message in UTF-16BE, and if that fails, fall back to decode
- * in UCS-2BE.
- */
+ g_autoptr(GByteArray) bytearray = NULL;
+
mm_obj_dbg (log_object, "converting SMS part text from UTF-16BE to UTF-8...");
- utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UTF-16BE", NULL, NULL, NULL);
- if (!utf8) {
- mm_obj_dbg (log_object, "converting SMS part text from UCS-2BE to UTF-8...");
- utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UCS-2BE", NULL, NULL, NULL);
- }
+ bytearray = g_byte_array_append (g_byte_array_sized_new (len), (const guint8 *)text, len);
+ /* Always assume UTF-16 instead of UCS-2! */
+ utf8 = mm_modem_charset_byte_array_to_utf8 (bytearray, MM_MODEM_CHARSET_UTF16);
if (!utf8) {
- mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE/UCS-2BE to UTF-8: not decoding any text");
+ mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text");
utf8 = g_strdup ("");
} else
mm_obj_dbg (log_object, " got UTF-8 text: '%s'", utf8);
@@ -625,9 +612,11 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint index,
mm_obj_dbg (log_object, " user data encoding is 8bit");
break;
case MM_SMS_ENCODING_UNKNOWN:
- default:
mm_obj_dbg (log_object, " user data encoding is unknown");
break;
+ default:
+ g_assert_not_reached ();
+
}
mm_sms_part_set_encoding (sms_part, user_data_encoding);
@@ -829,6 +818,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
guint len, offset = 0;
guint shift = 0;
guint8 *udl_ptr;
+ MMSmsEncoding encoding;
g_return_val_if_fail (mm_sms_part_get_number (part) != NULL, NULL);
g_return_val_if_fail (mm_sms_part_get_text (part) != NULL || mm_sms_part_get_data (part) != NULL, NULL);
@@ -923,7 +913,9 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
pdu[offset] |= mm_sms_part_get_class (part);
}
- switch (mm_sms_part_get_encoding (part)) {
+ encoding = mm_sms_part_get_encoding (part);
+
+ switch (encoding) {
case MM_SMS_ENCODING_UCS2:
mm_obj_dbg (log_object, " using UCS2 encoding...");
pdu[offset] |= SMS_DCS_CODING_UCS2;
@@ -976,7 +968,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
shift = 1;
}
- if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_GSM7) {
+ if (encoding == MM_SMS_ENCODING_GSM7) {
guint8 *unpacked, *packed;
guint32 unlen = 0, packlen = 0;
@@ -1012,17 +1004,19 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
memcpy (&pdu[offset], packed, packlen);
g_free (packed);
offset += packlen;
- } else if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_UCS2) {
+ } else if (encoding == MM_SMS_ENCODING_UCS2) {
g_autoptr(GByteArray) array = NULL;
g_autoptr(GError) inner_error = NULL;
/* Try to guess a good value for the array */
array = g_byte_array_sized_new (strlen (mm_sms_part_get_text (part)) * 2);
- if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UCS2, &inner_error)) {
+ /* Always assume UTF-16 instead of UCS-2! */
+ if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UTF16, &inner_error)) {
g_set_error (error,
MM_MESSAGE_ERROR,
MM_MESSAGE_ERROR_INVALID_PDU_PARAMETER,
- "Failed to convert message text to UCS2: %s", inner_error->message);
+ "Failed to convert message text to UTF-16: %s",
+ inner_error->message);
goto error;
}
@@ -1094,62 +1088,68 @@ util_split_text_gsm7 (const gchar *text,
}
static gchar **
-util_split_text_ucs2 (const gchar *text,
- gsize text_len,
- gpointer log_object)
+util_split_text_utf16_or_ucs2 (const gchar *text,
+ gsize text_len,
+ gpointer log_object)
{
- g_autoptr(GByteArray) array = NULL;
- g_autoptr(GError) error = NULL;
- gchar **out;
- guint n_chunks;
- guint i;
- guint j;
-
- /* Guess the size of the output array to avoid multiple allocations */
- array = g_byte_array_sized_new (text_len * 2);
- if (!mm_modem_charset_byte_array_append (array,
- text,
- FALSE,
- MM_MODEM_CHARSET_UCS2,
- &error)) {
- mm_obj_warn (log_object, "failed to append UCS2: %s", error->message);
- return NULL;
+ g_autoptr(GPtrArray) chunks = NULL;
+ const gchar *walker;
+ const gchar *chunk_start;
+ glong encoded_chunk_length;
+ glong total_encoded_chunk_length;
+
+ chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
+
+ walker = text;
+ chunk_start = text;
+ encoded_chunk_length = 0;
+ total_encoded_chunk_length = 0;
+ while (walker && *walker) {
+ g_autofree gunichar2 *unichar2 = NULL;
+ glong unichar2_written = 0;
+ glong unichar2_written_bytes = 0;
+ gunichar single;
+
+ single = g_utf8_get_char (walker);
+ unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
+ g_assert (unichar2_written > 0);
+
+ /* When splitting for UCS-2 encoding, only one single unichar2 will be
+ * written, because all codepoints represented in UCS2 fit in the BMP.
+ * When splitting for UTF-16, though, we may end up writing one or two
+ * unichar2 (without or with surrogate pairs), because UTF-16 covers the
+ * whole Unicode spectrum. */
+ unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
+ if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
+ g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+ chunk_start = walker;
+ encoded_chunk_length = unichar2_written_bytes;
+ } else
+ encoded_chunk_length += unichar2_written_bytes;
+
+ total_encoded_chunk_length += unichar2_written_bytes;
+ walker = g_utf8_next_char (walker);
}
- /* Our bytearray has it in UCS-2 now.
- * UCS-2 is a fixed-size encoding, which means that the text has exactly
- * 2 bytes for each unicode point. We can now split this array into
- * chunks of 67 UCS-2 characters (134 bytes).
- *
- * Note that UCS-2 covers unicode points between U+0000 and U+FFFF, which
- * means that there is no direct relationship between the size of the
- * input text in UTF-8 and the size of the text in UCS-2. A 3-byte UTF-8
- * encoded character will still be represented with 2 bytes in UCS-2.
- */
+ /* We have split the original string in chunks, where each chunk
+ * does not require more than 134 bytes when encoded in UTF-16.
+ * As a special case now, we consider the case that no splitting
+ * is necessary, i.e. if the total amount of bytes after encoding
+ * in UTF-16 is less or equal than 140. */
+ if (total_encoded_chunk_length <= 140) {
+ gchar **out;
- /* No splitting needed? */
- if (array->len <= 140) {
out = g_new0 (gchar *, 2);
out[0] = g_strdup (text);
return out;
}
- /* Compute number of chunks needed */
- n_chunks = array->len / 134;
- if (array->len % 134 != 0)
- n_chunks++;
+ /* Otherwise, we do need the splitted chunks. Add the last one
+ * with contents plus the last trailing NULL */
+ g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+ g_ptr_array_add (chunks, NULL);
- /* Fill in all chunks */
- out = g_new0 (gchar *, n_chunks + 1);
- for (i = 0, j = 0; i < n_chunks; i++, j += 134) {
- out[i] = sms_decode_text (&array->data[j],
- MIN (array->len - j, 134),
- MM_SMS_ENCODING_UCS2,
- 0,
- log_object);
- }
-
- return out;
+ return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
}
gchar **
@@ -1174,6 +1174,11 @@ mm_sms_part_3gpp_util_split_text (const gchar *text,
* 134 * 8 = 1072; 1072/7=153.14
* 2) If we're using UCS2 encoding, we can pack up to 70 characters in
* 140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
+ * 3) If we're using UTF-16 encoding (instead of UCS2), the amount of
+ * characters we can pack is variable, depends on how the characters
+ * are encoded in UTF-16 (e.g. if there are characters out of the BMP
+ * we'll need surrogate pairs and a single character will need 4 bytes
+ * instead of 2).
*
* This method does the split of the input string into N strings, so that
* each of the strings can be placed in a SMS part.
@@ -1185,9 +1190,9 @@ mm_sms_part_3gpp_util_split_text (const gchar *text,
return util_split_text_gsm7 (text, strlen (text), log_object);
}
- /* Otherwise, fallback to UCS2 encoding */
+ /* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
*encoding = MM_SMS_ENCODING_UCS2;
- return util_split_text_ucs2 (text, strlen (text), log_object);
+ return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
}
GByteArray **