aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/mm-sms-part-3gpp.c159
-rw-r--r--src/mm-sms-part.h18
-rw-r--r--src/tests/test-sms-part-3gpp.c78
3 files changed, 165 insertions, 90 deletions
diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
index 9e1862d1..c18aaa75 100644
--- a/src/mm-sms-part-3gpp.c
+++ b/src/mm-sms-part-3gpp.c
@@ -246,38 +246,25 @@ sms_decode_text (const guint8 *text,
int bit_offset,
gpointer log_object)
{
- char *utf8;
- guint8 *unpacked;
- guint32 unpacked_len;
+ gchar *utf8;
if (encoding == MM_SMS_ENCODING_GSM7) {
+ g_autofree guint8 *unpacked = NULL;
+ guint32 unpacked_len;
+
mm_obj_dbg (log_object, "converting SMS part text from GSM-7 to UTF-8...");
unpacked = mm_charset_gsm_unpack ((const guint8 *) text, len, bit_offset, &unpacked_len);
utf8 = (char *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len);
mm_obj_dbg (log_object, " got UTF-8 text: '%s'", utf8);
- g_free (unpacked);
} else if (encoding == MM_SMS_ENCODING_UCS2) {
- /* Despite 3GPP TS 23.038 specifies that Unicode SMS messages are
- * encoded in UCS-2, UTF-16 encoding is commonly used instead on many
- * modern platforms to allow encoding code points that fall outside the
- * Basic Multilingual Plane (BMP), such as Emoji. Most of the UCS-2
- * code points are identical to their equivalent UTF-16 code points.
- * In UTF-16, non-BMP code points are encoded in a pair of surrogate
- * code points (i.e. a high surrogate in 0xD800..0xDBFF, followed by a
- * low surrogate in 0xDC00..0xDFFF). An isolated surrogate code point
- * has no general interpretation in UTF-16, but could be a valid
- * (though unmapped) code point in UCS-2. Here we first try to decode
- * the SMS message in UTF-16BE, and if that fails, fall back to decode
- * in UCS-2BE.
- */
+ g_autoptr(GByteArray) bytearray = NULL;
+
mm_obj_dbg (log_object, "converting SMS part text from UTF-16BE to UTF-8...");
- utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UTF-16BE", NULL, NULL, NULL);
- if (!utf8) {
- mm_obj_dbg (log_object, "converting SMS part text from UCS-2BE to UTF-8...");
- utf8 = g_convert ((const gchar *) text, len, "UTF-8", "UCS-2BE", NULL, NULL, NULL);
- }
+ bytearray = g_byte_array_append (g_byte_array_sized_new (len), (const guint8 *)text, len);
+ /* Always assume UTF-16 instead of UCS-2! */
+ utf8 = mm_modem_charset_byte_array_to_utf8 (bytearray, MM_MODEM_CHARSET_UTF16);
if (!utf8) {
- mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE/UCS-2BE to UTF-8: not decoding any text");
+ mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text");
utf8 = g_strdup ("");
} else
mm_obj_dbg (log_object, " got UTF-8 text: '%s'", utf8);
@@ -625,9 +612,11 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint index,
mm_obj_dbg (log_object, " user data encoding is 8bit");
break;
case MM_SMS_ENCODING_UNKNOWN:
- default:
mm_obj_dbg (log_object, " user data encoding is unknown");
break;
+ default:
+ g_assert_not_reached ();
+
}
mm_sms_part_set_encoding (sms_part, user_data_encoding);
@@ -829,6 +818,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
guint len, offset = 0;
guint shift = 0;
guint8 *udl_ptr;
+ MMSmsEncoding encoding;
g_return_val_if_fail (mm_sms_part_get_number (part) != NULL, NULL);
g_return_val_if_fail (mm_sms_part_get_text (part) != NULL || mm_sms_part_get_data (part) != NULL, NULL);
@@ -923,7 +913,9 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
pdu[offset] |= mm_sms_part_get_class (part);
}
- switch (mm_sms_part_get_encoding (part)) {
+ encoding = mm_sms_part_get_encoding (part);
+
+ switch (encoding) {
case MM_SMS_ENCODING_UCS2:
mm_obj_dbg (log_object, " using UCS2 encoding...");
pdu[offset] |= SMS_DCS_CODING_UCS2;
@@ -976,7 +968,7 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
shift = 1;
}
- if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_GSM7) {
+ if (encoding == MM_SMS_ENCODING_GSM7) {
guint8 *unpacked, *packed;
guint32 unlen = 0, packlen = 0;
@@ -1012,17 +1004,19 @@ mm_sms_part_3gpp_get_submit_pdu (MMSmsPart *part,
memcpy (&pdu[offset], packed, packlen);
g_free (packed);
offset += packlen;
- } else if (mm_sms_part_get_encoding (part) == MM_SMS_ENCODING_UCS2) {
+ } else if (encoding == MM_SMS_ENCODING_UCS2) {
g_autoptr(GByteArray) array = NULL;
g_autoptr(GError) inner_error = NULL;
/* Try to guess a good value for the array */
array = g_byte_array_sized_new (strlen (mm_sms_part_get_text (part)) * 2);
- if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UCS2, &inner_error)) {
+ /* Always assume UTF-16 instead of UCS-2! */
+ if (!mm_modem_charset_byte_array_append (array, mm_sms_part_get_text (part), FALSE, MM_MODEM_CHARSET_UTF16, &inner_error)) {
g_set_error (error,
MM_MESSAGE_ERROR,
MM_MESSAGE_ERROR_INVALID_PDU_PARAMETER,
- "Failed to convert message text to UCS2: %s", inner_error->message);
+ "Failed to convert message text to UTF-16: %s",
+ inner_error->message);
goto error;
}
@@ -1094,62 +1088,68 @@ util_split_text_gsm7 (const gchar *text,
}
static gchar **
-util_split_text_ucs2 (const gchar *text,
- gsize text_len,
- gpointer log_object)
+util_split_text_utf16_or_ucs2 (const gchar *text,
+ gsize text_len,
+ gpointer log_object)
{
- g_autoptr(GByteArray) array = NULL;
- g_autoptr(GError) error = NULL;
- gchar **out;
- guint n_chunks;
- guint i;
- guint j;
-
- /* Guess the size of the output array to avoid multiple allocations */
- array = g_byte_array_sized_new (text_len * 2);
- if (!mm_modem_charset_byte_array_append (array,
- text,
- FALSE,
- MM_MODEM_CHARSET_UCS2,
- &error)) {
- mm_obj_warn (log_object, "failed to append UCS2: %s", error->message);
- return NULL;
+ g_autoptr(GPtrArray) chunks = NULL;
+ const gchar *walker;
+ const gchar *chunk_start;
+ glong encoded_chunk_length;
+ glong total_encoded_chunk_length;
+
+ chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
+
+ walker = text;
+ chunk_start = text;
+ encoded_chunk_length = 0;
+ total_encoded_chunk_length = 0;
+ while (walker && *walker) {
+ g_autofree gunichar2 *unichar2 = NULL;
+ glong unichar2_written = 0;
+ glong unichar2_written_bytes = 0;
+ gunichar single;
+
+ single = g_utf8_get_char (walker);
+ unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
+ g_assert (unichar2_written > 0);
+
+ /* When splitting for UCS-2 encoding, only one single unichar2 will be
+ * written, because all codepoints represented in UCS2 fit in the BMP.
+ * When splitting for UTF-16, though, we may end up writing one or two
+ * unichar2 (without or with surrogate pairs), because UTF-16 covers the
+ * whole Unicode spectrum. */
+ unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
+ if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
+ g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+ chunk_start = walker;
+ encoded_chunk_length = unichar2_written_bytes;
+ } else
+ encoded_chunk_length += unichar2_written_bytes;
+
+ total_encoded_chunk_length += unichar2_written_bytes;
+ walker = g_utf8_next_char (walker);
}
- /* Our bytearray has it in UCS-2 now.
- * UCS-2 is a fixed-size encoding, which means that the text has exactly
- * 2 bytes for each unicode point. We can now split this array into
- * chunks of 67 UCS-2 characters (134 bytes).
- *
- * Note that UCS-2 covers unicode points between U+0000 and U+FFFF, which
- * means that there is no direct relationship between the size of the
- * input text in UTF-8 and the size of the text in UCS-2. A 3-byte UTF-8
- * encoded character will still be represented with 2 bytes in UCS-2.
- */
+ /* We have split the original string in chunks, where each chunk
+ * does not require more than 134 bytes when encoded in UTF-16.
+ * As a special case now, we consider the case that no splitting
+ * is necessary, i.e. if the total amount of bytes after encoding
+ * in UTF-16 is less or equal than 140. */
+ if (total_encoded_chunk_length <= 140) {
+ gchar **out;
- /* No splitting needed? */
- if (array->len <= 140) {
out = g_new0 (gchar *, 2);
out[0] = g_strdup (text);
return out;
}
- /* Compute number of chunks needed */
- n_chunks = array->len / 134;
- if (array->len % 134 != 0)
- n_chunks++;
+ /* Otherwise, we do need the splitted chunks. Add the last one
+ * with contents plus the last trailing NULL */
+ g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+ g_ptr_array_add (chunks, NULL);
- /* Fill in all chunks */
- out = g_new0 (gchar *, n_chunks + 1);
- for (i = 0, j = 0; i < n_chunks; i++, j += 134) {
- out[i] = sms_decode_text (&array->data[j],
- MIN (array->len - j, 134),
- MM_SMS_ENCODING_UCS2,
- 0,
- log_object);
- }
-
- return out;
+ return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
}
gchar **
@@ -1174,6 +1174,11 @@ mm_sms_part_3gpp_util_split_text (const gchar *text,
* 134 * 8 = 1072; 1072/7=153.14
* 2) If we're using UCS2 encoding, we can pack up to 70 characters in
* 140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
+ * 3) If we're using UTF-16 encoding (instead of UCS2), the amount of
+ * characters we can pack is variable, depends on how the characters
+ * are encoded in UTF-16 (e.g. if there are characters out of the BMP
+ * we'll need surrogate pairs and a single character will need 4 bytes
+ * instead of 2).
*
* This method does the split of the input string into N strings, so that
* each of the strings can be placed in a SMS part.
@@ -1185,9 +1190,9 @@ mm_sms_part_3gpp_util_split_text (const gchar *text,
return util_split_text_gsm7 (text, strlen (text), log_object);
}
- /* Otherwise, fallback to UCS2 encoding */
+ /* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
*encoding = MM_SMS_ENCODING_UCS2;
- return util_split_text_ucs2 (text, strlen (text), log_object);
+ return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
}
GByteArray **
diff --git a/src/mm-sms-part.h b/src/mm-sms-part.h
index 92f39b11..2ee7f308 100644
--- a/src/mm-sms-part.h
+++ b/src/mm-sms-part.h
@@ -20,11 +20,27 @@
#include <glib.h>
#include <ModemManager.h>
+/* Despite 3GPP TS 23.038 specifies that Unicode SMS messages are
+ * encoded in UCS-2, UTF-16 encoding is commonly used instead on many
+ * modern platforms to allow encoding code points that fall outside the
+ * Basic Multilingual Plane (BMP), such as Emoji. Most of the UCS-2
+ * code points are identical to their equivalent UTF-16 code points.
+ * In UTF-16, non-BMP code points are encoded in a pair of surrogate
+ * code points (i.e. a high surrogate in 0xD800..0xDBFF, followed by a
+ * low surrogate in 0xDC00..0xDFFF). An isolated surrogate code point
+ * has no general interpretation in UTF-16, but could be a valid
+ * (though unmapped) code point in UCS-2.
+ *
+ * The current implementation in ModemManager just assumes that whenever
+ * possible (i.e. when parsing received PDUs or when creating submit
+ * PDUs) UTF-16 will be used instead of plain UCS-2 (even if the PDUs
+ * report the encoding as UCS-2).
+ */
typedef enum { /*< underscore_name=mm_sms_encoding >*/
MM_SMS_ENCODING_UNKNOWN = 0x0,
MM_SMS_ENCODING_GSM7,
MM_SMS_ENCODING_8BIT,
- MM_SMS_ENCODING_UCS2
+ MM_SMS_ENCODING_UCS2,
} MMSmsEncoding;
typedef struct _MMSmsPart MMSmsPart;
diff --git a/src/tests/test-sms-part-3gpp.c b/src/tests/test-sms-part-3gpp.c
index c3d59d87..db6aa7a0 100644
--- a/src/tests/test-sms-part-3gpp.c
+++ b/src/tests/test-sms-part-3gpp.c
@@ -553,8 +553,7 @@ common_test_create_pdu (const gchar *smsc,
g_assert_no_error (error);
g_assert (pdu != NULL);
- g_assert_cmpuint (len, ==, expected_size);
- g_assert_cmpint (memcmp (pdu, expected, len), ==, 0);
+ g_assert_cmpmem (pdu, len, expected, expected_size);
g_assert_cmpint (msgstart, ==, expected_msgstart);
g_free (pdu);
@@ -735,7 +734,7 @@ common_test_text_split (const gchar *text,
}
static void
-test_text_split_short (void)
+test_text_split_short_gsm7 (void)
{
const gchar *text = "Hello";
const gchar *expected [] = {
@@ -749,7 +748,7 @@ test_text_split_short (void)
static void
test_text_split_short_ucs2 (void)
{
- const gchar *text = "你好";
+ const gchar *text = "你好"; /* (UTF-8) e4 bd a0 e5 a5 bd */
const gchar *expected [] = {
"你好",
NULL
@@ -759,7 +758,19 @@ test_text_split_short_ucs2 (void)
}
static void
-test_text_split_max_single_pdu (void)
+test_text_split_short_utf16 (void)
+{
+ const gchar *text = "😉"; /* U+1F609, winking face */
+ const gchar *expected [] = {
+ "😉",
+ NULL
+ };
+
+ common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
+static void
+test_text_split_max_single_pdu_gsm7 (void)
{
const gchar *text =
"0123456789012345678901234567890123456789"
@@ -798,7 +809,23 @@ test_text_split_max_single_pdu_ucs2 (void)
}
static void
-test_text_split_two_pdu (void)
+test_text_split_max_single_pdu_utf16 (void)
+{
+ /* NOTE: this string contains 35 Bhaiksuki characters, each of
+ * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
+ * in total). */
+ const gchar *text =
+ "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣";
+ const gchar *expected [] = {
+ "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣",
+ NULL
+ };
+
+ common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
+static void
+test_text_split_two_pdu_gsm7 (void)
{
const gchar *text =
"0123456789012345678901234567890123456789"
@@ -839,6 +866,30 @@ test_text_split_two_pdu_ucs2 (void)
common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
}
+static void
+test_text_split_two_pdu_utf16 (void)
+{
+ /* NOTE: this string contains 35 Bhaiksuki characters, each of
+ * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
+ * in total) plus one ASCII char (encoded with 1 byte in UTF-8 and
+ * 2 bytes in UTF-16), making it a total of 142 bytes when in
+ * UTF-16 (so not fitting in one single PDU)
+ *
+ * When split in chunks, the last chunk will hold 2 Bhaiksuki
+ * characters plus the last ASCII one (9 bytes in UTF-16) so that
+ * the first chunk contains the leading 33 Bhaiksuki characters
+ * (132 characters, less than 134) */
+ const gchar *text =
+ "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣a";
+ const gchar *expected [] = {
+ "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡",
+ "𑰢𑰣a",
+ NULL
+ };
+
+ common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
/************************************************************/
int main (int argc, char **argv)
@@ -874,12 +925,15 @@ int main (int argc, char **argv)
g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-3", test_create_pdu_gsm_3);
g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-no-validity", test_create_pdu_gsm_no_validity);
- g_test_add_func ("/MM/SMS/3GPP/Text-Split/short", test_text_split_short);
- g_test_add_func ("/MM/SMS/3GPP/Text-Split/short-UCS2", test_text_split_short_ucs2);
- g_test_add_func ("/MM/SMS/3GPP/Text-Split/max-single-pdu", test_text_split_max_single_pdu);
- g_test_add_func ("/MM/SMS/3GPP/Text-Split/max-single-pdu-UCS2", test_text_split_max_single_pdu_ucs2);
- g_test_add_func ("/MM/SMS/3GPP/Text-Split/two-pdu", test_text_split_two_pdu);
- g_test_add_func ("/MM/SMS/3GPP/Text-Split/two-pdu-UCS2", test_text_split_two_pdu_ucs2);
+ g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/short", test_text_split_short_gsm7);
+ g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/short", test_text_split_short_ucs2);
+ g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/short", test_text_split_short_utf16);
+ g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/max-single-pdu", test_text_split_max_single_pdu_gsm7);
+ g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/max-single-pdu", test_text_split_max_single_pdu_ucs2);
+ g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/max-single-pdu", test_text_split_max_single_pdu_utf16);
+ g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/two-pdu", test_text_split_two_pdu_gsm7);
+ g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/two-pdu", test_text_split_two_pdu_ucs2);
+ g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/two-pdu", test_text_split_two_pdu_utf16);
return g_test_run ();
}