charsets: move mm_sms_part_3gpp_util_split_text to mm_charset_util_split_text

author: Andrey Skvortsov <andrej.skvortzov@gmail.com> 2022-08-30 01:35:18 +0300
committer: Aleksander Morgado <aleksander@aleksander.es> 2022-09-13 20:49:01 +0000
commit: 2ece78c80f09c919cbbbc825fab2fc63f864147d (patch)
tree: 292734f8f38c1128de7cd7f3fbeb30167f1883ff /src
parent: 9f5a84f777be651524115d82e5a32a0f7d00ad0a (diff)
7 files changed, 350 insertions, 335 deletions
diff --git a/src/mm-base-sms.c b/src/mm-base-sms.c
index 90dda3d2..db36c7b9 100644
--- a/src/mm-base-sms.c
+++ b/src/mm-base-sms.c
@@ -112,6 +112,7 @@ generate_3gpp_submit_pdus (MMBaseSms *self,
     gsize data_len = 0;
 
     MMSmsEncoding encoding;
+    MMModemCharset charset;
     gchar **split_text = NULL;
     GByteArray **split_data = NULL;
 
@@ -129,7 +130,7 @@ generate_3gpp_submit_pdus (MMBaseSms *self,
     g_assert (!(text != NULL && data != NULL));
 
     if (text) {
-        split_text = mm_sms_part_3gpp_util_split_text (text, &encoding, self);
+        split_text = mm_charset_util_split_text (text, &charset, self);
         if (!split_text) {
             g_set_error (error,
                          MM_CORE_ERROR,
@@ -137,6 +138,7 @@ generate_3gpp_submit_pdus (MMBaseSms *self,
                          "Cannot generate PDUs: Error processing input text");
             return FALSE;
         }
+        encoding = (charset == MM_MODEM_CHARSET_GSM) ? MM_SMS_ENCODING_GSM7 : MM_SMS_ENCODING_UCS2;
         n_parts = g_strv_length (split_text);
     } else if (data) {
         encoding = MM_SMS_ENCODING_8BIT;
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index 9a7bbb95..1bc6e87e 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -974,3 +974,142 @@ mm_modem_charsets_init (void)
         mm_obj_dbg (NULL, "[charsets]   %s: iconv conversion to/from charset is supported", charset_settings[i].iconv_name);
     }
 }
+
+static gchar **
+util_split_text_gsm7 (const gchar *text,
+                      gsize        text_len,
+                      gpointer     log_object)
+{
+    gchar **out;
+    guint   n_chunks;
+    guint   i;
+    guint   j;
+
+    /* No splitting needed? */
+    if (text_len <= 160) {
+        out = g_new0 (gchar *, 2);
+        out[0] = g_strdup (text);
+        return out;
+    }
+
+    /* Compute number of chunks needed */
+    n_chunks = text_len / 153;
+    if (text_len % 153 != 0)
+        n_chunks++;
+
+    /* Fill in all chunks */
+    out = g_new0 (gchar *, n_chunks + 1);
+    for (i = 0, j = 0; i < n_chunks; i++, j += 153)
+        out[i] = g_strndup (&text[j], 153);
+
+    return out;
+}
+
+static gchar **
+util_split_text_utf16_or_ucs2 (const gchar *text,
+                               gsize        text_len,
+                               gpointer     log_object)
+{
+    g_autoptr(GPtrArray)  chunks = NULL;
+    const gchar          *walker;
+    const gchar          *chunk_start;
+    glong                 encoded_chunk_length;
+    glong                 total_encoded_chunk_length;
+
+    chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
+
+    walker = text;
+    chunk_start = text;
+    encoded_chunk_length = 0;
+    total_encoded_chunk_length = 0;
+    while (walker && *walker) {
+        g_autofree gunichar2 *unichar2 = NULL;
+        glong                 unichar2_written = 0;
+        glong                 unichar2_written_bytes = 0;
+        gunichar              single;
+
+        single = g_utf8_get_char (walker);
+        unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
+        g_assert (unichar2_written > 0);
+
+        /* When splitting for UCS-2 encoding, only one single unichar2 will be
+         * written, because all codepoints represented in UCS2 fit in the BMP.
+         * When splitting for UTF-16, though, we may end up writing one or two
+         * unichar2 (without or with surrogate pairs), because UTF-16 covers the
+         * whole Unicode spectrum. */
+        unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
+        if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
+            g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+            chunk_start = walker;
+            encoded_chunk_length = unichar2_written_bytes;
+        } else
+            encoded_chunk_length += unichar2_written_bytes;
+
+        total_encoded_chunk_length += unichar2_written_bytes;
+        walker = g_utf8_next_char (walker);
+    }
+
+    /* We have split the original string in chunks, where each chunk
+     * does not require more than 134 bytes when encoded in UTF-16.
+     * As a special case now, we consider the case that no splitting
+     * is necessary, i.e. if the total amount of bytes after encoding
+     * in UTF-16 is less or equal than 140. */
+    if (total_encoded_chunk_length <= 140) {
+        gchar **out;
+
+        out = g_new0 (gchar *, 2);
+        out[0] = g_strdup (text);
+        return out;
+    }
+
+    /* Otherwise, we do need the splitted chunks. Add the last one
+     * with contents plus the last trailing NULL */
+    g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
+    g_ptr_array_add (chunks, NULL);
+
+    return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
+}
+
+
+gchar **
+mm_charset_util_split_text (const gchar   *text,
+                            MMModemCharset *charset,
+                            gpointer       log_object)
+{
+    if (!text)
+        return NULL;
+
+    /* Some info about the rules for splitting.
+     *
+     * The User Data can be up to 140 bytes in the SMS part:
+     *  0) If we only need one chunk, it can be of up to 140 bytes.
+     *     If we need more than one chunk, these have to be of 140 - 6 = 134
+     *     bytes each, as we need place for the UDH header.
+     *  1) If we're using GSM7 encoding, this gives us up to 160 characters,
+     *     as we can pack 160 characters of 7bits each into 140 bytes.
+     *      160 * 7 = 140 * 8 = 1120.
+     *     If we only have 134 bytes allowed, that would mean that we can pack
+     *     up to 153 input characters:
+     *      134 * 8 = 1072; 1072/7=153.14
+     *  2) If we're using UCS2 encoding, we can pack up to 70 characters in
+     *     140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
+     *  3) If we're using UTF-16 encoding (instead of UCS2), the amount of
+     *     characters we can pack is variable, depends on how the characters
+     *     are encoded in UTF-16 (e.g. if there are characters out of the BMP
+     *     we'll need surrogate pairs and a single character will need 4 bytes
+     *     instead of 2).
+     *
+     * This method does the split of the input string into N strings, so that
+     * each of the strings can be placed in a SMS part.
+     */
+
+    /* Check if we can do GSM encoding */
+    if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) {
+        *charset = MM_MODEM_CHARSET_GSM;
+        return util_split_text_gsm7 (text, strlen (text), log_object);
+    }
+
+    /* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
+    *charset = MM_MODEM_CHARSET_UTF16;
+    return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
+}
diff --git a/src/mm-charsets.h b/src/mm-charsets.h
index 3071f6be..8de7140e 100644
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -112,4 +112,15 @@ gchar *mm_modem_charset_str_to_utf8 (const gchar     *str,
 
 void mm_modem_charsets_init (void);
 
+
+/*
+ * Select appropriate encoding and split an UTF-8 encoded input string
+ * into N UTF-8 strings, so that each of the strings
+ * can be encoded into 'charset' and placed in a SMS part.
+ */
+gchar **mm_charset_util_split_text (const gchar    *text,
+                                    MMModemCharset *charset,
+                                    gpointer        log_object);
+
+
 #endif /* MM_CHARSETS_H */
diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
index 1bbe1a0a..07d3ef8a 100644
--- a/src/mm-sms-part-3gpp.c
+++ b/src/mm-sms-part-3gpp.c
@@ -1101,144 +1101,6 @@ error:
     return NULL;
 }
 
-static gchar **
-util_split_text_gsm7 (const gchar *text,
-                      gsize        text_len,
-                      gpointer     log_object)
-{
-    gchar **out;
-    guint   n_chunks;
-    guint   i;
-    guint   j;
-
-    /* No splitting needed? */
-    if (text_len <= 160) {
-        out = g_new0 (gchar *, 2);
-        out[0] = g_strdup (text);
-        return out;
-    }
-
-    /* Compute number of chunks needed */
-    n_chunks = text_len / 153;
-    if (text_len % 153 != 0)
-        n_chunks++;
-
-    /* Fill in all chunks */
-    out = g_new0 (gchar *, n_chunks + 1);
-    for (i = 0, j = 0; i < n_chunks; i++, j += 153)
-        out[i] = g_strndup (&text[j], 153);
-
-    return out;
-}
-
-static gchar **
-util_split_text_utf16_or_ucs2 (const gchar *text,
-                               gsize        text_len,
-                               gpointer     log_object)
-{
-    g_autoptr(GPtrArray)  chunks = NULL;
-    const gchar          *walker;
-    const gchar          *chunk_start;
-    glong                 encoded_chunk_length;
-    glong                 total_encoded_chunk_length;
-
-    chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
-
-    walker = text;
-    chunk_start = text;
-    encoded_chunk_length = 0;
-    total_encoded_chunk_length = 0;
-    while (walker && *walker) {
-        g_autofree gunichar2 *unichar2 = NULL;
-        glong                 unichar2_written = 0;
-        glong                 unichar2_written_bytes = 0;
-        gunichar              single;
-
-        single = g_utf8_get_char (walker);
-        unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
-        g_assert (unichar2_written > 0);
-
-        /* When splitting for UCS-2 encoding, only one single unichar2 will be
-         * written, because all codepoints represented in UCS2 fit in the BMP.
-         * When splitting for UTF-16, though, we may end up writing one or two
-         * unichar2 (without or with surrogate pairs), because UTF-16 covers the
-         * whole Unicode spectrum. */
-        unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
-        if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
-            g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
-            chunk_start = walker;
-            encoded_chunk_length = unichar2_written_bytes;
-        } else
-            encoded_chunk_length += unichar2_written_bytes;
-
-        total_encoded_chunk_length += unichar2_written_bytes;
-        walker = g_utf8_next_char (walker);
-    }
-
-    /* We have split the original string in chunks, where each chunk
-     * does not require more than 134 bytes when encoded in UTF-16.
-     * As a special case now, we consider the case that no splitting
-     * is necessary, i.e. if the total amount of bytes after encoding
-     * in UTF-16 is less or equal than 140. */
-    if (total_encoded_chunk_length <= 140) {
-        gchar **out;
-
-        out = g_new0 (gchar *, 2);
-        out[0] = g_strdup (text);
-        return out;
-    }
-
-    /* Otherwise, we do need the splitted chunks. Add the last one
-     * with contents plus the last trailing NULL */
-    g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
-    g_ptr_array_add (chunks, NULL);
-
-    return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
-}
-
-gchar **
-mm_sms_part_3gpp_util_split_text (const gchar   *text,
-                                  MMSmsEncoding *encoding,
-                                  gpointer       log_object)
-{
-    if (!text)
-        return NULL;
-
-    /* Some info about the rules for splitting.
-     *
-     * The User Data can be up to 140 bytes in the SMS part:
-     *  0) If we only need one chunk, it can be of up to 140 bytes.
-     *     If we need more than one chunk, these have to be of 140 - 6 = 134
-     *     bytes each, as we need place for the UDH header.
-     *  1) If we're using GSM7 encoding, this gives us up to 160 characters,
-     *     as we can pack 160 characters of 7bits each into 140 bytes.
-     *      160 * 7 = 140 * 8 = 1120.
-     *     If we only have 134 bytes allowed, that would mean that we can pack
-     *     up to 153 input characters:
-     *      134 * 8 = 1072; 1072/7=153.14
-     *  2) If we're using UCS2 encoding, we can pack up to 70 characters in
-     *     140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
-     *  3) If we're using UTF-16 encoding (instead of UCS2), the amount of
-     *     characters we can pack is variable, depends on how the characters
-     *     are encoded in UTF-16 (e.g. if there are characters out of the BMP
-     *     we'll need surrogate pairs and a single character will need 4 bytes
-     *     instead of 2).
-     *
-     * This method does the split of the input string into N strings, so that
-     * each of the strings can be placed in a SMS part.
-     */
-
-    /* Check if we can do GSM encoding */
-    if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) {
-        *encoding = MM_SMS_ENCODING_GSM7;
-        return util_split_text_gsm7 (text, strlen (text), log_object);
-    }
-
-    /* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
-    *encoding = MM_SMS_ENCODING_UCS2;
-    return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
-}
-
 GByteArray **
 mm_sms_part_3gpp_util_split_data (const guint8 *data,
                                   gsize data_len)
diff --git a/src/mm-sms-part-3gpp.h b/src/mm-sms-part-3gpp.h
index c6f4cf3f..d2ee5114 100644
--- a/src/mm-sms-part-3gpp.h
+++ b/src/mm-sms-part-3gpp.h
@@ -44,9 +44,6 @@ guint       mm_sms_part_3gpp_encode_address   (const gchar   *address,
                                                guint8        *buf,
                                                gsize          buflen,
                                                gboolean       is_smsc);
-gchar      **mm_sms_part_3gpp_util_split_text (const gchar   *text,
-                                               MMSmsEncoding *encoding,
-                                               gpointer       log_object);
 GByteArray **mm_sms_part_3gpp_util_split_data (const guint8  *data,
                                                gsize          data_len);
 
diff --git a/src/tests/test-charsets.c b/src/tests/test-charsets.c
index 8735fd22..9f616842 100644
--- a/src/tests/test-charsets.c
+++ b/src/tests/test-charsets.c
@@ -446,6 +446,188 @@ test_charset_can_covert_to (void)
     }
 }
 
+/********************* TEXT SPLIT TESTS *********************/
+
+static void
+common_test_text_split (const gchar *text,
+                        const gchar **expected,
+                        MMModemCharset expected_charset)
+{
+    gchar **out;
+    MMModemCharset out_charset = MM_MODEM_CHARSET_UNKNOWN;
+    guint i;
+
+    out = mm_charset_util_split_text (text, &out_charset, NULL);
+
+    g_assert (out != NULL);
+    g_assert (out_charset != MM_MODEM_CHARSET_UNKNOWN);
+
+    g_assert_cmpuint (g_strv_length (out), ==, g_strv_length ((gchar **)expected));
+
+    for (i = 0; out[i]; i++) {
+        g_assert_cmpstr (out[i], ==, expected[i]);
+    }
+
+    g_strfreev (out);
+}
+
+static void
+test_text_split_short_gsm7 (void)
+{
+    const gchar *text = "Hello";
+    const gchar *expected [] = {
+        "Hello",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_MODEM_CHARSET_GSM);
+}
+
+static void
+test_text_split_short_ucs2 (void)
+{
+    const gchar *text = "你好"; /* (UTF-8) e4 bd a0 e5 a5 bd */
+    const gchar *expected [] = {
+        "你好",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
+}
+
+static void
+test_text_split_short_utf16 (void)
+{
+    const gchar *text = "😉"; /* U+1F609, winking face */
+    const gchar *expected [] = {
+        "😉",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
+}
+
+static void
+test_text_split_max_single_pdu_gsm7 (void)
+{
+    const gchar *text =
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789";
+    const gchar *expected [] = {
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_MODEM_CHARSET_GSM);
+}
+
+static void
+test_text_split_max_single_pdu_ucs2 (void)
+{
+    /* NOTE: This chinese string contains 210 bytes when encoded in
+     * UTF-8! But still, it can be placed into 140 bytes when in UCS-2
+     */
+    const gchar *text =
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好";
+    const gchar *expected [] = {
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
+}
+
+static void
+test_text_split_max_single_pdu_utf16 (void)
+{
+    /* NOTE: this string contains 35 Bhaiksuki characters, each of
+     * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
+     * in total). */
+    const gchar *text =
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣";
+    const gchar *expected [] = {
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
+}
+
+static void
+test_text_split_two_pdu_gsm7 (void)
+{
+    const gchar *text =
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "01234567890123456789012345678901234567890";
+    const gchar *expected [] = {
+        /* First chunk */
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "012345678901234567890123456789012",
+        /* Second chunk */
+        "34567890",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_MODEM_CHARSET_GSM);
+}
+
+static void
+test_text_split_two_pdu_ucs2 (void)
+{
+    const gchar *text =
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好好";
+    const gchar *expected [] = {
+        /* First chunk */
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你",
+        /* Second chunk */
+        "好你好好",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
+}
+
+static void
+test_text_split_two_pdu_utf16 (void)
+{
+    /* NOTE: this string contains 35 Bhaiksuki characters, each of
+     * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
+     * in total) plus one ASCII char (encoded with 1 byte in UTF-8 and
+     * 2 bytes in UTF-16), making it a total of 142 bytes when in
+     * UTF-16 (so not fitting in one single PDU)
+     *
+     * When split in chunks, the last chunk will hold 2 Bhaiksuki
+     * characters plus the last ASCII one (9 bytes in UTF-16) so that
+     * the first chunk contains the leading 33 Bhaiksuki characters
+     * (132 characters, less than 134) */
+    const gchar *text =
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣a";
+    const gchar *expected [] = {
+        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡",
+        "𑰢𑰣a",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
+}
+
 int main (int argc, char **argv)
 {
     setlocale (LC_ALL, "");
@@ -471,5 +653,15 @@ int main (int argc, char **argv)
 
     g_test_add_func ("/MM/charsets/can-convert-to", test_charset_can_covert_to);
 
+    g_test_add_func ("/MM/charsets/text-split/gsm7/short",           test_text_split_short_gsm7);
+    g_test_add_func ("/MM/charsets/text-split/ucs2/short",           test_text_split_short_ucs2);
+    g_test_add_func ("/MM/charsets/text-split/utf16/short",          test_text_split_short_utf16);
+    g_test_add_func ("/MM/charsets/text-split/gsm7/max-single-pdu",  test_text_split_max_single_pdu_gsm7);
+    g_test_add_func ("/MM/charsets/text-split/ucs2/max-single-pdu",  test_text_split_max_single_pdu_ucs2);
+    g_test_add_func ("/MM/charsets/text-split/utf16/max-single-pdu", test_text_split_max_single_pdu_utf16);
+    g_test_add_func ("/MM/charsets/text-split/gsm7/two-pdu",         test_text_split_two_pdu_gsm7);
+    g_test_add_func ("/MM/charsets/text-split/ucs2/two-pdu",         test_text_split_two_pdu_ucs2);
+    g_test_add_func ("/MM/charsets/text-split/utf16/two-pdu",        test_text_split_two_pdu_utf16);
+
     return g_test_run ();
 }
diff --git a/src/tests/test-sms-part-3gpp.c b/src/tests/test-sms-part-3gpp.c
index 4da299e7..cfd7a856 100644
--- a/src/tests/test-sms-part-3gpp.c
+++ b/src/tests/test-sms-part-3gpp.c
@@ -24,6 +24,7 @@
 #include <libmm-glib.h>
 
 #include "mm-sms-part-3gpp.h"
+#include "mm-charsets.h"
 #include "mm-log-test.h"
 
 /********************* PDU PARSER TESTS *********************/
@@ -529,9 +530,12 @@ common_test_create_pdu (const gchar *smsc,
     if (text) {
         gchar **out;
         MMSmsEncoding encoding = MM_SMS_ENCODING_UNKNOWN;
+        MMModemCharset charset = MM_MODEM_CHARSET_UNKNOWN;
 
         /* Detect best encoding */
-        out = mm_sms_part_3gpp_util_split_text (text, &encoding, NULL);
+        out = mm_charset_util_split_text (text, &charset, NULL);
+        if (out)
+            encoding = (charset == MM_MODEM_CHARSET_GSM) ? MM_SMS_ENCODING_GSM7 : MM_SMS_ENCODING_UCS2;
         g_strfreev (out);
         mm_sms_part_set_text (part, text);
         mm_sms_part_set_encoding (part, encoding);
@@ -708,188 +712,6 @@ test_create_pdu_gsm_no_validity (void)
                             1); /* expected_msgstart */
 }
 
-/********************* TEXT SPLIT TESTS *********************/
-
-static void
-common_test_text_split (const gchar *text,
-                        const gchar **expected,
-                        MMSmsEncoding expected_encoding)
-{
-    gchar **out;
-    MMSmsEncoding out_encoding = MM_SMS_ENCODING_UNKNOWN;
-    guint i;
-
-    out = mm_sms_part_3gpp_util_split_text (text, &out_encoding, NULL);
-
-    g_assert (out != NULL);
-    g_assert (out_encoding != MM_SMS_ENCODING_UNKNOWN);
-
-    g_assert_cmpuint (g_strv_length (out), ==, g_strv_length ((gchar **)expected));
-
-    for (i = 0; out[i]; i++) {
-        g_assert_cmpstr (out[i], ==, expected[i]);
-    }
-
-    g_strfreev (out);
-}
-
-static void
-test_text_split_short_gsm7 (void)
-{
-    const gchar *text = "Hello";
-    const gchar *expected [] = {
-        "Hello",
-        NULL
-    };
-
-    common_test_text_split (text, expected, MM_SMS_ENCODING_GSM7);
-}
-
-static void
-test_text_split_short_ucs2 (void)
-{
-    const gchar *text = "你好"; /* (UTF-8) e4 bd a0 e5 a5 bd */
-    const gchar *expected [] = {
-        "你好",
-        NULL
-    };
-
-    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
-}
-
-static void
-test_text_split_short_utf16 (void)
-{
-    const gchar *text = "😉"; /* U+1F609, winking face */
-    const gchar *expected [] = {
-        "😉",
-        NULL
-    };
-
-    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
-}
-
-static void
-test_text_split_max_single_pdu_gsm7 (void)
-{
-    const gchar *text =
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789";
-    const gchar *expected [] = {
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789",
-        NULL
-    };
-
-    common_test_text_split (text, expected, MM_SMS_ENCODING_GSM7);
-}
-
-static void
-test_text_split_max_single_pdu_ucs2 (void)
-{
-    /* NOTE: This chinese string contains 210 bytes when encoded in
-     * UTF-8! But still, it can be placed into 140 bytes when in UCS-2
-     */
-    const gchar *text =
-        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
-        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
-        "你好你好你好";
-    const gchar *expected [] = {
-        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
-        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
-        "你好你好你好",
-        NULL
-    };
-
-    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
-}
-
-static void
-test_text_split_max_single_pdu_utf16 (void)
-{
-    /* NOTE: this string contains 35 Bhaiksuki characters, each of
-     * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
-     * in total). */
-    const gchar *text =
-        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣";
-    const gchar *expected [] = {
-        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣",
-        NULL
-    };
-
-    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
-}
-
-static void
-test_text_split_two_pdu_gsm7 (void)
-{
-    const gchar *text =
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789"
-        "01234567890123456789012345678901234567890";
-    const gchar *expected [] = {
-        /* First chunk */
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789"
-        "0123456789012345678901234567890123456789"
-        "012345678901234567890123456789012",
-        /* Second chunk */
-        "34567890",
-        NULL
-    };
-
-    common_test_text_split (text, expected, MM_SMS_ENCODING_GSM7);
-}
-
-static void
-test_text_split_two_pdu_ucs2 (void)
-{
-    const gchar *text =
-        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
-        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
-        "你好你好你好好";
-    const gchar *expected [] = {
-        /* First chunk */
-        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
-        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
-        "你好你",
-        /* Second chunk */
-        "好你好好",
-        NULL
-    };
-
-    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
-}
-
-static void
-test_text_split_two_pdu_utf16 (void)
-{
-    /* NOTE: this string contains 35 Bhaiksuki characters, each of
-     * them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
-     * in total) plus one ASCII char (encoded with 1 byte in UTF-8 and
-     * 2 bytes in UTF-16), making it a total of 142 bytes when in
-     * UTF-16 (so not fitting in one single PDU)
-     *
-     * When split in chunks, the last chunk will hold 2 Bhaiksuki
-     * characters plus the last ASCII one (9 bytes in UTF-16) so that
-     * the first chunk contains the leading 33 Bhaiksuki characters
-     * (132 characters, less than 134) */
-    const gchar *text =
-        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣a";
-    const gchar *expected [] = {
-        "𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡",
-        "𑰢𑰣a",
-        NULL
-    };
-
-    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
-}
-
 /************************************************************/
 
 int main (int argc, char **argv)
@@ -925,15 +747,5 @@ int main (int argc, char **argv)
     g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-3", test_create_pdu_gsm_3);
     g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-no-validity", test_create_pdu_gsm_no_validity);
 
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/short",           test_text_split_short_gsm7);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/short",           test_text_split_short_ucs2);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/short",          test_text_split_short_utf16);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/max-single-pdu",  test_text_split_max_single_pdu_gsm7);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/max-single-pdu",  test_text_split_max_single_pdu_ucs2);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/max-single-pdu", test_text_split_max_single_pdu_utf16);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/two-pdu",         test_text_split_two_pdu_gsm7);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/two-pdu",         test_text_split_two_pdu_ucs2);
-    g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/two-pdu",        test_text_split_two_pdu_utf16);
-
     return g_test_run ();
 }
author	Andrey Skvortsov <andrej.skvortzov@gmail.com>	2022-08-30 01:35:18 +0300
committer	Aleksander Morgado <aleksander@aleksander.es>	2022-09-13 20:49:01 +0000
commit	2ece78c80f09c919cbbbc825fab2fc63f864147d (patch)
tree	292734f8f38c1128de7cd7f3fbeb30167f1883ff /src
parent	9f5a84f777be651524115d82e5a32a0f7d00ad0a (diff)