diff options
author | Andrey Skvortsov <andrej.skvortzov@gmail.com> | 2022-08-30 01:35:18 +0300 |
---|---|---|
committer | Aleksander Morgado <aleksander@aleksander.es> | 2022-09-13 20:49:01 +0000 |
commit | 2ece78c80f09c919cbbbc825fab2fc63f864147d (patch) | |
tree | 292734f8f38c1128de7cd7f3fbeb30167f1883ff /src/mm-charsets.c | |
parent | 9f5a84f777be651524115d82e5a32a0f7d00ad0a (diff) |
charsets: move mm_sms_part_3gpp_util_split_text to mm_charset_util_split_text
Diffstat (limited to 'src/mm-charsets.c')
-rw-r--r-- | src/mm-charsets.c | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c index 9a7bbb95..1bc6e87e 100644 --- a/src/mm-charsets.c +++ b/src/mm-charsets.c @@ -974,3 +974,142 @@ mm_modem_charsets_init (void) mm_obj_dbg (NULL, "[charsets] %s: iconv conversion to/from charset is supported", charset_settings[i].iconv_name); } } + +static gchar ** +util_split_text_gsm7 (const gchar *text, + gsize text_len, + gpointer log_object) +{ + gchar **out; + guint n_chunks; + guint i; + guint j; + + /* No splitting needed? */ + if (text_len <= 160) { + out = g_new0 (gchar *, 2); + out[0] = g_strdup (text); + return out; + } + + /* Compute number of chunks needed */ + n_chunks = text_len / 153; + if (text_len % 153 != 0) + n_chunks++; + + /* Fill in all chunks */ + out = g_new0 (gchar *, n_chunks + 1); + for (i = 0, j = 0; i < n_chunks; i++, j += 153) + out[i] = g_strndup (&text[j], 153); + + return out; +} + +static gchar ** +util_split_text_utf16_or_ucs2 (const gchar *text, + gsize text_len, + gpointer log_object) +{ + g_autoptr(GPtrArray) chunks = NULL; + const gchar *walker; + const gchar *chunk_start; + glong encoded_chunk_length; + glong total_encoded_chunk_length; + + chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free); + + walker = text; + chunk_start = text; + encoded_chunk_length = 0; + total_encoded_chunk_length = 0; + while (walker && *walker) { + g_autofree gunichar2 *unichar2 = NULL; + glong unichar2_written = 0; + glong unichar2_written_bytes = 0; + gunichar single; + + single = g_utf8_get_char (walker); + unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL); + g_assert (unichar2_written > 0); + + /* When splitting for UCS-2 encoding, only one single unichar2 will be + * written, because all codepoints represented in UCS2 fit in the BMP. + * When splitting for UTF-16, though, we may end up writing one or two + * unichar2 (without or with surrogate pairs), because UTF-16 covers the + * whole Unicode spectrum. */ + unichar2_written_bytes = (unichar2_written * sizeof (gunichar2)); + if ((encoded_chunk_length + unichar2_written_bytes) > 134) { + g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start)); + chunk_start = walker; + encoded_chunk_length = unichar2_written_bytes; + } else + encoded_chunk_length += unichar2_written_bytes; + + total_encoded_chunk_length += unichar2_written_bytes; + walker = g_utf8_next_char (walker); + } + + /* We have split the original string in chunks, where each chunk + * does not require more than 134 bytes when encoded in UTF-16. + * As a special case now, we consider the case that no splitting + * is necessary, i.e. if the total amount of bytes after encoding + * in UTF-16 is less or equal than 140. */ + if (total_encoded_chunk_length <= 140) { + gchar **out; + + out = g_new0 (gchar *, 2); + out[0] = g_strdup (text); + return out; + } + + /* Otherwise, we do need the splitted chunks. Add the last one + * with contents plus the last trailing NULL */ + g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start)); + g_ptr_array_add (chunks, NULL); + + return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE); +} + + +gchar ** +mm_charset_util_split_text (const gchar *text, + MMModemCharset *charset, + gpointer log_object) +{ + if (!text) + return NULL; + + /* Some info about the rules for splitting. + * + * The User Data can be up to 140 bytes in the SMS part: + * 0) If we only need one chunk, it can be of up to 140 bytes. + * If we need more than one chunk, these have to be of 140 - 6 = 134 + * bytes each, as we need place for the UDH header. + * 1) If we're using GSM7 encoding, this gives us up to 160 characters, + * as we can pack 160 characters of 7bits each into 140 bytes. + * 160 * 7 = 140 * 8 = 1120. + * If we only have 134 bytes allowed, that would mean that we can pack + * up to 153 input characters: + * 134 * 8 = 1072; 1072/7=153.14 + * 2) If we're using UCS2 encoding, we can pack up to 70 characters in + * 140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes. + * 3) If we're using UTF-16 encoding (instead of UCS2), the amount of + * characters we can pack is variable, depends on how the characters + * are encoded in UTF-16 (e.g. if there are characters out of the BMP + * we'll need surrogate pairs and a single character will need 4 bytes + * instead of 2). + * + * This method does the split of the input string into N strings, so that + * each of the strings can be placed in a SMS part. + */ + + /* Check if we can do GSM encoding */ + if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) { + *charset = MM_MODEM_CHARSET_GSM; + return util_split_text_gsm7 (text, strlen (text), log_object); + } + + /* Otherwise fallback to report UCS-2 and split supporting UTF-16 */ + *charset = MM_MODEM_CHARSET_UTF16; + return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object); +} |