diff options
-rw-r--r-- | src/mm-charsets.c | 66 | ||||
-rw-r--r-- | src/mm-charsets.h | 7 | ||||
-rw-r--r-- | src/mm-sms-part-3gpp.c | 6 | ||||
-rw-r--r-- | src/mm-sms-part-cdma.c | 6 | ||||
-rw-r--r-- | src/tests/test-charsets.c | 68 |
5 files changed, 97 insertions, 56 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c index 56a745dd..55604182 100644 --- a/src/mm-charsets.c +++ b/src/mm-charsets.c @@ -463,43 +463,37 @@ mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len) } static gboolean -gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +gsm_is_subset (gunichar c, const char *utf8, gsize ulen) { guint8 gsm; - *out_clen = 1; if (utf8_to_gsm_def_char (utf8, ulen, &gsm)) return TRUE; - if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) { - *out_clen = 2; + if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) return TRUE; - } return FALSE; } static gboolean -ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +ira_is_subset (gunichar c, const char *utf8, gsize ulen) { - *out_clen = 1; return (ulen == 1); } static gboolean -ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +ucs2_is_subset (gunichar c, const char *utf8, gsize ulen) { - *out_clen = 2; return (c <= 0xFFFF); } static gboolean -iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +iso88591_is_subset (gunichar c, const char *utf8, gsize ulen) { - *out_clen = 1; return (c <= 0xFF); } static gboolean -pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +pccp437_is_subset (gunichar c, const char *utf8, gsize ulen) { static const gunichar t[] = { 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, @@ -520,8 +514,6 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) }; int i; - *out_clen = 1; - if (c <= 0x7F) return TRUE; for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) { @@ -532,7 +524,7 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) } static gboolean -pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +pcdn_is_subset (gunichar c, const char *utf8, gsize ulen) { static const gunichar t[] = { 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, @@ -553,8 +545,6 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) }; int i; - *out_clen = 1; - if (c <= 0x7F) return TRUE; for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) { @@ -566,7 +556,7 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) typedef struct { MMModemCharset cs; - gboolean (*func) (gunichar c, const char *utf8, gsize ulen, guint *out_clen); + gboolean (*func) (gunichar c, const char *utf8, gsize ulen); guint charsize; } SubsetEntry; @@ -581,40 +571,34 @@ SubsetEntry subset_table[] = { }; /** - * mm_charset_get_encoded_len: + * mm_charset_can_covert_to: + * @utf8: UTF-8 valid string. + * @charset: the #MMModemCharset to validate the conversion from @utf8. * - * @utf8: UTF-8 valid string - * @charset: the #MMModemCharset to check the length of @utf8 in - * @out_unsupported: on return, number of characters of @utf8 that are not fully - * representable in @charset - * - * Returns: the size in bytes of the string if converted from UTF-8 into @charset. - **/ -guint -mm_charset_get_encoded_len (const char *utf8, - MMModemCharset charset, - guint *out_unsupported) + * Returns: %TRUE if the conversion is possible without errors, %FALSE otherwise. + */ +gboolean +mm_charset_can_convert_to (const char *utf8, + MMModemCharset charset) { const char *p = utf8; - guint len = 0, unsupported = 0; SubsetEntry *e; - g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0); - g_return_val_if_fail (utf8 != NULL, 0); + g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, FALSE); + g_return_val_if_fail (utf8 != NULL, FALSE); if (charset == MM_MODEM_CHARSET_UTF8) - return strlen (utf8); + return TRUE; /* Find the charset in our subset table */ for (e = &subset_table[0]; e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN; e++); - g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0); + g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, FALSE); while (*p) { gunichar c; const char *end; - guint clen = 0; c = g_utf8_get_char_validated (p, -1); g_return_val_if_fail (c != (gunichar) -1, 0); @@ -625,15 +609,13 @@ mm_charset_get_encoded_len (const char *utf8, while (*++end); } - if (!e->func (c, p, (end - p), &clen)) - unsupported++; - len += clen; + if (!e->func (c, p, (end - p))) + return FALSE; + p = end; } - if (out_unsupported) - *out_unsupported = unsupported; - return len; + return TRUE; } guint8 * diff --git a/src/mm-charsets.h b/src/mm-charsets.h index c0b309e3..340ae95b 100644 --- a/src/mm-charsets.h +++ b/src/mm-charsets.h @@ -57,10 +57,9 @@ guint8 *mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len); guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len); -/* Returns the size in bytes required to hold the UTF-8 string in the given charset */ -guint mm_charset_get_encoded_len (const char *utf8, - MMModemCharset charset, - guint *out_unsupported); +/* Checks whether conversion to the given charset may be done without errors */ +gboolean mm_charset_can_convert_to (const char *utf8, + MMModemCharset charset); guint8 *mm_charset_gsm_unpack (const guint8 *gsm, guint32 num_septets, diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c index 8fd255ea..0b59b247 100644 --- a/src/mm-sms-part-3gpp.c +++ b/src/mm-sms-part-3gpp.c @@ -1026,7 +1026,6 @@ gchar ** mm_sms_part_3gpp_util_split_text (const gchar *text, MMSmsEncoding *encoding) { - guint gsm_unsupported = 0; gchar **out; guint n_chunks; guint i; @@ -1058,10 +1057,7 @@ mm_sms_part_3gpp_util_split_text (const gchar *text, */ /* Check if we can do GSM encoding */ - mm_charset_get_encoded_len (text, - MM_MODEM_CHARSET_GSM, - &gsm_unsupported); - if (gsm_unsupported > 0) { + if (!mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) { /* If cannot do it in GSM encoding, do it in UCS-2 */ GByteArray *array; diff --git a/src/mm-sms-part-cdma.c b/src/mm-sms-part-cdma.c index 8d76bcec..167eda83 100644 --- a/src/mm-sms-part-cdma.c +++ b/src/mm-sms-part-cdma.c @@ -1365,7 +1365,6 @@ decide_best_encoding (const gchar *text, guint *num_bits_per_field, Encoding *encoding) { - guint latin_unsupported = 0; guint ascii_unsupported = 0; guint i; guint len; @@ -1391,10 +1390,7 @@ decide_best_encoding (const gchar *text, } /* Check if we can do Latin encoding */ - mm_charset_get_encoded_len (text, - MM_MODEM_CHARSET_8859_1, - &latin_unsupported); - if (!latin_unsupported) { + if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_8859_1)) { *out = g_byte_array_sized_new (len); mm_modem_charset_byte_array_append (*out, text, diff --git a/src/tests/test-charsets.c b/src/tests/test-charsets.c index 9ae23faf..e7b47da0 100644 --- a/src/tests/test-charsets.c +++ b/src/tests/test-charsets.c @@ -20,6 +20,12 @@ #include "mm-modem-helpers.h" #include "mm-log.h" +#if defined ENABLE_TEST_MESSAGE_TRACES +#define trace(message, ...) g_print (message, ##__VA_ARGS__) +#else +#define trace(...) +#endif + static void test_gsm7_default_chars (void) { @@ -344,6 +350,66 @@ test_take_convert_ucs2_bad_ascii2 (void) g_assert (converted == NULL); } +struct charset_can_convert_to_test_s { + const char *utf8; + gboolean to_gsm; + gboolean to_ira; + gboolean to_8859_1; + gboolean to_ucs2; + gboolean to_pccp437; + gboolean to_pcdn; +}; + +static void +test_charset_can_covert_to (void) +{ + static const struct charset_can_convert_to_test_s charset_can_convert_to_test[] = { + { + .utf8 = "", + .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE, + }, + { + .utf8 = " ", + .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE, + }, + { + .utf8 = "some basic ascii", + .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE, + }, + { + .utf8 = "ホモ・サピエンス 喂人类 katakana, chinese, english: UCS2 takes it all", + .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE, + }, + { + .utf8 = "Some from the GSM7 basic set: a % Ψ Ω ñ ö è æ", + .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE, + }, + { + .utf8 = "More from the GSM7 extended set: {} [] ~ € |", + .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE, + }, + { + .utf8 = "patín cannot be encoded in GSM7 or IRA, but is valid UCS2, ISO-8859-1, CP437 and CP850", + .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE, + }, + { + .utf8 = "ècole can be encoded in multiple ways, but not in IRA", + .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE, + }, + }; + guint i; + + for (i = 0; i < G_N_ELEMENTS (charset_can_convert_to_test); i++) { + trace ("testing charset conversion: '%s'\n", charset_can_convert_to_test[i].utf8); + g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_GSM) == charset_can_convert_to_test[i].to_gsm); + g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_IRA) == charset_can_convert_to_test[i].to_ira); + g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_8859_1) == charset_can_convert_to_test[i].to_8859_1); + g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_UCS2) == charset_can_convert_to_test[i].to_ucs2); + g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_PCCP437) == charset_can_convert_to_test[i].to_pccp437); + g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_PCDN) == charset_can_convert_to_test[i].to_pcdn); + } +} + void _mm_log (const char *loc, const char *func, @@ -387,5 +453,7 @@ int main (int argc, char **argv) g_test_add_func ("/MM/charsets/take-convert/ucs2/bad-ascii", test_take_convert_ucs2_bad_ascii); g_test_add_func ("/MM/charsets/take-convert/ucs2/bad-ascii-2", test_take_convert_ucs2_bad_ascii2); + g_test_add_func ("/MM/charsets/can-convert-to", test_charset_can_covert_to); + return g_test_run (); } |