core: better handling of non-UCS2 conversions that should be UCS2 (bgo #683817)

Some modems return the +COPS operator name in hex-encoded current character set (as set with +CSCS). Others return the operator name in ASCII when set to UCS2, while yet others return the ASCII name with trash at the end (*cough* Huawei *cough*). Handle that better by not crashing.
author: Dan Williams <dcbw@redhat.com> 2012-09-11 16:36:16 -0500
committer: Dan Williams <dcbw@redhat.com> 2012-09-12 23:03:58 -0500
commit: c524734d9fd897add850391e7db0a1060e2f6c37 (patch)
tree: 26bb44bdfbabe22abefc098376220a1870358d5c /src/mm-charsets.c
parent: 73ced242da75abf63a1b5be47ad95123a9e53a3f (diff)
1 files changed, 30 insertions, 12 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index f88c0c7a..5f41a7c0 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -711,8 +711,7 @@ gsm_pack (const guint8 *src,
  * the hex representation of the charset-encoded string, so we need to cope with
  * that case. */
 gchar *
-mm_charset_take_and_convert_to_utf8 (gchar *str,
-                                     MMModemCharset charset)
+mm_charset_take_and_convert_to_utf8 (gchar *str, MMModemCharset charset)
 {
     gchar *utf8 = NULL;
 
@@ -753,6 +752,7 @@ mm_charset_take_and_convert_to_utf8 (gchar *str,
     case MM_MODEM_CHARSET_UCS2: {
         gsize len;
         gboolean possibly_hex = TRUE;
+        gsize bread = 0, bwritten = 0;
 
         /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */
         len = strlen (str);
@@ -766,19 +766,37 @@ mm_charset_take_and_convert_to_utf8 (gchar *str,
                 possibly_hex = isxdigit (*p++);
         }
 
-        /* If we get UCS-2, we expect the HEX representation of the string */
+        /* If hex, then we expect hex-encoded UCS-2 */
         if (possibly_hex) {
             utf8 = mm_modem_charset_hex_to_utf8 (str, charset);
-            if (!utf8) {
-                /* If we couldn't convert the string as HEX-UCS-2, try to see if
-                 * the string is valid UTF-8 itself. */
-                utf8 = str;
-            } else
+            if (utf8) {
                 g_free (str);
-        } else
-            /* If we already know it's not hex, try to use the string as it is */
-            utf8 = str;
+                break;
+            }
+        }
+
+        /* If not hex, then it might be raw UCS-2 (very unlikely) or ASCII/UTF-8
+         * (much more likely).  Try to convert to UTF-8 and if that fails, use
+         * the partial conversion length to re-convert the part of the string
+         * that is UTF-8, if any.
+         */
+        utf8 = g_convert (str, strlen (str),
+                          "UTF-8//TRANSLIT", "UTF-8//TRANSLIT",
+                          &bread, &bwritten, NULL);
+
+        /* Valid conversion, or we didn't get enough valid UTF-8 */
+        if (utf8 || (bwritten <= 2)) {
+            g_free (str);
+            break;
+        }
 
+        /* Last try; chop off the original string at the conversion failure
+         * location and get what we can.
+         */
+        str[bread] = '\0';
+        utf8 = g_convert (str, strlen (str),
+                          "UTF-8//TRANSLIT", "UTF-8//TRANSLIT",
+                          NULL, NULL, NULL);
         break;
     }
 
@@ -792,7 +810,7 @@ mm_charset_take_and_convert_to_utf8 (gchar *str,
 
     /* Validate UTF-8 always before returning. This result will be exposed in DBus
      * very likely... */
-    if (!g_utf8_validate (utf8, -1, NULL)) {
+    if (utf8 && !g_utf8_validate (utf8, -1, NULL)) {
         /* Better return NULL than an invalid UTF-8 string */
         g_free (utf8);
         utf8 = NULL;
author	Dan Williams <dcbw@redhat.com>	2012-09-11 16:36:16 -0500
committer	Dan Williams <dcbw@redhat.com>	2012-09-12 23:03:58 -0500
commit	c524734d9fd897add850391e7db0a1060e2f6c37 (patch)
tree	26bb44bdfbabe22abefc098376220a1870358d5c /src/mm-charsets.c
parent	73ced242da75abf63a1b5be47ad95123a9e53a3f (diff)