gsm: add GSM 03.38 encoding/decoding functions and testcases

author: Dan Williams <dcbw@redhat.com> 2010-09-02 19:29:05 -0500
committer: Dan Williams <dcbw@redhat.com> 2010-09-02 19:29:05 -0500
commit: 9e94dd5b6124d00cf10d6296c7c9aa80f8f68d80 (patch)
tree: 78be39d22a363099512a36ebf8b71756bd559162 /src
parent: 85ce5446759092968c6540b9d842c5bc777abb74 (diff)
4 files changed, 388 insertions, 12 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index e61e56ea..8fea400e 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -109,7 +109,7 @@ charset_iconv_from (MMModemCharset charset)
 
 gboolean
 mm_modem_charset_byte_array_append (GByteArray *array,
-                                    const char *string,
+                                    const char *utf8,
                                     gboolean quoted,
                                     MMModemCharset charset)
 {
@@ -119,22 +119,16 @@ mm_modem_charset_byte_array_append (GByteArray *array,
     gsize written = 0;
 
     g_return_val_if_fail (array != NULL, FALSE);
-    g_return_val_if_fail (string != NULL, FALSE);
+    g_return_val_if_fail (utf8 != NULL, FALSE);
 
     iconv_to = charset_iconv_to (charset);
     g_return_val_if_fail (iconv_to != NULL, FALSE);
 
-    converted = g_convert (string,
-                           g_utf8_strlen (string, -1),
-                           iconv_to,
-                           "UTF-8",
-                           NULL,
-                           &written,
-                           &error);
+    converted = g_convert (utf8, -1, iconv_to, "UTF-8", NULL, &written, &error);
     if (!converted) {
         if (error) {
             g_warning ("%s: failed to convert '%s' to %s character set: (%d) %s",
-                       __func__, string, iconv_to,
+                       __func__, utf8, iconv_to,
                        error->code, error->message);
             g_error_free (error);
         }
@@ -183,3 +177,251 @@ mm_modem_charset_hex_to_utf8 (const char *src, MMModemCharset charset)
     return converted;
 }
 
+
+/* GSM 03.38 encoding conversion stuff */
+
+#define GSM_DEF_ALPHABET_SIZE 128
+#define GSM_EXT_ALPHABET_SIZE 10
+
+typedef struct GsmUtf8Mapping {
+    gchar chars[3];
+    guint8 len;
+    guint8 gsm;  /* only used for extended GSM charset */
+} GsmUtf8Mapping;
+
+#define ONE(a)     { {a, 0x00, 0x00}, 1, 0 }
+#define TWO(a, b)  { {a, b,    0x00}, 2, 0 }
+
+/**
+ * gsm_def_utf8_alphabet:
+ *
+ * Mapping from GSM default alphabet to UTF-8.
+ *
+ * ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet. Mapping to UCS-2.
+ * Mapping according to http://unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT
+ */
+static const GsmUtf8Mapping gsm_def_utf8_alphabet[GSM_DEF_ALPHABET_SIZE] = {
+	/* @             £                $                ¥   */
+    ONE(0x40),       TWO(0xc2, 0xa3), ONE(0x24),       TWO(0xc2, 0xa5),
+    /* è             é                ù                ì   */
+	TWO(0xc3, 0xa8), TWO(0xc3, 0xa9), TWO(0xc3, 0xb9), TWO(0xc3, 0xac),
+	/* ò             Ç                \n               Ø   */
+    TWO(0xc3, 0xb2), TWO(0xc3, 0x87), ONE(0x0a),       TWO(0xc3, 0x98),
+    /* ø             \r               Å                å   */
+    TWO(0xc3, 0xb8), ONE(0x0d),       TWO(0xc3, 0x85), TWO(0xc3, 0xa5),
+	/* Δ             _                Φ                Γ   */
+    TWO(0xce, 0x94), ONE(0x5f),       TWO(0xce, 0xa6), TWO(0xce, 0x93),
+    /* Λ             Ω                Π                Ψ   */
+    TWO(0xce, 0x9b), TWO(0xce, 0xa9), TWO(0xce, 0xa0), TWO(0xce, 0xa8),
+	/* Σ             Θ                Ξ                Escape Code */
+    TWO(0xce, 0xa3), TWO(0xce, 0x98), TWO(0xce, 0x9e), ONE(0xa0),
+    /* Æ             æ                ß                É   */
+    TWO(0xc3, 0x86), TWO(0xc3, 0xa6), TWO(0xc3, 0x9f), TWO(0xc3, 0x89),
+	/* ' '           !                "                #   */
+    ONE(0x20),       ONE(0x21),       ONE(0x22),       ONE(0x23),
+    /* ¤             %                &                '   */
+    TWO(0xc2, 0xa4), ONE(0x25),       ONE(0x26),       ONE(0x27),
+	/* (             )                *                +   */
+    ONE(0x28),       ONE(0x29),       ONE(0x2a),       ONE(0x2b),
+    /* ,             -                .                /   */
+    ONE(0x2c),       ONE(0x2d),       ONE(0x2e),       ONE(0x2f),
+	/* 0             1                2                3   */
+	ONE(0x30),       ONE(0x31),       ONE(0x32),       ONE(0x33),
+    /* 4             5                6                7   */
+	ONE(0x34),       ONE(0x35),       ONE(0x36),       ONE(0x37),
+	/* 8             9                :                ;   */
+	ONE(0x38),       ONE(0x39),       ONE(0x3a),       ONE(0x3b),
+	/* <             =                >                ?   */
+	ONE(0x3c),       ONE(0x3d),       ONE(0x3e),       ONE(0x3f),
+	/* ¡             A                B                C   */
+	TWO(0xc2, 0xa1), ONE(0x41),       ONE(0x42),       ONE(0x43),
+	/* D             E                F                G   */
+	ONE(0x44),       ONE(0x45),       ONE(0x46),       ONE(0x47),
+	/* H             I                J                K   */
+	ONE(0x48),       ONE(0x49),       ONE(0x4a),       ONE(0x4b),
+	/* L             M                N                O   */
+	ONE(0x4c),       ONE(0x4d),       ONE(0x4e),       ONE(0x4f),
+	/* P             Q                R                S   */
+	ONE(0x50),       ONE(0x51),       ONE(0x52),       ONE(0x53),
+	/* T             U                V                W   */
+	ONE(0x54),       ONE(0x55),       ONE(0x56),       ONE(0x57),
+	/* X             Y                Z                Ä   */
+	ONE(0x58),       ONE(0x59),       ONE(0x5a),       TWO(0xc3, 0x84),
+	/* Ö             Ñ                Ü                §   */
+    TWO(0xc3, 0x96), TWO(0xc3, 0x91), TWO(0xc3, 0x9c), TWO(0xc2, 0xa7),
+	/* ¿             a                b                c   */
+	TWO(0xc2, 0xbf), ONE(0x61),       ONE(0x62),       ONE(0x63),
+	/* d             e                f                g   */
+	ONE(0x64),       ONE(0x65),       ONE(0x66),       ONE(0x67),
+	/* h             i                j                k   */
+	ONE(0x68),       ONE(0x69),       ONE(0x6a),       ONE(0x6b),
+	/* l             m                n                o   */
+	ONE(0x6c),       ONE(0x6d),       ONE(0x6e),       ONE(0x6f),
+	/* p             q                r                s   */
+	ONE(0x70),       ONE(0x71),       ONE(0x72),       ONE(0x73),
+	/* t             u                v                w   */
+	ONE(0x74),       ONE(0x75),       ONE(0x76),       ONE(0x77),
+	/* x             y                z                ä   */
+	ONE(0x78),       ONE(0x79),       ONE(0x7a),       TWO(0xc3, 0xa4),
+    /* ö             ñ                ü                à   */
+    TWO(0xc3, 0xb6), TWO(0xc3, 0xb1), TWO(0xc3, 0xbc), TWO(0xc3, 0xa0)
+};
+
+static guint8
+gsm_def_char_to_utf8 (const guint8 gsm, guint8 out_utf8[2])
+{
+    g_return_val_if_fail (gsm < GSM_DEF_ALPHABET_SIZE, 0);
+    memcpy (&out_utf8[0], &gsm_def_utf8_alphabet[gsm].chars[0], gsm_def_utf8_alphabet[gsm].len);
+    return gsm_def_utf8_alphabet[gsm].len;
+}
+
+static gboolean
+utf8_to_gsm_def_char (const char *utf8, guint32 len, guint8 *out_gsm)
+{
+    int i;
+
+    if (len > 0 && len < 4) {
+        for (i = 0; i < GSM_DEF_ALPHABET_SIZE; i++) {
+            if (gsm_def_utf8_alphabet[i].len == len) {
+                if (memcmp (&gsm_def_utf8_alphabet[i].chars[0], utf8, len) == 0) {
+                    *out_gsm = i;
+                    return TRUE;
+                }
+            }
+        }
+    }
+    return FALSE;
+}
+
+
+#define EONE(a, g)        { {a, 0x00, 0x00}, 1, g }
+#define ETHR(a, b, c, g)  { {a, b,    c},    3, g }
+
+/**
+ * gsm_ext_utf8_alphabet:
+ *
+ * Mapping from GSM extended alphabet to UTF-8.
+ *
+ */
+static const GsmUtf8Mapping gsm_ext_utf8_alphabet[GSM_EXT_ALPHABET_SIZE] = {
+    /* form feed      ^                 {                 }  */
+    EONE(0x0c, 0x0a), EONE(0x5e, 0x14), EONE(0x7b, 0x28), EONE(0x7d, 0x29),
+    /* \              [                 ~                 ]  */
+    EONE(0x5c, 0x2f), EONE(0x5b, 0x3c), EONE(0x7e, 0x3d), EONE(0x5d, 0x3e),
+    /* |              €                                      */
+    EONE(0x7c, 0x40), ETHR(0xe2, 0x82, 0xac, 0x65)
+};
+
+#define GSM_ESCAPE_CHAR 0x1b
+
+static guint8
+gsm_ext_char_to_utf8 (const guint8 gsm, guint8 out_utf8[3])
+{
+    int i;
+
+    for (i = 0; i < GSM_EXT_ALPHABET_SIZE; i++) {
+        if (gsm == gsm_ext_utf8_alphabet[i].gsm) {
+            memcpy (&out_utf8[0], &gsm_ext_utf8_alphabet[i].chars[0], gsm_ext_utf8_alphabet[i].len);
+            return gsm_ext_utf8_alphabet[i].len;
+        }
+    }
+    return 0;
+}
+
+static gboolean
+utf8_to_gsm_ext_char (const char *utf8, guint32 len, guint8 *out_gsm)
+{
+    int i;
+
+    if (len > 0 && len < 4) {
+        for (i = 0; i < GSM_EXT_ALPHABET_SIZE; i++) {
+            if (gsm_ext_utf8_alphabet[i].len == len) {
+                if (memcmp (&gsm_ext_utf8_alphabet[i].chars[0], utf8, len) == 0) {
+                    *out_gsm = gsm_ext_utf8_alphabet[i].gsm;
+                    return TRUE;
+                }
+            }
+        }
+    }
+    return FALSE;
+}
+
+guint8 *
+mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len)
+{
+    int i;
+    GByteArray *utf8;
+
+    g_return_val_if_fail (gsm != NULL, NULL);
+    g_return_val_if_fail (len < 4096, NULL);
+
+    /* worst case initial length */
+    utf8 = g_byte_array_sized_new (len * 2 + 1);
+
+    for (i = 0; i < len; i++) {
+        guint8 uchars[4];
+        guint8 ulen;
+
+        if (gsm[i] == GSM_ESCAPE_CHAR) {
+            /* Extended alphabet, decode next char */
+            ulen = gsm_ext_char_to_utf8 (gsm[i+1], uchars);
+            if (ulen)
+                i += 1;
+        } else {
+            /* Default alphabet */
+            ulen = gsm_def_char_to_utf8 (gsm[i], uchars);
+        }
+
+        if (ulen)
+            g_byte_array_append (utf8, &uchars[0], ulen);
+        else
+            g_byte_array_append (utf8, (guint8 *) "?", 1);
+    }
+
+    g_byte_array_append (utf8, (guint8 *) "\0", 1);  /* NULL terminator */
+    return g_byte_array_free (utf8, FALSE);
+}
+
+guint8 *
+mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len)
+{
+    GByteArray *gsm;
+    const char *c = utf8, *next = c;
+    static const guint8 gesc = GSM_ESCAPE_CHAR;
+    int i = 0;
+
+    g_return_val_if_fail (utf8 != NULL, NULL);
+    g_return_val_if_fail (out_len != NULL, NULL);
+    g_return_val_if_fail (g_utf8_validate (utf8, -1, NULL), NULL);
+
+    /* worst case initial length */
+    gsm = g_byte_array_sized_new (g_utf8_strlen (utf8, -1) * 2 + 1);
+
+    if (*utf8 == 0x00) {
+        /* Zero-length string */
+        g_byte_array_append (gsm, (guint8 *) "\0", 1);
+        *out_len = 0;
+        return g_byte_array_free (gsm, FALSE);
+    }
+
+    while (next && *next) {
+        guint8 gch = 0x3f;  /* 0x3f == '?' */
+
+        next = g_utf8_next_char (c);
+
+        /* Try escaped chars first, then default alphabet */
+        if (utf8_to_gsm_ext_char (c, next - c, &gch)) {
+            /* Add the escape char */
+            g_byte_array_append (gsm, &gesc, 1);
+            g_byte_array_append (gsm, &gch, 1);
+        } else if (utf8_to_gsm_def_char (c, next - c, &gch))
+            g_byte_array_append (gsm, &gch, 1);
+
+        c = next;
+        i++;
+    }
+
+    *out_len = gsm->len;
+    return g_byte_array_free (gsm, FALSE);
+}
+
diff --git a/src/mm-charsets.h b/src/mm-charsets.h
index 5fa34065..ff39400e 100644
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -39,7 +39,7 @@ MMModemCharset mm_modem_charset_from_string (const char *string);
  * UTF-8 encoded.
  */
 gboolean mm_modem_charset_byte_array_append (GByteArray *array,
-                                             const char *string,
+                                             const char *utf8,
                                              gboolean quoted,
                                              MMModemCharset charset);
 
@@ -48,5 +48,9 @@ gboolean mm_modem_charset_byte_array_append (GByteArray *array,
  */
 char *mm_modem_charset_hex_to_utf8 (const char *src, MMModemCharset charset);
 
+guint8 *mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len);
+
+guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len);
+
 #endif /* MM_CHARSETS_H */
 
diff --git a/src/tests/Makefile.am b/src/tests/Makefile.am
index 74255dbc..77bd4775 100644
--- a/src/tests/Makefile.am
+++ b/src/tests/Makefile.am
@@ -1,7 +1,7 @@
 INCLUDES = \
 	-I$(top_srcdir)/src
 
-noinst_PROGRAMS = test-modem-helpers
+noinst_PROGRAMS = test-modem-helpers test-charsets
 
 test_modem_helpers_SOURCES = \
 	test-modem-helpers.c
@@ -13,10 +13,21 @@ test_modem_helpers_LDADD = \
 	$(top_builddir)/src/libmodem-helpers.la \
 	$(MM_LIBS)
 
+test_charsets_SOURCES = \
+	test-charsets.c
+
+test_charsets_CPPFLAGS = \
+	$(MM_CFLAGS)
+
+test_charsets_LDADD = \
+	$(top_builddir)/src/libmodem-helpers.la \
+	$(MM_LIBS)
+
 if WITH_TESTS
 
 check-local: test-modem-helpers
 	$(abs_builddir)/test-modem-helpers
+	$(abs_builddir)/test-charsets
 
 endif
 
diff --git a/src/tests/test-charsets.c b/src/tests/test-charsets.c
new file mode 100644
index 00000000..80518dc5
--- /dev/null
+++ b/src/tests/test-charsets.c
@@ -0,0 +1,119 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details:
+ *
+ * Copyright (C) 2010 Red Hat, Inc.
+ */
+
+#include <glib.h>
+#include <string.h>
+
+#include "mm-modem-helpers.h"
+
+static void
+test_def_chars (void *f, gpointer d)
+{
+    /* Test that a string with all the characters in the GSM 03.38 charset
+     * are converted from UTF-8 to GSM and back to UTF-8 successfully.
+     */
+    static const char *s = "@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞÆæßÉ !\"#¤%&'()*+,-./0123456789:;<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑÜ§¿abcdefghijklmnopqrstuvwxyzäöñüà";
+    guint8 *gsm, *utf8;
+    guint32 len = 0;
+
+    /* Convert to GSM */
+    gsm = mm_charset_utf8_to_unpacked_gsm (s, &len);
+    g_assert (gsm);
+    g_assert_cmpint (len, ==, 127);
+
+    /* And back to UTF-8 */
+    utf8 = mm_charset_gsm_unpacked_to_utf8 (gsm, len);
+    g_assert (utf8);
+    g_assert_cmpstr (s, ==, (const char *) utf8);
+
+    g_free (gsm);
+    g_free (utf8);
+}
+
+static void
+test_esc_chars (void *f, gpointer d)
+{
+    /* Test that a string with all the characters in the extended GSM 03.38
+     * charset are converted from UTF-8 to GSM and back to UTF-8 successfully.
+     */
+    static const char *s = "\f^{}\\[~]|€";
+    guint8 *gsm, *utf8;
+    guint32 len = 0;
+
+    /* Convert to GSM */
+    gsm = mm_charset_utf8_to_unpacked_gsm (s, &len);
+    g_assert (gsm);
+    g_assert_cmpint (len, ==, 20);
+
+    /* And back to UTF-8 */
+    utf8 = mm_charset_gsm_unpacked_to_utf8 (gsm, len);
+    g_assert (utf8);
+    g_assert_cmpstr (s, ==, (const char *) utf8);
+
+    g_free (gsm);
+    g_free (utf8);
+}
+
+static void
+test_mixed_chars (void *f, gpointer d)
+{
+    /* Test that a string with a mix of GSM 03.38 default and extended characters
+     * is converted from UTF-8 to GSM and back to UTF-8 successfully.
+     */
+    static const char *s = "@£$¥èéùìø\fΩΠΨΣΘ{ΞÆæß(})789\\:;<=>[?¡QRS]TUÖ|ÑÜ§¿abpqrstuvöñüà€";
+    guint8 *gsm, *utf8;
+    guint32 len = 0;
+
+    /* Convert to GSM */
+    gsm = mm_charset_utf8_to_unpacked_gsm (s, &len);
+    g_assert (gsm);
+    g_assert_cmpint (len, ==, 69);
+
+    /* And back to UTF-8 */
+    utf8 = mm_charset_gsm_unpacked_to_utf8 (gsm, len);
+    g_assert (utf8);
+    g_assert_cmpstr (s, ==, (const char *) utf8);
+
+    g_free (gsm);
+    g_free (utf8);
+}
+
+
+#if GLIB_CHECK_VERSION(2,25,12)
+typedef GTestFixtureFunc TCFunc;
+#else
+typedef void (*TCFunc)(void);
+#endif
+
+#define TESTCASE(t, d) g_test_create_case (#t, 0, d, NULL, (TCFunc) t, NULL)
+
+int main (int argc, char **argv)
+{
+	GTestSuite *suite;
+    gint result;
+
+	g_test_init (&argc, &argv, NULL);
+
+	suite = g_test_get_root ();
+
+	g_test_suite_add (suite, TESTCASE (test_def_chars, NULL));
+	g_test_suite_add (suite, TESTCASE (test_esc_chars, NULL));
+	g_test_suite_add (suite, TESTCASE (test_mixed_chars, NULL));
+
+    result = g_test_run ();
+
+    return result;
+}
+
author	Dan Williams <dcbw@redhat.com>	2010-09-02 19:29:05 -0500
committer	Dan Williams <dcbw@redhat.com>	2010-09-02 19:29:05 -0500
commit	9e94dd5b6124d00cf10d6296c7c9aa80f8f68d80 (patch)
tree	78be39d22a363099512a36ebf8b71756bd559162 /src
parent	85ce5446759092968c6540b9d842c5bc777abb74 (diff)