From be9157f3e9b156dd62c73637ce20e2908ab00eda Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Fri, 12 Nov 2010 22:36:13 +0200 Subject: [PATCH] * mu-msg-str.c: add Latin-Extended-A for normalization, update unit test --- src/mu-msg-str.c | 331 ++++++++++++++++++++++++++++-------- src/tests/test-mu-msg-str.c | 1 + 2 files changed, 261 insertions(+), 71 deletions(-) diff --git a/src/mu-msg-str.c b/src/mu-msg-str.c index 65d46fa0..c75f23ef 100644 --- a/src/mu-msg-str.c +++ b/src/mu-msg-str.c @@ -26,6 +26,8 @@ + + char* mu_msg_str_normalize (const char *str, gboolean downcase) { @@ -41,92 +43,279 @@ mu_msg_str_normalize (const char *str, gboolean downcase) output = g_new0 (char, 2 * strlen(str)); for (i = 0, cur = (const guchar*)str; *cur; ++cur) { - if (*cur != 0xc3) { /* != latin-1 supplement? */ + if (*cur < 0xc3 || *cur > 0xc5) { output[i++] = *cur; continue; } - ++cur; - switch (*cur) { + if (*cur == 0xc3) { /* latin-1 supplement */ + ++cur; + switch (*cur) { + + case 0x80: + case 0x81: + case 0x82: + case 0x83: + case 0x84: + case 0x85: output[i++] = 'A'; break; - case 0x80: - case 0x81: - case 0x82: - case 0x83: - case 0x84: - case 0x85: output[i++] = 'A'; break; - - case 0x86: output[i++] = 'A'; output[i++] = 'e'; break; - case 0x87: output[i++] = 'C'; break; + case 0x86: output[i++] = 'A'; output[i++] = 'e'; break; + case 0x87: output[i++] = 'C'; break; + + case 0x88: + case 0x89: + case 0x8a: + case 0x8b: output[i++] = 'E'; break; + + case 0x8c: + case 0x8d: + case 0x8e: + case 0x8f: output[i++] = 'I'; break; + + case 0x90: output[i++] = 'D'; break; + case 0x91: output[i++] = 'N'; break; + + case 0x92: + case 0x93: + case 0x94: + case 0x95: + case 0x96: output[i++] = 'O'; break; + + case 0x99: + case 0x9a: + case 0x9b: + case 0x9c: output[i++] = 'U'; break; - case 0x88: - case 0x89: - case 0x8a: - case 0x8b: output[i++] = 'E'; break; + case 0x9d: output[i++] = 'Y'; break; + case 0x9e: output[i++] = 'T'; output[i++] = 'h'; break; + case 0x9f: output[i++] = 's'; output[i++] = 's'; break; + + case 0xa0: + case 0xa1: + case 0xa2: + case 0xa3: + case 0xa4: + case 0xa5: output[i++] = 'a'; break; + + case 0xa6: output[i++] = 'a'; output[i++] = 'e'; break; + case 0xa7: output[i++] = 'c'; break; + + case 0xa8: + case 0xa9: + case 0xaa: + case 0xab: output[i++] = 'e'; break; + + case 0xac: + case 0xad: + case 0xae: + case 0xaf: output[i++] = 'i'; break; + + case 0xb0: output[i++] = 'd'; break; + case 0xb1: output[i++] = 'n'; break; + + case 0xb2: + case 0xb3: + case 0xb4: + case 0xb5: + case 0xb6: output[i++] = 'o'; break; + + case 0xb9: + case 0xba: + case 0xbb: + case 0xbc: output[i++] = 'u'; break; + + case 0xbd: output[i++] = 'y'; break; + case 0xbe: output[i++] = 't'; output[i++] = 'h'; break; + case 0xbf: output[i++] = 'y'; break; + + default: + output[i++] = *cur; + } + } else if (*cur == 0xc4) { /* Latin Extended-A (0x04) */ + ++cur; + switch (*cur) { + case 0x80: + case 0x82: + case 0x84: output[i++] = 'A'; break; - case 0x8c: - case 0x8d: - case 0x8e: - case 0x8f: output[i++] = 'I'; break; + case 0x86: + case 0x88: + case 0x8a: + case 0x8c: output[i++] = 'C'; break; - case 0x90: output[i++] = 'D'; break; - case 0x91: output[i++] = 'N'; break; - - case 0x92: - case 0x93: - case 0x94: - case 0x95: - case 0x96: output[i++] = 'O'; break; + case 0x8e: + case 0x90: output[i++] = 'D'; break; - case 0x99: - case 0x9a: - case 0x9b: - case 0x9c: output[i++] = 'U'; break; - - case 0x9d: output[i++] = 'Y'; break; - case 0x9e: output[i++] = 'T'; output[i++] = 'h'; break; - case 0x9f: output[i++] = 's'; output[i++] = 's'; break; - - case 0xa0: - case 0xa1: - case 0xa2: - case 0xa3: - case 0xa4: - case 0xa5: output[i++] = 'a'; break; + case 0x92: + case 0x94: + case 0x96: + case 0x98: + case 0x9a: output[i++] = 'E'; break; - case 0xa6: output[i++] = 'a'; output[i++] = 'e'; break; - case 0xa7: output[i++] = 'c'; break; - - case 0xa8: - case 0xa9: - case 0xaa: - case 0xab: output[i++] = 'e'; break; + case 0x9c: + case 0x9e: + case 0xa0: + case 0xa2: output[i++] = 'G'; break; - case 0xac: - case 0xad: - case 0xae: - case 0xaf: output[i++] = 'i'; break; + case 0xa4: + case 0xa6: output[i++] = 'H'; break; - case 0xb0: output[i++] = 'd'; break; - case 0xb1: output[i++] = 'n'; break; - - case 0xb2: - case 0xb3: - case 0xb4: - case 0xb5: - case 0xb6: output[i++] = 'o'; break; + case 0xa8: + case 0xaa: + case 0xac: + case 0xae: + case 0xb0: output[i++] = 'I'; break; + + case 0xb2: output[i++] = 'I'; output[i++] = 'J'; break; - case 0xb9: - case 0xba: - case 0xbb: - case 0xbc: output[i++] = 'u'; break; + case 0xb4: output[i++] = 'J'; break; - case 0xbd: output[i++] = 'y'; break; - case 0xbe: output[i++] = 't'; output[i++] = 'h'; break; - case 0xbf: output[i++] = 'y'; break; + case 0xb6: output[i++] = 'K'; break; - default: - output[i++] = *cur; + case 0xb9: + case 0xbb: + case 0xbd: + case 0xbf: output[i++] = 'L'; break; + + case 0x81: + case 0x83: + case 0x85: output[i++] = 'a'; break; + + case 0x87: + case 0x89: + case 0x8b: + case 0x8d: output[i++] = 'c'; break; + + case 0x8f: + case 0x91: output[i++] = 'd'; break; + + case 0x93: + case 0x95: + case 0x97: + case 0x99: + case 0x9b: output[i++] = 'e'; break; + + case 0x9d: + case 0x9f: + case 0xa1: + case 0xa: output[i++] = 'g'; break; + + case 0xa5: + case 0xa7: output[i++] = 'h'; break; + + case 0xa9: + case 0xab: + case 0xad: + case 0xaf: + case 0xb1: output[i++] = 'i'; break; + + case 0xb3: output[i++] = 'i'; output[i++] = 'j'; break; + + case 0xb5: output[i++] = 'j'; break; + + case 0xb7: + case 0xb8: output[i++] = 'k'; break; + + case 0xba: + case 0xbc: + case 0xbe: output[i++] = 'l'; break; + + default: + output[i++] = *cur; + } + + } else { /* Latin Extended-A (0xc5) */ + ++cur; + switch (*cur) { + case 0x81: output[i++] = 'L'; break; + + case 0x83: + case 0x85: + case 0x87: output[i++] = 'N'; break; + + case 0x8c: + case 0x8e: + case 0x90: output[i++] = 'O'; break; + + case 0x92: output[i++] = 'O'; output[i++] = 'e'; break; + + case 0x94: + case 0x96: + case 0x98: output[i++] = 'R'; break; + + case 0x9a: + case 0x9c: + case 0x9e: + case 0xa0: output[i++] = 'S'; break; + + case 0xa2: + case 0xa4: + case 0xa6: output[i++] = 'T'; break; + + case 0xa8: + case 0xaa: + case 0xac: + case 0xae: + case 0xb0: + case 0xb2: output[i++] = 'U'; break; + + case 0xb4: output[i++] = 'W'; break; + + case 0xb6: + case 0xb8: + output[i++] = 'Y'; break; + + case 0xb9: + case 0xbb: + case 0xbd: output[i++] = 'Z'; break; + + case 0x80: + case 0x82: output[i++] = 'l'; break; + + case 0x84: + case 0x86: + case 0x88: + case 0x89: + case 0x8a: + case 0x8b: output[i++] = 'n'; break; + + case 0x8d: + case 0x8f: + case 0x91: output[i++] = 'o'; break; + + case 0x93: output[i++] = 'o'; output[i++] = 'e'; break; + + case 0x95: + case 0x97: + case 0x99: output[i++] = 'r'; break; + + case 0x9b: + case 0x9d: + case 0x9f: + case 0xa1: output[i++] = 's'; break; + + case 0xa3: + case 0xa5: + case 0xa7: output[i++] = 't'; break; + + case 0xa9: + case 0xab: + case 0xad: + case 0xaf: + case 0xb1: + case 0xb3: output[i++] = 'u'; break; + + case 0xb5: output[i++] = 'w'; break; + + case 0xb7: output[i++] = 'y'; break; + + case 0xba: + case 0xbc: + case 0xbe: output[i++] = 'z'; break; + + case 0xbf: output[i++] = 's'; break; + + } } } diff --git a/src/tests/test-mu-msg-str.c b/src/tests/test-mu-msg-str.c index 70e7794e..691606a0 100644 --- a/src/tests/test-mu-msg-str.c +++ b/src/tests/test-mu-msg-str.c @@ -137,6 +137,7 @@ test_mu_msg_str_normalize_01 (void) { "dantès", "dantes"}, { "foo", "foo" }, { "Föö", "foo" }, + { "číslo", "cislo" }, { "hÆvý mëÐal ümláõt", "haevy medal umlaot"} };