diff --git a/src/mu-msg-str.c b/src/mu-msg-str.c index c75f23ef..84be4473 100644 --- a/src/mu-msg-str.c +++ b/src/mu-msg-str.c @@ -16,6 +16,10 @@ ** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ** */ +#if HAVE_CONFIG_H +#include "config.h" +#endif /*HAVE_CONFIG_H*/ + #include #include @@ -24,27 +28,50 @@ #include "mu-msg-str.h" #include "mu-msg-flags.h" - - - +static char* normalize_in_place_downcase (char *str); char* mu_msg_str_normalize (const char *str, gboolean downcase) +{ + g_return_val_if_fail (str, NULL); + + return mu_msg_str_normalize_in_place (g_strdup(str), downcase); +} + +/* we can normalize in-place, as the normalized string will never be + * longer than the original. even for replacements that are 2 chars + * wide (e.g. German ß => ss), the replacement is 2 bytes, like the + * original 0xc3 0x9f + */ +char* +mu_msg_str_normalize_in_place (char *str, gboolean downcase) { const guchar *cur; - gchar *output; int i; g_return_val_if_fail (str, NULL); - - if (*str == '\0') - return g_strdup (""); - output = g_new0 (char, 2 * strlen(str)); + if (*str == '\0') + return str; + +#ifdef MU_DISABLE_NORMALIZATION + { + if (downcase) { + gchar *c; + for (c = str; *c; ++c) + *c = tolower (*c); + } + return str; + } +#endif /*MU_DISABLE_NORMALIZATION*/ + + + if (downcase) + return normalize_in_place_downcase (str); for (i = 0, cur = (const guchar*)str; *cur; ++cur) { if (*cur < 0xc3 || *cur > 0xc5) { - output[i++] = *cur; + str[i++] = *cur; continue; } @@ -57,278 +84,562 @@ mu_msg_str_normalize (const char *str, gboolean downcase) case 0x82: case 0x83: case 0x84: - case 0x85: output[i++] = 'A'; break; + case 0x85: str[i++] = 'A'; break; - case 0x86: output[i++] = 'A'; output[i++] = 'e'; break; - case 0x87: output[i++] = 'C'; break; + case 0x86: str[i++] = 'A'; str[i++] = 'e'; break; + case 0x87: str[i++] = 'C'; break; case 0x88: case 0x89: case 0x8a: - case 0x8b: output[i++] = 'E'; break; + case 0x8b: str[i++] = 'E'; break; case 0x8c: case 0x8d: case 0x8e: - case 0x8f: output[i++] = 'I'; break; + case 0x8f: str[i++] = 'I'; break; - case 0x90: output[i++] = 'D'; break; - case 0x91: output[i++] = 'N'; break; + case 0x90: str[i++] = 'D'; break; + case 0x91: str[i++] = 'N'; break; case 0x92: case 0x93: case 0x94: case 0x95: - case 0x96: output[i++] = 'O'; break; + case 0x96: str[i++] = 'O'; break; case 0x99: case 0x9a: case 0x9b: - case 0x9c: output[i++] = 'U'; break; + case 0x9c: str[i++] = 'U'; break; - case 0x9d: output[i++] = 'Y'; break; - case 0x9e: output[i++] = 'T'; output[i++] = 'h'; break; - case 0x9f: output[i++] = 's'; output[i++] = 's'; break; + case 0x9d: str[i++] = 'Y'; break; + case 0x9e: str[i++] = 'T'; str[i++] = 'h'; break; + case 0x9f: str[i++] = 's'; str[i++] = 's'; break; case 0xa0: case 0xa1: case 0xa2: case 0xa3: case 0xa4: - case 0xa5: output[i++] = 'a'; break; + case 0xa5: str[i++] = 'a'; break; - case 0xa6: output[i++] = 'a'; output[i++] = 'e'; break; - case 0xa7: output[i++] = 'c'; break; + case 0xa6: str[i++] = 'a'; str[i++] = 'e'; break; + case 0xa7: str[i++] = 'c'; break; case 0xa8: case 0xa9: case 0xaa: - case 0xab: output[i++] = 'e'; break; + case 0xab: str[i++] = 'e'; break; case 0xac: case 0xad: case 0xae: - case 0xaf: output[i++] = 'i'; break; + case 0xaf: str[i++] = 'i'; break; - case 0xb0: output[i++] = 'd'; break; - case 0xb1: output[i++] = 'n'; break; + case 0xb0: str[i++] = 'd'; break; + case 0xb1: str[i++] = 'n'; break; case 0xb2: case 0xb3: case 0xb4: case 0xb5: - case 0xb6: output[i++] = 'o'; break; + case 0xb6: str[i++] = 'o'; break; case 0xb9: case 0xba: case 0xbb: - case 0xbc: output[i++] = 'u'; break; + case 0xbc: str[i++] = 'u'; break; - case 0xbd: output[i++] = 'y'; break; - case 0xbe: output[i++] = 't'; output[i++] = 'h'; break; - case 0xbf: output[i++] = 'y'; break; + case 0xbd: str[i++] = 'y'; break; + case 0xbe: str[i++] = 't'; str[i++] = 'h'; break; + case 0xbf: str[i++] = 'y'; break; default: - output[i++] = *cur; + str[i++] = *cur; } } else if (*cur == 0xc4) { /* Latin Extended-A (0x04) */ ++cur; switch (*cur) { case 0x80: case 0x82: - case 0x84: output[i++] = 'A'; break; + case 0x84: str[i++] = 'A'; break; case 0x86: case 0x88: case 0x8a: - case 0x8c: output[i++] = 'C'; break; + case 0x8c: str[i++] = 'C'; break; case 0x8e: - case 0x90: output[i++] = 'D'; break; + case 0x90: str[i++] = 'D'; break; case 0x92: case 0x94: case 0x96: case 0x98: - case 0x9a: output[i++] = 'E'; break; + case 0x9a: str[i++] = 'E'; break; case 0x9c: case 0x9e: case 0xa0: - case 0xa2: output[i++] = 'G'; break; + case 0xa2: str[i++] = 'G'; break; case 0xa4: - case 0xa6: output[i++] = 'H'; break; + case 0xa6: str[i++] = 'H'; break; case 0xa8: case 0xaa: case 0xac: case 0xae: - case 0xb0: output[i++] = 'I'; break; + case 0xb0: str[i++] = 'I'; break; - case 0xb2: output[i++] = 'I'; output[i++] = 'J'; break; + case 0xb2: str[i++] = 'I'; str[i++] = 'J'; break; - case 0xb4: output[i++] = 'J'; break; + case 0xb4: str[i++] = 'J'; break; - case 0xb6: output[i++] = 'K'; break; + case 0xb6: str[i++] = 'K'; break; case 0xb9: case 0xbb: case 0xbd: - case 0xbf: output[i++] = 'L'; break; + case 0xbf: str[i++] = 'L'; break; case 0x81: case 0x83: - case 0x85: output[i++] = 'a'; break; + case 0x85: str[i++] = 'a'; break; case 0x87: case 0x89: case 0x8b: - case 0x8d: output[i++] = 'c'; break; + case 0x8d: str[i++] = 'c'; break; case 0x8f: - case 0x91: output[i++] = 'd'; break; + case 0x91: str[i++] = 'd'; break; case 0x93: case 0x95: case 0x97: case 0x99: - case 0x9b: output[i++] = 'e'; break; + case 0x9b: str[i++] = 'e'; break; case 0x9d: case 0x9f: case 0xa1: - case 0xa: output[i++] = 'g'; break; + case 0xa: str[i++] = 'g'; break; case 0xa5: - case 0xa7: output[i++] = 'h'; break; + case 0xa7: str[i++] = 'h'; break; case 0xa9: case 0xab: case 0xad: case 0xaf: - case 0xb1: output[i++] = 'i'; break; + case 0xb1: str[i++] = 'i'; break; - case 0xb3: output[i++] = 'i'; output[i++] = 'j'; break; + case 0xb3: str[i++] = 'i'; str[i++] = 'j'; break; - case 0xb5: output[i++] = 'j'; break; + case 0xb5: str[i++] = 'j'; break; case 0xb7: - case 0xb8: output[i++] = 'k'; break; + case 0xb8: str[i++] = 'k'; break; case 0xba: case 0xbc: - case 0xbe: output[i++] = 'l'; break; + case 0xbe: str[i++] = 'l'; break; default: - output[i++] = *cur; + str[i++] = *cur; } } else { /* Latin Extended-A (0xc5) */ ++cur; switch (*cur) { - case 0x81: output[i++] = 'L'; break; + case 0x81: str[i++] = 'L'; break; case 0x83: case 0x85: - case 0x87: output[i++] = 'N'; break; + case 0x87: str[i++] = 'N'; break; case 0x8c: case 0x8e: - case 0x90: output[i++] = 'O'; break; + case 0x90: str[i++] = 'O'; break; - case 0x92: output[i++] = 'O'; output[i++] = 'e'; break; + case 0x92: str[i++] = 'O'; str[i++] = 'e'; break; case 0x94: case 0x96: - case 0x98: output[i++] = 'R'; break; + case 0x98: str[i++] = 'R'; break; case 0x9a: case 0x9c: case 0x9e: - case 0xa0: output[i++] = 'S'; break; + case 0xa0: str[i++] = 'S'; break; case 0xa2: case 0xa4: - case 0xa6: output[i++] = 'T'; break; + case 0xa6: str[i++] = 'T'; break; case 0xa8: case 0xaa: case 0xac: case 0xae: case 0xb0: - case 0xb2: output[i++] = 'U'; break; + case 0xb2: str[i++] = 'U'; break; - case 0xb4: output[i++] = 'W'; break; + case 0xb4: str[i++] = 'W'; break; case 0xb6: case 0xb8: - output[i++] = 'Y'; break; + str[i++] = 'Y'; break; case 0xb9: case 0xbb: - case 0xbd: output[i++] = 'Z'; break; + case 0xbd: str[i++] = 'Z'; break; case 0x80: - case 0x82: output[i++] = 'l'; break; + case 0x82: str[i++] = 'l'; break; case 0x84: case 0x86: case 0x88: case 0x89: case 0x8a: - case 0x8b: output[i++] = 'n'; break; + case 0x8b: str[i++] = 'n'; break; case 0x8d: case 0x8f: - case 0x91: output[i++] = 'o'; break; + case 0x91: str[i++] = 'o'; break; - case 0x93: output[i++] = 'o'; output[i++] = 'e'; break; + case 0x93: str[i++] = 'o'; str[i++] = 'e'; break; case 0x95: case 0x97: - case 0x99: output[i++] = 'r'; break; + case 0x99: str[i++] = 'r'; break; case 0x9b: case 0x9d: case 0x9f: - case 0xa1: output[i++] = 's'; break; + case 0xa1: str[i++] = 's'; break; case 0xa3: case 0xa5: - case 0xa7: output[i++] = 't'; break; + case 0xa7: str[i++] = 't'; break; case 0xa9: case 0xab: case 0xad: case 0xaf: case 0xb1: - case 0xb3: output[i++] = 'u'; break; + case 0xb3: str[i++] = 'u'; break; - case 0xb5: output[i++] = 'w'; break; + case 0xb5: str[i++] = 'w'; break; - case 0xb7: output[i++] = 'y'; break; + case 0xb7: str[i++] = 'y'; break; case 0xba: case 0xbc: - case 0xbe: output[i++] = 'z'; break; + case 0xbe: str[i++] = 'z'; break; - case 0xbf: output[i++] = 's'; break; + case 0xbf: str[i++] = 's'; break; } } } - output [i] = '\0'; - - /* for utf8, this should not interfere with anything it shouldn't... */ - if (downcase) { - gchar *c; - for (c = output; *c; ++c) - *c = tolower (*c); - } + str [i] = '\0'; - return output; + return str; +} + + +static char* +normalize_in_place_downcase (char *str) +{ + const guchar *cur; + int i; + + if (*str == '\0') + return str; + + for (i = 0, cur = (const guchar*)str; *cur; ++cur) { + + if (G_UNLIKELY (*cur == 0xc3)) { /* latin-1 supplement */ + ++cur; + switch (*cur) { + + case 0x80: + case 0x81: + case 0x82: + case 0x83: + case 0x84: + case 0x85: str[i++] = 'a'; break; + + case 0x86: str[i++] = 'a'; str[i++] = 'e'; break; + case 0x87: str[i++] = 'c'; break; + + case 0x88: + case 0x89: + case 0x8a: + case 0x8b: str[i++] = 'e'; break; + + case 0x8c: + case 0x8d: + case 0x8e: + case 0x8f: str[i++] = 'i'; break; + + case 0x90: str[i++] = 'd'; break; + case 0x91: str[i++] = 'n'; break; + + case 0x92: + case 0x93: + case 0x94: + case 0x95: + case 0x96: str[i++] = 'o'; break; + + case 0x99: + case 0x9a: + case 0x9b: + case 0x9c: str[i++] = 'u'; break; + + case 0x9d: str[i++] = 'y'; break; + case 0x9e: str[i++] = 't'; str[i++] = 'h'; break; + case 0x9f: str[i++] = 't'; str[i++] = 's'; break; + + case 0xa0: + case 0xa1: + case 0xa2: + case 0xa3: + case 0xa4: + case 0xa5: str[i++] = 'a'; break; + + case 0xa6: str[i++] = 'a'; str[i++] = 'e'; break; + case 0xa7: str[i++] = 'c'; break; + + case 0xa8: + case 0xa9: + case 0xaa: + case 0xab: str[i++] = 'e'; break; + + case 0xac: + case 0xad: + case 0xae: + case 0xaf: str[i++] = 'i'; break; + + case 0xb0: str[i++] = 'd'; break; + case 0xb1: str[i++] = 'n'; break; + + case 0xb2: + case 0xb3: + case 0xb4: + case 0xb5: + case 0xb6: str[i++] = 'o'; break; + + case 0xb9: + case 0xba: + case 0xbb: + case 0xbc: str[i++] = 'u'; break; + + case 0xbd: str[i++] = 'y'; break; + case 0xbe: str[i++] = 't'; str[i++] = 'h'; break; + case 0xbf: str[i++] = 'y'; break; + + default: + str[i++] = tolower(*cur); + } + } else if (G_UNLIKELY ((*cur == 0xc4))) { /* Latin Extended-A (0x04) */ + ++cur; + switch (*cur) { + case 0x80: + case 0x82: + case 0x84: str[i++] = 'a'; break; + + case 0x86: + case 0x88: + case 0x8a: + case 0x8c: str[i++] = 'c'; break; + + case 0x8e: + case 0x90: str[i++] = 'd'; break; + + case 0x92: + case 0x94: + case 0x96: + case 0x98: + case 0x9a: str[i++] = 'e'; break; + + case 0x9c: + case 0x9e: + case 0xa0: + case 0xa2: str[i++] = 'g'; break; + + case 0xa4: + case 0xa6: str[i++] = 'h'; break; + + case 0xa8: + case 0xaa: + case 0xac: + case 0xae: + case 0xb0: str[i++] = 'i'; break; + + case 0xb2: str[i++] = 'i'; str[i++] = 'j'; break; + + case 0xb4: str[i++] = 'j'; break; + + case 0xb6: str[i++] = 'k'; break; + + case 0xb9: + case 0xbb: + case 0xbd: + case 0xbf: str[i++] = 'l'; break; + + case 0x81: + case 0x83: + case 0x85: str[i++] = 'a'; break; + + case 0x87: + case 0x89: + case 0x8b: + case 0x8d: str[i++] = 'c'; break; + + case 0x8f: + case 0x91: str[i++] = 'd'; break; + + case 0x93: + case 0x95: + case 0x97: + case 0x99: + case 0x9b: str[i++] = 'e'; break; + + case 0x9d: + case 0x9f: + case 0xa1: + case 0xa: str[i++] = 'g'; break; + + case 0xa5: + case 0xa7: str[i++] = 'h'; break; + + case 0xa9: + case 0xab: + case 0xad: + case 0xaf: + case 0xb1: str[i++] = 'i'; break; + + case 0xb3: str[i++] = 'i'; str[i++] = 'j'; break; + + case 0xb5: str[i++] = 'j'; break; + + case 0xb7: + case 0xb8: str[i++] = 'k'; break; + + case 0xba: + case 0xbc: + case 0xbe: str[i++] = 'l'; break; + + default: + str[i++] = tolower(*cur); + } + + } else if (G_UNLIKELY ((*cur == 0xc5))) { /* Latin Extended-A (0xc5) */ + ++cur; + switch (*cur) { + case 0x81: str[i++] = 'l'; break; + + case 0x83: + case 0x85: + case 0x87: str[i++] = 'n'; break; + + case 0x8c: + case 0x8e: + case 0x90: str[i++] = 'o'; break; + + case 0x92: str[i++] = 'o'; str[i++] = 'e'; break; + + case 0x94: + case 0x96: + case 0x98: str[i++] = 'r'; break; + + case 0x9a: + case 0x9c: + case 0x9e: + case 0xa0: str[i++] = 's'; break; + + case 0xa2: + case 0xa4: + case 0xa6: str[i++] = 't'; break; + + case 0xa8: + case 0xaa: + case 0xac: + case 0xae: + case 0xb0: + case 0xb2: str[i++] = 'u'; break; + + case 0xb4: str[i++] = 'w'; break; + + case 0xb6: + case 0xb8: + str[i++] = 'y'; break; + + case 0xb9: + case 0xbb: + case 0xbd: str[i++] = 'z'; break; + + case 0x80: + case 0x82: str[i++] = 'l'; break; + + case 0x84: + case 0x86: + case 0x88: + case 0x89: + case 0x8a: + case 0x8b: str[i++] = 'n'; break; + + case 0x8d: + case 0x8f: + case 0x91: str[i++] = 'o'; break; + + case 0x93: str[i++] = 'o'; str[i++] = 'e'; break; + + case 0x95: + case 0x97: + case 0x99: str[i++] = 'r'; break; + + case 0x9b: + case 0x9d: + case 0x9f: + case 0xa1: str[i++] = 's'; break; + + case 0xa3: + case 0xa5: + case 0xa7: str[i++] = 't'; break; + + case 0xa9: + case 0xab: + case 0xad: + case 0xaf: + case 0xb1: + case 0xb3: str[i++] = 'u'; break; + + case 0xb5: str[i++] = 'w'; break; + + case 0xb7: str[i++] = 'y'; break; + + case 0xba: + case 0xbc: + case 0xbe: str[i++] = 'z'; break; + + case 0xbf: str[i++] = 's'; break; + + default: + str[i++] = tolower(*cur); + } + } else + str[i++] = tolower(*cur); + } + + str [i] = '\0'; + + return str; } @@ -369,10 +680,6 @@ mu_msg_str_display_date_s (time_t t) return mu_msg_str_date_s ("%X", t); } - - - - const char* mu_msg_str_size_s (size_t s) { diff --git a/src/mu-msg-str.h b/src/mu-msg-str.h index 28b184af..14068371 100644 --- a/src/mu-msg-str.h +++ b/src/mu-msg-str.h @@ -61,7 +61,7 @@ char* mu_msg_str_date (const char* frm, time_t t) G_GNUC_WARN_UNUSED_RES const char* mu_msg_str_display_date_s (time_t t); -/** +/** * create a 'display contact' from an email header To/Cc/Bcc/From-type address * ie., turn * "Foo Bar" @@ -144,9 +144,10 @@ char* mu_msg_str_summarize (const char* str, -/** +/** * normalize a string (ie., collapse accented characters etc.), and * optionally, downcase it + * * * @param str a valid utf8 string or NULL * @param downcase if TRUE, convert the string to lowercase @@ -155,6 +156,20 @@ char* mu_msg_str_summarize (const char* str, */ char* mu_msg_str_normalize (const char *str, gboolean downcase); + +/** + * normalize a string (ie., collapse accented characters etc.), and + * optionally, downcase it. this happen by changing the string; if + * that is not desired, use mu_msg_str_normalize + * + * @param str a valid utf8 string or NULL + * @param downcase if TRUE, convert the string to lowercase + * + * @return the normalize string, or NULL in case of error or str was NULL + */ +char* mu_msg_str_normalize_in_place (char *str, gboolean downcase); + + G_END_DECLS #endif /*__MU_MSG_STR_H__*/