diff --git a/lib/mu-str-normalize.c b/lib/mu-str-normalize.c deleted file mode 100644 index d240161a..00000000 --- a/lib/mu-str-normalize.c +++ /dev/null @@ -1,402 +0,0 @@ -/* -** Copyright (C) 2012-2013 Dirk-Jan C. Binnema -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 3 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** -*/ - -#if HAVE_CONFIG_H -#include "config.h" -#endif /*HAVE_CONFIG_H*/ - - -#include -#include -#include - -#include "mu-str.h" - - -char* -mu_str_normalize (const char *str, gboolean downcase, GStringChunk *strchunk) -{ - char *mystr; - - g_return_val_if_fail (str, NULL); - - if (strchunk) - mystr = g_string_chunk_insert (strchunk, str); - else - mystr = g_strdup (str); - - return mu_str_normalize_in_place (mystr, downcase, strchunk); -} - - - -/* - * this implementation works for accented chars in Unicode Blocks - * 'Latin-1 Supplement' and 'Latin Extended-A'. An alternative (slower - * but much simpler) implementation would be to use g_utf8_normalize - * to decompose characters in the accent part and the character part, - * and then get rid of the former. That would be slower than what we - * do here, but also more *complete*. It's unclear whether it would - * be slower *in practice* => needs checking - */ - -/* we can normalize in-place, as the normalized string will never be - * longer than the original. even for replacements that are 2 chars - * wide (e.g. German ß => ss), the replacement is 2 bytes, like the - * original 0xc3 0x9f - * - * note-to-self: http://www.geertvanderploeg.com/unicode-gen/ - */ -char* -mu_str_normalize_in_place (char *str, gboolean downcase, GStringChunk *strchunk) -{ - const guchar *cur; - int i; - - g_return_val_if_fail (str, NULL); - - if (*str == '\0') - return str; - - for (i = 0, cur = (const guchar*)str; *cur; ++cur) { - - /* special case for plain-old ascii */ - if ((*cur < 0x80)) { - str[i++] = downcase ? tolower (*cur) : *cur; - continue; - } - - if (*cur == 0xc3) { /* latin-1 supplement */ - ++cur; - switch (*cur) { - - case 0x80: - case 0x81: - case 0x82: - case 0x83: - case 0x84: - case 0x85: str[i++] = downcase ? 'a' : 'A' ; break; - - case 0x86: - str[i++] = downcase ? 'a' : 'A' ; - str[i++] = 'e'; - break; - - case 0x87: str[i++] = downcase ? 'c' : 'C'; break; - - case 0x88: - case 0x89: - case 0x8a: - case 0x8b: - str[i++] = downcase ? 'e' : 'E'; - break; - - case 0x8c: - case 0x8d: - case 0x8e: - case 0x8f: str[i++] = downcase ? 'i': 'I'; break; - - case 0x90: str[i++] = downcase ? 'd' : 'D'; break; - case 0x91: str[i++] = downcase ? 'n' : 'N'; break; - - case 0x92: - case 0x93: - case 0x94: - case 0x95: - case 0x96: str[i++] = downcase ? 'o' : 'O'; break; - - case 0x99: - case 0x9a: - case 0x9b: - case 0x9c: str[i++] = downcase ? 'u' : 'U'; break; - - case 0x9d: str[i++] = downcase ? 'y' : 'Y'; break; - - case 0x9e: - str[i++] = downcase ? 't' : 'T'; - str[i++] = 'h'; - break; - - case 0x9f: str[i++] = 's'; str[i++] = 's'; break; - - case 0xa0: - case 0xa1: - case 0xa2: - case 0xa3: - case 0xa4: - case 0xa5: str[i++] = 'a'; break; - - case 0xa6: str[i++] = 'a'; str[i++] = 'e'; break; - case 0xa7: str[i++] = 'c'; break; - - case 0xa8: - case 0xa9: - case 0xaa: - case 0xab: str[i++] = 'e'; break; - - case 0xac: - case 0xad: - case 0xae: - case 0xaf: str[i++] = 'i'; break; - - case 0xb0: str[i++] = 'd'; break; - case 0xb1: str[i++] = 'n'; break; - - case 0xb2: - case 0xb3: - case 0xb4: - case 0xb5: - case 0xb6: str[i++] = 'o'; break; - - case 0xb9: - case 0xba: - case 0xbb: - case 0xbc: str[i++] = 'u'; break; - - case 0xbd: str[i++] = 'y'; break; - case 0xbe: str[i++] = 't'; str[i++] = 'h'; break; - case 0xbf: str[i++] = 'y'; break; - - default: - str[i++] = *cur; - } - - } else if (*cur == 0xc4) { /* Latin Extended-A (0x04) */ - ++cur; - switch (*cur) { - case 0x80: - case 0x82: - case 0x84: str[i++] = downcase ? 'a' : 'A'; break; - - case 0x86: - case 0x88: - case 0x8a: - case 0x8c: str[i++] = downcase ? 'c' : 'C'; break; - - case 0x8e: - case 0x90: str[i++] = downcase ? 'd' : 'D'; break; - - case 0x92: - case 0x94: - case 0x96: - case 0x98: - case 0x9a: str[i++] = downcase ? 'e' : 'E'; break; - - case 0x9c: - case 0x9e: - case 0xa0: - case 0xa2: str[i++] = downcase ? 'g' : 'G'; break; - - case 0xa4: - case 0xa6: str[i++] = downcase ? 'h' : 'H'; break; - - case 0xa8: - case 0xaa: - case 0xac: - case 0xae: - case 0xb0: str[i++] = downcase ? 'i' : 'I'; break; - - case 0xb2: - str[i++] = downcase ? 'i' : 'I'; - str[i++] = downcase ? 'j' : 'J'; - break; - - - case 0xb4: str[i++] = downcase ? 'j' : 'J'; break; - - case 0xb6: str[i++] = downcase ? 'k' : 'K'; break; - - case 0xb9: - case 0xbb: - case 0xbd: - case 0xbf: str[i++] = downcase ? 'l': 'L'; break; - - case 0x81: - case 0x83: - case 0x85: str[i++] = 'a'; break; - - case 0x87: - case 0x89: - case 0x8b: - case 0x8d: str[i++] = 'c'; break; - - case 0x8f: - case 0x91: str[i++] = 'd'; break; - - case 0x93: - case 0x95: - case 0x97: - case 0x99: - case 0x9b: str[i++] = 'e'; break; - - case 0x9d: - case 0x9f: - case 0xa1: - case 0xa: str[i++] = 'g'; break; - - case 0xa5: - case 0xa7: str[i++] = 'h'; break; - - case 0xa9: - case 0xab: - case 0xad: - case 0xaf: - case 0xb1: str[i++] = 'i'; break; - - case 0xb3: str[i++] = 'i'; str[i++] = 'j'; break; - - case 0xb5: str[i++] = 'j'; break; - - case 0xb7: - case 0xb8: str[i++] = 'k'; break; - - case 0xba: - case 0xbc: - case 0xbe: str[i++] = 'l'; break; - - default: str[i++] = *cur; break; - - } - - } else if (*cur == 0xc5) { /* Latin Extended-A (0xc5) */ - ++cur; - switch (*cur) { - case 0x81: str[i++] = downcase ? 'l': 'L'; break; - - case 0x83: - case 0x85: - case 0x87: str[i++] = downcase ? 'n': 'N'; break; - - case 0x8c: - case 0x8e: - case 0x90: str[i++] = downcase ? 'o': 'O'; break; - - case 0x92: - str[i++] = downcase ? 'o': 'O'; - str[i++] = 'e'; - break; - - case 0x94: - case 0x96: - case 0x98: str[i++] = downcase ? 'r': 'R'; break; - - case 0x9a: - case 0x9c: - case 0x9e: - case 0xa0: str[i++] = downcase ? 's': 'S'; break; - - case 0xa2: - case 0xa4: - case 0xa6: str[i++] = downcase ? 't': 'T'; break; - - case 0xa8: - case 0xaa: - case 0xac: - case 0xae: - case 0xb0: - case 0xb2: str[i++] = downcase ? 'u': 'U'; break; - case 0xb4: str[i++] = downcase ? 'w': 'W'; break; - - case 0xb6: - case 0xb8: str[i++] = downcase ? 'y': 'Y'; break; - - case 0xb9: - case 0xbb: - case 0xbd: str[i++] = downcase ? 'z': 'Z'; break; - - case 0x80: - case 0x82: str[i++] = 'l'; break; - - case 0x84: - case 0x86: - case 0x88: - case 0x89: - case 0x8a: - case 0x8b: str[i++] = 'n'; break; - - case 0x8d: - case 0x8f: - case 0x91: str[i++] = 'o'; break; - - case 0x93: str[i++] = 'o'; str[i++] = 'e'; break; - - case 0x95: - case 0x97: - case 0x99: str[i++] = 'r'; break; - - case 0x9b: - case 0x9d: - case 0x9f: - case 0xa1: str[i++] = 's'; break; - - case 0xa3: - case 0xa5: - case 0xa7: str[i++] = 't'; break; - - case 0xa9: - case 0xab: - case 0xad: - case 0xaf: - case 0xb1: - case 0xb3: str[i++] = 'u'; break; - - case 0xb5: str[i++] = 'w'; break; - - case 0xb7: str[i++] = 'y'; break; - - case 0xba: - case 0xbc: - case 0xbe: str[i++] = 'z'; break; - - case 0xbf: str[i++] = 's'; break; - - default: str[i++] = *cur; break; - } - - } else { - /* our fast-path for latin-utf8 does not work - * -- bummer! just append the character then - * */ - gunichar uc; - char buf[7]; - size_t len1, len2; - - len1 = g_utf8_next_char ((char*)cur) - (char*)cur; - uc = g_utf8_get_char ((char*)cur); - - if (downcase) - uc = g_unichar_tolower (uc); - - len2 = g_unichar_to_utf8 (uc, buf); - - /* if the new char fits where the old char was, - * change it. otherwise, don't bother. */ - - if (len1 == len2) { - memcpy (str + i, buf, len2); - i += len2; - } - } - - } - - str[i] = '\0'; - - return str; -} diff --git a/lib/mu-str.c b/lib/mu-str.c index 276f0557..52b6c2f0 100644 --- a/lib/mu-str.c +++ b/lib/mu-str.c @@ -257,78 +257,53 @@ mu_str_to_list (const char *str, char sepa, gboolean strip) return lst; } - -static gchar* -eat_esc_string (char **strlst, GError **err) -{ - char *str; - gboolean quoted; - GString *gstr; - - str = g_strchug (*strlst); - gstr = g_string_sized_new (strlen(str)); - - for (quoted = FALSE; *str; ++str) { - - if (*str == '"') { - quoted = !quoted; - continue; - } else if (*str == '\\') { - if (str[1] != ' ' && str[1] != '"' && str[1] != '\\') - goto err; /* invalid escaping */ - g_string_append_c (gstr, str[1]); - ++str; - continue; - } else if (*str == ' ' && !quoted) { - ++str; - goto leave; - } else - g_string_append_c (gstr, *str); - } -leave: - *strlst = str; - return g_string_free (gstr, FALSE); -err: - g_set_error (err, MU_ERROR_DOMAIN, MU_ERROR_IN_PARAMETERS, - "error parsing string '%s'", g_strchug(*strlst)); - *strlst = NULL; - return g_string_free (gstr, TRUE); -} - - GSList* -mu_str_esc_to_list (const char *strings, GError **err) +mu_str_esc_to_list (const char *strings) { GSList *lst; - char *mystrings, *freeme; - const char* cur; + GString *part; + unsigned u; + gboolean quoted; g_return_val_if_fail (strings, NULL); - for (cur = strings; *cur && (*cur == ' ' || *cur == '\t'); ++cur); - freeme = mystrings = g_strdup (cur); + part = g_string_new (NULL); - lst = NULL; - do { - gchar *str; - str = eat_esc_string (&mystrings, err); - if (str) - lst = g_slist_prepend (lst, str); - else { - g_free (freeme); - mu_str_free_list (lst); - return NULL; + for (u = 0, lst = NULL, quoted = FALSE; + u != strlen (strings); ++u) { + + char kar; + kar = strings[u]; + + if (quoted && kar != '"') { + g_string_append_c (part, kar); + continue; } - } while (mystrings && *mystrings); + switch (kar) { + case '"': + quoted = !quoted; + g_string_append_c (part, kar); + continue; + case ' ': + if (part->len > 0) { + lst = g_slist_prepend + (lst, g_string_free (part, FALSE)); + part = g_string_new (NULL); + } + continue; + default: + g_string_append_c (part, kar); + } + } + + if (part->len) + lst = g_slist_prepend (lst, g_string_free (part, FALSE)); - g_free (freeme); return g_slist_reverse (lst); } - - void mu_str_free_list (GSList *lst) { @@ -451,147 +426,105 @@ check_for_field (const char *str, gboolean *is_field, static gboolean -is_xapian_special_char (char c) +handle_esc_maybe (GString *gstr, char **cur, gunichar uc, + gboolean query_esc) { - switch (c) { + char kar; - case '@': - case '.': - case ',': - case '/': - case '[': - case ']': - case '+': - case '-': - case ' ': - case ':': - case '(': - case ')': - case '$': - case '"': - case '\\': - case '\'': - case '*': - return TRUE; - default: - return FALSE; - } -} + kar = *cur[0]; -#define ESC_CHAR '_' - -/* - * Xapian treats various characters such as '@', '-', ':' and '.' - * specially; function below is an ugly hack to make it DWIM in most - * cases... - * - * function expects search terms (not complete queries) - * */ -char* -mu_str_xapian_escape_in_place_try (char *term, gboolean esc_space, GStringChunk *strchunk) -{ - unsigned char *cur; - char lookback; - gboolean is_field, is_range_field, quoted; - unsigned colon; - - g_return_val_if_fail (term, NULL); - - check_for_field (term, &is_field, &is_range_field); - - for (colon = 0, lookback = 0, quoted=FALSE, cur = (unsigned char*)term; - *cur; ++cur) { - - if (*cur == '\\') - quoted = !quoted; - - switch (*cur) { - - case '.': /* escape '..' if it's not a range field */ - if (cur[1] == '.') { - if (!is_range_field) { - *cur = ESC_CHAR; - *(cur + 1) = ESC_CHAR; - } - ++cur; - } else if (isblank(lookback) || isblank(cur[1]) || - cur[1] == '\0') - *cur = ' '; - else - *cur = ESC_CHAR; - break; + if (query_esc) { + switch (kar) { case ':': - /* if there's a registered xapian prefix - * before the *first* ':', don't touch - * it. Otherwise replace ':' with ' '... ugh - * yuck ugly... - */ - if (colon != 0 || !is_field) - *cur = ' '; - ++colon; - break; - case '@': - case '/': - case '[': - case ']': - case '+': - case '$': - case '\\': - case '-': - *cur = ESC_CHAR; - break; - case ' ': - case '_': case '(': case ')': + case '*': case '"': - case '\'': - case '*': /* wildcard */ - break; /* leave as they are */ - default: - /* turn other stuff into spaces */ - if (*cur < 0x80 && !isalnum (*cur)) - *cur = ' '; + g_string_append_c (gstr, kar); + return TRUE; + case '.': + if ((*cur)[1] == '.' && (*cur)[2] != '.') { + g_string_append (gstr, ".."); + *cur = g_utf8_next_char (*cur); + return TRUE; + } + default: break; } - - lookback = *cur; } - /* downcase try to remove accents etc. */ - return mu_str_normalize_in_place (term, TRUE, strchunk); + if (g_unichar_ispunct(uc) || isblank(kar)) { + g_string_append_c (gstr, '_'); + return TRUE; + } + + return FALSE; } -char* -mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strchunk) + +static char* +process_str (const char *str, gboolean xapian_esc, gboolean query_esc) { - char *mystr; + GString *gstr; + char *norm, *cur; - g_return_val_if_fail (query, NULL); + norm = g_utf8_normalize (str, -1, G_NORMALIZE_ALL); + gstr = g_string_sized_new (strlen (norm)); - if (strchunk) - mystr = g_string_chunk_insert (strchunk, query); - else - mystr = g_strdup (query); + for (cur = norm; cur && *cur; cur = g_utf8_next_char (cur)) { - return mu_str_xapian_escape_in_place_try (mystr, esc_space, strchunk); + gunichar uc; + + uc = g_utf8_get_char (cur); + + if (xapian_esc) + if (handle_esc_maybe (gstr, &cur, uc, query_esc)) + continue; + + if (g_unichar_ismark(uc)) + continue; + + /* maybe add some special cases, such as Spaß->spass ? + */ + + uc = g_unichar_tolower (uc); + g_string_append_unichar (gstr, uc); + } + + g_free (norm); + + /* g_print ("-->%s\n", gstr->str); */ + + return g_string_free (gstr, FALSE); } char* -mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk) +mu_str_process_text (const char *str) { - char *cur, *esc; + g_return_val_if_fail (str, NULL); - g_return_val_if_fail (term, NULL); - g_return_val_if_fail (strchunk, NULL); + return process_str (str, FALSE, FALSE); - for (cur = esc = mu_str_normalize (term, TRUE, strchunk); - *cur; ++cur) { - if (is_xapian_special_char (*cur)) - *cur = ESC_CHAR; - } +} + + +char* +mu_str_process_term (const char *str) +{ + g_return_val_if_fail (str, NULL); + + return process_str (str, TRUE, FALSE); + +} + + +char* +mu_str_process_query_term (const char *str) +{ + g_return_val_if_fail (str, NULL); + + return process_str (str, TRUE, TRUE); - return esc; } diff --git a/lib/mu-str.h b/lib/mu-str.h index 88a82095..73186398 100644 --- a/lib/mu-str.h +++ b/lib/mu-str.h @@ -106,86 +106,42 @@ char* mu_str_flags (MuFlags flags) char* mu_str_summarize (const char* str, size_t max_lines) G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; -/** - * normalize a string (ie., collapse accented characters etc.), and - * optionally, downcase it. Works for accented chars in Unicode Blocks - * 'Latin-1 Supplement' and 'Latin Extended-A' - * - * @param str a valid utf8 string or NULL - * @param downcase if TRUE, convert the string to lowercase - * @param strchunk (optional) if non-NULL, allocate strings on strchunk - * - * @return the normalized string, or NULL in case of error or str was - * NULL. Unless strchunk was provided, user must g_free the string when - * no longer needed - */ -char* mu_str_normalize (const char *str, gboolean downcase, - GStringChunk *strchunk) - G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; /** - * normalize a string (ie., collapse accented characters etc.), and - * optionally, downcase it. this happen by changing the string; if - * that is not desired, use mu_str_normalize. Works for accented chars - * in Unicode Blocks 'Latin-1 Supplement' and 'Latin Extended-A' + * Process some text (e.g. message bodies) -- flatten (remove accents + * etc.), and remove some punctuation. * - * @param str a valid utf8 string or NULL - * @param downcase if TRUE, convert the string to lowercase - * @param strchunk (optional) if non-NULL, allocate strings on strchunk + * @param text some text * - * @return the normalized string, or NULL in case of error or str was - * NULL. User only needs to free the returned string if a) return - * value != str and b) strchunk was not provided. + * @return the processed text, free with g_free */ -char* mu_str_normalize_in_place (char *str, gboolean downcase, - GStringChunk *strchunk); +char* mu_str_process_text (const char *text) + G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; /** - * escape the string for use with xapian matching. in practice, if the - * string contains an '@', replace '@', single-'.' with '_'. Also, - * replace ':' with '_', if it's not following a xapian-prefix (such - * as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]). - * changing is done in-place (by changing the argument string). in any - * case, the string will be downcased. + * Process some term (e.g., an e-mail address, subject field): + * remove accents, replace some punctuation by _ * - * @param query a query string - * @param esc_space escape space characters as well - * @param strchunk (optional) if non-NULL, allocate strings on strchunk - * - * @return the escaped string or NULL in case of error. User only - * needs to free the returned string if a) return value != query and b) - * strchunk was not provided. + * @param term some term * + * @return the processed text, free with g_free */ -char* mu_str_xapian_escape_in_place_try (char *query, gboolean esc_space, - GStringChunk *strchunk); +char* mu_str_process_term (const char *term) + G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; + /** - * escape the string for use with xapian matching. in practice, if the - * string contains an '@', replace '@', single-'.' with '_'. Also, - * replace ':' with '_', if it's not following a xapian-prefix (such - * as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]). + * Process some query term (e.g., an e-mail address, subject field): + * remove accents, replace some punctuation by _, but leave some query + * metachars alone. * - * @param str a string - * @param esc_space escape space characters as well - * @param strchunk (optional) if non-NULL, allocate strings on strchunk + * @param qterm some query term * - * @return the escaped string (free with g_free) or NULL in case of error - * Unless strchunk was provided, user must g_free the string when - * no longer needed + * @return the processed text, free with g_free */ -char* mu_str_xapian_escape (const char *str, gboolean esc_space, - GStringChunk *strchunk) G_GNUC_WARN_UNUSED_RESULT; +char* mu_str_process_query_term (const char *qterm) + G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; -/** - * escape the xapian term - * - * @param str a string - * @param strchunk allocate strings on strchunk - * - * @return the escaped string, which is allocated in the strchunk - */ -char* mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk); /** * Fixup values for some fields in the DWIM manner: @@ -315,14 +271,14 @@ GSList* mu_str_to_list (const char *str, char sepa, gboolean strip); /** * convert a string (with possible escaping) to a list. list items are * separated by one or more spaces. list items can be quoted (using - * '"'), and '"', ' ' and '\' use their special meaning when prefixed - * with \. + * '"'). * * @param str a string * - * @return a list of elements or NULL in case of error + * @return a list of elements or NULL in case of error, free with + * mu_str_free_list */ -GSList* mu_str_esc_to_list (const char *str, GError **err); +GSList* mu_str_esc_to_list (const char *str); /**