* mu-str: simplify, cleanup string pre-processing functions

This commit is contained in:
djcb 2013-05-13 00:01:49 +03:00
parent 2f60f33dc8
commit d26f3c0bae
3 changed files with 132 additions and 645 deletions

View File

@ -1,402 +0,0 @@
/*
** Copyright (C) 2012-2013 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#if HAVE_CONFIG_H
#include "config.h"
#endif /*HAVE_CONFIG_H*/
#include <glib.h>
#include <string.h>
#include <ctype.h>
#include "mu-str.h"
char*
mu_str_normalize (const char *str, gboolean downcase, GStringChunk *strchunk)
{
char *mystr;
g_return_val_if_fail (str, NULL);
if (strchunk)
mystr = g_string_chunk_insert (strchunk, str);
else
mystr = g_strdup (str);
return mu_str_normalize_in_place (mystr, downcase, strchunk);
}
/*
* this implementation works for accented chars in Unicode Blocks
* 'Latin-1 Supplement' and 'Latin Extended-A'. An alternative (slower
* but much simpler) implementation would be to use g_utf8_normalize
* to decompose characters in the accent part and the character part,
* and then get rid of the former. That would be slower than what we
* do here, but also more *complete*. It's unclear whether it would
* be slower *in practice* => needs checking
*/
/* we can normalize in-place, as the normalized string will never be
* longer than the original. even for replacements that are 2 chars
* wide (e.g. German ß => ss), the replacement is 2 bytes, like the
* original 0xc3 0x9f
*
* note-to-self: http://www.geertvanderploeg.com/unicode-gen/
*/
char*
mu_str_normalize_in_place (char *str, gboolean downcase, GStringChunk *strchunk)
{
const guchar *cur;
int i;
g_return_val_if_fail (str, NULL);
if (*str == '\0')
return str;
for (i = 0, cur = (const guchar*)str; *cur; ++cur) {
/* special case for plain-old ascii */
if ((*cur < 0x80)) {
str[i++] = downcase ? tolower (*cur) : *cur;
continue;
}
if (*cur == 0xc3) { /* latin-1 supplement */
++cur;
switch (*cur) {
case 0x80:
case 0x81:
case 0x82:
case 0x83:
case 0x84:
case 0x85: str[i++] = downcase ? 'a' : 'A' ; break;
case 0x86:
str[i++] = downcase ? 'a' : 'A' ;
str[i++] = 'e';
break;
case 0x87: str[i++] = downcase ? 'c' : 'C'; break;
case 0x88:
case 0x89:
case 0x8a:
case 0x8b:
str[i++] = downcase ? 'e' : 'E';
break;
case 0x8c:
case 0x8d:
case 0x8e:
case 0x8f: str[i++] = downcase ? 'i': 'I'; break;
case 0x90: str[i++] = downcase ? 'd' : 'D'; break;
case 0x91: str[i++] = downcase ? 'n' : 'N'; break;
case 0x92:
case 0x93:
case 0x94:
case 0x95:
case 0x96: str[i++] = downcase ? 'o' : 'O'; break;
case 0x99:
case 0x9a:
case 0x9b:
case 0x9c: str[i++] = downcase ? 'u' : 'U'; break;
case 0x9d: str[i++] = downcase ? 'y' : 'Y'; break;
case 0x9e:
str[i++] = downcase ? 't' : 'T';
str[i++] = 'h';
break;
case 0x9f: str[i++] = 's'; str[i++] = 's'; break;
case 0xa0:
case 0xa1:
case 0xa2:
case 0xa3:
case 0xa4:
case 0xa5: str[i++] = 'a'; break;
case 0xa6: str[i++] = 'a'; str[i++] = 'e'; break;
case 0xa7: str[i++] = 'c'; break;
case 0xa8:
case 0xa9:
case 0xaa:
case 0xab: str[i++] = 'e'; break;
case 0xac:
case 0xad:
case 0xae:
case 0xaf: str[i++] = 'i'; break;
case 0xb0: str[i++] = 'd'; break;
case 0xb1: str[i++] = 'n'; break;
case 0xb2:
case 0xb3:
case 0xb4:
case 0xb5:
case 0xb6: str[i++] = 'o'; break;
case 0xb9:
case 0xba:
case 0xbb:
case 0xbc: str[i++] = 'u'; break;
case 0xbd: str[i++] = 'y'; break;
case 0xbe: str[i++] = 't'; str[i++] = 'h'; break;
case 0xbf: str[i++] = 'y'; break;
default:
str[i++] = *cur;
}
} else if (*cur == 0xc4) { /* Latin Extended-A (0x04) */
++cur;
switch (*cur) {
case 0x80:
case 0x82:
case 0x84: str[i++] = downcase ? 'a' : 'A'; break;
case 0x86:
case 0x88:
case 0x8a:
case 0x8c: str[i++] = downcase ? 'c' : 'C'; break;
case 0x8e:
case 0x90: str[i++] = downcase ? 'd' : 'D'; break;
case 0x92:
case 0x94:
case 0x96:
case 0x98:
case 0x9a: str[i++] = downcase ? 'e' : 'E'; break;
case 0x9c:
case 0x9e:
case 0xa0:
case 0xa2: str[i++] = downcase ? 'g' : 'G'; break;
case 0xa4:
case 0xa6: str[i++] = downcase ? 'h' : 'H'; break;
case 0xa8:
case 0xaa:
case 0xac:
case 0xae:
case 0xb0: str[i++] = downcase ? 'i' : 'I'; break;
case 0xb2:
str[i++] = downcase ? 'i' : 'I';
str[i++] = downcase ? 'j' : 'J';
break;
case 0xb4: str[i++] = downcase ? 'j' : 'J'; break;
case 0xb6: str[i++] = downcase ? 'k' : 'K'; break;
case 0xb9:
case 0xbb:
case 0xbd:
case 0xbf: str[i++] = downcase ? 'l': 'L'; break;
case 0x81:
case 0x83:
case 0x85: str[i++] = 'a'; break;
case 0x87:
case 0x89:
case 0x8b:
case 0x8d: str[i++] = 'c'; break;
case 0x8f:
case 0x91: str[i++] = 'd'; break;
case 0x93:
case 0x95:
case 0x97:
case 0x99:
case 0x9b: str[i++] = 'e'; break;
case 0x9d:
case 0x9f:
case 0xa1:
case 0xa: str[i++] = 'g'; break;
case 0xa5:
case 0xa7: str[i++] = 'h'; break;
case 0xa9:
case 0xab:
case 0xad:
case 0xaf:
case 0xb1: str[i++] = 'i'; break;
case 0xb3: str[i++] = 'i'; str[i++] = 'j'; break;
case 0xb5: str[i++] = 'j'; break;
case 0xb7:
case 0xb8: str[i++] = 'k'; break;
case 0xba:
case 0xbc:
case 0xbe: str[i++] = 'l'; break;
default: str[i++] = *cur; break;
}
} else if (*cur == 0xc5) { /* Latin Extended-A (0xc5) */
++cur;
switch (*cur) {
case 0x81: str[i++] = downcase ? 'l': 'L'; break;
case 0x83:
case 0x85:
case 0x87: str[i++] = downcase ? 'n': 'N'; break;
case 0x8c:
case 0x8e:
case 0x90: str[i++] = downcase ? 'o': 'O'; break;
case 0x92:
str[i++] = downcase ? 'o': 'O';
str[i++] = 'e';
break;
case 0x94:
case 0x96:
case 0x98: str[i++] = downcase ? 'r': 'R'; break;
case 0x9a:
case 0x9c:
case 0x9e:
case 0xa0: str[i++] = downcase ? 's': 'S'; break;
case 0xa2:
case 0xa4:
case 0xa6: str[i++] = downcase ? 't': 'T'; break;
case 0xa8:
case 0xaa:
case 0xac:
case 0xae:
case 0xb0:
case 0xb2: str[i++] = downcase ? 'u': 'U'; break;
case 0xb4: str[i++] = downcase ? 'w': 'W'; break;
case 0xb6:
case 0xb8: str[i++] = downcase ? 'y': 'Y'; break;
case 0xb9:
case 0xbb:
case 0xbd: str[i++] = downcase ? 'z': 'Z'; break;
case 0x80:
case 0x82: str[i++] = 'l'; break;
case 0x84:
case 0x86:
case 0x88:
case 0x89:
case 0x8a:
case 0x8b: str[i++] = 'n'; break;
case 0x8d:
case 0x8f:
case 0x91: str[i++] = 'o'; break;
case 0x93: str[i++] = 'o'; str[i++] = 'e'; break;
case 0x95:
case 0x97:
case 0x99: str[i++] = 'r'; break;
case 0x9b:
case 0x9d:
case 0x9f:
case 0xa1: str[i++] = 's'; break;
case 0xa3:
case 0xa5:
case 0xa7: str[i++] = 't'; break;
case 0xa9:
case 0xab:
case 0xad:
case 0xaf:
case 0xb1:
case 0xb3: str[i++] = 'u'; break;
case 0xb5: str[i++] = 'w'; break;
case 0xb7: str[i++] = 'y'; break;
case 0xba:
case 0xbc:
case 0xbe: str[i++] = 'z'; break;
case 0xbf: str[i++] = 's'; break;
default: str[i++] = *cur; break;
}
} else {
/* our fast-path for latin-utf8 does not work
* -- bummer! just append the character then
* */
gunichar uc;
char buf[7];
size_t len1, len2;
len1 = g_utf8_next_char ((char*)cur) - (char*)cur;
uc = g_utf8_get_char ((char*)cur);
if (downcase)
uc = g_unichar_tolower (uc);
len2 = g_unichar_to_utf8 (uc, buf);
/* if the new char fits where the old char was,
* change it. otherwise, don't bother. */
if (len1 == len2) {
memcpy (str + i, buf, len2);
i += len2;
}
}
}
str[i] = '\0';
return str;
}

View File

@ -257,78 +257,53 @@ mu_str_to_list (const char *str, char sepa, gboolean strip)
return lst;
}
static gchar*
eat_esc_string (char **strlst, GError **err)
{
char *str;
gboolean quoted;
GString *gstr;
str = g_strchug (*strlst);
gstr = g_string_sized_new (strlen(str));
for (quoted = FALSE; *str; ++str) {
if (*str == '"') {
quoted = !quoted;
continue;
} else if (*str == '\\') {
if (str[1] != ' ' && str[1] != '"' && str[1] != '\\')
goto err; /* invalid escaping */
g_string_append_c (gstr, str[1]);
++str;
continue;
} else if (*str == ' ' && !quoted) {
++str;
goto leave;
} else
g_string_append_c (gstr, *str);
}
leave:
*strlst = str;
return g_string_free (gstr, FALSE);
err:
g_set_error (err, MU_ERROR_DOMAIN, MU_ERROR_IN_PARAMETERS,
"error parsing string '%s'", g_strchug(*strlst));
*strlst = NULL;
return g_string_free (gstr, TRUE);
}
GSList*
mu_str_esc_to_list (const char *strings, GError **err)
mu_str_esc_to_list (const char *strings)
{
GSList *lst;
char *mystrings, *freeme;
const char* cur;
GString *part;
unsigned u;
gboolean quoted;
g_return_val_if_fail (strings, NULL);
for (cur = strings; *cur && (*cur == ' ' || *cur == '\t'); ++cur);
freeme = mystrings = g_strdup (cur);
part = g_string_new (NULL);
lst = NULL;
do {
gchar *str;
str = eat_esc_string (&mystrings, err);
if (str)
lst = g_slist_prepend (lst, str);
else {
g_free (freeme);
mu_str_free_list (lst);
return NULL;
for (u = 0, lst = NULL, quoted = FALSE;
u != strlen (strings); ++u) {
char kar;
kar = strings[u];
if (quoted && kar != '"') {
g_string_append_c (part, kar);
continue;
}
} while (mystrings && *mystrings);
switch (kar) {
case '"':
quoted = !quoted;
g_string_append_c (part, kar);
continue;
case ' ':
if (part->len > 0) {
lst = g_slist_prepend
(lst, g_string_free (part, FALSE));
part = g_string_new (NULL);
}
continue;
default:
g_string_append_c (part, kar);
}
}
if (part->len)
lst = g_slist_prepend (lst, g_string_free (part, FALSE));
g_free (freeme);
return g_slist_reverse (lst);
}
void
mu_str_free_list (GSList *lst)
{
@ -451,147 +426,105 @@ check_for_field (const char *str, gboolean *is_field,
static gboolean
is_xapian_special_char (char c)
handle_esc_maybe (GString *gstr, char **cur, gunichar uc,
gboolean query_esc)
{
switch (c) {
char kar;
case '@':
case '.':
case ',':
case '/':
case '[':
case ']':
case '+':
case '-':
case ' ':
case ':':
case '(':
case ')':
case '$':
case '"':
case '\\':
case '\'':
case '*':
return TRUE;
default:
return FALSE;
}
}
kar = *cur[0];
#define ESC_CHAR '_'
/*
* Xapian treats various characters such as '@', '-', ':' and '.'
* specially; function below is an ugly hack to make it DWIM in most
* cases...
*
* function expects search terms (not complete queries)
* */
char*
mu_str_xapian_escape_in_place_try (char *term, gboolean esc_space, GStringChunk *strchunk)
{
unsigned char *cur;
char lookback;
gboolean is_field, is_range_field, quoted;
unsigned colon;
g_return_val_if_fail (term, NULL);
check_for_field (term, &is_field, &is_range_field);
for (colon = 0, lookback = 0, quoted=FALSE, cur = (unsigned char*)term;
*cur; ++cur) {
if (*cur == '\\')
quoted = !quoted;
switch (*cur) {
case '.': /* escape '..' if it's not a range field */
if (cur[1] == '.') {
if (!is_range_field) {
*cur = ESC_CHAR;
*(cur + 1) = ESC_CHAR;
}
++cur;
} else if (isblank(lookback) || isblank(cur[1]) ||
cur[1] == '\0')
*cur = ' ';
else
*cur = ESC_CHAR;
break;
if (query_esc) {
switch (kar) {
case ':':
/* if there's a registered xapian prefix
* before the *first* ':', don't touch
* it. Otherwise replace ':' with ' '... ugh
* yuck ugly...
*/
if (colon != 0 || !is_field)
*cur = ' ';
++colon;
break;
case '@':
case '/':
case '[':
case ']':
case '+':
case '$':
case '\\':
case '-':
*cur = ESC_CHAR;
break;
case ' ':
case '_':
case '(':
case ')':
case '*':
case '"':
case '\'':
case '*': /* wildcard */
break; /* leave as they are */
default:
/* turn other stuff into spaces */
if (*cur < 0x80 && !isalnum (*cur))
*cur = ' ';
g_string_append_c (gstr, kar);
return TRUE;
case '.':
if ((*cur)[1] == '.' && (*cur)[2] != '.') {
g_string_append (gstr, "..");
*cur = g_utf8_next_char (*cur);
return TRUE;
}
default: break;
}
lookback = *cur;
}
/* downcase try to remove accents etc. */
return mu_str_normalize_in_place (term, TRUE, strchunk);
if (g_unichar_ispunct(uc) || isblank(kar)) {
g_string_append_c (gstr, '_');
return TRUE;
}
return FALSE;
}
char*
mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strchunk)
static char*
process_str (const char *str, gboolean xapian_esc, gboolean query_esc)
{
char *mystr;
GString *gstr;
char *norm, *cur;
g_return_val_if_fail (query, NULL);
norm = g_utf8_normalize (str, -1, G_NORMALIZE_ALL);
gstr = g_string_sized_new (strlen (norm));
if (strchunk)
mystr = g_string_chunk_insert (strchunk, query);
else
mystr = g_strdup (query);
for (cur = norm; cur && *cur; cur = g_utf8_next_char (cur)) {
return mu_str_xapian_escape_in_place_try (mystr, esc_space, strchunk);
gunichar uc;
uc = g_utf8_get_char (cur);
if (xapian_esc)
if (handle_esc_maybe (gstr, &cur, uc, query_esc))
continue;
if (g_unichar_ismark(uc))
continue;
/* maybe add some special cases, such as Spaß->spass ?
*/
uc = g_unichar_tolower (uc);
g_string_append_unichar (gstr, uc);
}
g_free (norm);
/* g_print ("-->%s\n", gstr->str); */
return g_string_free (gstr, FALSE);
}
char*
mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk)
mu_str_process_text (const char *str)
{
char *cur, *esc;
g_return_val_if_fail (str, NULL);
g_return_val_if_fail (term, NULL);
g_return_val_if_fail (strchunk, NULL);
return process_str (str, FALSE, FALSE);
for (cur = esc = mu_str_normalize (term, TRUE, strchunk);
*cur; ++cur) {
if (is_xapian_special_char (*cur))
*cur = ESC_CHAR;
}
}
char*
mu_str_process_term (const char *str)
{
g_return_val_if_fail (str, NULL);
return process_str (str, TRUE, FALSE);
}
char*
mu_str_process_query_term (const char *str)
{
g_return_val_if_fail (str, NULL);
return process_str (str, TRUE, TRUE);
return esc;
}

View File

@ -106,86 +106,42 @@ char* mu_str_flags (MuFlags flags)
char* mu_str_summarize (const char* str, size_t max_lines)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
/**
* normalize a string (ie., collapse accented characters etc.), and
* optionally, downcase it. Works for accented chars in Unicode Blocks
* 'Latin-1 Supplement' and 'Latin Extended-A'
*
* @param str a valid utf8 string or NULL
* @param downcase if TRUE, convert the string to lowercase
* @param strchunk (optional) if non-NULL, allocate strings on strchunk
*
* @return the normalized string, or NULL in case of error or str was
* NULL. Unless strchunk was provided, user must g_free the string when
* no longer needed
*/
char* mu_str_normalize (const char *str, gboolean downcase,
GStringChunk *strchunk)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
/**
* normalize a string (ie., collapse accented characters etc.), and
* optionally, downcase it. this happen by changing the string; if
* that is not desired, use mu_str_normalize. Works for accented chars
* in Unicode Blocks 'Latin-1 Supplement' and 'Latin Extended-A'
* Process some text (e.g. message bodies) -- flatten (remove accents
* etc.), and remove some punctuation.
*
* @param str a valid utf8 string or NULL
* @param downcase if TRUE, convert the string to lowercase
* @param strchunk (optional) if non-NULL, allocate strings on strchunk
* @param text some text
*
* @return the normalized string, or NULL in case of error or str was
* NULL. User only needs to free the returned string if a) return
* value != str and b) strchunk was not provided.
* @return the processed text, free with g_free
*/
char* mu_str_normalize_in_place (char *str, gboolean downcase,
GStringChunk *strchunk);
char* mu_str_process_text (const char *text)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
/**
* escape the string for use with xapian matching. in practice, if the
* string contains an '@', replace '@', single-'.' with '_'. Also,
* replace ':' with '_', if it's not following a xapian-prefix (such
* as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]).
* changing is done in-place (by changing the argument string). in any
* case, the string will be downcased.
* Process some term (e.g., an e-mail address, subject field):
* remove accents, replace some punctuation by _
*
* @param query a query string
* @param esc_space escape space characters as well
* @param strchunk (optional) if non-NULL, allocate strings on strchunk
*
* @return the escaped string or NULL in case of error. User only
* needs to free the returned string if a) return value != query and b)
* strchunk was not provided.
* @param term some term
*
* @return the processed text, free with g_free
*/
char* mu_str_xapian_escape_in_place_try (char *query, gboolean esc_space,
GStringChunk *strchunk);
char* mu_str_process_term (const char *term)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
/**
* escape the string for use with xapian matching. in practice, if the
* string contains an '@', replace '@', single-'.' with '_'. Also,
* replace ':' with '_', if it's not following a xapian-prefix (such
* as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]).
* Process some query term (e.g., an e-mail address, subject field):
* remove accents, replace some punctuation by _, but leave some query
* metachars alone.
*
* @param str a string
* @param esc_space escape space characters as well
* @param strchunk (optional) if non-NULL, allocate strings on strchunk
* @param qterm some query term
*
* @return the escaped string (free with g_free) or NULL in case of error
* Unless strchunk was provided, user must g_free the string when
* no longer needed
* @return the processed text, free with g_free
*/
char* mu_str_xapian_escape (const char *str, gboolean esc_space,
GStringChunk *strchunk) G_GNUC_WARN_UNUSED_RESULT;
char* mu_str_process_query_term (const char *qterm)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
/**
* escape the xapian term
*
* @param str a string
* @param strchunk allocate strings on strchunk
*
* @return the escaped string, which is allocated in the strchunk
*/
char* mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk);
/**
* Fixup values for some fields in the DWIM manner:
@ -315,14 +271,14 @@ GSList* mu_str_to_list (const char *str, char sepa, gboolean strip);
/**
* convert a string (with possible escaping) to a list. list items are
* separated by one or more spaces. list items can be quoted (using
* '"'), and '"', ' ' and '\' use their special meaning when prefixed
* with \.
* '"').
*
* @param str a string
*
* @return a list of elements or NULL in case of error
* @return a list of elements or NULL in case of error, free with
* mu_str_free_list
*/
GSList* mu_str_esc_to_list (const char *str, GError **err);
GSList* mu_str_esc_to_list (const char *str);
/**