* mu-str: simplify, cleanup string pre-processing functions

2013-05-13 00:01:49 +03:00 · 2013-05-13 00:01:49 +03:00 · d26f3c0bae
parent 2f60f33dc8
commit d26f3c0bae
3 changed files with 132 additions and 645 deletions
--- a/lib/mu-str-normalize.c
+++ b/lib/mu-str-normalize.c
@ -1,402 +0,0 @@
-/*
-** Copyright (C) 2012-2013 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
-**
-** This program is free software; you can redistribute it and/or modify
-** it under the terms of the GNU General Public License as published by
-** the Free Software Foundation; either version 3 of the License, or
-** (at your option) any later version.
-**
-** This program is distributed in the hope that it will be useful,
-** but WITHOUT ANY WARRANTY; without even the implied warranty of
-** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-** GNU General Public License for more details.
-**
-** You should have received a copy of the GNU General Public License
-** along with this program; if not, write to the Free Software Foundation,
-** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-**
-*/
-
-#if HAVE_CONFIG_H
-#include "config.h"
-#endif /*HAVE_CONFIG_H*/
-
-
-#include <glib.h>
-#include <string.h>
-#include <ctype.h>
-
-#include "mu-str.h"
-
-
-char*
-mu_str_normalize (const char *str, gboolean downcase, GStringChunk *strchunk)
-{
-	char *mystr;
-
-	g_return_val_if_fail (str, NULL);
-
-	if (strchunk)
-		mystr = g_string_chunk_insert (strchunk, str);
-	else
-		mystr = g_strdup (str);
-
-	return mu_str_normalize_in_place (mystr, downcase, strchunk);
-}
-
-
-
-/*
- * this implementation works for accented chars in Unicode Blocks
- * 'Latin-1 Supplement' and 'Latin Extended-A'. An alternative (slower
- * but much simpler) implementation would be to use g_utf8_normalize
- * to decompose characters in the accent part and the character part,
- * and then get rid of the former. That would be slower than what we
- * do here, but also more *complete*.  It's unclear whether it would
- * be slower *in practice* => needs checking
- */
-
-/* we can normalize in-place, as the normalized string will never be
- * longer than the original.  even for replacements that are 2 chars
- * wide (e.g. German ß => ss), the replacement is 2 bytes, like the
- * original 0xc3 0x9f
- *
- * note-to-self: http://www.geertvanderploeg.com/unicode-gen/
- */
-char*
-mu_str_normalize_in_place (char *str, gboolean downcase, GStringChunk *strchunk)
-{
-	const guchar *cur;
-	int i;
-
-	g_return_val_if_fail (str, NULL);
-
-	if (*str == '\0')
-		return str;
-
-	for (i = 0, cur = (const guchar*)str; *cur; ++cur) {
-
-		/* special case for plain-old ascii */
-		if ((*cur < 0x80)) {
-			str[i++] = downcase ? tolower (*cur) : *cur;
-			continue;
-		}
-
-		if (*cur == 0xc3) { /* latin-1 supplement */
-			++cur;
-			switch (*cur) {
-
-			case 0x80:
-			case 0x81:
-			case 0x82:
-			case 0x83:
-			case 0x84:
-			case 0x85: str[i++] = downcase ? 'a' : 'A' ; break;
-
-			case 0x86:
-				str[i++] = downcase ? 'a' : 'A' ;
-				str[i++] = 'e';
-				break;
-
-			case 0x87: str[i++] = downcase ? 'c' : 'C'; break;
-
-			case 0x88:
-			case 0x89:
-			case 0x8a:
-			case 0x8b:
-				str[i++] = downcase ? 'e' : 'E';
-				break;
-
-			case 0x8c:
-			case 0x8d:
-			case 0x8e:
-			case 0x8f: str[i++] = downcase ? 'i': 'I'; break;
-
-			case 0x90: str[i++] = downcase ? 'd' : 'D'; break;
-			case 0x91: str[i++] = downcase ? 'n' : 'N'; break;
-
-			case 0x92:
-			case 0x93:
-			case 0x94:
-			case 0x95:
-			case 0x96: str[i++] = downcase ? 'o' : 'O'; break;
-
-			case 0x99:
-			case 0x9a:
-			case 0x9b:
-			case 0x9c: str[i++] = downcase ? 'u' : 'U'; break;
-
-			case 0x9d: str[i++] = downcase ? 'y' : 'Y'; break;
-
-			case 0x9e:
-				str[i++] = downcase ? 't' : 'T';
-				str[i++] = 'h';
-				break;
-
-			case 0x9f: str[i++] = 's'; str[i++] = 's'; break;
-
-			case 0xa0:
-			case 0xa1:
-			case 0xa2:
-			case 0xa3:
-			case 0xa4:
-			case 0xa5: str[i++] = 'a'; break;
-
-			case 0xa6: str[i++] = 'a'; str[i++] = 'e'; break;
-			case 0xa7: str[i++] = 'c'; break;
-
-			case 0xa8:
-			case 0xa9:
-			case 0xaa:
-			case 0xab: str[i++] = 'e'; break;
-
-			case 0xac:
-			case 0xad:
-			case 0xae:
-			case 0xaf: str[i++] = 'i'; break;
-
-			case 0xb0: str[i++] = 'd'; break;
-			case 0xb1: str[i++] = 'n'; break;
-
-			case 0xb2:
-			case 0xb3:
-			case 0xb4:
-			case 0xb5:
-			case 0xb6: str[i++] = 'o'; break;
-
-			case 0xb9:
-			case 0xba:
-			case 0xbb:
-			case 0xbc: str[i++] = 'u'; break;
-
-			case 0xbd: str[i++] = 'y'; break;
-			case 0xbe: str[i++] = 't'; str[i++] = 'h'; break;
-			case 0xbf: str[i++] = 'y'; break;
-
-			default:
-				str[i++] = *cur;
-			}
-
-		} else if (*cur == 0xc4) {  /* Latin Extended-A (0x04) */
-			++cur;
-			switch (*cur) {
-			case 0x80:
-			case 0x82:
-			case 0x84: str[i++] = downcase ? 'a' : 'A'; break;
-
-			case 0x86:
-			case 0x88:
-			case 0x8a:
-			case 0x8c: str[i++] = downcase ? 'c' : 'C'; break;
-
-			case 0x8e:
-			case 0x90: str[i++] = downcase ? 'd' : 'D'; break;
-
-			case 0x92:
-			case 0x94:
-			case 0x96:
-			case 0x98:
-			case 0x9a: str[i++] = downcase ? 'e' : 'E'; break;
-
-			case 0x9c:
-			case 0x9e:
-			case 0xa0:
-			case 0xa2: str[i++] = downcase ? 'g' : 'G'; break;
-
-			case 0xa4:
-			case 0xa6: str[i++] = downcase ? 'h' : 'H'; break;
-
-			case 0xa8:
-			case 0xaa:
-			case 0xac:
-			case 0xae:
-			case 0xb0: str[i++] = downcase ? 'i' : 'I'; break;
-
-			case 0xb2:
-				str[i++] = downcase ? 'i' : 'I';
-				str[i++] = downcase ? 'j' : 'J';
-				break;
-
-
-			case 0xb4: str[i++] = downcase ? 'j' : 'J'; break;
-
-			case 0xb6: str[i++] = downcase ? 'k' : 'K'; break;
-
-			case 0xb9:
-			case 0xbb:
-			case 0xbd:
-			case 0xbf: str[i++] = downcase ? 'l': 'L'; break;
-
-			case 0x81:
-			case 0x83:
-			case 0x85: str[i++] = 'a'; break;
-
-			case 0x87:
-			case 0x89:
-			case 0x8b:
-			case 0x8d: str[i++] = 'c'; break;
-
-			case 0x8f:
-			case 0x91: str[i++] = 'd'; break;
-
-			case 0x93:
-			case 0x95:
-			case 0x97:
-			case 0x99:
-			case 0x9b: str[i++] = 'e'; break;
-
-			case 0x9d:
-			case 0x9f:
-			case 0xa1:
-			case 0xa: str[i++] = 'g'; break;
-
-			case 0xa5:
-			case 0xa7: str[i++] = 'h'; break;
-
-			case 0xa9:
-			case 0xab:
-			case 0xad:
-			case 0xaf:
-			case 0xb1: str[i++] = 'i'; break;
-
-			case 0xb3: str[i++] = 'i'; str[i++] = 'j'; break;
-
-			case 0xb5: str[i++] = 'j'; break;
-
-			case 0xb7:
-			case 0xb8: str[i++] = 'k'; break;
-
-			case 0xba:
-			case 0xbc:
-			case 0xbe: str[i++] = 'l'; break;
-
-			default:   str[i++] = *cur; break;
-
-			}
-
-		} else if (*cur == 0xc5) { /* Latin Extended-A (0xc5) */
-			++cur;
-			switch (*cur) {
-			case 0x81: str[i++] = downcase ? 'l': 'L'; break;
-
-			case 0x83:
-			case 0x85:
-			case 0x87: str[i++] = downcase ? 'n': 'N'; break;
-
-			case 0x8c:
-			case 0x8e:
-			case 0x90: str[i++] = downcase ? 'o': 'O'; break;
-
-			case 0x92:
-				str[i++] = downcase ? 'o':  'O';
-				str[i++] = 'e';
-				break;
-
-			case 0x94:
-			case 0x96:
-			case 0x98: str[i++] = downcase ? 'r': 'R'; break;
-
-			case 0x9a:
-			case 0x9c:
-			case 0x9e:
-			case 0xa0: str[i++] = downcase ? 's': 'S'; break;
-
-			case 0xa2:
-			case 0xa4:
-			case 0xa6: str[i++] = downcase ? 't': 'T'; break;
-
-			case 0xa8:
-			case 0xaa:
-			case 0xac:
-			case 0xae:
-			case 0xb0:
-			case 0xb2: str[i++] = downcase ? 'u': 'U'; break;
-			case 0xb4: str[i++] = downcase ? 'w': 'W'; break;
-
-			case 0xb6:
-			case 0xb8: str[i++] = downcase ? 'y': 'Y'; break;
-
-			case 0xb9:
-			case 0xbb:
-			case 0xbd: str[i++] = downcase ? 'z': 'Z'; break;
-
-			case 0x80:
-			case 0x82: str[i++] = 'l'; break;
-
-			case 0x84:
-			case 0x86:
-			case 0x88:
-			case 0x89:
-			case 0x8a:
-			case 0x8b: str[i++] = 'n'; break;
-
-			case 0x8d:
-			case 0x8f:
-			case 0x91: str[i++] = 'o'; break;
-
-			case 0x93: str[i++] = 'o'; str[i++] = 'e'; break;
-
-			case 0x95:
-			case 0x97:
-			case 0x99: str[i++] = 'r'; break;
-
-			case 0x9b:
-			case 0x9d:
-			case 0x9f:
-			case 0xa1: str[i++] = 's'; break;
-
-			case 0xa3:
-			case 0xa5:
-			case 0xa7: str[i++] = 't'; break;
-
-			case 0xa9:
-			case 0xab:
-			case 0xad:
-			case 0xaf:
-			case 0xb1:
-			case 0xb3: str[i++] = 'u'; break;
-
-			case 0xb5: str[i++] = 'w'; break;
-
-			case 0xb7: str[i++] = 'y'; break;
-
-			case 0xba:
-			case 0xbc:
-			case 0xbe: str[i++] = 'z'; break;
-
-			case 0xbf: str[i++] = 's'; break;
-
-			default:   str[i++] = *cur; break;
-			}
-
-		} else {
-			/* our fast-path for latin-utf8 does not work
-			 * -- bummer! just append the character then
-			 * */
-			gunichar uc;
-			char buf[7];
-			size_t len1, len2;
-
-			len1 = g_utf8_next_char ((char*)cur) - (char*)cur;
-			uc = g_utf8_get_char ((char*)cur);
-
-			if (downcase)
-				uc = g_unichar_tolower (uc);
-
-			len2 = g_unichar_to_utf8 (uc, buf);
-
-			/* if the new char fits where the old char was,
-			 * change it. otherwise, don't bother. */
-
-			if (len1 == len2) {
-				memcpy (str + i, buf, len2);
-				i += len2;
-			}
-		}
-
-	}
-
-	str[i] = '\0';
-
-	return str;
-}
--- a/lib/mu-str.c
+++ b/lib/mu-str.c
@ -257,78 +257,53 @@ mu_str_to_list (const char *str, char sepa, gboolean strip)
 	return lst;
 }

-
-static gchar*
-eat_esc_string (char **strlst, GError **err)
-{
-	char *str;
-	gboolean quoted;
-	GString *gstr;
-
-	str  = g_strchug (*strlst);
-	gstr = g_string_sized_new (strlen(str));
-
-	for (quoted = FALSE; *str; ++str) {
-
-		if (*str == '"') {
-			quoted = !quoted;
-			continue;
-		} else if (*str == '\\') {
-			if (str[1] != ' ' && str[1] != '"' && str[1] != '\\')
-				goto err; /* invalid escaping */
-			g_string_append_c (gstr, str[1]);
-			++str;
-			continue;
-		} else if (*str == ' ' && !quoted) {
-			++str;
-			goto leave;
-		} else
-			g_string_append_c (gstr, *str);
-	}
-leave:
-	*strlst = str;
-	return g_string_free (gstr, FALSE);
-err:
-	g_set_error (err, MU_ERROR_DOMAIN, MU_ERROR_IN_PARAMETERS,
-		     "error parsing string '%s'", g_strchug(*strlst));
-	*strlst = NULL;
-	return g_string_free (gstr, TRUE);
-}
-
-
 GSList*
-mu_str_esc_to_list (const char *strings, GError **err)
+mu_str_esc_to_list (const char *strings)
 {
 	GSList *lst;
-	char *mystrings, *freeme;
-	const char* cur;
+	GString *part;
+	unsigned u;
+	gboolean quoted;

 	g_return_val_if_fail (strings, NULL);

-	for (cur = strings; *cur && (*cur == ' ' || *cur == '\t'); ++cur);
-	freeme = mystrings = g_strdup (cur);
+	part = g_string_new (NULL);

-	lst = NULL;
-	do {
-		gchar *str;
-		str = eat_esc_string (&mystrings, err);
-		if (str)
-			lst = g_slist_prepend (lst, str);
-		else {
-			g_free (freeme);
-			mu_str_free_list (lst);
-			return NULL;
+	for (u = 0, lst = NULL, quoted = FALSE;
+	     u != strlen (strings); ++u) {
+
+		char kar;
+		kar = strings[u];
+
+		if (quoted && kar != '"') {
+			g_string_append_c (part, kar);
+			continue;
 		}

-	} while (mystrings && *mystrings);
+		switch (kar) {
+		case '"':
+			quoted = !quoted;
+			g_string_append_c (part, kar);
+			continue;
+		case ' ':
+ 			if (part->len > 0) {
+				lst = g_slist_prepend
+					(lst, g_string_free (part, FALSE));
+				part = g_string_new (NULL);
+			}
+			continue;
+		default:
+			g_string_append_c (part, kar);
+		}
+	}
+
+	if (part->len)
+		lst = g_slist_prepend (lst, g_string_free (part, FALSE));

-	g_free (freeme);
 	return g_slist_reverse (lst);
 }


-
-
 void
 mu_str_free_list (GSList *lst)
 {
@ -451,147 +426,105 @@ check_for_field (const char *str, gboolean *is_field,


 static gboolean
-is_xapian_special_char (char c)
+handle_esc_maybe (GString *gstr, char **cur, gunichar uc,
+		  gboolean query_esc)
 {
-	switch (c) {
+	char kar;

-	case '@':
-	case '.':
-	case ',':
-	case '/':
-	case '[':
-	case ']':
-	case '+':
-	case '-':
-	case ' ':
-	case ':':
-	case '(':
-	case ')':
-	case '$':
-	case '"':
-	case '\\':
-	case '\'':
-	case '*':
-		return TRUE;
-	default:
-		return FALSE;
-	}
-}
+	kar = *cur[0];

-#define ESC_CHAR '_'
-
-/*
- * Xapian treats various characters such as '@', '-', ':' and '.'
- * specially; function below is an ugly hack to make it DWIM in most
- * cases...
- *
- * function expects search terms (not complete queries)
- * */
-char*
-mu_str_xapian_escape_in_place_try (char *term, gboolean esc_space, GStringChunk *strchunk)
-{
-	unsigned char *cur;
-	char lookback;
-	gboolean is_field, is_range_field, quoted;
-	unsigned colon;
-
-	g_return_val_if_fail (term, NULL);
-
-	check_for_field (term, &is_field, &is_range_field);
-
-	for (colon = 0, lookback = 0, quoted=FALSE, cur = (unsigned char*)term;
-	     *cur; ++cur) {
-
-		if (*cur == '\\')
-			quoted = !quoted;
-
-		switch (*cur) {
-
-		case '.': /* escape '..' if it's not a range field */
-			if (cur[1] == '.') {
-				if (!is_range_field) {
-					*cur	   = ESC_CHAR;
-					*(cur + 1) = ESC_CHAR;
-				}
-				++cur;
-			} else if (isblank(lookback) || isblank(cur[1]) ||
-				   cur[1] == '\0')
-				*cur = ' ';
-			else
-				*cur = ESC_CHAR;
-			break;
+	if (query_esc) {
+		switch (kar) {
 		case ':':
-			/* if there's a registered xapian prefix
-			 * before the *first* ':', don't touch
-			 * it. Otherwise replace ':' with ' '... ugh
-			 * yuck ugly...
-			 */
-			if (colon != 0 || !is_field)
-				*cur = ' ';
-			++colon;
-			break;
-		case '@':
-		case '/':
-		case '[':
-		case ']':
-		case '+':
-		case '$':
-		case '\\':
-		case '-':
-			*cur = ESC_CHAR;
-			break;
-		case ' ':
-		case '_':
 		case '(':
 		case ')':
+		case '*':
 		case '"':
-		case '\'':
-		case '*':   /* wildcard */
-			break; /* leave as they are */
-		default:
-			/* turn other stuff into spaces */
-			if (*cur < 0x80 && !isalnum (*cur))
-				*cur = ' ';
+			g_string_append_c (gstr, kar);
+			return TRUE;
+		case '.':
+			if ((*cur)[1] == '.' && (*cur)[2] != '.') {
+				g_string_append (gstr, "..");
+				*cur = g_utf8_next_char (*cur);
+				return TRUE;
+			}
+		default: break;
 		}
-
-		lookback = *cur;
 	}

-	/* downcase try to remove accents etc. */
-	return mu_str_normalize_in_place (term, TRUE, strchunk);
+	if (g_unichar_ispunct(uc) || isblank(kar)) {
+		g_string_append_c (gstr, '_');
+		return TRUE;
+	}
+
+	return FALSE;
 }

-char*
-mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strchunk)
+
+static char*
+process_str (const char *str, gboolean xapian_esc, gboolean query_esc)
 {
-	char *mystr;
+	GString *gstr;
+	char *norm, *cur;

-	g_return_val_if_fail (query, NULL);
+	norm = g_utf8_normalize (str, -1, G_NORMALIZE_ALL);
+	gstr = g_string_sized_new (strlen (norm));

-	if (strchunk)
-		mystr = g_string_chunk_insert (strchunk, query);
-	else
-		mystr = g_strdup (query);
+	for (cur = norm; cur && *cur; cur = g_utf8_next_char (cur)) {

-	return mu_str_xapian_escape_in_place_try (mystr, esc_space, strchunk);
+		gunichar uc;
+
+		uc = g_utf8_get_char (cur);
+
+		if (xapian_esc)
+			if (handle_esc_maybe (gstr, &cur, uc, query_esc))
+				continue;
+
+		if (g_unichar_ismark(uc))
+			continue;
+
+		/* maybe add some special cases, such as Spaß->spass ?
+		 */
+
+		uc = g_unichar_tolower (uc);
+		g_string_append_unichar (gstr, uc);
+	}
+
+	g_free (norm);
+
+	/* g_print ("-->%s\n", gstr->str); */
+
+	return g_string_free (gstr, FALSE);
 }


 char*
-mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk)
+mu_str_process_text (const char *str)
 {
-	char *cur, *esc;
+	g_return_val_if_fail (str, NULL);

-	g_return_val_if_fail (term, NULL);
-	g_return_val_if_fail (strchunk, NULL);
+	return process_str (str, FALSE, FALSE);

-	for (cur = esc = mu_str_normalize (term, TRUE, strchunk);
-	     *cur; ++cur) {
-		if (is_xapian_special_char (*cur))
-			*cur = ESC_CHAR;
-	}
+}
+
+
+char*
+mu_str_process_term (const char *str)
+{
+	g_return_val_if_fail (str, NULL);
+
+	return process_str (str, TRUE, FALSE);
+
+}
+
+
+char*
+mu_str_process_query_term (const char *str)
+{
+	g_return_val_if_fail (str, NULL);
+
+	return process_str (str, TRUE, TRUE);

-	return esc;
 }


--- a/lib/mu-str.h
+++ b/lib/mu-str.h
@ -106,86 +106,42 @@ char*       mu_str_flags    (MuFlags flags)
 char* mu_str_summarize (const char* str, size_t max_lines)
    G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;

-/**
- * normalize a string (ie., collapse accented characters etc.), and
- * optionally, downcase it. Works for accented chars in Unicode Blocks
- * 'Latin-1 Supplement' and 'Latin Extended-A'
- *
- * @param str a valid utf8 string or NULL
- * @param downcase if TRUE, convert the string to lowercase
- * @param strchunk (optional) if non-NULL, allocate strings on strchunk
- *
- * @return the normalized string, or NULL in case of error or str was
- * NULL. Unless strchunk was provided, user must g_free the string when
- * no longer needed
- */
-char* mu_str_normalize (const char *str, gboolean downcase,
-			GStringChunk *strchunk)
-    G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;

 /**
- * normalize a string (ie., collapse accented characters etc.), and
- * optionally, downcase it. this happen by changing the string; if
- * that is not desired, use mu_str_normalize. Works for accented chars
- * in Unicode Blocks 'Latin-1 Supplement' and 'Latin Extended-A'
+ * Process some text (e.g. message bodies) -- flatten (remove accents
+ * etc.), and remove some punctuation.
 *
- * @param str a valid utf8 string or NULL
- * @param downcase if TRUE, convert the string to lowercase
- * @param strchunk (optional) if non-NULL, allocate strings on strchunk
+ * @param text some text
 *
- * @return the normalized string, or NULL in case of error or str was
- * NULL. User only needs to free the returned string if a) return
- * value != str and b) strchunk was not provided.
+ * @return the processed text, free with g_free
 */
-char* mu_str_normalize_in_place (char *str, gboolean downcase,
-				 GStringChunk *strchunk);
+char* mu_str_process_text (const char *text)
+	 G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;

 /**
- * escape the string for use with xapian matching. in practice, if the
- * string contains an '@', replace '@', single-'.' with '_'. Also,
- * replace ':' with '_', if it's not following a xapian-prefix (such
- * as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]).
- * changing is done in-place (by changing the argument string). in any
- * case, the string will be downcased.
+ * Process some term (e.g., an e-mail address, subject field):
+ * remove accents, replace some punctuation by _
 *
- * @param query a query string
- * @param esc_space escape space characters as well
- * @param strchunk (optional) if non-NULL, allocate strings on strchunk
- *
- * @return the escaped string or NULL in case of error. User only
- * needs to free the returned string if a) return value != query and b)
- * strchunk was not provided.
+ * @param term some term
 *
+ * @return the processed text, free with g_free
 */
-char* mu_str_xapian_escape_in_place_try (char *query, gboolean esc_space,
-					 GStringChunk *strchunk);
+char* mu_str_process_term (const char *term)
+	 G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
+

 /**
- * escape the string for use with xapian matching. in practice, if the
- * string contains an '@', replace '@', single-'.' with '_'. Also,
- * replace ':' with '_', if it's not following a xapian-prefix (such
- * as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]).
+ * Process some query term (e.g., an e-mail address, subject field):
+ * remove accents, replace some punctuation by _, but leave some query
+ * metachars alone.
 *
- * @param str a string
- * @param esc_space escape space characters as well
- * @param strchunk (optional) if non-NULL, allocate strings on strchunk
+ * @param qterm some query term
 *
- * @return the escaped string (free with g_free) or NULL in case of error
- * Unless strchunk was provided, user must g_free the string when
- * no longer needed
+ * @return the processed text, free with g_free
 */
-char* mu_str_xapian_escape (const char *str, gboolean esc_space,
-			    GStringChunk *strchunk)  G_GNUC_WARN_UNUSED_RESULT;
+char* mu_str_process_query_term (const char *qterm)
+	 G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;

-/**
- * escape the xapian term
- *
- * @param str a string
- * @param strchunk allocate strings on strchunk
- *
- * @return the escaped string, which is allocated in the strchunk
- */
-char* mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk);

 /**
 * Fixup values for some fields in the DWIM manner:
@ -315,14 +271,14 @@ GSList* mu_str_to_list (const char *str, char sepa, gboolean strip);
 /**
 * convert a string (with possible escaping) to a list. list items are
 * separated by one or more spaces. list items can be quoted (using
- * '"'), and '"', ' ' and '\' use their special meaning when prefixed
- * with \.
+ * '"').
 *
 * @param str a string
 *
- * @return a list of elements or NULL in case of error
+ * @return a list of elements or NULL in case of error, free with
+ * mu_str_free_list
 */
-GSList* mu_str_esc_to_list (const char *str, GError **err);
+GSList* mu_str_esc_to_list (const char *str);


 /**