* mu-str.[ch]: add mu_str_ascii_xapian_escape_in_place, for escaping some

Xapian fields; also add some tests
2010-11-29 21:21:55 +02:00 · 2010-11-29 21:21:55 +02:00 · bb5b1304e5
parent c6dadad978
commit bb5b1304e5
3 changed files with 180 additions and 4 deletions
--- a/src/mu-str.c
+++ b/src/mu-str.c
@ -30,7 +30,7 @@

 #include "mu-str.h"
 #include "mu-msg-flags.h"
-
+#include "mu-msg-fields.h"

 const char* 
 mu_str_date_s (const char* frm, time_t t)
@ -226,3 +226,104 @@ mu_date_parse_hdwmy (const char* str)
 	return delta <= now ? now - delta : never;  
 }

+
+struct _CheckPrefix {
+	const char		*pfx;
+	guint			 len;
+	gboolean		 match;
+};
+typedef struct _CheckPrefix	 CheckPrefix;
+
+static void
+each_check_prefix (MuMsgFieldId mfid, CheckPrefix *cpfx)
+{
+	const char *field_name;
+	char field_shortcut;
+
+	if (!cpfx || cpfx->match)
+		return;
+	
+	field_shortcut = mu_msg_field_shortcut (mfid);
+	if (field_shortcut == cpfx->pfx[0] && cpfx->pfx[1] == ':') {
+		cpfx->match = TRUE;
+		return;
+	}
+
+	field_name = mu_msg_field_name (mfid);
+	if (field_name &&
+	    strncmp (cpfx->pfx, field_name, cpfx->len) == 0) {
+		cpfx->match = TRUE;
+		return;
+	}
+}
+
+/* colon is a position inside q pointing at a ':' character. function
+ * determines whether the prefix is a registered prefix (like
+ * 'subject' or 'from' or 's') */
+static gboolean
+is_xapian_prefix (const char *q, const char *colon)
+{
+	const char *cur;
+	
+	if (colon == q)
+		return FALSE; /* : at beginning, not a prefix */
+	
+	/* track back from colon until a boundary or beginning of the
+	 * str */
+	for (cur = colon - 1; cur >= q; --cur) {
+
+		if (cur == q || !isalpha (*(cur-1))) {
+
+			CheckPrefix cpfx;
+			memset (&cpfx, 0, sizeof(CheckPrefix));
+
+			cpfx.pfx   = cur;
+			cpfx.len   = (colon - cur);
+			cpfx.match = FALSE;
+			
+			mu_msg_field_foreach ((MuMsgFieldForEachFunc)
+					      each_check_prefix,
+					      &cpfx);
+			
+			return (cpfx.match);
+		}
+	}
+	
+	return FALSE;
+}
+
+char*
+mu_str_ascii_xapian_escape_in_place (char *query)
+{
+	gchar *cur;
+	gboolean replace_dot;
+		
+	g_return_val_if_fail (query, NULL);
+
+	/* only replace the '.' if the string looks like an e-mail
+	 * address or msg-id */
+	replace_dot = (g_strstr_len(query, -1, "@") != NULL);
+	
+	for (cur = query; *cur; ++cur) {
+		if (*cur == '@') 
+			*cur = '_';
+
+		else if (replace_dot && *cur == '.') {
+			if (cur[1] == '.')  /* don't replace '..' */
+				cur += 2;
+			else
+				*cur = '_';
+		} else if (*cur == ':') {
+			/* if there's a registered xapian prefix before the
+			 * ':', don't touch it. Otherwise replace ':' with
+			  * a space'... ugly...
+			  */			 
+			if (!is_xapian_prefix (query, cur))
+				*cur = '_';
+		} else
+			*cur = tolower(*cur);
+	}
+	
+	return query;
+}
+
--- a/src/mu-str.h
+++ b/src/mu-str.h
@ -138,7 +138,7 @@ char* mu_str_summarize (const char* str,
 * 'Latin-1 Supplement' and 'Latin Extended-A'
 *
 * @param str a valid utf8 string or NULL
- * @param downcase if TRUE, convert the string to lowercase
+ * @param downcase if TRUE, convert the string to lowercase 
 * 
 * @return the normalize string, or NULL in case of error or str was NULL
 */
@ -153,12 +153,29 @@ char* mu_str_normalize (const char *str, gboolean downcase);
 * 
 * @param str a valid utf8 string or NULL
 * @param downcase if TRUE, convert the string to lowercase
- * 
- * @return the normalize string, or NULL in case of error or str was NULL
+ *  
+ * @return the normalized string, or NULL in case of error or str was
+ * NULL
 */
 char* mu_str_normalize_in_place (char *str, gboolean downcase);


+/**
+ * escape the string for use with xapian matching. in practice, if the
+ * string contains an '@', replace '@', single-'.' with '_'. Also,
+ * replace ':' with '_', if it's not following a xapian-prefix (such
+ * as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]).
+ * changing is done in-place (by changing the argument string). in
+ * any, case, the string will be downcased.
+ *
+ * works for ascii strings, like e-mail addresses and message-id.
+ * 
+ * @param query a query string
+ * 
+ * @return the escaped string or NULL in case of error
+ */
+char* mu_str_ascii_xapian_escape_in_place (char *query);
+
 /**
 * 
 * parse strings like 1h, 3w, 2m to mean '1 hour before now', '3 weeks
--- a/src/tests/test-mu-str.c
+++ b/src/tests/test-mu-str.c
@ -152,6 +152,59 @@ test_mu_str_normalize_01 (void)
 }


+static void
+test_mu_str_normalize_02 (void)
+{
+	int			i;
+	struct {
+		const char*	word;
+		const char*	norm;
+	} words [] = {
+		{ "DantèS", "DanteS"}, 
+		{ "foo", "foo" },
+		{ "Föö", "Foo" },
+		{ "číslO", "cislO" },
+		{ "hÆvý mëÐal ümláõt", "hAevy meDal umlaot"}
+	};
+
+	
+	for (i = 0; i != G_N_ELEMENTS(words); ++i) {
+		gchar *str;
+		str = mu_str_normalize (words[i].word, FALSE);
+		g_assert_cmpstr (str, ==, words[i].norm);
+		g_free (str);
+	}
+}
+
+
+static void
+test_mu_str_ascii_xapian_escape (void)
+{
+		int			i;
+		struct {
+				const char*	word;
+				const char*	esc;
+		} words [] = {
+				{ "aap@noot.mies", "aap_noot_mies"}, 
+				{ "Foo..Bar", "foo..bar" },
+				{ "subject:test@foo", "subject:test_foo" },
+				{ "xxx:test@bar", "xxx_test_bar" },
+		};
+		
+		for (i = 0; i != G_N_ELEMENTS(words); ++i) {
+				gchar *a = g_strdup (words[i].word);
+				mu_str_ascii_xapian_escape_in_place (a);
+				g_assert_cmpstr (a, ==, words[i].esc);
+				g_free (a);
+		}
+}
+
+
+
+
+
+
+
 #if 0

 static void
@ -233,6 +286,11 @@ main (int argc, char *argv[])
 	/* mu_str_normalize */
 	g_test_add_func ("/mu-str/mu-str-normalize-01",
 			 test_mu_str_normalize_01);
+	g_test_add_func ("/mu-str/mu-str-normalize-02",
+					 test_mu_str_normalize_02);
+
+	g_test_add_func ("/mu-str/mu-str-ascii-xapian-escape",
+					 test_mu_str_ascii_xapian_escape);
 	
 	/* mu_str_complete_iso_date_(begin|end) */
 	/* g_test_add_func ("/mu-str/mu-str-complete-iso-date-begin", */