* mu: more 'fixing'/'massaging' of queries

This commit is contained in:
djcb 2013-04-17 00:09:08 +03:00
parent 9489370eb9
commit a5001acff0
6 changed files with 123 additions and 40 deletions

View File

@ -302,6 +302,8 @@ add_terms_values_number (Xapian::Document& doc, MuMsg *msg, MuMsgFieldId mfid)
}
/* for string and string-list */
static void
add_terms_values_str (Xapian::Document& doc, char *val,
@ -317,11 +319,11 @@ add_terms_values_str (Xapian::Document& doc, char *val,
termgen.index_text_without_positions (val, 1, prefix(mfid));
}
if (mu_msg_field_xapian_escape (mfid))
val= mu_str_xapian_escape_in_place_try (val, TRUE /*esc_space*/,
strchunk);
val = mu_str_xapian_escape_term (val, strchunk);
if (mu_msg_field_xapian_term(mfid))
doc.add_term (prefix(mfid) +
std::string(val, 0, _MuStore::MAX_TERM_LENGTH));
std::string(val, 0,
_MuStore::MAX_TERM_LENGTH));
}
@ -440,8 +442,7 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)
* on strchunk, no need to free*/
if ((fname = mu_msg_part_get_filename (part, FALSE))) {
char *val;
val = mu_str_xapian_escape (fname, TRUE /*esc space*/,
pdata->_strchunk);
val = mu_str_xapian_escape_term (fname, pdata->_strchunk);
g_free (fname);
pdata->_doc.add_term
(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
@ -598,7 +599,7 @@ add_address_subfields (Xapian::Document& doc, const char *addr,
const std::string& pfx, GStringChunk *strchunk)
{
const char *at;
char *p1, *p2;
const char *p1, *p2;
/* add "foo" and "bar.com" as terms as well for
* "foo@bar.com" */
@ -608,14 +609,11 @@ add_address_subfields (Xapian::Document& doc, const char *addr,
p1 = g_strndup(addr, at - addr); // foo
p2 = g_strdup (at + 1);
p1 = mu_str_xapian_escape_in_place_try (p1, TRUE, strchunk);
p2 = mu_str_xapian_escape_in_place_try (p2, TRUE, strchunk);
p1 = mu_str_xapian_escape_term (p1, strchunk);
p2 = mu_str_xapian_escape_term (p2, strchunk);
doc.add_term (pfx + std::string(p1, 0, _MuStore::MAX_TERM_LENGTH));
doc.add_term (pfx + std::string(p2, 0, _MuStore::MAX_TERM_LENGTH));
g_free (p1);
g_free (p2);
}
static void
@ -643,8 +641,8 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
char *escaped;
/* note: escaped is added to stringchunk, no need for
* freeing */
escaped = mu_str_xapian_escape (contact->address, FALSE,
msgdoc->_strchunk);
escaped = mu_str_xapian_escape_term (contact->address,
msgdoc->_strchunk);
msgdoc->_doc->add_term
(std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
add_address_subfields (*msgdoc->_doc, contact->address, pfx,

View File

@ -449,6 +449,35 @@ check_for_field (const char *str, gboolean *is_field,
*is_range_field = pfx.range_field;
}
static gboolean
is_xapian_special_char (char c)
{
switch (c) {
case '@':
case '.':
case ',':
case '/':
case '[':
case ']':
case '+':
case '-':
case ' ':
case ':':
case '(':
case ')':
case '"':
case '\'':
case '*':
return TRUE;
default:
return FALSE;
}
}
#define ESC_CHAR '_'
/*
* Xapian treats various characters such as '@', '-', ':' and '.'
* specially; function below is an ugly hack to make it DWIM in most
@ -460,44 +489,68 @@ char*
mu_str_xapian_escape_in_place_try (char *term, gboolean esc_space, GStringChunk *strchunk)
{
unsigned char *cur;
const char escchar = '_';
gboolean is_field, is_range_field;
char lookback;
gboolean is_field, is_range_field, quoted;
unsigned colon;
g_return_val_if_fail (term, NULL);
check_for_field (term, &is_field, &is_range_field);
for (colon = 0, cur = (unsigned char*)term; *cur; ++cur) {
for (colon = 0, lookback = 0, quoted=FALSE, cur = (unsigned char*)term;
*cur; ++cur) {
if (*cur == '\\')
quoted = !quoted;
switch (*cur) {
case '.': /* escape '..' if it's not a range field*/
if (is_range_field && cur[1] == '.')
cur += 1;
case '.': /* escape '..' if it's not a range field */
if (cur[1] == '.') {
if (!is_range_field) {
*cur = ESC_CHAR;
*(cur + 1) = ESC_CHAR;
}
++cur;
} else if (isblank(lookback) || isblank(cur[1]) ||
cur[1] == '\0')
*cur = ' ';
else
*cur = escchar;
*cur = ESC_CHAR;
break;
case ':':
/* if there's a registered xapian prefix
* before the *first* ':', don't touch
* it. Otherwise replace ':' with '_'... ugh
* it. Otherwise replace ':' with ' '... ugh
* yuck ugly...
*/
if (colon != 0 || !is_field)
*cur = escchar;
*cur = ' ';
++colon;
break;
case '@':
case '/':
case '[':
case ']':
case '+':
case '-':
*cur = ESC_CHAR;
break;
case ' ':
case '_':
case '(':
case ')':
case '"':
case '\'':
case '*': /* wildcard */
break;
break; /* leave as they are */
default:
/* escape all other special stuff */
/* turn other stuff into spaces */
if (*cur < 0x80 && !isalnum (*cur))
*cur = escchar;
*cur = ' ';
}
lookback = *cur;
}
/* downcase try to remove accents etc. */
@ -519,6 +572,26 @@ mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strch
return mu_str_xapian_escape_in_place_try (mystr, esc_space, strchunk);
}
char*
mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk)
{
char *cur, *esc;
g_return_val_if_fail (term, NULL);
g_return_val_if_fail (strchunk, NULL);
for (cur = esc = mu_str_normalize (term, TRUE, strchunk);
*cur; ++cur) {
if (is_xapian_special_char (*cur))
*cur = ESC_CHAR;
}
return esc;
}
/*
* Split simple search term into prefix, expression and suffix.
* Meant to handle cases like "(maildir:/abc)", prefix and
@ -533,7 +606,7 @@ mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strch
*/
static gboolean
split_term (const gchar *term,
const gchar **pfx, const gchar **cond, const gchar **sfx)
const gchar **pfx, const gchar **cond, const gchar **sfx)
{
size_t l;
const gchar *start, *tail;

View File

@ -177,6 +177,15 @@ char* mu_str_xapian_escape_in_place_try (char *query, gboolean esc_space,
char* mu_str_xapian_escape (const char *str, gboolean esc_space,
GStringChunk *strchunk) G_GNUC_WARN_UNUSED_RESULT;
/**
* escape the xapian term
*
* @param str a string
* @param strchunk allocate strings on strchunk
*
* @return the escaped string, which is allocated in the strchunk
*/
char* mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk);
/**
* Fixup values for some fields in the DWIM manner:

View File

@ -197,15 +197,15 @@ test_mu_str_xapian_escape (void)
{ "aap@noot.mies", "aap_noot_mies"},
{ "Foo..Bar", "foo__bar" },
{ "Foo.Bar", "foo_bar" },
{ "Foo. Bar", "foo__bar" },
{ "Foo. Bar", "foo bar" },
{ "subject:test@foo", "subject:test_foo" },
{ "xxx:test@bar", "xxx_test_bar" },
{ "aa$bb$cc", "aa_bb_cc" },
{ "xxx:test@bar", "xxx test_bar" },
{ "aa$bb$cc", "aa bb cc" },
{ "date:2010..2012", "date:2010..2012"},
{ "d:2010..2012", "d:2010..2012"},
{ "size:10..20", "size:10..20"},
{ "x:2010..2012", "x:2010__2012"},
{ "q:2010..2012", "q_2010__2012"},
{ "q:2010..2012", "q 2010__2012"},
{ "subject:2010..2012", "subject:2010__2012"},
{ "(maildir:foo)", "(maildir:foo)"},
{ "£300", "£300" }
@ -233,10 +233,10 @@ test_mu_str_xapian_escape_non_ascii (void)
const char* word;
const char* esc;
} words [] = {
{ "Тесла, Никола", "тесла__никола"},
{ "Тесла, Никола", "тесла никола"},
{ "Masha@Аркона.ru", "masha_аркона_ru" },
{ "foo:ελληνικά", "foo_ελληνικά" },
{ "日本語!!", "日本語__" },
{ "foo:ελληνικά", "foo ελληνικά" },
{ "日本語!!", "日本語 " },
{ "", "" }
};

View File

@ -24,6 +24,8 @@
#include <glib.h>
#include <glib/gstdio.h>
#include <string.h>
#include <errno.h>
#include "../mu-query.h"
@ -298,12 +300,11 @@ test_mu_find_links (void)
static void
test_mu_find_maildir_special (void)
{
/* ensure that maldirs with spaces in their names work... */
search ("\"maildir:/wom bat\" subject:atoms", 1);
search ("\"maildir:/wom_bat\" subject:atoms", 1);
search ("\"maildir:/wOm_bàT\"", 3);
search ("\"maildir:/wOm*\"", 3);
search ("\"maildir:/wOm *\"", 3);
search ("\"maildir:wom bat\"", 0);
search ("\"maildir:/wOm_*\"", 3);
search ("\"maildir:wom_bat\"", 0);
search ("\"maildir:/wombat\"", 0);
search ("subject:atoms", 1);
}
@ -366,8 +367,10 @@ get_file_size (const char* path)
struct stat statbuf;
rv = stat (path, &statbuf);
if (rv != 0)
if (rv != 0) {
/* g_warning ("error: %s", strerror (errno)); */
return -1;
}
return (gint64)statbuf.st_size;
}

View File

@ -213,7 +213,7 @@ test_mu_query_03 (void)
{ "s:LISP", 1},
{ "s:\"Re: Learning LISP; Scheme vs elisp.\"", 1},
{ "subject:Re: Learning LISP; Scheme vs elisp.", 0},
{ "subject:Re: Learning LISP; Scheme vs elisp.", 1},
{ "subject:\"Re: Learning LISP; Scheme vs elisp.\"", 1},
{ "to:help-gnu-emacs@gnu.org", 4},
{ "t:help-gnu-emacs", 4},
@ -530,7 +530,7 @@ test_mu_query_tags (void)
{ "tag:lost tag:paradise", 1},
{ "tag:lost tag:horizon", 0},
{ "tag:lost OR tag:horizon", 1},
{ "x:paradise,lost", 0},
{ "x:paradise,lost", 1},
};
for (i = 0; i != G_N_ELEMENTS(queries); ++i)