mirror of https://github.com/djcb/mu.git
* mu: more 'fixing'/'massaging' of queries
This commit is contained in:
parent
9489370eb9
commit
a5001acff0
|
@ -302,6 +302,8 @@ add_terms_values_number (Xapian::Document& doc, MuMsg *msg, MuMsgFieldId mfid)
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* for string and string-list */
|
||||
static void
|
||||
add_terms_values_str (Xapian::Document& doc, char *val,
|
||||
|
@ -317,11 +319,11 @@ add_terms_values_str (Xapian::Document& doc, char *val,
|
|||
termgen.index_text_without_positions (val, 1, prefix(mfid));
|
||||
}
|
||||
if (mu_msg_field_xapian_escape (mfid))
|
||||
val= mu_str_xapian_escape_in_place_try (val, TRUE /*esc_space*/,
|
||||
strchunk);
|
||||
val = mu_str_xapian_escape_term (val, strchunk);
|
||||
if (mu_msg_field_xapian_term(mfid))
|
||||
doc.add_term (prefix(mfid) +
|
||||
std::string(val, 0, _MuStore::MAX_TERM_LENGTH));
|
||||
std::string(val, 0,
|
||||
_MuStore::MAX_TERM_LENGTH));
|
||||
}
|
||||
|
||||
|
||||
|
@ -440,8 +442,7 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)
|
|||
* on strchunk, no need to free*/
|
||||
if ((fname = mu_msg_part_get_filename (part, FALSE))) {
|
||||
char *val;
|
||||
val = mu_str_xapian_escape (fname, TRUE /*esc space*/,
|
||||
pdata->_strchunk);
|
||||
val = mu_str_xapian_escape_term (fname, pdata->_strchunk);
|
||||
g_free (fname);
|
||||
pdata->_doc.add_term
|
||||
(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
|
||||
|
@ -598,7 +599,7 @@ add_address_subfields (Xapian::Document& doc, const char *addr,
|
|||
const std::string& pfx, GStringChunk *strchunk)
|
||||
{
|
||||
const char *at;
|
||||
char *p1, *p2;
|
||||
const char *p1, *p2;
|
||||
|
||||
/* add "foo" and "bar.com" as terms as well for
|
||||
* "foo@bar.com" */
|
||||
|
@ -608,14 +609,11 @@ add_address_subfields (Xapian::Document& doc, const char *addr,
|
|||
p1 = g_strndup(addr, at - addr); // foo
|
||||
p2 = g_strdup (at + 1);
|
||||
|
||||
p1 = mu_str_xapian_escape_in_place_try (p1, TRUE, strchunk);
|
||||
p2 = mu_str_xapian_escape_in_place_try (p2, TRUE, strchunk);
|
||||
p1 = mu_str_xapian_escape_term (p1, strchunk);
|
||||
p2 = mu_str_xapian_escape_term (p2, strchunk);
|
||||
|
||||
doc.add_term (pfx + std::string(p1, 0, _MuStore::MAX_TERM_LENGTH));
|
||||
doc.add_term (pfx + std::string(p2, 0, _MuStore::MAX_TERM_LENGTH));
|
||||
|
||||
g_free (p1);
|
||||
g_free (p2);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -643,8 +641,8 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
|
|||
char *escaped;
|
||||
/* note: escaped is added to stringchunk, no need for
|
||||
* freeing */
|
||||
escaped = mu_str_xapian_escape (contact->address, FALSE,
|
||||
msgdoc->_strchunk);
|
||||
escaped = mu_str_xapian_escape_term (contact->address,
|
||||
msgdoc->_strchunk);
|
||||
msgdoc->_doc->add_term
|
||||
(std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
|
||||
add_address_subfields (*msgdoc->_doc, contact->address, pfx,
|
||||
|
|
99
lib/mu-str.c
99
lib/mu-str.c
|
@ -449,6 +449,35 @@ check_for_field (const char *str, gboolean *is_field,
|
|||
*is_range_field = pfx.range_field;
|
||||
}
|
||||
|
||||
|
||||
static gboolean
|
||||
is_xapian_special_char (char c)
|
||||
{
|
||||
switch (c) {
|
||||
|
||||
case '@':
|
||||
case '.':
|
||||
case ',':
|
||||
case '/':
|
||||
case '[':
|
||||
case ']':
|
||||
case '+':
|
||||
case '-':
|
||||
case ' ':
|
||||
case ':':
|
||||
case '(':
|
||||
case ')':
|
||||
case '"':
|
||||
case '\'':
|
||||
case '*':
|
||||
return TRUE;
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
#define ESC_CHAR '_'
|
||||
|
||||
/*
|
||||
* Xapian treats various characters such as '@', '-', ':' and '.'
|
||||
* specially; function below is an ugly hack to make it DWIM in most
|
||||
|
@ -460,44 +489,68 @@ char*
|
|||
mu_str_xapian_escape_in_place_try (char *term, gboolean esc_space, GStringChunk *strchunk)
|
||||
{
|
||||
unsigned char *cur;
|
||||
const char escchar = '_';
|
||||
gboolean is_field, is_range_field;
|
||||
char lookback;
|
||||
gboolean is_field, is_range_field, quoted;
|
||||
unsigned colon;
|
||||
|
||||
g_return_val_if_fail (term, NULL);
|
||||
|
||||
check_for_field (term, &is_field, &is_range_field);
|
||||
|
||||
for (colon = 0, cur = (unsigned char*)term; *cur; ++cur) {
|
||||
for (colon = 0, lookback = 0, quoted=FALSE, cur = (unsigned char*)term;
|
||||
*cur; ++cur) {
|
||||
|
||||
if (*cur == '\\')
|
||||
quoted = !quoted;
|
||||
|
||||
switch (*cur) {
|
||||
|
||||
case '.': /* escape '..' if it's not a range field*/
|
||||
if (is_range_field && cur[1] == '.')
|
||||
cur += 1;
|
||||
case '.': /* escape '..' if it's not a range field */
|
||||
if (cur[1] == '.') {
|
||||
if (!is_range_field) {
|
||||
*cur = ESC_CHAR;
|
||||
*(cur + 1) = ESC_CHAR;
|
||||
}
|
||||
++cur;
|
||||
} else if (isblank(lookback) || isblank(cur[1]) ||
|
||||
cur[1] == '\0')
|
||||
*cur = ' ';
|
||||
else
|
||||
*cur = escchar;
|
||||
*cur = ESC_CHAR;
|
||||
break;
|
||||
case ':':
|
||||
/* if there's a registered xapian prefix
|
||||
* before the *first* ':', don't touch
|
||||
* it. Otherwise replace ':' with '_'... ugh
|
||||
* it. Otherwise replace ':' with ' '... ugh
|
||||
* yuck ugly...
|
||||
*/
|
||||
if (colon != 0 || !is_field)
|
||||
*cur = escchar;
|
||||
*cur = ' ';
|
||||
++colon;
|
||||
break;
|
||||
case '@':
|
||||
case '/':
|
||||
case '[':
|
||||
case ']':
|
||||
case '+':
|
||||
case '-':
|
||||
*cur = ESC_CHAR;
|
||||
break;
|
||||
case ' ':
|
||||
case '_':
|
||||
case '(':
|
||||
case ')':
|
||||
case '"':
|
||||
case '\'':
|
||||
case '*': /* wildcard */
|
||||
break;
|
||||
break; /* leave as they are */
|
||||
default:
|
||||
/* escape all other special stuff */
|
||||
/* turn other stuff into spaces */
|
||||
if (*cur < 0x80 && !isalnum (*cur))
|
||||
*cur = escchar;
|
||||
*cur = ' ';
|
||||
}
|
||||
|
||||
lookback = *cur;
|
||||
}
|
||||
|
||||
/* downcase try to remove accents etc. */
|
||||
|
@ -519,6 +572,26 @@ mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strch
|
|||
return mu_str_xapian_escape_in_place_try (mystr, esc_space, strchunk);
|
||||
}
|
||||
|
||||
|
||||
char*
|
||||
mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk)
|
||||
{
|
||||
char *cur, *esc;
|
||||
|
||||
g_return_val_if_fail (term, NULL);
|
||||
g_return_val_if_fail (strchunk, NULL);
|
||||
|
||||
for (cur = esc = mu_str_normalize (term, TRUE, strchunk);
|
||||
*cur; ++cur) {
|
||||
if (is_xapian_special_char (*cur))
|
||||
*cur = ESC_CHAR;
|
||||
}
|
||||
|
||||
return esc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Split simple search term into prefix, expression and suffix.
|
||||
* Meant to handle cases like "(maildir:/abc)", prefix and
|
||||
|
@ -533,7 +606,7 @@ mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strch
|
|||
*/
|
||||
static gboolean
|
||||
split_term (const gchar *term,
|
||||
const gchar **pfx, const gchar **cond, const gchar **sfx)
|
||||
const gchar **pfx, const gchar **cond, const gchar **sfx)
|
||||
{
|
||||
size_t l;
|
||||
const gchar *start, *tail;
|
||||
|
|
|
@ -177,6 +177,15 @@ char* mu_str_xapian_escape_in_place_try (char *query, gboolean esc_space,
|
|||
char* mu_str_xapian_escape (const char *str, gboolean esc_space,
|
||||
GStringChunk *strchunk) G_GNUC_WARN_UNUSED_RESULT;
|
||||
|
||||
/**
|
||||
* escape the xapian term
|
||||
*
|
||||
* @param str a string
|
||||
* @param strchunk allocate strings on strchunk
|
||||
*
|
||||
* @return the escaped string, which is allocated in the strchunk
|
||||
*/
|
||||
char* mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk);
|
||||
|
||||
/**
|
||||
* Fixup values for some fields in the DWIM manner:
|
||||
|
|
|
@ -197,15 +197,15 @@ test_mu_str_xapian_escape (void)
|
|||
{ "aap@noot.mies", "aap_noot_mies"},
|
||||
{ "Foo..Bar", "foo__bar" },
|
||||
{ "Foo.Bar", "foo_bar" },
|
||||
{ "Foo. Bar", "foo__bar" },
|
||||
{ "Foo. Bar", "foo bar" },
|
||||
{ "subject:test@foo", "subject:test_foo" },
|
||||
{ "xxx:test@bar", "xxx_test_bar" },
|
||||
{ "aa$bb$cc", "aa_bb_cc" },
|
||||
{ "xxx:test@bar", "xxx test_bar" },
|
||||
{ "aa$bb$cc", "aa bb cc" },
|
||||
{ "date:2010..2012", "date:2010..2012"},
|
||||
{ "d:2010..2012", "d:2010..2012"},
|
||||
{ "size:10..20", "size:10..20"},
|
||||
{ "x:2010..2012", "x:2010__2012"},
|
||||
{ "q:2010..2012", "q_2010__2012"},
|
||||
{ "q:2010..2012", "q 2010__2012"},
|
||||
{ "subject:2010..2012", "subject:2010__2012"},
|
||||
{ "(maildir:foo)", "(maildir:foo)"},
|
||||
{ "£300", "£300" }
|
||||
|
@ -233,10 +233,10 @@ test_mu_str_xapian_escape_non_ascii (void)
|
|||
const char* word;
|
||||
const char* esc;
|
||||
} words [] = {
|
||||
{ "Тесла, Никола", "тесла__никола"},
|
||||
{ "Тесла, Никола", "тесла никола"},
|
||||
{ "Masha@Аркона.ru", "masha_аркона_ru" },
|
||||
{ "foo:ελληνικά", "foo_ελληνικά" },
|
||||
{ "日本語!!", "日本語__" },
|
||||
{ "foo:ελληνικά", "foo ελληνικά" },
|
||||
{ "日本語!!", "日本語 " },
|
||||
{ "£", "£" }
|
||||
};
|
||||
|
||||
|
|
|
@ -24,6 +24,8 @@
|
|||
|
||||
#include <glib.h>
|
||||
#include <glib/gstdio.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "../mu-query.h"
|
||||
|
||||
|
@ -298,12 +300,11 @@ test_mu_find_links (void)
|
|||
static void
|
||||
test_mu_find_maildir_special (void)
|
||||
{
|
||||
/* ensure that maldirs with spaces in their names work... */
|
||||
search ("\"maildir:/wom bat\" subject:atoms", 1);
|
||||
search ("\"maildir:/wom_bat\" subject:atoms", 1);
|
||||
search ("\"maildir:/wOm_bàT\"", 3);
|
||||
search ("\"maildir:/wOm*\"", 3);
|
||||
search ("\"maildir:/wOm *\"", 3);
|
||||
search ("\"maildir:wom bat\"", 0);
|
||||
search ("\"maildir:/wOm_*\"", 3);
|
||||
search ("\"maildir:wom_bat\"", 0);
|
||||
search ("\"maildir:/wombat\"", 0);
|
||||
search ("subject:atoms", 1);
|
||||
}
|
||||
|
@ -366,8 +367,10 @@ get_file_size (const char* path)
|
|||
struct stat statbuf;
|
||||
|
||||
rv = stat (path, &statbuf);
|
||||
if (rv != 0)
|
||||
if (rv != 0) {
|
||||
/* g_warning ("error: %s", strerror (errno)); */
|
||||
return -1;
|
||||
}
|
||||
|
||||
return (gint64)statbuf.st_size;
|
||||
}
|
||||
|
|
|
@ -213,7 +213,7 @@ test_mu_query_03 (void)
|
|||
{ "s:LISP", 1},
|
||||
|
||||
{ "s:\"Re: Learning LISP; Scheme vs elisp.\"", 1},
|
||||
{ "subject:Re: Learning LISP; Scheme vs elisp.", 0},
|
||||
{ "subject:Re: Learning LISP; Scheme vs elisp.", 1},
|
||||
{ "subject:\"Re: Learning LISP; Scheme vs elisp.\"", 1},
|
||||
{ "to:help-gnu-emacs@gnu.org", 4},
|
||||
{ "t:help-gnu-emacs", 4},
|
||||
|
@ -530,7 +530,7 @@ test_mu_query_tags (void)
|
|||
{ "tag:lost tag:paradise", 1},
|
||||
{ "tag:lost tag:horizon", 0},
|
||||
{ "tag:lost OR tag:horizon", 1},
|
||||
{ "x:paradise,lost", 0},
|
||||
{ "x:paradise,lost", 1},
|
||||
};
|
||||
|
||||
for (i = 0; i != G_N_ELEMENTS(queries); ++i)
|
||||
|
|
Loading…
Reference in New Issue