diff --git a/lib/mu-store-write.cc b/lib/mu-store-write.cc index 3366cd7f..4d5a02b2 100644 --- a/lib/mu-store-write.cc +++ b/lib/mu-store-write.cc @@ -302,6 +302,8 @@ add_terms_values_number (Xapian::Document& doc, MuMsg *msg, MuMsgFieldId mfid) } + + /* for string and string-list */ static void add_terms_values_str (Xapian::Document& doc, char *val, @@ -317,11 +319,11 @@ add_terms_values_str (Xapian::Document& doc, char *val, termgen.index_text_without_positions (val, 1, prefix(mfid)); } if (mu_msg_field_xapian_escape (mfid)) - val= mu_str_xapian_escape_in_place_try (val, TRUE /*esc_space*/, - strchunk); + val = mu_str_xapian_escape_term (val, strchunk); if (mu_msg_field_xapian_term(mfid)) doc.add_term (prefix(mfid) + - std::string(val, 0, _MuStore::MAX_TERM_LENGTH)); + std::string(val, 0, + _MuStore::MAX_TERM_LENGTH)); } @@ -440,8 +442,7 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata) * on strchunk, no need to free*/ if ((fname = mu_msg_part_get_filename (part, FALSE))) { char *val; - val = mu_str_xapian_escape (fname, TRUE /*esc space*/, - pdata->_strchunk); + val = mu_str_xapian_escape_term (fname, pdata->_strchunk); g_free (fname); pdata->_doc.add_term (file + std::string(val, 0, MuStore::MAX_TERM_LENGTH)); @@ -598,7 +599,7 @@ add_address_subfields (Xapian::Document& doc, const char *addr, const std::string& pfx, GStringChunk *strchunk) { const char *at; - char *p1, *p2; + const char *p1, *p2; /* add "foo" and "bar.com" as terms as well for * "foo@bar.com" */ @@ -608,14 +609,11 @@ add_address_subfields (Xapian::Document& doc, const char *addr, p1 = g_strndup(addr, at - addr); // foo p2 = g_strdup (at + 1); - p1 = mu_str_xapian_escape_in_place_try (p1, TRUE, strchunk); - p2 = mu_str_xapian_escape_in_place_try (p2, TRUE, strchunk); + p1 = mu_str_xapian_escape_term (p1, strchunk); + p2 = mu_str_xapian_escape_term (p2, strchunk); doc.add_term (pfx + std::string(p1, 0, _MuStore::MAX_TERM_LENGTH)); doc.add_term (pfx + std::string(p2, 0, _MuStore::MAX_TERM_LENGTH)); - - g_free (p1); - g_free (p2); } static void @@ -643,8 +641,8 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc) char *escaped; /* note: escaped is added to stringchunk, no need for * freeing */ - escaped = mu_str_xapian_escape (contact->address, FALSE, - msgdoc->_strchunk); + escaped = mu_str_xapian_escape_term (contact->address, + msgdoc->_strchunk); msgdoc->_doc->add_term (std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH)); add_address_subfields (*msgdoc->_doc, contact->address, pfx, diff --git a/lib/mu-str.c b/lib/mu-str.c index c353616c..343f41de 100644 --- a/lib/mu-str.c +++ b/lib/mu-str.c @@ -449,6 +449,35 @@ check_for_field (const char *str, gboolean *is_field, *is_range_field = pfx.range_field; } + +static gboolean +is_xapian_special_char (char c) +{ + switch (c) { + + case '@': + case '.': + case ',': + case '/': + case '[': + case ']': + case '+': + case '-': + case ' ': + case ':': + case '(': + case ')': + case '"': + case '\'': + case '*': + return TRUE; + default: + return FALSE; + } +} + +#define ESC_CHAR '_' + /* * Xapian treats various characters such as '@', '-', ':' and '.' * specially; function below is an ugly hack to make it DWIM in most @@ -460,44 +489,68 @@ char* mu_str_xapian_escape_in_place_try (char *term, gboolean esc_space, GStringChunk *strchunk) { unsigned char *cur; - const char escchar = '_'; - gboolean is_field, is_range_field; + char lookback; + gboolean is_field, is_range_field, quoted; unsigned colon; g_return_val_if_fail (term, NULL); check_for_field (term, &is_field, &is_range_field); - for (colon = 0, cur = (unsigned char*)term; *cur; ++cur) { + for (colon = 0, lookback = 0, quoted=FALSE, cur = (unsigned char*)term; + *cur; ++cur) { + + if (*cur == '\\') + quoted = !quoted; switch (*cur) { - case '.': /* escape '..' if it's not a range field*/ - if (is_range_field && cur[1] == '.') - cur += 1; + case '.': /* escape '..' if it's not a range field */ + if (cur[1] == '.') { + if (!is_range_field) { + *cur = ESC_CHAR; + *(cur + 1) = ESC_CHAR; + } + ++cur; + } else if (isblank(lookback) || isblank(cur[1]) || + cur[1] == '\0') + *cur = ' '; else - *cur = escchar; + *cur = ESC_CHAR; break; case ':': /* if there's a registered xapian prefix * before the *first* ':', don't touch - * it. Otherwise replace ':' with '_'... ugh + * it. Otherwise replace ':' with ' '... ugh * yuck ugly... */ if (colon != 0 || !is_field) - *cur = escchar; + *cur = ' '; ++colon; break; + case '@': + case '/': + case '[': + case ']': + case '+': + case '-': + *cur = ESC_CHAR; + break; + case ' ': + case '_': case '(': case ')': + case '"': case '\'': case '*': /* wildcard */ - break; + break; /* leave as they are */ default: - /* escape all other special stuff */ + /* turn other stuff into spaces */ if (*cur < 0x80 && !isalnum (*cur)) - *cur = escchar; + *cur = ' '; } + + lookback = *cur; } /* downcase try to remove accents etc. */ @@ -519,6 +572,26 @@ mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strch return mu_str_xapian_escape_in_place_try (mystr, esc_space, strchunk); } + +char* +mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk) +{ + char *cur, *esc; + + g_return_val_if_fail (term, NULL); + g_return_val_if_fail (strchunk, NULL); + + for (cur = esc = mu_str_normalize (term, TRUE, strchunk); + *cur; ++cur) { + if (is_xapian_special_char (*cur)) + *cur = ESC_CHAR; + } + + return esc; +} + + + /* * Split simple search term into prefix, expression and suffix. * Meant to handle cases like "(maildir:/abc)", prefix and @@ -533,7 +606,7 @@ mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strch */ static gboolean split_term (const gchar *term, - const gchar **pfx, const gchar **cond, const gchar **sfx) + const gchar **pfx, const gchar **cond, const gchar **sfx) { size_t l; const gchar *start, *tail; diff --git a/lib/mu-str.h b/lib/mu-str.h index 38fa8ade..88a82095 100644 --- a/lib/mu-str.h +++ b/lib/mu-str.h @@ -177,6 +177,15 @@ char* mu_str_xapian_escape_in_place_try (char *query, gboolean esc_space, char* mu_str_xapian_escape (const char *str, gboolean esc_space, GStringChunk *strchunk) G_GNUC_WARN_UNUSED_RESULT; +/** + * escape the xapian term + * + * @param str a string + * @param strchunk allocate strings on strchunk + * + * @return the escaped string, which is allocated in the strchunk + */ +char* mu_str_xapian_escape_term (const char *term, GStringChunk *strchunk); /** * Fixup values for some fields in the DWIM manner: diff --git a/lib/tests/test-mu-str.c b/lib/tests/test-mu-str.c index 04901025..680423b0 100644 --- a/lib/tests/test-mu-str.c +++ b/lib/tests/test-mu-str.c @@ -197,15 +197,15 @@ test_mu_str_xapian_escape (void) { "aap@noot.mies", "aap_noot_mies"}, { "Foo..Bar", "foo__bar" }, { "Foo.Bar", "foo_bar" }, - { "Foo. Bar", "foo__bar" }, + { "Foo. Bar", "foo bar" }, { "subject:test@foo", "subject:test_foo" }, - { "xxx:test@bar", "xxx_test_bar" }, - { "aa$bb$cc", "aa_bb_cc" }, + { "xxx:test@bar", "xxx test_bar" }, + { "aa$bb$cc", "aa bb cc" }, { "date:2010..2012", "date:2010..2012"}, { "d:2010..2012", "d:2010..2012"}, { "size:10..20", "size:10..20"}, { "x:2010..2012", "x:2010__2012"}, - { "q:2010..2012", "q_2010__2012"}, + { "q:2010..2012", "q 2010__2012"}, { "subject:2010..2012", "subject:2010__2012"}, { "(maildir:foo)", "(maildir:foo)"}, { "£300", "£300" } @@ -233,10 +233,10 @@ test_mu_str_xapian_escape_non_ascii (void) const char* word; const char* esc; } words [] = { - { "Тесла, Никола", "тесла__никола"}, + { "Тесла, Никола", "тесла никола"}, { "Masha@Аркона.ru", "masha_аркона_ru" }, - { "foo:ελληνικά", "foo_ελληνικά" }, - { "日本語!!", "日本語__" }, + { "foo:ελληνικά", "foo ελληνικά" }, + { "日本語!!", "日本語 " }, { "£", "£" } }; diff --git a/mu/tests/test-mu-cmd.c b/mu/tests/test-mu-cmd.c index 58fb1918..2f60373f 100644 --- a/mu/tests/test-mu-cmd.c +++ b/mu/tests/test-mu-cmd.c @@ -24,6 +24,8 @@ #include #include +#include +#include #include "../mu-query.h" @@ -298,12 +300,11 @@ test_mu_find_links (void) static void test_mu_find_maildir_special (void) { - /* ensure that maldirs with spaces in their names work... */ - search ("\"maildir:/wom bat\" subject:atoms", 1); + search ("\"maildir:/wom_bat\" subject:atoms", 1); search ("\"maildir:/wOm_bàT\"", 3); search ("\"maildir:/wOm*\"", 3); - search ("\"maildir:/wOm *\"", 3); - search ("\"maildir:wom bat\"", 0); + search ("\"maildir:/wOm_*\"", 3); + search ("\"maildir:wom_bat\"", 0); search ("\"maildir:/wombat\"", 0); search ("subject:atoms", 1); } @@ -366,8 +367,10 @@ get_file_size (const char* path) struct stat statbuf; rv = stat (path, &statbuf); - if (rv != 0) + if (rv != 0) { + /* g_warning ("error: %s", strerror (errno)); */ return -1; + } return (gint64)statbuf.st_size; } diff --git a/mu/tests/test-mu-query.c b/mu/tests/test-mu-query.c index 47d049ba..e76070ed 100644 --- a/mu/tests/test-mu-query.c +++ b/mu/tests/test-mu-query.c @@ -213,7 +213,7 @@ test_mu_query_03 (void) { "s:LISP", 1}, { "s:\"Re: Learning LISP; Scheme vs elisp.\"", 1}, - { "subject:Re: Learning LISP; Scheme vs elisp.", 0}, + { "subject:Re: Learning LISP; Scheme vs elisp.", 1}, { "subject:\"Re: Learning LISP; Scheme vs elisp.\"", 1}, { "to:help-gnu-emacs@gnu.org", 4}, { "t:help-gnu-emacs", 4}, @@ -530,7 +530,7 @@ test_mu_query_tags (void) { "tag:lost tag:paradise", 1}, { "tag:lost tag:horizon", 0}, { "tag:lost OR tag:horizon", 1}, - { "x:paradise,lost", 0}, + { "x:paradise,lost", 1}, }; for (i = 0; i != G_N_ELEMENTS(queries); ++i)