From 5d069e786f6d84e22300423bc27f78d304f3e838 Mon Sep 17 00:00:00 2001 From: djcb Date: Mon, 13 May 2013 00:03:47 +0300 Subject: [PATCH] * lib: updates for mu-msg-field / mu-str updates --- lib/mu-query.cc | 9 +- lib/mu-store-write.cc | 176 ++++++++++++++++------------------------ lib/tests/test-mu-str.c | 144 ++++++++++++++------------------ mu/mu-cmd-server.c | 2 +- 4 files changed, 135 insertions(+), 196 deletions(-) diff --git a/lib/mu-query.cc b/lib/mu-query.cc index 85606d0e..ca973fb2 100644 --- a/lib/mu-query.cc +++ b/lib/mu-query.cc @@ -313,17 +313,14 @@ mu_query_preprocess (const char *query, GError **err) /* convert the query to a list of query terms, and escape them * separately */ - parts = mu_str_esc_to_list (query, err); + parts = mu_str_esc_to_list (query); if (!parts) return NULL; for (cur = parts; cur; cur = g_slist_next(cur)) { char *data; data = (gchar*)cur->data; - /* remove accents and turn to lower-case */ - /* escape '@', single '_' and ':' if it's not following a - * xapian-pfx with '_' */ - cur->data = mu_str_xapian_escape (data, TRUE, NULL); + cur->data = mu_str_process_query_term (data); g_free (data); /* run term fixups */ data = (gchar*)cur->data; @@ -334,7 +331,7 @@ mu_query_preprocess (const char *query, GError **err) myquery = mu_str_from_list (parts, ' '); mu_str_free_list (parts); - return myquery; + return myquery ? myquery : g_strdup (""); } diff --git a/lib/mu-store-write.cc b/lib/mu-store-write.cc index 95e7cdb2..500beb7e 100644 --- a/lib/mu-store-write.cc +++ b/lib/mu-store-write.cc @@ -306,12 +306,14 @@ add_terms_values_number (Xapian::Document& doc, MuMsg *msg, MuMsgFieldId mfid) /* for string and string-list */ static void -add_terms_values_str (Xapian::Document& doc, char *val, - MuMsgFieldId mfid, GStringChunk *strchunk) +add_terms_values_str (Xapian::Document& doc, const char *val, MuMsgFieldId mfid) { - /* now, let's create some search terms... */ - if (mu_msg_field_normalize (mfid)) - val = mu_str_normalize_in_place (val, TRUE, strchunk); + char *str; + + if (mu_msg_field_preprocess (mfid)) + str = mu_str_process_term (val); + else + str = g_strdup (val); if (mu_msg_field_xapian_index (mfid)) { Xapian::TermGenerator termgen; @@ -319,46 +321,37 @@ add_terms_values_str (Xapian::Document& doc, char *val, termgen.index_text_without_positions (val, 1, prefix(mfid)); } - if (mu_msg_field_xapian_term(mfid)) { - - if (mu_msg_field_xapian_escape (mfid)) - val = mu_str_xapian_escape_term (val, strchunk); - - // if (mfid == MU_MSG_FIELD_ID_TAGS) - // g_print ("tag:'%s'\n", val); + // g_print ("%s --> '%s'\n", mu_msg_field_name (mfid), str); + if (mu_msg_field_xapian_term(mfid)) doc.add_term (prefix(mfid) + - std::string(val, 0, - _MuStore::MAX_TERM_LENGTH)); - } + std::string(str, 0, _MuStore::MAX_TERM_LENGTH)); + + g_free (str); } static void -add_terms_values_string (Xapian::Document& doc, MuMsg *msg, - MuMsgFieldId mfid, GStringChunk *strchunk) +add_terms_values_string (Xapian::Document& doc, MuMsg *msg, MuMsgFieldId mfid) { const char *orig; - char *val; if (!(orig = mu_msg_get_field_string (msg, mfid))) return; /* nothing to do */ - val = g_string_chunk_insert (strchunk, orig); - /* the value is what we display in search results; the * unchanged original */ if (mu_msg_field_xapian_value(mfid)) - doc.add_value ((Xapian::valueno)mfid, val); + doc.add_value ((Xapian::valueno)mfid, orig); - add_terms_values_str (doc, val, mfid, strchunk); + add_terms_values_str (doc, orig, mfid); } static void add_terms_values_string_list (Xapian::Document& doc, MuMsg *msg, - MuMsgFieldId mfid, GStringChunk *strchunk) + MuMsgFieldId mfid) { const GSList *lst; @@ -375,30 +368,25 @@ add_terms_values_string_list (Xapian::Document& doc, MuMsg *msg, } if (mu_msg_field_xapian_term (mfid)) { - for (; lst; lst = g_slist_next ((GSList*)lst)) { - char *val; - val = g_string_chunk_insert - (strchunk, (const gchar*)lst->data); - add_terms_values_str (doc, val, mfid, strchunk); - } + for (; lst; lst = g_slist_next ((GSList*)lst)) + add_terms_values_str (doc, (const gchar*)lst->data, + mfid); } } struct PartData { - PartData (Xapian::Document& doc, MuMsgFieldId mfid, - GStringChunk *strchunk): - _doc (doc), _mfid(mfid), _strchunk(strchunk) {} + PartData (Xapian::Document& doc, MuMsgFieldId mfid): + _doc (doc), _mfid(mfid) {} Xapian::Document _doc; MuMsgFieldId _mfid; - GStringChunk *_strchunk; }; /* index non-body text parts */ static void maybe_index_text_part (MuMsg *msg, MuMsgPart *part, PartData *pdata) { - char *txt, *norm; + char *txt, *str; Xapian::TermGenerator termgen; /* only deal with attachments/messages; inlines are indexed as @@ -413,13 +401,13 @@ maybe_index_text_part (MuMsg *msg, MuMsgPart *part, PartData *pdata) termgen.set_document(pdata->_doc); - /* allocated on strchunk, no need to free */ - norm = mu_str_normalize (txt, TRUE, pdata->_strchunk); + str = mu_str_process_text (txt); termgen.index_text_without_positions - (norm, 1, prefix(MU_MSG_FIELD_ID_EMBEDDED_TEXT)); + (str, 1, prefix(MU_MSG_FIELD_ID_EMBEDDED_TEXT)); g_free (txt); + g_free (str); } @@ -445,14 +433,13 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata) (mime + std::string(ctype, 0, MuStore::MAX_TERM_LENGTH)); } - /* now, let's create a term it there's a filename. allocated - * on strchunk, no need to free*/ if ((fname = mu_msg_part_get_filename (part, FALSE))) { - char *val; - val = mu_str_xapian_escape_term (fname, pdata->_strchunk); + char *str; + str = mu_str_process_term (fname); g_free (fname); pdata->_doc.add_term - (file + std::string(val, 0, MuStore::MAX_TERM_LENGTH)); + (file + std::string(str, 0, MuStore::MAX_TERM_LENGTH)); + g_free (str); } maybe_index_text_part (msg, part, pdata); @@ -461,60 +448,44 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata) static void add_terms_values_attach (Xapian::Document& doc, MuMsg *msg, - MuMsgFieldId mfid, GStringChunk *strchunk) + MuMsgFieldId mfid) { - PartData pdata (doc, mfid, strchunk); + PartData pdata (doc, mfid); mu_msg_part_foreach (msg, MU_MSG_OPTION_RECURSE_RFC822, (MuMsgPartForeachFunc)each_part, &pdata); } -/* escape the body -- for now, only replace '-' with '_' */ -static void -body_escape_in_place (char *body) -{ - while (*body) { - switch (*body) { - case '-': *body = '_'; - default: break; - } - ++body; - } -} - - static void add_terms_values_body (Xapian::Document& doc, MuMsg *msg, - MuMsgFieldId mfid, GStringChunk *strchunk) + MuMsgFieldId mfid) { const char *str; - char *norm; + char *flat; if (mu_msg_get_flags(msg) & MU_FLAG_ENCRYPTED) return; /* ignore encrypted bodies */ str = mu_msg_get_body_text (msg, MU_MSG_OPTION_NONE); if (!str) /* FIXME: html->txt fallback needed */ - str = mu_msg_get_body_html (msg, - MU_MSG_OPTION_NONE); + str = mu_msg_get_body_html (msg, MU_MSG_OPTION_NONE); if (!str) return; /* no body... */ Xapian::TermGenerator termgen; termgen.set_document(doc); - /* norm is allocated on strchunk, no need for freeing */ - norm = mu_str_normalize (str, TRUE, strchunk); - body_escape_in_place (norm); + flat = mu_str_process_text (str); - termgen.index_text_without_positions (norm, 1, prefix(mfid)); + // g_print ("\n--\n%s\n--\n", flat); + termgen.index_text_without_positions (flat, 1, prefix(mfid)); + g_free (flat); } struct _MsgDoc { Xapian::Document *_doc; MuMsg *_msg; MuStore *_store; - GStringChunk *_strchunk; /* callback data, to determine whether this message is 'personal' */ gboolean _personal; @@ -531,10 +502,10 @@ add_terms_values_default (MuMsgFieldId mfid, MsgDoc *msgdoc) (*msgdoc->_doc, msgdoc->_msg, mfid); else if (mu_msg_field_is_string (mfid)) add_terms_values_string - (*msgdoc->_doc, msgdoc->_msg, mfid, msgdoc->_strchunk); + (*msgdoc->_doc, msgdoc->_msg, mfid); else if (mu_msg_field_is_string_list(mfid)) add_terms_values_string_list - (*msgdoc->_doc, msgdoc->_msg, mfid, msgdoc->_strchunk); + (*msgdoc->_doc, msgdoc->_msg, mfid); else g_return_if_reached (); @@ -550,20 +521,20 @@ add_terms_values (MuMsgFieldId mfid, MsgDoc* msgdoc) !mu_msg_field_xapian_value(mfid)) return; + // if (mu_msg_field_xapian_contact (mfid)) + // return; /* handled in new_doc_from_message */ + switch (mfid) { case MU_MSG_FIELD_ID_DATE: add_terms_values_date (*msgdoc->_doc, msgdoc->_msg, mfid); break; case MU_MSG_FIELD_ID_BODY_TEXT: - add_terms_values_body (*msgdoc->_doc, msgdoc->_msg, mfid, - msgdoc->_strchunk); + add_terms_values_body (*msgdoc->_doc, msgdoc->_msg, mfid); break; - /* note: add_terms_values_attach handles _FILE, _MIME and * _ATTACH_TEXT msgfields */ case MU_MSG_FIELD_ID_FILE: - add_terms_values_attach (*msgdoc->_doc, msgdoc->_msg, mfid, - msgdoc->_strchunk); + add_terms_values_attach (*msgdoc->_doc, msgdoc->_msg, mfid); break; case MU_MSG_FIELD_ID_MIME: case MU_MSG_FIELD_ID_EMBEDDED_TEXT: @@ -603,28 +574,29 @@ xapian_pfx (MuMsgContact *contact) static void add_address_subfields (Xapian::Document& doc, const char *addr, - const std::string& pfx, GStringChunk *strchunk) + const std::string& pfx) { - const char *at; - char *s1, *s2; - const char *p1, *p2; + const char *at, *domain_part; + char *name_part, *f1, *f2; /* add "foo" and "bar.com" as terms as well for * "foo@bar.com" */ if (G_UNLIKELY(!(at = (g_strstr_len (addr, -1, "@"))))) return; - s1 = g_strndup(addr, at - addr); // foo - s2 = g_strdup (at + 1); + name_part = g_strndup(addr, at - addr); // foo + domain_part = at + 1; - p1 = mu_str_xapian_escape_term (s1, strchunk); - p2 = mu_str_xapian_escape_term (s2, strchunk); + f1 = mu_str_process_term (name_part); + f2 = mu_str_process_term (domain_part); - g_free (s1); - g_free (s2); + g_free (name_part); - doc.add_term (pfx + std::string(p1, 0, _MuStore::MAX_TERM_LENGTH)); - doc.add_term (pfx + std::string(p2, 0, _MuStore::MAX_TERM_LENGTH)); + doc.add_term (pfx + std::string(f1, 0, _MuStore::MAX_TERM_LENGTH)); + doc.add_term (pfx + std::string(f2, 0, _MuStore::MAX_TERM_LENGTH)); + + g_free (f1); + g_free (f2); } static void @@ -641,23 +613,18 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc) if (!mu_str_is_empty(contact->name)) { Xapian::TermGenerator termgen; termgen.set_document (*msgdoc->_doc); - /* note: norm is added to stringchunk, no need for freeing */ - char *norm = mu_str_normalize (contact->name, TRUE, - msgdoc->_strchunk); - termgen.index_text_without_positions (norm, 1, pfx); + char *flat = mu_str_process_text (contact->name); + termgen.index_text_without_positions (flat, 1, pfx); + g_free (flat); } - /* don't normalize e-mail address, but do lowercase it */ if (!mu_str_is_empty(contact->address)) { - char *escaped; - /* note: escaped is added to stringchunk, no need for - * freeing */ - escaped = mu_str_xapian_escape_term (contact->address, - msgdoc->_strchunk); + char *flat; + flat = mu_str_process_term (contact->address); msgdoc->_doc->add_term - (std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH)); - add_address_subfields (*msgdoc->_doc, contact->address, pfx, - msgdoc->_strchunk); + (std::string (pfx + flat, 0, MuStore::MAX_TERM_LENGTH)); + g_free (flat); + add_address_subfields (*msgdoc->_doc, contact->address, pfx); /* store it also in our contacts cache */ if (msgdoc->_store->contacts()) @@ -684,16 +651,11 @@ each_contact_check_if_personal (MuMsgContact *contact, MsgDoc *msgdoc) } } - - -#define MU_STRING_CHUNK_SIZE 8192 - Xapian::Document new_doc_from_message (MuStore *store, MuMsg *msg) { Xapian::Document doc; - MsgDoc docinfo = {&doc, msg, store, 0, FALSE, NULL}; - docinfo._strchunk = g_string_chunk_new (MU_STRING_CHUNK_SIZE); + MsgDoc docinfo = {&doc, msg, store, 0, FALSE}; mu_msg_field_foreach ((MuMsgFieldForeachFunc)add_terms_values, &docinfo); @@ -713,7 +675,7 @@ new_doc_from_message (MuStore *store, MuMsg *msg) mu_msg_contact_foreach (msg, (MuMsgContactForeachFunc)each_contact_info, &docinfo); - g_string_chunk_free (docinfo._strchunk); + // g_printerr ("\n--%s\n--\n", doc.serialise().c_str()); return doc; } @@ -783,8 +745,6 @@ add_or_update_msg (MuStore *store, unsigned docid, MuMsg *msg, GError **err) return MU_STORE_INVALID_DOCID; } - - unsigned mu_store_add_msg (MuStore *store, MuMsg *msg, GError **err) { diff --git a/lib/tests/test-mu-str.c b/lib/tests/test-mu-str.c index fa41ff53..9bf7d1e4 100644 --- a/lib/tests/test-mu-str.c +++ b/lib/tests/test-mu-str.c @@ -35,6 +35,17 @@ #include "mu-msg-prio.h" +#define ASSERT_EQL(S1,S2) \ +do { \ + const char *s1 = (S1); \ + const char *s2 = (S2); \ + if (g_strcmp0 (s1,s2) != 0) { \ + g_printerr ("error: '%s' != '%s'\n", s1, s2); \ + g_assert (0); \ + } \ +} while (0) + + static void test_mu_str_size_01 (void) @@ -105,7 +116,7 @@ test_mu_str_prio_02 (void) static void -test_mu_str_normalize_01 (void) +test_mu_str_flatten (void) { int i; struct { @@ -116,43 +127,13 @@ test_mu_str_normalize_01 (void) { "foo", "foo" }, { "Föö", "foo" }, { "číslo", "cislo" }, - { "hÆvý mëÐal ümláõt", "haevy medal umlaot"} + { "hÆvý mëÐal ümláõt", "hævy_meðal_umlaot"} }; for (i = 0; i != G_N_ELEMENTS(words); ++i) { gchar *str; - str = mu_str_normalize (words[i].word, TRUE, NULL); - g_assert_cmpstr (str, ==, words[i].norm); - g_free (str); - } -} - - -static void -test_mu_str_normalize_02 (void) -{ - int i; - struct { - const char* word; - const char* norm; - } words [] = { - { "DantèS", "DanteS"}, - { "foo", "foo" }, - { "Föö", "Foo" }, - { "číslO", "cislO" }, - { "hÆvý mëÐal ümláõt", "hAevy meDal umlaot"}, - { "£300", "£300" } - - }; - - - for (i = 0; i != G_N_ELEMENTS(words); ++i) { - gchar *str; - if (g_test_verbose()) - g_print ("[%s] <=> [%s] <=> [%s]\n", words[i].word, words[i].norm, - mu_str_normalize (words[i].word, FALSE, NULL)); - str = mu_str_normalize (words[i].word, FALSE, NULL); + str = mu_str_process_term (words[i].word); g_assert_cmpstr (str, ==, words[i].norm); g_free (str); } @@ -173,13 +154,13 @@ test_mu_str_esc_to_list (void) { "maildir:sent items", {"maildir:sent", "items", NULL}}, { "\"maildir:sent items\"", - {"maildir:sent items", NULL, NULL}}, + {"\"maildir:sent items\"", NULL, NULL}}, }; for (i = 0; i != G_N_ELEMENTS(strings); ++i) { GSList *lst, *cur; unsigned u; - lst = mu_str_esc_to_list (strings[i].str, NULL); + lst = mu_str_esc_to_list (strings[i].str); for (cur = lst, u = 0; cur; cur = g_slist_next(cur), ++u) g_assert_cmpstr ((const char*)cur->data,==,strings[i].strs[u]); mu_str_free_list (lst); @@ -187,69 +168,79 @@ test_mu_str_esc_to_list (void) } static void -test_mu_str_xapian_escape (void) +test_mu_str_process_query_term (void) { int i; struct { const char* word; const char* esc; } words [] = { - { "aap@noot.mies", "aap_noot_mies"}, - { "Foo..Bar", "foo__bar" }, + { "aap@noot.mies", "aap_noot_mies" }, + { "Foo..Bar", "foo..bar" }, { "Foo.Bar", "foo_bar" }, - { "Foo. Bar", "foo bar" }, + { "Foo Bar", "foo_bar" }, { "subject:test@foo", "subject:test_foo" }, - { "xxx:test@bar", "xxx test_bar" }, + { "xxx:test@bar", "xxx:test_bar" }, { "aa$bb$cc", "aa_bb_cc" }, { "date:2010..2012", "date:2010..2012"}, { "d:2010..2012", "d:2010..2012"}, { "size:10..20", "size:10..20"}, - { "x:2010..2012", "x:2010__2012"}, - { "q:2010..2012", "q 2010__2012"}, - { "subject:2010..2012", "subject:2010__2012"}, + { "x:2010..2012", "x:2010..2012"}, + { "q:2010..2012", "q:2010..2012"}, + { "subject:2010..2012", "subject:2010..2012"}, { "(maildir:foo)", "(maildir:foo)"}, - { "£300", "£300" } + { "Тесла, Никола", "тесла__никола"}, + { "Masha@Аркона.ru", "masha_аркона_ru" }, + { "foo:ελληνικά", "foo:ελληνικα" }, + { "日本語!!", "日本語__" }, + { "£", "_" } }; for (i = 0; i != G_N_ELEMENTS(words); ++i) { - gchar *a = g_strdup (words[i].word); - mu_str_xapian_escape_in_place_try (a, FALSE, NULL); - + gchar *s; + s = mu_str_process_query_term (words[i].word); if (g_test_verbose()) g_print ("expected: '%s' <=> got: '%s'\n", - words[i].esc, a); - - g_assert_cmpstr (a, ==, words[i].esc); - g_free (a); + words[i].esc, s); + g_assert_cmpstr (s, ==, words[i].esc); + g_free (s); } } static void -test_mu_str_xapian_escape_non_ascii (void) +test_mu_str_process_term (void) { int i; struct { const char* word; const char* esc; } words [] = { - { "Тесла, Никола", "тесла никола"}, + { "aap@noot.mies", "aap_noot_mies" }, + { "Foo..Bar", "foo__bar" }, + { "Foo.Bar", "foo_bar" }, + { "Foo Bar", "foo_bar" }, + { "subject:test@foo", "subject_test_foo" }, + { "xxx:test@bar", "xxx_test_bar" }, + { "aa$bb$cc", "aa_bb_cc" }, + { "date:2010..2012", "date_2010__2012"}, + { "subject:2010..2012", "subject_2010__2012"}, + { "(maildir:foo)", "_maildir_foo_"}, + { "Тесла, Никола", "тесла__никола"}, { "Masha@Аркона.ru", "masha_аркона_ru" }, - { "foo:ελληνικά", "foo ελληνικά" }, - { "日本語!!", "日本語 " }, - { "£", "£" } + { "foo:ελληνικά", "foo_ελληνικα" }, + { "日本語!!", "日本語__" }, + { "£", "_" } }; for (i = 0; i != G_N_ELEMENTS(words); ++i) { - gchar *a = g_strdup (words[i].word); - mu_str_xapian_escape_in_place_try (a, FALSE, NULL); - + gchar *s; + s = mu_str_process_term (words[i].word); if (g_test_verbose()) - g_print ("(%s) expected: '%s' <=> got: '%s'\n", - words[i].word, words[i].esc, a); - - g_assert_cmpstr (a, ==, words[i].esc); - g_free (a); + g_print ("expected: '%s' <=> got: '%s'\n", + words[i].esc, s); + g_assert_cmpstr (s, ==, words[i].esc); + g_free (s); } } @@ -425,8 +416,9 @@ test_mu_term_fixups (void) int main (int argc, char *argv[]) { - g_test_init (&argc, &argv, NULL); + setlocale (LC_ALL, ""); + g_test_init (&argc, &argv, NULL); /* mu_str_size */ g_test_add_func ("/mu-str/mu-str-size-01", @@ -440,16 +432,13 @@ main (int argc, char *argv[]) g_test_add_func ("/mu-str/mu-str-prio-02", test_mu_str_prio_02); - /* mu_str_normalize */ - g_test_add_func ("/mu-str/mu-str-normalize-01", - test_mu_str_normalize_01); - g_test_add_func ("/mu-str/mu-str-normalize-02", - test_mu_str_normalize_02); + g_test_add_func ("/mu-str/mu-str-flatten", + test_mu_str_flatten); - g_test_add_func ("/mu-str/mu-str-xapian-escape", - test_mu_str_xapian_escape); - g_test_add_func ("/mu-str/mu-str-xapian-escape-non-ascii", - test_mu_str_xapian_escape_non_ascii); + g_test_add_func ("/mu-str/process-query-term", + test_mu_str_process_query_term); + g_test_add_func ("/mu-str/process-term", + test_mu_str_process_term); g_test_add_func ("/mu-str/mu-str-display_contact", test_mu_str_display_contact); @@ -464,13 +453,6 @@ main (int argc, char *argv[]) g_test_add_func ("/mu-str/mu-str-esc-to-list", test_mu_str_esc_to_list); - /* g_test_add_func ("/mu-str/mu_str_guess_first_name", */ - /* test_mu_str_guess_first_name); */ - /* g_test_add_func ("/mu-str/mu_str_guess_last_name", */ - /* test_mu_str_guess_last_name); */ - /* g_test_add_func ("/mu-str/mu_str_guess_nick", */ - /* test_mu_str_guess_nick); */ - g_test_add_func ("/mu-str/mu_str_subject_normalize", test_mu_str_subject_normalize); diff --git a/mu/mu-cmd-server.c b/mu/mu-cmd-server.c index ded44068..5626f3dc 100644 --- a/mu/mu-cmd-server.c +++ b/mu/mu-cmd-server.c @@ -190,7 +190,7 @@ read_line_as_list (GError **err) } while (1); line = g_string_free (gstr, FALSE); - lst = mu_str_esc_to_list (line, err); + lst = mu_str_esc_to_list (line); g_free (line);