document: index some sub-parts as well

1. Also add 'normal' terms for some indexable fields
2. Add terms for e-mail address components

And add some tests.

This helps for some corner-case queries (see tests).

Fixes #2278
Fixes #2281
This commit is contained in:
Dirk-Jan C. Binnema 2022-06-26 21:55:14 +03:00
parent 6cb38c8125
commit df80935c2e
2 changed files with 63 additions and 6 deletions

View File

@ -30,7 +30,6 @@
#include <string>
#include <utils/mu-utils.hh>
using namespace Mu;
constexpr uint8_t SepaChar1 = 0xfe;
@ -46,7 +45,13 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
} else if (field.is_indexable_term()) {
Xapian::TermGenerator termgen;
termgen.set_document(doc);
termgen.index_text(utf8_flatten(val),1,field.xapian_term());
termgen.index_text(utf8_flatten(val), 1, field.xapian_term());
/* also add as 'normal' term, so some queries where the indexer
* eats special chars also match */
if (field.id != Field::Id::BodyText &&
field.id != Field::Id::EmbeddedText) {
doc.add_term(field.xapian_term(val));
}
} else
throw std::logic_error("not a search term");
}
@ -143,12 +148,19 @@ Document::add(Field::Id id, const Contacts& contacts)
if (!cfield_id || *cfield_id != id)
continue;
xdoc_.add_term(field.xapian_term(contact.email));
const auto e{contact.email};
xdoc_.add_term(field.xapian_term(e));
/* allow searching for address components, too */
const auto atpos = e.find('@');
if (atpos != std::string::npos && atpos < e.size() - 1) {
xdoc_.add_term(field.xapian_term(e.substr(0, atpos)));
xdoc_.add_term(field.xapian_term(e.substr(atpos + 1)));
}
if (!contact.name.empty())
termgen.index_text(utf8_flatten(contact.name), 1,
field.xapian_term());
cvec.emplace_back(contact.email + sepa2 + contact.name);
}

View File

@ -81,7 +81,6 @@ make_test_store(const std::string& test_path, const TestMap& test_map,
static void
test_simple()
{
const TestMap test_msgs = {{
// "sqlite-msg" "Simple mailing list message.
@ -157,13 +156,59 @@ I said: "Aujourd'hui!"
//g_assert_cmpuint(qr->begin().date().value_or(0), ==, 123454);
}
static void
test_spam_address_components()
{
const TestMap test_msgs = {{
// "sqlite-msg" "Simple mailing list message.
{
"spam/cur/spam-msg:2,S",
R"(Message-Id: <abcde@foo.bar>
From: "Foo Example" <bar@example.com>
To: example@example.com
Subject: ***SPAM*** this is a test
Boo!
)"},
}};
TempDir tdir;
auto store{make_test_store(tdir.path(), test_msgs, {})};
g_test_bug("2278");
g_test_bug("2281");
// matches both
for (auto&& expr: {
"SPAM",
"spam",
"/.*SPAM.*/",
"subject:SPAM",
"from:bar@example.com",
"subject:\\*\\*\\*SPAM\\*\\*\\*",
"bar",
"example.com"
}) {
if (g_test_verbose())
g_message("query: '%s'", expr);
auto qr = store.run_query(expr);
assert_valid_result(qr);
g_assert_false(qr->empty());
g_assert_cmpuint(qr->size(), ==, 1);
}
}
int
main(int argc, char* argv[])
{
g_test_init(&argc, &argv, NULL);
g_test_add_func("/store/query/simple", test_simple);
g_test_bug_base("https://github.com/djcb/mu/issues/");
g_test_add_func("/store/query/simple", test_simple);
g_test_add_func("/store/query/spam-address-components",
test_spam_address_components);
return g_test_run();
}