mirror of
https://github.com/djcb/mu.git
synced 2024-06-29 07:51:04 +02:00
522 lines
14 KiB
C++
522 lines
14 KiB
C++
/*
|
|
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
**
|
|
** This program is free software; you can redistribute it and/or modify it
|
|
** under the terms of the GNU General Public License as published by the
|
|
** Free Software Foundation; either version 3, or (at your option) any
|
|
** later version.
|
|
**
|
|
** This program is distributed in the hope that it will be useful,
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
** GNU General Public License for more details.
|
|
**
|
|
** You should have received a copy of the GNU General Public License
|
|
** along with this program; if not, write to the Free Software Foundation,
|
|
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
**
|
|
*/
|
|
|
|
#include "config.h"
|
|
#include "mu-query-parser.hh"
|
|
|
|
#include <string_view>
|
|
#include <variant>
|
|
#include <array>
|
|
#include <type_traits>
|
|
|
|
#include "utils/mu-option.hh"
|
|
#include <glib.h>
|
|
#include "utils/mu-utils-file.hh"
|
|
|
|
using namespace Mu;
|
|
|
|
// backward compat
|
|
#ifndef HAVE_XAPIAN_FLAG_NGRAMS
|
|
#define FLAG_NGRAMS FLAG_CJK_NGRAM
|
|
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
|
|
|
|
/**
|
|
* Expand terms for scripts without explicit word-breaks (e.g.
|
|
* Chinese/Japanese/Korean) in the way that Xapian expects it -
|
|
* use Xapian's built-in QueryParser just for that.
|
|
*/
|
|
static Result<Xapian::Query>
|
|
ngram_expand(const Field& field, const std::string& str)
|
|
{
|
|
Xapian::QueryParser qp;
|
|
const auto pfx{std::string(1U, field.xapian_prefix())};
|
|
|
|
qp.set_default_op(Xapian::Query::OP_OR);
|
|
|
|
return qp.parse_query(str, Xapian::QueryParser::FLAG_NGRAMS, pfx);
|
|
}
|
|
|
|
|
|
static Option<Sexp>
|
|
tail(Sexp&& s)
|
|
{
|
|
if (!s.listp() || s.empty())
|
|
return Nothing;
|
|
|
|
s.list().erase(s.list().begin(), s.list().begin() + 1);
|
|
|
|
return s;
|
|
}
|
|
|
|
Option<std::string>
|
|
head_symbol(const Sexp& s)
|
|
{
|
|
if (!s.listp() || s.empty() || !s.head() || !s.head()->symbolp())
|
|
return Nothing;
|
|
|
|
return s.head()->symbol().name;
|
|
}
|
|
|
|
|
|
Option<std::string>
|
|
string_nth(const Sexp& args, size_t n)
|
|
{
|
|
if (!args.listp() || args.size() < n + 1)
|
|
return Nothing;
|
|
|
|
if (auto&& item{args.list().at(n)}; !item.stringp())
|
|
return Nothing;
|
|
else
|
|
return item.string();
|
|
}
|
|
|
|
static Result<Xapian::Query>
|
|
phrase(const Field& field, Sexp&& s)
|
|
{
|
|
if (!field.is_phrasable_term())
|
|
return Err(Error::Code::InvalidArgument,
|
|
"field {} does not support phrases", field.name);
|
|
|
|
if (s.size() == 1 && s.front().stringp()) {
|
|
auto&& words{split(s.front().string(), " ")};
|
|
std::vector<Xapian::Query> phvec;
|
|
phvec.reserve(words.size());
|
|
for(auto&& w: words)
|
|
phvec.emplace_back(Xapian::Query{field.xapian_term(std::move(w))});
|
|
return Xapian::Query{Xapian::Query::OP_PHRASE,
|
|
phvec.begin(), phvec.end()};
|
|
} else
|
|
return Err(Error::Code::InvalidArgument,
|
|
"invalid phrase for field {}: '{}'", field.name, s.to_string());
|
|
}
|
|
|
|
static Result<Xapian::Query>
|
|
regex(const Store& store, const Field& field, const std::string& rx_str)
|
|
{
|
|
auto&& str{utf8_flatten(rx_str)};
|
|
auto&& rx{Regex::make(str, G_REGEX_OPTIMIZE)};
|
|
if (!rx) {
|
|
mu_warning("invalid regexp: '{}': {}", str, rx.error().what());
|
|
return Xapian::Query::MatchNothing;
|
|
}
|
|
|
|
std::vector<Xapian::Query> rxvec;
|
|
store.for_each_term(field.id, [&](auto&& str) {
|
|
if (auto&& val{str.data() + 1}; rx->matches(val))
|
|
rxvec.emplace_back(field.xapian_term(std::string_view{val}));
|
|
return true;
|
|
});
|
|
|
|
return Xapian::Query(Xapian::Query::OP_OR, rxvec.begin(), rxvec.end());
|
|
}
|
|
|
|
|
|
|
|
static Result<Xapian::Query>
|
|
range(const Field& field, Sexp&& s)
|
|
{
|
|
auto&& r0{string_nth(s, 0)};
|
|
auto&& r1{string_nth(s, 1)};
|
|
if (!r0 || !r1)
|
|
return Err(Error::Code::InvalidArgument, "expected 2 range values");
|
|
|
|
// in the sexp, we use iso date/time for human readability; now convert to
|
|
// time_t
|
|
auto iso_to_lexnum=[](const std::string& s)->Option<std::string> {
|
|
if (s.empty())
|
|
return s;
|
|
if (auto&& t{parse_date_time(s, true, true/*utc*/)}; !t)
|
|
return Nothing;
|
|
else
|
|
return to_lexnum(*t);
|
|
};
|
|
|
|
if (field == Field::Id::Date || field == Field::Id::Changed) {
|
|
// iso -> time_t
|
|
r0 = iso_to_lexnum(*r0);
|
|
r1 = iso_to_lexnum(*r1);
|
|
} else if (field == Field::Id::Size) {
|
|
if (!r0->empty())
|
|
r0 = to_lexnum(::atoll(r0->c_str()));
|
|
if (!r1->empty())
|
|
r1 = to_lexnum(::atoll(r1->c_str()));
|
|
} else
|
|
return Err(Error::Code::InvalidArgument,
|
|
"unsupported range field {}", field.name);
|
|
|
|
if (r0->empty() && r1->empty())
|
|
return Xapian::Query::MatchNothing; // empty range matches nothing.
|
|
else if (r0->empty() && !r1->empty())
|
|
return Xapian::Query(Xapian::Query::OP_VALUE_LE,
|
|
field.value_no(), *r1);
|
|
else if (!r0->empty() && r1->empty())
|
|
return Xapian::Query(Xapian::Query::OP_VALUE_GE,
|
|
field.value_no(), *r0);
|
|
else
|
|
return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
|
|
field.value_no(), *r0, *r1);
|
|
}
|
|
|
|
|
|
|
|
using OpPair = std::pair<const std::string_view, Xapian::Query::op>;
|
|
static constexpr std::array<OpPair, 4> LogOpPairs = {{
|
|
{ "and", Xapian::Query::OP_AND },
|
|
{ "or", Xapian::Query::OP_OR },
|
|
{ "xor", Xapian::Query::OP_XOR },
|
|
{ "not", Xapian::Query::OP_AND_NOT }
|
|
}};
|
|
|
|
static Option<Xapian::Query::op>
|
|
find_log_op(const std::string& opname)
|
|
{
|
|
for (auto&& p: LogOpPairs)
|
|
if (p.first == opname)
|
|
return p.second;
|
|
|
|
return Nothing;
|
|
}
|
|
|
|
static Result<Xapian::Query> parse(const Store& store, Sexp&& s, Mu::ParserFlags flags);
|
|
|
|
static Result<Xapian::Query>
|
|
parse_logop(const Store& store, Xapian::Query::op op, Sexp&& args, Mu::ParserFlags flags)
|
|
{
|
|
if (!args.listp() || args.empty())
|
|
return Err(Error::Code::InvalidArgument,
|
|
"expected non-empty list but got", args.to_string());
|
|
|
|
std::vector<Xapian::Query> qs;
|
|
for (auto&& elm: args.list()) {
|
|
if (auto&& q{parse(store, std::move(elm), flags)}; !q)
|
|
return Err(std::move(q.error()));
|
|
else
|
|
qs.emplace_back(std::move(*q));
|
|
}
|
|
|
|
switch(op) {
|
|
case Xapian::Query::OP_AND_NOT:
|
|
// TODO: optimize AND_NOT
|
|
if (qs.size() != 1)
|
|
return Err(Error::Code::InvalidArgument,
|
|
"expected single argument for NOT");
|
|
else
|
|
return Xapian::Query{op, Xapian::Query::MatchAll, qs.at(0)};
|
|
|
|
case Xapian::Query::OP_AND:
|
|
case Xapian::Query::OP_OR:
|
|
case Xapian::Query::OP_XOR:
|
|
return Xapian::Query(op, qs.begin(), qs.end());
|
|
|
|
default:
|
|
return Err(Error::Code::InvalidArgument, "unexpected xapian op");
|
|
}
|
|
}
|
|
|
|
|
|
static Result<Xapian::Query>
|
|
parse_field_matcher(const Store& store, const Field& field,
|
|
const std::string& match_sym, Sexp&& args)
|
|
{
|
|
auto&& str0{string_nth(args, 0)};
|
|
|
|
if (match_sym == wildcard_sym.name && str0)
|
|
return Xapian::Query{Xapian::Query::OP_WILDCARD,
|
|
field.xapian_term(*str0)};
|
|
else if (match_sym == range_sym.name && !!str0)
|
|
return range(field, std::move(args));
|
|
else if (match_sym == regex_sym.name && !!str0)
|
|
return regex(store, field, *str0);
|
|
else if (match_sym == phrase_sym.name)
|
|
return phrase(field, std::move(args));
|
|
|
|
return Err(Error::Code::InvalidArgument,
|
|
"invalid field '{}'/'{}' matcher: {}",
|
|
field.name, match_sym, args.to_string());
|
|
}
|
|
|
|
static Result<Xapian::Query>
|
|
parse_basic(const Field &field, Sexp &&vals, Mu::ParserFlags flags)
|
|
{
|
|
auto ngrams = any_of(flags & ParserFlags::SupportNgrams);
|
|
if (!vals.stringp())
|
|
return Err(Error::Code::InvalidArgument, "expected string");
|
|
|
|
auto&& val{vals.string()};
|
|
|
|
switch (field.id) {
|
|
case Field::Id::Flags:
|
|
if (auto&& finfo{flag_info(val)}; finfo)
|
|
return Xapian::Query{field.xapian_term(finfo->shortcut_lower())};
|
|
else
|
|
return Err(Error::Code::InvalidArgument, "invalid flag '{}'", val);
|
|
case Field::Id::Priority:
|
|
if (auto&& prio{priority_from_name(val)}; prio)
|
|
return Xapian::Query{field.xapian_term(to_char(*prio))};
|
|
else
|
|
return Err(Error::Code::InvalidArgument, "invalid priority '{}'", val);
|
|
default: {
|
|
auto q{Xapian::Query{field.xapian_term(val)}};
|
|
if (ngrams) { // special case: cjk; see if we can create an expanded query.
|
|
if (field.is_phrasable_term() && contains_unbroken_script(val))
|
|
if (auto&& ng{ngram_expand(field, val)}; ng)
|
|
return ng;
|
|
}
|
|
return q;
|
|
}}
|
|
}
|
|
|
|
static Result<Xapian::Query>
|
|
parse(const Store& store, Sexp&& s, Mu::ParserFlags flags)
|
|
{
|
|
auto&& headsym{head_symbol(s)};
|
|
if (!headsym)
|
|
return Err(Error::Code::InvalidArgument,
|
|
"expected (symbol ...) but got {}", s.to_string());
|
|
|
|
// ie., something like (or|and| ... ....)
|
|
if (auto&& logop{find_log_op(*headsym)}; logop) {
|
|
if (auto&& args{tail(std::move(s))}; !args)
|
|
return Err(Error::Code::InvalidArgument,
|
|
"expected (logop ...) but got {}",
|
|
s.to_string());
|
|
else
|
|
return parse_logop(store, *logop, std::move(*args), flags);
|
|
|
|
}
|
|
// something like (field ...)
|
|
else if (auto&& field{field_from_name(*headsym)}; field) {
|
|
|
|
auto&& rest{tail(std::move(s))};
|
|
if (!rest || rest->empty())
|
|
return Err(Error::Code::InvalidArgument,
|
|
"expected field-value or field-matcher");
|
|
|
|
auto&& matcher{rest->front()};
|
|
// field-value: (field "value"); ensure "value" is there
|
|
if (matcher.stringp())
|
|
return parse_basic(*field, std::move(matcher), flags);
|
|
|
|
// otherwise, we expect a field-matcher, e.g. (field (phrase "a b c"))
|
|
// ensure the matcher is a list starting with a symbol
|
|
auto&& match_sym{head_symbol(matcher)};
|
|
if (!match_sym)
|
|
return Err(Error::Code::InvalidArgument,
|
|
"expected field-matcher");
|
|
|
|
if (auto&& args{tail(std::move(matcher))}; !args)
|
|
return Err(Error::Code::InvalidArgument, "expected matcher arguments");
|
|
else
|
|
return parse_field_matcher(store, *field,
|
|
*match_sym, std::move(*args));
|
|
}
|
|
return Err(Error::Code::InvalidArgument, "unexpected sexp {}", s.to_string());
|
|
}
|
|
|
|
/* LCOV_EXCL_START*/
|
|
// parse the way Xapian's internal parser does it; for testing.
|
|
static Xapian::Query
|
|
xapian_query_classic(const std::string& expr, Mu::ParserFlags flags)
|
|
{
|
|
Xapian::QueryParser xqp;
|
|
|
|
// add prefixes
|
|
field_for_each([&](auto&& field){
|
|
|
|
if (!field.is_searchable())
|
|
return;
|
|
|
|
const auto prefix{std::string(1U, field.xapian_prefix())};
|
|
std::vector<std::string> names = {
|
|
std::string{field.name},
|
|
std::string(1U, field.shortcut)
|
|
};
|
|
if (!field.alias.empty())
|
|
names.emplace_back(std::string{field.alias});
|
|
|
|
for (auto&& name: names)
|
|
xqp.add_prefix(name, prefix);
|
|
});
|
|
|
|
auto xflags = Xapian::QueryParser::FLAG_PHRASE |
|
|
Xapian::QueryParser::FLAG_BOOLEAN |
|
|
Xapian::QueryParser::FLAG_WILDCARD;
|
|
|
|
if (any_of(flags & ParserFlags::SupportNgrams))
|
|
xflags |= Xapian::QueryParser::FLAG_NGRAMS;
|
|
|
|
xqp.set_default_op(Xapian::Query::OP_AND);
|
|
return xqp.parse_query(expr, xflags);
|
|
}
|
|
/* LCOV_EXCL_STOP*/
|
|
|
|
Result<Xapian::Query>
|
|
Mu::make_xapian_query(const Store& store, const std::string& expr, Mu::ParserFlags flags) noexcept
|
|
{
|
|
if (any_of(flags & Mu::ParserFlags::XapianParser))
|
|
return xapian_query_classic(expr, flags);
|
|
|
|
return parse(store, Mu::parse_query(expr, true/*expand*/), flags);
|
|
}
|
|
|
|
|
|
#ifdef BUILD_XAPIANIZE_QUERY
|
|
int
|
|
main (int argc, char *argv[])
|
|
{
|
|
if (argc < 2) {
|
|
mu_printerrln("expected: parse-query <query>");
|
|
return 1;
|
|
}
|
|
|
|
auto store = Store::make(runtime_path(Mu::RuntimePath::XapianDb));
|
|
if (!store) {
|
|
mu_printerrln("error: {}", store.error());
|
|
return 2;
|
|
}
|
|
|
|
std::string expr;
|
|
for (auto i = 1; i < argc; ++i) {
|
|
expr += argv[i];
|
|
expr += " ";
|
|
}
|
|
|
|
if (auto&& query{make_xapian_query(*store, expr)}; !query) {
|
|
mu_printerrln("error: {}", query.error());
|
|
return 1;
|
|
} else
|
|
mu_println("mu: {}", query->get_description());
|
|
|
|
if (auto&& query{make_xapian_query(*store, expr, ParserFlags::XapianParser)}; !query) {
|
|
mu_printerrln("error: {}", query.error());
|
|
return 2;
|
|
} else
|
|
mu_println("xp: {}", query->get_description());
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
#endif /*BUILD_XAPIANIZE_QUERY*/
|
|
|
|
#if BUILD_TESTS
|
|
/*
|
|
* Tests.
|
|
*
|
|
*/
|
|
|
|
#include "utils/mu-test-utils.hh"
|
|
|
|
using TestCase = std::pair<std::string, std::string>;
|
|
|
|
static void
|
|
test_sexp()
|
|
{
|
|
/* tail */
|
|
g_assert_false(!!tail(Sexp{}));
|
|
auto t = tail(Sexp{1,2,3});
|
|
g_assert_true(!!t && t->listp() && t->size() == 2);
|
|
|
|
/* head_symbol */
|
|
g_assert_false(!!head_symbol(Sexp{}));
|
|
assert_equal(head_symbol(Sexp{"foo"_sym, 1, 2}).value_or("bar"), "foo");
|
|
|
|
/* string_nth */
|
|
g_assert_false(!!string_nth(Sexp{}, 123));
|
|
g_assert_false(!!string_nth(Sexp{1, 2, 3}, 1));
|
|
assert_equal(string_nth(Sexp{"aap", "noot", "mies"}, 2).value_or("wim"), "mies");
|
|
}
|
|
|
|
|
|
static void
|
|
test_xapian()
|
|
{
|
|
allow_warnings();
|
|
|
|
auto&& testhome{unwrap(make_temp_dir())};
|
|
auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)};
|
|
auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))};
|
|
|
|
// Xapian internal format (get_description()) is _not_ guaranteed
|
|
// to be the same between versions
|
|
auto&& zz{make_xapian_query(store, R"(subject:"hello world")")};
|
|
assert_valid_result(zz);
|
|
/* LCOV_EXCL_START*/
|
|
if (zz->get_description() != R"(Query((Shello world OR (Shello PHRASE 2 Sworld))))") {
|
|
mu_println("{}", zz->get_description());
|
|
if (mu_test_mu_hacker()) {
|
|
// in the mu hacker case, we want to be warned if Xapian changed.
|
|
g_critical("xapian version mismatch");
|
|
g_assert_true(false);
|
|
} else {
|
|
g_test_skip("incompatible xapian descriptions");
|
|
return;
|
|
}
|
|
}
|
|
/* LCOV_EXCL_STOP*/
|
|
|
|
std::vector<TestCase> cases = {
|
|
|
|
TestCase{R"(i:87h766tzzz.fsf@gnus.org)", R"(Query(I87h766tzzz.fsf@gnus.org))"},
|
|
TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"},
|
|
TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"},
|
|
TestCase{R"(subject:"hello world")",
|
|
R"(Query((Shello world OR (Shello PHRASE 2 Sworld))))"},
|
|
TestCase{R"(subject:/boo/")", R"(Query())"},
|
|
|
|
// logic
|
|
TestCase{R"(not)", R"(Query((Tnot OR Cnot OR Hnot OR Fnot OR Snot OR Bnot OR Enot)))"},
|
|
TestCase{R"(from:a and (from:b or from:c))", R"(Query((Fa AND (Fb OR Fc))))"},
|
|
// optimize?
|
|
TestCase{R"(not from:a and to:b)", R"(Query(((<alldocuments> AND_NOT Fa) AND Tb)))"},
|
|
TestCase{R"(cc:a not bcc:b)", R"(Query((Ca AND (<alldocuments> AND_NOT Hb))))"},
|
|
|
|
// ranges.
|
|
TestCase{R"(size:1..10")", R"(Query(VALUE_RANGE 17 g1 ga))"},
|
|
TestCase{R"(size:10..1")", R"(Query(VALUE_RANGE 17 g1 ga))"},
|
|
TestCase{R"(size:10..")", R"(Query(VALUE_GE 17 ga))"},
|
|
TestCase{R"(size:..10")", R"(Query(VALUE_LE 17 ga))"},
|
|
TestCase{R"(size:10")", R"(Query(VALUE_RANGE 17 ga ga))"}, // change?
|
|
TestCase{R"(size:..")", R"(Query())"},
|
|
};
|
|
|
|
for (auto&& test: cases) {
|
|
auto&& xq{make_xapian_query(store, test.first)};
|
|
assert_valid_result(xq);
|
|
assert_equal(xq->get_description(), test.second);
|
|
}
|
|
|
|
remove_directory(testhome);
|
|
}
|
|
|
|
int
|
|
main(int argc, char* argv[])
|
|
{
|
|
mu_test_init(&argc, &argv);
|
|
|
|
Xapian::QueryParser qp;
|
|
|
|
g_test_add_func("/query-parser/sexp", test_sexp);
|
|
g_test_add_func("/query-parser/xapianizer", test_xapian);
|
|
|
|
return g_test_run();
|
|
}
|
|
|
|
#endif /*BUILD_TESTS*/
|