mirror of https://github.com/djcb/mu.git
485 lines
13 KiB
C++
485 lines
13 KiB
C++
/*
|
||
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||
**
|
||
** This program is free software; you can redistribute it and/or modify it
|
||
** under the terms of the GNU General Public License as published by the
|
||
** Free Software Foundation; either version 3, or (at your option) any
|
||
** later version.
|
||
**
|
||
** This program is distributed in the hope that it will be useful,
|
||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
** GNU General Public License for more details.
|
||
**
|
||
** You should have received a copy of the GNU General Public License
|
||
** along with this program; if not, write to the Free Software Foundation,
|
||
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||
**
|
||
*/
|
||
|
||
#include "config.h"
|
||
#include "mu-query-parser.hh"
|
||
|
||
#include <string_view>
|
||
#include <variant>
|
||
#include <array>
|
||
#include <type_traits>
|
||
#include <iostream>
|
||
|
||
#include "utils/mu-option.hh"
|
||
#include <glib.h>
|
||
#include "utils/mu-utils-file.hh"
|
||
|
||
using namespace Mu;
|
||
|
||
|
||
|
||
/**
|
||
* Expand terms for scripts without explicit word-breaks (e.g.
|
||
* Chinese/Japanese/Korean) in the way that Xapian expects it -
|
||
* use Xapian's built-in QueryParser just for that.
|
||
*/
|
||
static Result<Xapian::Query>
|
||
ngram_expand(const Field& field, const std::string& str)
|
||
{
|
||
mu_println("ng: '{}'", str);
|
||
|
||
Xapian::QueryParser qp;
|
||
const auto pfx{std::string(1U, field.xapian_prefix())};
|
||
|
||
qp.set_default_op(Xapian::Query::OP_OR);
|
||
|
||
return qp.parse_query(
|
||
str,
|
||
#if HAVE_XAPIAN_FLAG_NGRAMS
|
||
Xapian::QueryParser::FLAG_NGRAMS,
|
||
#else
|
||
Xapian::QueryParser::FLAG_CJK_NGRAM,
|
||
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
|
||
pfx);
|
||
}
|
||
|
||
|
||
|
||
static Option<Sexp>
|
||
tail(Sexp&& s)
|
||
{
|
||
if (!s.listp() || s.empty())
|
||
return Nothing;
|
||
|
||
s.list().erase(s.list().begin(), s.list().begin() + 1);
|
||
|
||
return s;
|
||
}
|
||
|
||
Option<std::string>
|
||
head_symbol(const Sexp& s)
|
||
{
|
||
if (!s.listp() || s.empty() || !s.head() || !s.head()->symbolp())
|
||
return Nothing;
|
||
|
||
return s.head()->symbol().name;
|
||
}
|
||
|
||
|
||
Option<std::string>
|
||
string_nth(const Sexp& args, size_t n)
|
||
{
|
||
if (!args.listp() || args.size() < n + 1)
|
||
return Nothing;
|
||
|
||
if (auto&& item{args.list().at(n)}; !item.stringp())
|
||
return Nothing;
|
||
else
|
||
return item.string();
|
||
}
|
||
|
||
static Result<Xapian::Query>
|
||
phrase(const Field& field, Sexp&& s)
|
||
{
|
||
if (!field.is_indexable_term())
|
||
return Err(Error::Code::InvalidArgument,
|
||
"field {} does not support phrases", field.name);
|
||
|
||
if (s.size() == 1 && s.front().stringp()) {
|
||
auto&& words{split(s.front().string(), " ")};
|
||
std::vector<Xapian::Query> phvec;
|
||
phvec.reserve(words.size());
|
||
for(auto&& w: words)
|
||
phvec.emplace_back(Xapian::Query{field.xapian_term(std::move(w))});
|
||
return Xapian::Query{Xapian::Query::OP_PHRASE,
|
||
phvec.begin(), phvec.end()};
|
||
} else
|
||
return Err(Error::Code::InvalidArgument,
|
||
"invalid phrase for field {}: '{}'", field.name, s.to_string());
|
||
}
|
||
|
||
static Result<Xapian::Query>
|
||
regex(const Store& store, const Field& field, const std::string& rx_str)
|
||
{
|
||
auto&& str{utf8_flatten(rx_str)};
|
||
auto&& rx{Regex::make(str, G_REGEX_OPTIMIZE)};
|
||
if (!rx) {
|
||
mu_warning("invalid regexp: '{}': {}", str, rx.error().what());
|
||
return Xapian::Query::MatchNothing;
|
||
}
|
||
|
||
std::vector<Xapian::Query> rxvec;
|
||
store.for_each_term(field.id, [&](auto&& str) {
|
||
if (auto&& val{str.data() + 1}; rx->matches(val))
|
||
rxvec.emplace_back(field.xapian_term(std::string_view{val}));
|
||
return true;
|
||
});
|
||
|
||
return Xapian::Query(Xapian::Query::OP_OR, rxvec.begin(), rxvec.end());
|
||
}
|
||
|
||
|
||
|
||
static Result<Xapian::Query>
|
||
range(const Field& field, Sexp&& s)
|
||
{
|
||
auto&& r0{string_nth(s, 0)};
|
||
auto&& r1{string_nth(s, 1)};
|
||
if (!r0 || !r1)
|
||
return Err(Error::Code::InvalidArgument, "expected 2 range values");
|
||
|
||
// in the sexp, we use iso date/time for human readability; now convert to
|
||
// time_t
|
||
auto iso_to_lexnum=[](const std::string& s)->Option<std::string> {
|
||
if (s.empty())
|
||
return s;
|
||
if (auto&& t{parse_date_time(s, true, true/*utc*/)}; !t)
|
||
return Nothing;
|
||
else
|
||
return to_lexnum(*t);
|
||
};
|
||
|
||
if (field == Field::Id::Date || field == Field::Id::Changed) {
|
||
// iso -> time_t
|
||
r0 = iso_to_lexnum(*r0);
|
||
r1 = iso_to_lexnum(*r1);
|
||
} else if (field == Field::Id::Size) {
|
||
if (!r0->empty())
|
||
r0 = to_lexnum(::atoll(r0->c_str()));
|
||
if (!r1->empty())
|
||
r1 = to_lexnum(::atoll(r1->c_str()));
|
||
} else
|
||
return Err(Error::Code::InvalidArgument,
|
||
"unsupported range field {}", field.name);
|
||
|
||
if (r0->empty() && r1->empty())
|
||
return Xapian::Query::MatchAll;
|
||
else if (r0->empty() && !r1->empty())
|
||
return Xapian::Query(Xapian::Query::OP_VALUE_LE,
|
||
field.value_no(), *r1);
|
||
else if (!r0->empty() && r1->empty())
|
||
return Xapian::Query(Xapian::Query::OP_VALUE_GE,
|
||
field.value_no(), *r0);
|
||
else
|
||
return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
|
||
field.value_no(), *r0, *r1);
|
||
}
|
||
|
||
|
||
|
||
using OpPair = std::pair<const std::string_view, Xapian::Query::op>;
|
||
static constexpr std::array<OpPair, 4> LogOpPairs = {{
|
||
{ "and", Xapian::Query::OP_AND },
|
||
{ "or", Xapian::Query::OP_OR },
|
||
{ "xor", Xapian::Query::OP_XOR },
|
||
{ "not", Xapian::Query::OP_AND_NOT }
|
||
}};
|
||
|
||
static Option<Xapian::Query::op>
|
||
find_log_op(const std::string& opname)
|
||
{
|
||
for (auto&& p: LogOpPairs)
|
||
if (p.first == opname)
|
||
return p.second;
|
||
|
||
return Nothing;
|
||
}
|
||
|
||
static Result<Xapian::Query> parse(const Store& store, Sexp&& s, Mu::ParserFlags flags);
|
||
|
||
static Result<Xapian::Query>
|
||
parse_logop(const Store& store, Xapian::Query::op op, Sexp&& args, Mu::ParserFlags flags)
|
||
{
|
||
if (!args.listp() || args.empty())
|
||
return Err(Error::Code::InvalidArgument,
|
||
"expected non-empty list but got", args.to_string());
|
||
|
||
std::vector<Xapian::Query> qs;
|
||
for (auto&& elm: args.list()) {
|
||
if (auto&& q{parse(store, std::move(elm), flags)}; !q)
|
||
return Err(std::move(q.error()));
|
||
else
|
||
qs.emplace_back(std::move(*q));
|
||
}
|
||
|
||
switch(op) {
|
||
case Xapian::Query::OP_AND_NOT:
|
||
if (qs.size() != 1)
|
||
return Err(Error::Code::InvalidArgument,
|
||
"expected single argument for NOT");
|
||
else
|
||
return Xapian::Query{op, Xapian::Query::MatchAll, qs.at(0)};
|
||
|
||
case Xapian::Query::OP_AND:
|
||
case Xapian::Query::OP_OR:
|
||
case Xapian::Query::OP_XOR:
|
||
return Xapian::Query(op, qs.begin(), qs.end());
|
||
|
||
default:
|
||
return Err(Error::Code::InvalidArgument, "unexpected xapian op");
|
||
}
|
||
}
|
||
|
||
|
||
static Result<Xapian::Query>
|
||
parse_field_matcher(const Store& store, const Field& field,
|
||
const std::string& match_sym, Sexp&& args)
|
||
{
|
||
auto&& str0{string_nth(args, 0)};
|
||
|
||
if (match_sym == wildcard_sym.name && str0)
|
||
return Xapian::Query{Xapian::Query::OP_WILDCARD,
|
||
field.xapian_term(*str0)};
|
||
else if (match_sym == range_sym.name && !!str0)
|
||
return range(field, std::move(args));
|
||
else if (match_sym == regex_sym.name && !!str0)
|
||
return regex(store, field, *str0);
|
||
else if (match_sym == phrase_sym.name)
|
||
return phrase(field, std::move(args));
|
||
|
||
return Err(Error::Code::InvalidArgument,
|
||
"invalid field '{}'/'{}' matcher: {}",
|
||
field.name, match_sym, args.to_string());
|
||
}
|
||
|
||
|
||
static Result<Xapian::Query>
|
||
parse_basic(const Field& field, Sexp&& vals, Mu::ParserFlags flags)
|
||
{
|
||
static auto ngrams = any_of(flags & ParserFlags::SupportNgrams);
|
||
|
||
if (!vals.stringp())
|
||
return Err(Error::Code::InvalidArgument, "expected string");
|
||
|
||
auto&& val{vals.string()};
|
||
|
||
switch (field.id) {
|
||
case Field::Id::Flags:
|
||
if (auto&& finfo{flag_info(val)}; finfo)
|
||
return Xapian::Query{field.xapian_term(finfo->shortcut_lower())};
|
||
else
|
||
return Err(Error::Code::InvalidArgument,
|
||
"invalid flag '{}'", val);
|
||
case Field::Id::Priority:
|
||
if (auto&& prio{priority_from_name(val)}; prio)
|
||
return Xapian::Query{field.xapian_term(to_char(*prio))};
|
||
else
|
||
return Err(Error::Code::InvalidArgument,
|
||
"invalid priority '{}'", val);
|
||
default: {
|
||
auto q{Xapian::Query{field.xapian_term(val)}};
|
||
if (ngrams) { // special case: cjk; see if we can create an expanded query.
|
||
if (field.is_indexable_term() && contains_unbroken_script(val))
|
||
if (auto&& ng{ngram_expand(field, val)}; ng)
|
||
return ng;
|
||
}
|
||
return q;
|
||
}}
|
||
|
||
}
|
||
|
||
static Result<Xapian::Query>
|
||
parse(const Store& store, Sexp&& s, Mu::ParserFlags flags)
|
||
{
|
||
auto&& headsym{head_symbol(s)};
|
||
if (!headsym)
|
||
return Err(Error::Code::InvalidArgument,
|
||
"expected (symbol ...) but got {}", s.to_string());
|
||
|
||
// ie., something like (or|and| ... ....)
|
||
if (auto&& logop{find_log_op(*headsym)}; logop) {
|
||
if (auto&& args{tail(std::move(s))}; !args)
|
||
return Err(Error::Code::InvalidArgument,
|
||
"expected (logop ...) but got {}",
|
||
s.to_string());
|
||
else
|
||
return parse_logop(store, *logop, std::move(*args), flags);
|
||
|
||
}
|
||
// something like (field ...)
|
||
else if (auto&& field{field_from_name(*headsym)}; field) {
|
||
|
||
auto&& rest{tail(std::move(s))};
|
||
if (!rest || rest->empty())
|
||
return Err(Error::Code::InvalidArgument,
|
||
"expected field-value or field-matcher");
|
||
|
||
auto&& matcher{rest->front()};
|
||
|
||
// field-value: (field "value"); ensure "value" is there
|
||
if (matcher.stringp())
|
||
return parse_basic(*field, std::move(matcher), flags);
|
||
|
||
// otherwise, we expect a field-matcher, e.g. (field (phrase "a b c"))
|
||
// ensure the matcher is a list starting with a symbol
|
||
auto&& match_sym{head_symbol(matcher)};
|
||
if (!match_sym)
|
||
return Err(Error::Code::InvalidArgument,
|
||
"expected field-matcher");
|
||
|
||
if (auto&& args{tail(std::move(matcher))}; !args)
|
||
return Err(Error::Code::InvalidArgument, "expected matcher arguments");
|
||
else
|
||
return parse_field_matcher(store, *field,
|
||
*match_sym, std::move(*args));
|
||
}
|
||
return Err(Error::Code::InvalidArgument,
|
||
"unexpected sexp {}", s.to_string());
|
||
}
|
||
|
||
|
||
// parse the way Xapian's internal parser does it; for testing.
|
||
static Xapian::Query
|
||
xapian_query_classic(const std::string& expr, Mu::ParserFlags flags)
|
||
{
|
||
Xapian::QueryParser xqp;
|
||
|
||
// add prefixes
|
||
field_for_each([&](auto&& field){
|
||
|
||
if (!field.is_searchable())
|
||
return;
|
||
|
||
const auto prefix{std::string(1U, field.xapian_prefix())};
|
||
std::vector<std::string> names = {
|
||
std::string{field.name},
|
||
std::string(1U, field.shortcut)
|
||
};
|
||
if (!field.alias.empty())
|
||
names.emplace_back(std::string{field.alias});
|
||
|
||
for (auto&& name: names)
|
||
xqp.add_prefix(name, prefix);
|
||
});
|
||
|
||
const auto xflags = std::invoke([&]() {
|
||
unsigned f = Xapian::QueryParser::FLAG_PHRASE |
|
||
Xapian::QueryParser::FLAG_BOOLEAN |
|
||
Xapian::QueryParser::FLAG_WILDCARD;
|
||
if (any_of(flags & ParserFlags::SupportNgrams)) {
|
||
#if HAVE_XAPIAN_FLAG_NGRAMS
|
||
f |= Xapian::QueryParser::FLAG_NGRAMS;
|
||
#else
|
||
f |= Xapian::QueryParser::FLAG_CJK_NGRAM;
|
||
#endif
|
||
}
|
||
return f;
|
||
});
|
||
|
||
xqp.set_default_op(Xapian::Query::OP_AND);
|
||
return xqp.parse_query(expr, xflags);
|
||
}
|
||
|
||
Result<Xapian::Query>
|
||
Mu::make_xapian_query(const Store& store, const std::string& expr, Mu::ParserFlags flags) noexcept
|
||
{
|
||
if (any_of(flags & Mu::ParserFlags::XapianParser))
|
||
return xapian_query_classic(expr, flags);
|
||
|
||
return parse(store, Mu::parse_query(expr, true/*expand*/), flags);
|
||
}
|
||
|
||
|
||
#ifdef BUILD_XAPIANIZE_QUERY
|
||
int
|
||
main (int argc, char *argv[])
|
||
{
|
||
if (argc < 2) {
|
||
mu_printerrln("expected: parse-query <query>");
|
||
return 1;
|
||
}
|
||
|
||
auto store = Store::make(runtime_path(Mu::RuntimePath::XapianDb));
|
||
if (!store) {
|
||
mu_printerrln("error: {}", store.error());
|
||
return 2;
|
||
}
|
||
|
||
std::string expr;
|
||
for (auto i = 1; i < argc; ++i) {
|
||
expr += argv[i];
|
||
expr += " ";
|
||
}
|
||
|
||
if (auto&& query{make_xapian_query(*store, expr)}; !query) {
|
||
mu_printerrln("error: {}", query.error());
|
||
return 1;
|
||
} else {
|
||
mu_println("{}", query->get_description());
|
||
return 0;
|
||
}
|
||
}
|
||
#endif /*BUILD_XAPIANIZE_QUERY*/
|
||
|
||
#if BUILD_TESTS
|
||
/*
|
||
* Tests.
|
||
*
|
||
*/
|
||
|
||
#include "utils/mu-test-utils.hh"
|
||
|
||
using TestCase = std::pair<std::string, std::string>;
|
||
|
||
static void
|
||
test_xapian()
|
||
{
|
||
auto&& testhome{unwrap(make_temp_dir())};
|
||
auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)};
|
||
auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))};
|
||
|
||
std::vector<TestCase> cases = {
|
||
TestCase{R"(i:87h766tzzz.fsf@gnus.org)", R"(Query(I87h766tzzz.fsf@gnus.org))"},
|
||
TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"},
|
||
TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"},
|
||
TestCase{R"(subject:"hello world")", R"(Query((Shello PHRASE 2 Sworld)))"},
|
||
TestCase{R"(subject:/boo/")", R"(Query())"},
|
||
};
|
||
|
||
for (auto&& test: cases) {
|
||
auto&& xq{make_xapian_query(store, test.first)};
|
||
assert_valid_result(xq);
|
||
|
||
mu_println("'{}' <=> '{}'", xq->get_description(), test.second);
|
||
assert_equal(xq->get_description(), test.second);
|
||
}
|
||
|
||
remove_directory(testhome);
|
||
}
|
||
|
||
int
|
||
main(int argc, char* argv[])
|
||
{
|
||
mu_test_init(&argc, &argv);
|
||
|
||
|
||
Xapian::QueryParser qp;
|
||
// mu_println("{}", qp.parse_query("スポンサーシップ募集").get_description());
|
||
// mu_println("{}", qp.parse_query("スポンサーシップ募集", Xapian::QueryParser::FLAG_NGRAMS).get_description());
|
||
|
||
// mu_println("{}", qp.parse_query("hello world").get_description());
|
||
// mu_println("{}", qp.parse_query("hello world", Xapian::QueryParser::FLAG_NGRAMS).get_description());
|
||
|
||
g_test_add_func("/query-parser/xapianizer", test_xapian);
|
||
|
||
return g_test_run();
|
||
}
|
||
|
||
#endif /*BUILD_TESTS*/
|