lib: implement new query parser

Implement a new query parser; the results should be very similar to the
old one, but it adds an Sexp middle-representation, so users can see how
a query is interpreted.
This commit is contained in:
Dirk-Jan C. Binnema 2023-09-09 11:43:28 +03:00
parent 9c28c65d45
commit a9bd6e69d3
18 changed files with 1702 additions and 1632 deletions

View File

@ -1,4 +1,4 @@
## Copyright (C) 2021-2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
## Copyright (C) 2021-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
@ -26,16 +26,17 @@ lib_mu=static_library(
'mu-config.cc',
'mu-contacts-cache.cc',
'mu-maildir.cc',
'mu-parser.cc',
'mu-query-match-deciders.cc',
'mu-query-threads.cc',
'mu-query.cc',
'mu-script.cc',
'mu-server.cc',
'mu-store.cc',
'mu-tokenizer.cc',
'mu-xapian.cc',
'mu-xapian-db.cc'
'mu-xapian-db.cc',
# query-parser
'mu-query-processor.cc',
'mu-query-parser.cc',
'mu-query-xapianizer.cc'
],
dependencies: [
glib_dep,
@ -46,8 +47,7 @@ lib_mu=static_library(
config_h_dep,
lib_mu_utils_dep,
lib_mu_message_dep,
lib_mu_index_dep
],
lib_mu_index_dep],
install: false)
@ -57,14 +57,32 @@ lib_mu_dep = declare_dependency(
include_directories:
include_directories(['.', '..']))
# dev helpers
tokenize = executable(
'tokenize',
[ 'mu-tokenizer.cc', 'tokenize.cc' ],
dependencies: [ lib_mu_utils_dep, glib_dep ],
install: false)
#
# query parser dev helpers
#
process_query = executable('process-query', [ 'mu-query-processor.cc'],
install: false,
cpp_args: ['-DBUILD_PROCESS_QUERY'],
dependencies: [glib_dep, lib_mu_dep])
# actual tests
parse_query = executable( 'parse-query', [ 'mu-query-parser.cc' ],
install: false,
cpp_args: ['-DBUILD_PARSE_QUERY'],
dependencies: [glib_dep, lib_mu_dep])
parse_query_expand = executable( 'parse-query-expand', [ 'mu-query-parser.cc' ],
install: false,
cpp_args: ['-DBUILD_PARSE_QUERY_EXPAND'],
dependencies: [glib_dep, lib_mu_dep])
xapian_query = executable('xapianize-query', [ 'mu-query-xapianizer.cc' ],
install: false,
cpp_args: ['-DBUILD_XAPIANIZE_QUERY'],
dependencies: [glib_dep, lib_mu_dep])
#
# unit tests
#
test('test-threads',
executable('test-threads',
@ -86,4 +104,25 @@ test('test-config',
cpp_args: ['-DBUILD_TESTS'],
dependencies: [glib_dep, lib_mu_dep]))
test('test-query-processor',
executable('test-query-processor',
'mu-query-processor.cc',
install: false,
cpp_args: ['-DBUILD_TESTS'],
dependencies: [lib_mu_dep]))
test('test-query-parser',
executable('test-query-parser',
'mu-query-parser.cc',
install: false,
cpp_args: ['-DBUILD_TESTS'],
dependencies: [lib_mu_dep]))
test('test-query-xapianizer',
executable('test-query-xapianizer',
'mu-query-xapianizer.cc',
install: false,
cpp_args: ['-DBUILD_TESTS'],
dependencies: [lib_mu_dep]))
subdir('tests')

View File

@ -1,508 +0,0 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include "mu-parser.hh"
#include <algorithm>
#include <limits>
#include "mu-tokenizer.hh"
#include "utils/mu-utils.hh"
#include "utils/mu-error.hh"
#include "utils/mu-regex.hh"
#include "message/mu-message.hh"
using namespace Mu;
// 3 precedence levels: units (NOT,()) > factors (OR) > terms (AND)
// query -> <term-1> | ε
// <term-1> -> <factor-1> <term-2> | ε
// <term-2> -> OR|XOR <term-1> | ε
// <factor-1> -> <unit> <factor-2> | ε
// <factor-2> -> [AND]|AND NOT <factor-1> | ε
// <unit> -> [NOT] <term-1> | ( <term-1> ) | <data>
// <data> -> <value> | <range> | <regex>
// <value> -> [field:]value
// <range> -> [field:][lower]..[upper]
// <regex> -> [field:]/regex/
#define BUG(...) \
Mu::Error(Error::Code::Internal, "BUG @ line {}", __LINE__);
/**
* Get the "shortcut"/internal fields for the the given fieldstr or empty if there is none
*
* @param fieldstr a fieldstr, e.g "subject" or "s" for the subject field
*
* @return a vector with "exploded" values, with a code and a fullname. E.g. "s" might map
* to [<"S","subject">], while "recip" could map to [<"to", "T">, <"cc", "C">, <"bcc", "B">]
*/
struct FieldInfo {
const std::string field;
const std::string prefix;
bool supports_phrase;
Field::Id id;
};
using FieldInfoVec = std::vector<FieldInfo>;
struct Parser::Private {
Private(const Store& store, Parser::Flags flags) : store_{store}, flags_{flags} {}
std::vector<std::string> process_regex(const std::string& field,
const Regex& rx) const;
Mu::Tree term_1(Mu::Tokens& tokens, WarningVec& warnings) const;
Mu::Tree term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const;
Mu::Tree factor_1(Mu::Tokens& tokens, WarningVec& warnings) const;
Mu::Tree factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const;
Mu::Tree unit(Mu::Tokens& tokens, WarningVec& warnings) const;
Mu::Tree data(Mu::Tokens& tokens, WarningVec& warnings) const;
Mu::Tree range(const FieldInfoVec& fields,
const std::string& lower,
const std::string& upper,
size_t pos,
WarningVec& warnings) const;
Mu::Tree regex(const FieldInfoVec& fields,
const std::string& v,
size_t pos,
WarningVec& warnings) const;
Mu::Tree value(const FieldInfoVec& fields,
const std::string& v,
size_t pos,
WarningVec& warnings) const;
private:
const Store& store_;
const Parser::Flags flags_;
};
static std::string
process_value(const std::string& field, const std::string& value)
{
const auto id_opt{field_from_name(field)};
if (id_opt) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wswitch-enum"
switch (id_opt->id) {
case Field::Id::Priority: {
if (!value.empty())
return std::string(1, value[0]);
} break;
case Field::Id::Flags:
if (const auto info{flag_info(value)}; info)
return std::string(1, info->shortcut_lower());
break;
default:
break;
}
#pragma GCC diagnostic pop
}
return value; // XXX prio/flags, etc. alias
}
static void
add_field(std::vector<FieldInfo>& fields, Field::Id field_id)
{
const auto field{field_from_id(field_id)};
if (!field.shortcut)
return; // can't be searched
fields.emplace_back(FieldInfo{std::string{field.name}, field.xapian_term(),
field.is_indexable_term(), field_id});
}
static std::vector<FieldInfo>
process_field(const std::string& field_str, Parser::Flags flags)
{
std::vector<FieldInfo> fields;
if (any_of(flags & Parser::Flags::UnitTest)) {
add_field(fields, Field::Id::MessageId);
return fields;
}
if (field_str == "contact" || field_str == "recip") { // multi fields
add_field(fields, Field::Id::To);
add_field(fields, Field::Id::Cc);
add_field(fields, Field::Id::Bcc);
if (field_str == "contact")
add_field(fields, Field::Id::From);
} else if (field_str.empty()) {
add_field(fields, Field::Id::To);
add_field(fields, Field::Id::Cc);
add_field(fields, Field::Id::Bcc);
add_field(fields, Field::Id::From);
add_field(fields, Field::Id::Subject);
add_field(fields, Field::Id::BodyText);
} else if (const auto field_opt{field_from_name(field_str)}; field_opt)
add_field(fields, field_opt->id);
return fields;
}
static bool
is_range_field(const std::string& field_str)
{
if (const auto field_opt{field_from_name(field_str)}; !field_opt)
return false;
else
return field_opt->is_range();
}
struct MyRange {
std::string lower;
std::string upper;
};
static MyRange
process_range(const std::string& field_str,
const std::string& lower, const std::string& upper)
{
const auto field_opt{field_from_name(field_str)};
if (!field_opt)
return {lower, upper};
std::string l2 = lower;
std::string u2 = upper;
constexpr auto upper_limit = std::numeric_limits<int64_t>::max();
if (field_opt->id == Field::Id::Date || field_opt->id == Field::Id::Changed) {
l2 = to_lexnum(parse_date_time(lower, true).value_or(0));
u2 = to_lexnum(parse_date_time(upper, false).value_or(upper_limit));
} else if (field_opt->id == Field::Id::Size) {
l2 = to_lexnum(parse_size(lower, true).value_or(0));
u2 = to_lexnum(parse_size(upper, false).value_or(upper_limit));
}
return {l2, u2};
}
std::vector<std::string>
Parser::Private::process_regex(const std::string& field_str,
const Regex& rx) const
{
const auto field_opt{field_from_name(field_str)};
if (!field_opt)
return {};
const auto prefix{field_opt->xapian_term()};
std::vector<std::string> terms;
store_.for_each_term(field_opt->id, [&](auto&& str) {
auto val{str.c_str() + 1}; // strip off the Xapian prefix.
if (rx.matches(val))
terms.emplace_back(std::move(val));
return true;
});
return terms;
}
static Token
look_ahead(const Mu::Tokens& tokens)
{
return tokens.front();
}
static Mu::Tree
empty()
{
return {{Node::Type::Empty}};
}
Mu::Tree
Parser::Private::value(const FieldInfoVec& fields,
const std::string& v,
size_t pos,
WarningVec& warnings) const
{
auto val = utf8_flatten(v);
if (fields.empty())
throw BUG("expected one or more fields");
if (fields.size() == 1) {
const auto item = fields.front();
return Tree({Node::Type::Value,
FieldValue{item.id, process_value(item.field, val)}});
}
// a 'multi-field' such as "recip:"
Tree tree(Node{Node::Type::OpOr});
for (const auto& item : fields)
tree.add_child(Tree({Node::Type::Value,
FieldValue{item.id,
process_value(item.field, val)}}));
return tree;
}
Mu::Tree
Parser::Private::regex(const FieldInfoVec& fields,
const std::string& v,
size_t pos,
WarningVec& warnings) const
{
if (v.length() < 2)
throw BUG("expected regexp, got '%s'", v.c_str());
const auto rxstr = utf8_flatten(v.substr(1, v.length() - 2));
try {
Tree tree(Node{Node::Type::OpOr});
const auto rx = Regex::make(rxstr, G_REGEX_OPTIMIZE);
if (!rx)
throw rx.error();
for (const auto& field : fields) {
const auto terms = process_regex(field.field, *rx);
for (const auto& term : terms) {
tree.add_child(Tree({Node::Type::ValueAtomic,
FieldValue{field.id, term}}));
}
}
if (tree.children.empty())
return empty();
else
return tree;
} catch (...) {
// fallback
warnings.push_back({pos, "invalid regexp"});
return value(fields, v, pos, warnings);
}
}
Mu::Tree
Parser::Private::range(const FieldInfoVec& fields,
const std::string& lower,
const std::string& upper,
size_t pos,
WarningVec& warnings) const
{
if (fields.empty())
throw BUG("expected field");
const auto& field = fields.front();
if (!is_range_field(field.field))
return value(fields, lower + ".." + upper, pos, warnings);
auto prange = process_range(field.field, lower, upper);
if (prange.lower > prange.upper)
prange = process_range(field.field, upper, lower);
return Tree({Node::Type::Range,
FieldValue{field.id, prange.lower, prange.upper}});
}
Mu::Tree
Parser::Private::data(Mu::Tokens& tokens, WarningVec& warnings) const
{
const auto token = look_ahead(tokens);
if (token.type != Token::Type::Data)
warnings.push_back({token.pos, "expected: value"});
tokens.pop_front();
std::string field, val;
const auto col = token.str.find(":");
if (col != 0 && col != std::string::npos && col != token.str.length() - 1) {
field = token.str.substr(0, col);
val = token.str.substr(col + 1);
} else
val = token.str;
auto fields = process_field(field, flags_);
if (fields.empty()) { // not valid field...
warnings.push_back({token.pos, mu_format("invalid field '{}'", field)});
fields = process_field("", flags_);
// fallback, treat the whole of foo:bar as a value
return value(fields, field + ":" + val, token.pos, warnings);
}
// does it look like a regexp?
if (val.length() >= 2)
if (val[0] == '/' && val[val.length() - 1] == '/')
return regex(fields, val, token.pos, warnings);
// does it look like a range?
const auto dotdot = val.find("..");
if (dotdot != std::string::npos)
return range(fields,
val.substr(0, dotdot),
val.substr(dotdot + 2),
token.pos,
warnings);
else if (is_range_field(fields.front().field)) {
// range field without a range - treat as field:val..val
return range(fields, val, val, token.pos, warnings);
}
// if nothing else, it's a value.
return value(fields, val, token.pos, warnings);
}
Mu::Tree
Parser::Private::unit(Mu::Tokens& tokens, WarningVec& warnings) const
{
if (tokens.empty()) {
warnings.push_back({0, "expected: unit"});
return empty();
}
const auto token = look_ahead(tokens);
if (token.type == Token::Type::Not) {
tokens.pop_front();
Tree tree{{Node::Type::OpNot}};
tree.add_child(unit(tokens, warnings));
return tree;
}
if (token.type == Token::Type::Open) {
tokens.pop_front();
auto tree = term_1(tokens, warnings);
if (tokens.empty())
warnings.push_back({token.pos, "expected: ')'"});
else {
const auto token2 = look_ahead(tokens);
if (token2.type == Token::Type::Close)
tokens.pop_front();
else {
warnings.push_back(
{token2.pos,
std::string("expected: ')' but got ") + token2.str});
}
}
return tree;
}
return data(tokens, warnings);
}
Mu::Tree
Parser::Private::factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const
{
if (tokens.empty())
return empty();
const auto token = look_ahead(tokens);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wswitch-enum"
switch (token.type) {
case Token::Type::And: {
tokens.pop_front();
op = Node::Type::OpAnd;
} break;
case Token::Type::Open:
case Token::Type::Data:
case Token::Type::Not:
op = Node::Type::OpAnd; // implicit AND
break;
default:
return empty();
}
#pragma GCC diagnostic pop
return factor_1(tokens, warnings);
}
Mu::Tree
Parser::Private::factor_1(Mu::Tokens& tokens, WarningVec& warnings) const
{
Node::Type op{Node::Type::Invalid};
auto t = unit(tokens, warnings);
auto a2 = factor_2(tokens, op, warnings);
if (a2.empty())
return t;
Tree tree{{op}};
tree.add_child(std::move(t));
tree.add_child(std::move(a2));
return tree;
}
Mu::Tree
Parser::Private::term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const
{
if (tokens.empty())
return empty();
const auto token = look_ahead(tokens);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wswitch-enum"
switch (token.type) {
case Token::Type::Or: op = Node::Type::OpOr; break;
case Token::Type::Xor: op = Node::Type::OpXor; break;
default:
if (token.type != Token::Type::Close)
warnings.push_back({token.pos, "expected OR|XOR"});
return empty();
}
#pragma GCC diagnostic pop
tokens.pop_front();
return term_1(tokens, warnings);
}
Mu::Tree
Parser::Private::term_1(Mu::Tokens& tokens, WarningVec& warnings) const
{
Node::Type op{Node::Type::Invalid};
auto t = factor_1(tokens, warnings);
auto o2 = term_2(tokens, op, warnings);
if (o2.empty())
return t;
else {
Tree tree{{op}};
tree.add_child(std::move(t));
tree.add_child(std::move(o2));
return tree;
}
}
Mu::Parser::Parser(const Store& store, Parser::Flags flags) :
priv_{std::make_unique<Private>(store, flags)}
{
}
Mu::Parser::~Parser() = default;
Mu::Tree
Mu::Parser::parse(const std::string& expr, WarningVec& warnings) const
{
try {
auto tokens = tokenize(expr);
if (tokens.empty())
return empty();
else
return priv_->term_1(tokens, warnings);
} catch (const std::runtime_error& ex) {
std::cerr << ex.what() << std::endl;
return empty();
}
}

View File

@ -1,106 +0,0 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef __PARSER_HH__
#define __PARSER_HH__
#include "utils/mu-utils.hh"
#include <string>
#include <vector>
#include <memory>
#include <mu-tree.hh>
#include <mu-store.hh>
// A simple recursive-descent parser for queries. Follows the Xapian syntax,
// but better handles non-alphanum; also implements regexp
namespace Mu {
/**
* A parser warning
*
*/
struct Warning {
size_t pos{}; /**< pos in string */
const std::string msg; /**< warning message */
/**
* operator==
*
* @param rhs right-hand side
*
* @return true if rhs is equal to this; false otherwise
*/
bool operator==(const Warning& rhs) const { return pos == rhs.pos && msg == rhs.msg; }
};
using WarningVec = std::vector<Warning>;
/**
* operator<<
*
* @param os an output stream
* @param w a warning
*
* @return the updated output stream
*/
inline std::ostream&
operator<<(std::ostream& os, const Warning& w)
{
os << w.pos << ":" << w.msg;
return os;
}
class Parser {
public:
enum struct Flags { None = 0, UnitTest = 1 << 0 };
/**
* Construct a query parser object
*
* @param store a store object ptr, or none
*/
Parser(const Store& store, Flags = Flags::None);
/**
* DTOR
*
*/
~Parser();
/**
* Parse a query string
*
* @param query a query string
* @param warnings vec to receive warnings
*
* @return a parse-tree
*/
Tree parse(const std::string& query, WarningVec& warnings) const;
private:
struct Private;
std::unique_ptr<Private> priv_;
};
MU_ENABLE_BITOPS(Parser::Flags);
} // namespace Mu
#endif /* __PARSER_HH__ */

428
lib/mu-query-parser.cc Normal file
View File

@ -0,0 +1,428 @@
/*
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include "mu-query-parser.hh"
#include <string_view>
#include <variant>
#include <type_traits>
#include <iostream>
#include "utils/mu-utils.hh"
#include "utils/mu-sexp.hh"
#include "utils/mu-option.hh"
#include <glib.h>
#include "utils/mu-utils-file.hh"
using namespace Mu;
// Sexp extensions...
static Sexp&
prepend(Sexp& s, Sexp&& e)
{
s.list().insert(s.list().begin(), std::move(e));
return s;
}
static Option<Sexp&>
second(Sexp& s)
{
if (s.listp() && !s.empty() && s.cbegin() + 1 != s.cend())
return *(s.begin()+1);
else
return Nothing;
}
static bool
looks_like_matcher(const Sexp& sexp)
{
// all the "terminal values" (from the Mu parser's pov)
const std::array<Sexp::Symbol, 5> value_syms = {
placeholder_sym, phrase_sym, regex_sym, range_sym, wildcard_sym
};
if (!sexp.listp() || sexp.empty() || !sexp.front().symbolp())
return false;
const auto symbol{sexp.front().symbol()};
if (seq_some(value_syms, [&](auto &&sym) { return symbol == sym; }))
return true;
else if (!!field_from_name(symbol.name) || field_is_combi(symbol.name))
return true;
else
return false;
}
struct ParseContext {
bool expand;
std::vector<std::string> warnings;
};
/*
* Grammar
*
* query -> factor { (<OR> | <XOR>) factor }
* factor -> unit { [<AND>] unit }
* unit -> matcher | <NOT> query | <(> query <)>
* matcher
*/
static Sexp query(Sexp& tokens, ParseContext& ctx);
static Sexp
matcher(Sexp& tokens, ParseContext& ctx)
{
if (tokens.empty())
return {};
auto val{*tokens.head()};
tokens.pop_front();
/* special case: if we find some non-matcher type here, we need to
* second-guess the tokenizer */
if (!looks_like_matcher(val))
val = Sexp{placeholder_sym, val.symbol().name};
if (ctx.expand) { /* should we expand meta-fields? */
const auto symbol{val.front().symbol()};
const auto fields = fields_from_name(symbol == placeholder_sym ? "" : symbol.name);
if (!fields.empty()) {
Sexp vals{};
vals.add(or_sym);
for (auto&& field: fields)
vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}});
val = std::move(vals);
}
}
return val;
}
static Sexp
unit(Sexp& tokens, ParseContext& ctx)
{
if (tokens.head_symbolp(not_sym)) { /* NOT */
tokens.pop_front();
Sexp sub{query(tokens, ctx)};
/* special case: interpret "not" as a matcher instead; */
if (sub.empty())
return Sexp{placeholder_sym, not_sym.name};
/* we try to optimize: double negations are removed */
if (sub.head_symbolp(not_sym))
return *second(sub);
else
return Sexp(not_sym, std::move(sub));
} else if (tokens.head_symbolp(open_sym)) { /* ( sub) */
tokens.pop_front();
Sexp sub{query(tokens, ctx)};
if (tokens.head_symbolp(close_sym))
tokens.pop_front();
else {
//g_warning("expected <)>");
}
return sub;
}
/* matcher */
return matcher(tokens, ctx);
}
static Sexp
factor(Sexp& tokens, ParseContext& ctx)
{
Sexp un = unit(tokens, ctx);
/* query 'a b' is to be interpreted as 'a AND b';
*
* we need an implicit AND if the head symbol is either
* a matcher (value) or the start of a sub-expression */
auto implicit_and = [&]() {
if (tokens.head_symbolp(open_sym))
return true;
else if (auto&& head{tokens.head()}; head)
return looks_like_matcher(*head);
else
return false;
};
Sexp uns;
while (true) {
if (tokens.head_symbolp(and_sym))
tokens.pop_front();
else if (!implicit_and())
break;
if (auto&& un2 = unit(tokens, ctx); !un2.empty())
uns.add(std::move(un2));
else
break;
}
if (!uns.empty()) {
un = Sexp{and_sym, std::move(un)};
un.add_list(std::move(uns));
}
return un;
}
static Sexp
query(Sexp& tokens, ParseContext& ctx)
{
/* note: we flatten (or (or ( or ...)) etc. here;
* for optimization (since Xapian likes flat trees) */
Sexp fact = factor(tokens, ctx);
Sexp or_factors, xor_factors;
while (true) {
auto factors = std::invoke([&]()->Option<Sexp&> {
if (tokens.head_symbolp(or_sym))
return or_factors;
else if (tokens.head_symbolp(xor_sym))
return xor_factors;
else
return Nothing;
});
if (!factors)
break;
tokens.pop_front();
factors->add(factor(tokens, ctx));
}
// a bit clumsy...
if (!or_factors.empty() && xor_factors.empty()) {
fact = Sexp{or_sym, std::move(fact)};
fact.add_list(std::move(or_factors));
} else if (or_factors.empty() && !xor_factors.empty()) {
fact = Sexp{xor_sym, std::move(fact)};
fact.add_list(std::move(xor_factors));
} else if (!or_factors.empty() && !xor_factors.empty()) {
fact = Sexp{or_sym, std::move(fact)};
fact.add_list(std::move(or_factors));
prepend(xor_factors, xor_sym);
fact.add(std::move(xor_factors));
}
return fact;
}
Sexp
Mu::parse_query(const std::string& expr, bool expand)
{
ParseContext context;
context.expand = expand;
if (auto&& items = process_query(expr); !items.listp())
throw std::runtime_error("tokens must be a list-sexp");
else
return query(items, context);
}
#if defined(BUILD_PARSE_QUERY)||defined(BUILD_PARSE_QUERY_EXPAND)
int
main (int argc, char *argv[])
{
if (argc < 2) {
mu_printerrln("expected: {} <query>", argv[0]);
return 1;
}
std::string expr;
for (auto i = 1; i < argc; ++i) {
expr += argv[i];
expr += " ";
}
auto&& sexp = parse_query(expr,
#ifdef BUILD_PARSE_QUERY_EXPAND
true/*expand*/
#else
false/*don't expand*/
#endif
);
mu_println("{}", sexp.to_string());
return 0;
}
#endif // BUILD_PARSE_QUERY || BUILD_PARSE_QUERY_EXPAND
#if BUILD_TESTS
/*
* Tests.
*
*/
#include "utils/mu-test-utils.hh"
using TestCase = std::pair<std::string, std::string>;
static void
test_parser_basic()
{
std::vector<TestCase> cases = {
// single term
TestCase{R"(a)", R"((_ "a"))"},
// a and b
TestCase{R"(a and b)", R"((and (_ "a") (_ "b")))"},
// a and b and c
TestCase{R"(a and b and c)", R"((and (_ "a") (_ "b") (_ "c")))"},
// a or b
TestCase{R"(a or b)", R"((or (_ "a") (_ "b")))"},
// a or b and c
TestCase{R"(a or b and c)", R"((or (_ "a") (and (_ "b") (_ "c"))))"},
// a and b or c
TestCase{R"(a and b or c)", R"((or (and (_ "a") (_ "b")) (_ "c")))"},
// not a
TestCase{R"(not a)", R"((not (_ "a")))"},
// lone not
TestCase{R"(not)", R"((_ "not"))"},
// a and (b or c)
TestCase{R"(a and (b or c))", R"((and (_ "a") (or (_ "b") (_ "c"))))"},
// TODO: add more...
};
for (auto&& test: cases) {
auto&& sexp{parse_query(test.first)};
//mu_message ("'{}' <=> '{}'", sexp.to_string(), test.second);
assert_equal(sexp.to_string(), test.second);
}
}
static void
test_parser_recover()
{
std::vector<TestCase> cases = {
// implicit AND
TestCase{R"(a b)", R"((and (_ "a") (_ "b")))"},
// a or or (second to be used as value)
TestCase{R"(a or and)", R"((or (_ "a") (_ "and")))"},
// missing end )
TestCase{R"(a and ()", R"((_ "a"))"},
// missing end )
TestCase{R"(a and (b)", R"((and (_ "a") (_ "b")))"},
};
for (auto&& test: cases) {
auto&& sexp{parse_query(test.first)};
assert_equal(sexp.to_string(), test.second);
}
}
static void
test_parser_fields()
{
std::vector<TestCase> cases = {
// simple field
TestCase{R"(s:hello)", R"((subject "hello"))"},
// field, wildcard, regexp
TestCase{R"(subject:a* recip:/b/)",
R"((and (subject (wildcard "a")) (recip (regex "b"))))"},
TestCase{R"(from:hello or subject:world)",
R"((or (from "hello") (subject "world")))"},
};
for (auto&& test: cases) {
auto&& sexp{parse_query(test.first)};
assert_equal(sexp.to_string(), test.second);
}
}
static void
test_parser_expand()
{
std::vector<TestCase> cases = {
// simple field
TestCase{R"(recip:a)", R"((or (to "a") (cc "a") (bcc "a")))"},
// field, wildcard, regexp
TestCase{R"(a*)",
R"((or (to (wildcard "a")) (cc (wildcard "a")) (bcc (wildcard "a")) (from (wildcard "a")) (subject (wildcard "a")) (body (wildcard "a")) (embed (wildcard "a"))))"},
TestCase{R"(a xor contact:b)",
R"((xor (or (to "a") (cc "a") (bcc "a") (from "a") (subject "a") (body "a") (embed "a")) (or (to "b") (cc "b") (bcc "b") (from "b"))))"}
};
for (auto&& test: cases) {
auto&& sexp{parse_query(test.first, true/*expand*/)};
assert_equal(sexp.to_string(), test.second);
}
}
static void
test_parser_range()
{
std::vector<TestCase> cases = {
TestCase{R"(size:1)", R"((size (range "1" "1")))"},
TestCase{R"(size:2..)", R"((size (range "2" "")))"},
TestCase{R"(size:..1k)", R"((size (range "" "1024")))"},
TestCase{R"(size:..)", R"((size (range "" "")))"},
};
for (auto&& test: cases) {
auto&& sexp{parse_query(test.first, true/*expand*/)};
assert_equal(sexp.to_string(), test.second);
}
}
static void
test_parser_optimize()
{
std::vector<TestCase> cases = {
TestCase{R"(not a)", R"((not (_ "a")))"},
TestCase{R"(not not a)", R"((_ "a"))"},
TestCase{R"(not not not a)", R"((not (_ "a")))"},
TestCase{R"(not not not not a)", R"((_ "a"))"},
};
for (auto&& test: cases) {
auto&& sexp{parse_query(test.first)};
assert_equal(sexp.to_string(), test.second);
}
}
int
main(int argc, char* argv[])
{
mu_test_init(&argc, &argv);
g_test_add_func("/query-parser/basic", test_parser_basic);
g_test_add_func("/query-parser/recover", test_parser_recover);
g_test_add_func("/query-parser/fields", test_parser_fields);
g_test_add_func("/query-parser/range", test_parser_range);
g_test_add_func("/query-parser/expand", test_parser_expand);
g_test_add_func("/query-parser/optimize", test_parser_optimize);
return g_test_run();
}
#endif /*BUILD_TESTS*/

116
lib/mu-query-parser.hh Normal file
View File

@ -0,0 +1,116 @@
/*
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include <memory>
#include <string>
#include <array>
#include <xapian.h>
#include "utils/mu-sexp.hh"
#include "utils/mu-result.hh"
#include "mu-store.hh"
namespace Mu {
/*
* Some useful symbol-sexps
*/
static inline const auto placeholder_sym = "_"_sym;
static inline const auto phrase_sym = "phrase"_sym;
static inline const auto regex_sym = "regex"_sym;
static inline const auto range_sym = "range"_sym;
static inline const auto wildcard_sym = "wildcard"_sym;
static inline const auto open_sym = "("_sym;
static inline const auto close_sym = ")"_sym;
static inline const auto and_sym = "and"_sym;
static inline const auto or_sym = "or"_sym;
static inline const auto xor_sym = "xor"_sym;
static inline const auto not_sym = "not"_sym;
static inline const auto and_not_sym = "and-not"_sym;
/*
* We take a query, then parse it into a human-readable s-expression and then
* turn that s-expression into a Xapian query
*
* some query:
* "from:hello or subject:world"
*
* 1. tokenize-query
* => ((from "hello") or (subject "world"))
*
* 2. parse-query
* => (or (from "hello") (subject "world"))
*
* 3. xapian-query
* => Query((Fhello OR Sworld))
* *
*/
/**
* Analyze the query expression and express it as a Sexp-list with the sequence
* of elements.
*
* @param expr a search expression
*
* @return Sexp with the sequence of elements
*/
Sexp process_query(const std::string& expr);
/**
* Parse the query expression and create a parse-tree expressed as an Sexp
* object (tree).
*
* Internally, this processes the stream into element (see process_query()) and
* processes the tokens into a Sexp. This sexp is meant to be human-readable.
*
* @param expr a search expression
* @param expand whether to expand meta-fields (such as '_', 'recip', 'contacts')
*
* @return Sexp with the parse tree
*/
Sexp parse_query(const std::string& expr, bool expand=false);
/**
* Make a Xapian Query for the given string expression.
*
* This uses parse_query() and turns the S-expression into a Xapian::Query.
* Unlike mere parsing, this uses the information in the store to resolve
* wildcard / regex queries.
*
* @param store the message store
* @param expr a string expression
* @param flavor type of parser to use
*
* @return a Xapian query result or an error.
*/
enum struct ParserFlags {
None = 0 << 0,
SupportNgrams = 1 << 0, /**< Support Xapian's Ngrams for CJK etc. handling */
XapianParser = 1 << 1, /**< For testing only, use Xapian's
* built-in QueryParser; this is not
* fully compatible with mu, only useful
* for debugging. */
};
Result<Xapian::Query> make_xapian_query(const Store& store, const std::string& expr,
ParserFlags flag=ParserFlags::None) noexcept;
MU_ENABLE_BITOPS(ParserFlags);
} // namespace Mu

548
lib/mu-query-processor.cc Normal file
View File

@ -0,0 +1,548 @@
/*
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include "mu-query-parser.hh"
#include <string_view>
#include <variant>
#include <type_traits>
#include <iostream>
#include "utils/mu-option.hh"
#include <glib.h>
#include "utils/mu-utils-file.hh"
using namespace Mu;
/**
* An 'Element' here is a rather rich version of what is traditionally
* considered a (lexical) token.
*
* We try to determine as much as possible during the analysis phase; which is
* quite a bit (given the fairly simple query language), and the parsing phase
* only has to deal with the putting these elements in a tree.
*
* During analysis:
* 1) separate the query into a sequence strings
* 2) for each of these strings
* - Does it look like an Op? ('or', 'and' etc.) --> Op
* - Otherwise: treat as a Basic field ([field]:value)
* - Whitespace in value? -> promote to Phrase
* - otherwise:
* - Is value a regex (in /<regex>/) -> promote to Regex
* - Is value a wildcard (ends in '*') -> promote to Wildcard
* - is value a range (a..b) -> promote to Range
*
* After analysis, we have the sequence of element as a Sexp, which can then be
* fed to the parser. We attempt to make the Sexp as human-readable as possible.
*/
struct Element {
enum struct Bracket { Open, Close} ;
enum struct Op { And, Or, Xor, Not, AndNot };
template<typename ValueType>
struct FieldValue {
FieldValue(const ValueType& v): field{}, value{v}{}
template<typename StringType>
FieldValue(const StringType& fname, const ValueType& v):
field{std::string{fname}}, value{v}{}
template<typename StringType>
FieldValue(const Option<StringType>& fname, const ValueType& v) {
if (fname)
field = std::string{*fname};
value = v;
}
Option<std::string> field{};
ValueType value{};
};
struct Basic: public FieldValue<std::string> {using FieldValue::FieldValue;};
struct Phrase: public FieldValue<std::string> {using FieldValue::FieldValue;};
struct Regex: public FieldValue<std::string> {using FieldValue::FieldValue;};
struct Wildcard: public FieldValue<std::string> {using FieldValue::FieldValue;};
struct Range: public FieldValue<std::pair<std::string, std::string>> {
using FieldValue::FieldValue; };
using ValueType = std::variant<
/* */
Bracket,
/* op */
Op,
/* string values */
std::string,
/* value types */
Basic,
Phrase,
Regex,
Wildcard,
Range
>;
// helper
template <typename T, typename U>
struct decay_equiv:
std::is_same<typename std::decay<T>::type, U>::type {};
Element(Bracket b): value{b} {}
Element(Op op): value{op} {}
template<typename T,
typename std::enable_if<std::is_base_of<class FieldValue<T>, T>::value>::type = 0>
Element(const std::string& field, const T& val): value{T{field, val}} {}
Element(const std::string& val): value{val} {}
template<typename T>
Option<T&> get_opt() {
if (std::holds_alternative<T>(value))
return std::get<T>(value);
else
return Nothing;
}
Sexp sexp() const {
return std::visit([](auto&& arg)->Sexp {
auto field_sym = [](const Option<std::string>& field) {
return field ? Sexp::Symbol{*field} : placeholder_sym;
};
using T = std::decay_t<decltype(arg)>;
if constexpr (std::is_same_v<T, Bracket>) {
switch(arg) {
case Bracket::Open:
return open_sym;
case Bracket::Close:
return close_sym;
default:
throw std::logic_error("invalid bracket type");
}
} else if constexpr (std::is_same_v<T, Op>) {
switch(arg) {
case Op::And:
return and_sym;
case Op::Or:
return or_sym;
case Op::Xor:
return xor_sym;
case Op::Not:
return not_sym;
case Op::AndNot:
return and_not_sym;
default:
throw std::logic_error("invalid op type");
}
} else if constexpr (std::is_same_v<T, Basic>) {
return Sexp { field_sym(arg.field), arg.value };
} else if constexpr (std::is_same_v<T, Phrase>) {
return Sexp {field_sym(arg.field),
Sexp{ phrase_sym, arg.value }};
} else if constexpr (std::is_same_v<T, Regex>) {
return Sexp { field_sym(arg.field), Sexp{ regex_sym, arg.value}};
} else if constexpr (std::is_same_v<T, Wildcard>) {
return Sexp { field_sym(arg.field), Sexp{ wildcard_sym, arg.value}};
} else if constexpr (std::is_same_v<T, Range>) {
return Sexp {field_sym(arg.field),
Sexp{ range_sym, arg.value.first, arg.value.second }};
} else if constexpr (std::is_same_v<T, std::string>) {
throw std::logic_error("no bare strings should be here");
} else
throw std::logic_error("uninvited visitor");
}, value);
}
ValueType value;
};
using Elements = std::vector<Element>;
/**
* Remove first character from string and return it.
*
* @param[in,out] str a string
* @param[in,out] pos position in _original_ string
*
* @return a char or 0 if there is none.
*/
static char
read_char(std::string& str, size_t& pos)
{
if (str.empty())
return {};
auto kar{str.at(0)};
str.erase(0, 1);
++pos;
return kar;
}
/**
* Restore kar at the beginning of the string
*
* @param[in,out] str a string
* @param[in,out] pos position in _original_ string
* @param kar a character
*/
static void
unread_char(std::string& str, size_t& pos, char kar)
{
str = kar + str;
--pos;
}
/**
* Remove the the next element from the string and return it
*
* @param[in,out] str a string
* @param[in,out] pos position in _original_ string *
*
* @return an Element or Nothing
*/
static Option<Element>
next_element(std::string& str, size_t& pos)
{
bool quoted{}, escaped{};
std::string value{};
auto is_separator = [](char c) { return c == ' '|| c == '(' || c == ')'; };
while (!str.empty()) {
auto kar = read_char(str, pos);
if (kar == '\\') {
escaped = !escaped;
if (escaped)
continue;
}
if (kar == '"' && !escaped) {
if (!escaped && quoted)
return Element{value};
else {
quoted = true;
continue;
}
}
if (!quoted && !escaped && is_separator(kar)) {
if (!value.empty()) {
unread_char(str, pos, kar);
return Element{value};
}
if (quoted || kar == ' ')
continue;
switch (kar) {
case '(':
return Element{Element::Bracket::Open};
case ')':
return Element{Element::Bracket::Close};
default:
break;
}
}
value += kar;
escaped = false;
}
if (value.empty())
return Nothing;
else
return Element{value};
}
static Option<Element>
opify(Element&& element)
{
auto&& str{element.get_opt<std::string>()};
if (!str)
return element;
static const std::unordered_map<std::string, Element::Op> ops = {
{ "and", Element::Op::And },
{ "or", Element::Op::Or},
{ "xor", Element::Op::Xor },
{ "not", Element::Op::Not },
// AndNot only appears during parsing.
};
if (auto&& it = ops.find(utf8_flatten(*str)); it != ops.end())
element.value = it->second;
return element;
}
static Option<Element>
basify(Element&& element)
{
auto&& str{element.get_opt<std::string>()};
if (!str)
return element;
const auto pos = str->find(':');
if (pos == std::string::npos) {
element.value = Element::Basic{*str};
return element;
}
const auto fname{str->substr(0, pos)};
if (auto&& field{field_from_name(fname)}; field) {
auto val{str->substr(pos + 1)};
if (field == Field::Id::Flags) {
if (auto&& finfo{flag_info(val)}; finfo)
element.value = Element::Basic{field->name, std::string{finfo->name}};
else
Element::Basic{*str};
} else if (field == Field::Id::Priority) {
if (auto&& prio{priority_from_name(val)}; prio)
element.value = Element::Basic{field->name,
std::string{priority_name(*prio)}};
else
element.value = Element::Basic{*str};
} else
element.value = Element::Basic{std::string{field->name},
str->substr(pos + 1)};
} else if (field_is_combi(fname))
element.value = Element::Basic{fname, str->substr(pos +1)};
else
element.value = Element::Basic{*str};
return element;
}
static Option<Element>
phrasify(Element&& element)
{
auto&& basic{element.get_opt<Element::Basic>()};
if (!basic)
return element;
auto&& val{basic->value};
if (val.find(' ') != std::string::npos)
element.value = Element::Phrase{basic->field, val};
return element;
}
static Option<Element>
wildcardify(Element&& element)
{
auto&& basic{element.get_opt<Element::Basic>()};
if (!basic)
return element;
auto&& val{basic->value};
if (val.size() < 2 || val[val.size()-1] != '*')
return element;
val.erase(val.size() - 1);
element.value = Element::Wildcard{basic->field, val};
return element;
}
static Option<Element>
regexpify(Element&& element)
{
auto&& str{element.get_opt<Element::Basic>()};
if (!str)
return element;
auto&& val{str->value};
if (val.size() < 3 || val[0] != '/' || val[val.size()-1] != '/')
return element;
val.erase(val.size() - 1);
val.erase(0, 1);
element.value = Element::Regex{str->field, std::move(val)};
return element;
}
// handle range-fields: Size, Date, Changed
static Option<Element>
rangify(Element&& element)
{
auto&& str{element.get_opt<Element::Basic>()};
if (!str)
return element;
if (!str->field)
return element;
auto&& field = field_from_name(*str->field);
if (!field || !field->is_range())
return element;
/* yes: get the range */
auto&& range = std::invoke([&]()->std::pair<std::string, std::string> {
const auto val{str->value};
const auto pos{val.find("..")};
if (pos == std::string::npos)
return { val, val };
else
return {val.substr(0, pos), val.substr(pos + 2)};
});
if (field->id == Field::Id::Size) {
int64_t s1{range.first.empty() ? -1 :
parse_size(range.first, false/*first*/).value_or(-1)};
int64_t s2{range.second.empty() ? -1 :
parse_size(range.second, true/*last*/).value_or(-1)};
if (s2 >= 0 && s1 > s2)
std::swap(s1, s2);
element.value = Element::Range{str->field,
{s1 < 0 ? "" : std::to_string(s1),
s2 < 0 ? "" : std::to_string(s2)}};
} else if (field->id == Field::Id::Date || field->id == Field::Id::Changed) {
auto tstamp=[](auto&& str, auto&& first)->int64_t {
return str.empty() ? -1 :
parse_date_time(str, first ,false/*local*/).value_or(-1);
};
int64_t lower{tstamp(range.first, true/*lower*/)};
int64_t upper{tstamp(range.second, false/*upper*/)};
if (lower >= 0 && upper >= 0 && lower > upper) {
// can't simply swap due to rounding up/down
lower = tstamp(range.second, true/*lower*/);
upper = tstamp(range.first, false/*upper*/);
}
// use "Zulu" time.
element.value = Element::Range{
str->field,
{lower < 0 ? "" :
mu_format("{:%FT%TZ}",mu_time(lower, true/*utc*/)),
upper < 0 ? "" :
mu_format("{:%FT%TZ}", mu_time(upper, true/*utc*/))}};
}
return element;
}
static Elements
process(const std::string& expr)
{
Elements elements{};
size_t offset{0};
/* all control chars become SPC */
std::string str{expr};
for (auto& c: str)
c = ::iscntrl(c) ? ' ' : c;
while(!str.empty()) {
auto&& element = next_element(str, offset)
.and_then(opify)
.and_then(basify)
.and_then(regexpify)
.and_then(phrasify)
.and_then(wildcardify)
.and_then(rangify);
if (element)
elements.emplace_back(std::move(element.value()));
}
return elements;
}
Sexp
Mu::process_query(const std::string& expr)
{
const auto& elements{::process(expr)};
Sexp sexp{};
for (auto&& elm: elements)
sexp.add(elm.sexp());
return sexp;
}
#ifdef BUILD_PROCESS_QUERY
int
main (int argc, char *argv[])
{
if (argc < 2) {
mu_printerrln("expected: process-query <query>");
return 1;
}
std::string expr;
for (auto i = 1; i < argc; ++i) {
expr += argv[i];
expr += " ";
}
auto sexp = process_query(expr);
mu_println("{}", sexp.to_string());
return 0;
}
#endif /*BUILD_ANALYZE_QUERY*/
#if BUILD_TESTS
/*
* Tests.
*
*/
#include "utils/mu-test-utils.hh"
using TestCase = std::pair<std::string, std::string>;
static void
test_processor()
{
std::vector<TestCase> cases = {
TestCase{R"(hello world)", R"(((_ "hello") (_ "world")))"},
TestCase{R"("hello world")", R"(((_ (phrase "hello world"))))"},
TestCase{R"(subject:"hello world")", R"(((subject (phrase "hello world"))))"},
// TODO: add more...
};
for (auto&& test: cases) {
auto&& sexp{process_query(test.first)};
assert_equal(sexp.to_string(), test.second);
}
}
int
main(int argc, char* argv[])
{
mu_test_init(&argc, &argv);
g_test_add_func("/query-parser/processor", test_processor);
return g_test_run();
}
#endif /*BUILD_TESTS*/

484
lib/mu-query-xapianizer.cc Normal file
View File

@ -0,0 +1,484 @@
/*
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include "config.h"
#include "mu-query-parser.hh"
#include <string_view>
#include <variant>
#include <array>
#include <type_traits>
#include <iostream>
#include "utils/mu-option.hh"
#include <glib.h>
#include "utils/mu-utils-file.hh"
using namespace Mu;
/**
* Expand terms for scripts without explicit word-breaks (e.g.
* Chinese/Japanese/Korean) in the way that Xapian expects it -
* use Xapian's built-in QueryParser just for that.
*/
static Result<Xapian::Query>
ngram_expand(const Field& field, const std::string& str)
{
mu_println("ng: '{}'", str);
Xapian::QueryParser qp;
const auto pfx{std::string(1U, field.xapian_prefix())};
qp.set_default_op(Xapian::Query::OP_OR);
return qp.parse_query(
str,
#if HAVE_XAPIAN_FLAG_NGRAMS
Xapian::QueryParser::FLAG_NGRAMS,
#else
Xapian::QueryParser::FLAG_CJK_NGRAM,
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
pfx);
}
static Option<Sexp>
tail(Sexp&& s)
{
if (!s.listp() || s.empty())
return Nothing;
s.list().erase(s.list().begin(), s.list().begin() + 1);
return s;
}
Option<std::string>
head_symbol(const Sexp& s)
{
if (!s.listp() || s.empty() || !s.head() || !s.head()->symbolp())
return Nothing;
return s.head()->symbol().name;
}
Option<std::string>
string_nth(const Sexp& args, size_t n)
{
if (!args.listp() || args.size() < n + 1)
return Nothing;
if (auto&& item{args.list().at(n)}; !item.stringp())
return Nothing;
else
return item.string();
}
static Result<Xapian::Query>
phrase(const Field& field, Sexp&& s)
{
if (!field.is_indexable_term())
return Err(Error::Code::InvalidArgument,
"field {} does not support phrases", field.name);
if (s.size() == 1 && s.front().stringp()) {
auto&& words{split(s.front().string(), " ")};
std::vector<Xapian::Query> phvec;
phvec.reserve(words.size());
for(auto&& w: words)
phvec.emplace_back(Xapian::Query{field.xapian_term(std::move(w))});
return Xapian::Query{Xapian::Query::OP_PHRASE,
phvec.begin(), phvec.end()};
} else
return Err(Error::Code::InvalidArgument,
"invalid phrase for field {}: '{}'", field.name, s.to_string());
}
static Result<Xapian::Query>
regex(const Store& store, const Field& field, const std::string& rx_str)
{
auto&& str{utf8_flatten(rx_str)};
auto&& rx{Regex::make(str, G_REGEX_OPTIMIZE)};
if (!rx) {
mu_warning("invalid regexp: '{}': {}", str, rx.error().what());
return Xapian::Query::MatchNothing;
}
std::vector<Xapian::Query> rxvec;
store.for_each_term(field.id, [&](auto&& str) {
if (auto&& val{str.data() + 1}; rx->matches(val))
rxvec.emplace_back(field.xapian_term(std::string_view{val}));
return true;
});
return Xapian::Query(Xapian::Query::OP_OR, rxvec.begin(), rxvec.end());
}
static Result<Xapian::Query>
range(const Field& field, Sexp&& s)
{
auto&& r0{string_nth(s, 0)};
auto&& r1{string_nth(s, 1)};
if (!r0 || !r1)
return Err(Error::Code::InvalidArgument, "expected 2 range values");
// in the sexp, we use iso date/time for human readability; now convert to
// time_t
auto iso_to_lexnum=[](const std::string& s)->Option<std::string> {
if (s.empty())
return s;
if (auto&& t{parse_date_time(s, true, true/*utc*/)}; !t)
return Nothing;
else
return to_lexnum(*t);
};
if (field == Field::Id::Date || field == Field::Id::Changed) {
// iso -> time_t
r0 = iso_to_lexnum(*r0);
r1 = iso_to_lexnum(*r1);
} else if (field == Field::Id::Size) {
if (!r0->empty())
r0 = to_lexnum(::atoll(r0->c_str()));
if (!r1->empty())
r1 = to_lexnum(::atoll(r1->c_str()));
} else
return Err(Error::Code::InvalidArgument,
"unsupported range field {}", field.name);
if (r0->empty() && r1->empty())
return Xapian::Query::MatchAll;
else if (r0->empty() && !r1->empty())
return Xapian::Query(Xapian::Query::OP_VALUE_LE,
field.value_no(), *r1);
else if (!r0->empty() && r1->empty())
return Xapian::Query(Xapian::Query::OP_VALUE_GE,
field.value_no(), *r0);
else
return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
field.value_no(), *r0, *r1);
}
using OpPair = std::pair<const std::string_view, Xapian::Query::op>;
static constexpr std::array<OpPair, 4> LogOpPairs = {{
{ "and", Xapian::Query::OP_AND },
{ "or", Xapian::Query::OP_OR },
{ "xor", Xapian::Query::OP_XOR },
{ "not", Xapian::Query::OP_AND_NOT }
}};
static Option<Xapian::Query::op>
find_log_op(const std::string& opname)
{
for (auto&& p: LogOpPairs)
if (p.first == opname)
return p.second;
return Nothing;
}
static Result<Xapian::Query> parse(const Store& store, Sexp&& s, Mu::ParserFlags flags);
static Result<Xapian::Query>
parse_logop(const Store& store, Xapian::Query::op op, Sexp&& args, Mu::ParserFlags flags)
{
if (!args.listp() || args.empty())
return Err(Error::Code::InvalidArgument,
"expected non-empty list but got", args.to_string());
std::vector<Xapian::Query> qs;
for (auto&& elm: args.list()) {
if (auto&& q{parse(store, std::move(elm), flags)}; !q)
return Err(std::move(q.error()));
else
qs.emplace_back(std::move(*q));
}
switch(op) {
case Xapian::Query::OP_AND_NOT:
if (qs.size() != 1)
return Err(Error::Code::InvalidArgument,
"expected single argument for NOT");
else
return Xapian::Query{op, Xapian::Query::MatchAll, qs.at(0)};
case Xapian::Query::OP_AND:
case Xapian::Query::OP_OR:
case Xapian::Query::OP_XOR:
return Xapian::Query(op, qs.begin(), qs.end());
default:
return Err(Error::Code::InvalidArgument, "unexpected xapian op");
}
}
static Result<Xapian::Query>
parse_field_matcher(const Store& store, const Field& field,
const std::string& match_sym, Sexp&& args)
{
auto&& str0{string_nth(args, 0)};
if (match_sym == wildcard_sym.name && str0)
return Xapian::Query{Xapian::Query::OP_WILDCARD,
field.xapian_term(*str0)};
else if (match_sym == range_sym.name && !!str0)
return range(field, std::move(args));
else if (match_sym == regex_sym.name && !!str0)
return regex(store, field, *str0);
else if (match_sym == phrase_sym.name)
return phrase(field, std::move(args));
return Err(Error::Code::InvalidArgument,
"invalid field '{}'/'{}' matcher: {}",
field.name, match_sym, args.to_string());
}
static Result<Xapian::Query>
parse_basic(const Field& field, Sexp&& vals, Mu::ParserFlags flags)
{
static auto ngrams = any_of(flags & ParserFlags::SupportNgrams);
if (!vals.stringp())
return Err(Error::Code::InvalidArgument, "expected string");
auto&& val{vals.string()};
switch (field.id) {
case Field::Id::Flags:
if (auto&& finfo{flag_info(val)}; finfo)
return Xapian::Query{field.xapian_term(finfo->shortcut_lower())};
else
return Err(Error::Code::InvalidArgument,
"invalid flag '{}'", val);
case Field::Id::Priority:
if (auto&& prio{priority_from_name(val)}; prio)
return Xapian::Query{field.xapian_term(to_char(*prio))};
else
return Err(Error::Code::InvalidArgument,
"invalid priority '{}'", val);
default: {
auto q{Xapian::Query{field.xapian_term(val)}};
if (ngrams) { // special case: cjk; see if we can create an expanded query.
if (field.is_indexable_term() && contains_unbroken_script(val))
if (auto&& ng{ngram_expand(field, val)}; ng)
return ng;
}
return q;
}}
}
static Result<Xapian::Query>
parse(const Store& store, Sexp&& s, Mu::ParserFlags flags)
{
auto&& headsym{head_symbol(s)};
if (!headsym)
return Err(Error::Code::InvalidArgument,
"expected (symbol ...) but got {}", s.to_string());
// ie., something like (or|and| ... ....)
if (auto&& logop{find_log_op(*headsym)}; logop) {
if (auto&& args{tail(std::move(s))}; !args)
return Err(Error::Code::InvalidArgument,
"expected (logop ...) but got {}",
s.to_string());
else
return parse_logop(store, *logop, std::move(*args), flags);
}
// something like (field ...)
else if (auto&& field{field_from_name(*headsym)}; field) {
auto&& rest{tail(std::move(s))};
if (!rest || rest->empty())
return Err(Error::Code::InvalidArgument,
"expected field-value or field-matcher");
auto&& matcher{rest->front()};
// field-value: (field "value"); ensure "value" is there
if (matcher.stringp())
return parse_basic(*field, std::move(matcher), flags);
// otherwise, we expect a field-matcher, e.g. (field (phrase "a b c"))
// ensure the matcher is a list starting with a symbol
auto&& match_sym{head_symbol(matcher)};
if (!match_sym)
return Err(Error::Code::InvalidArgument,
"expected field-matcher");
if (auto&& args{tail(std::move(matcher))}; !args)
return Err(Error::Code::InvalidArgument, "expected matcher arguments");
else
return parse_field_matcher(store, *field,
*match_sym, std::move(*args));
}
return Err(Error::Code::InvalidArgument,
"unexpected sexp {}", s.to_string());
}
// parse the way Xapian's internal parser does it; for testing.
static Xapian::Query
xapian_query_classic(const std::string& expr, Mu::ParserFlags flags)
{
Xapian::QueryParser xqp;
// add prefixes
field_for_each([&](auto&& field){
if (!field.is_searchable())
return;
const auto prefix{std::string(1U, field.xapian_prefix())};
std::vector<std::string> names = {
std::string{field.name},
std::string(1U, field.shortcut)
};
if (!field.alias.empty())
names.emplace_back(std::string{field.alias});
for (auto&& name: names)
xqp.add_prefix(name, prefix);
});
const auto xflags = std::invoke([&]() {
unsigned f = Xapian::QueryParser::FLAG_PHRASE |
Xapian::QueryParser::FLAG_BOOLEAN |
Xapian::QueryParser::FLAG_WILDCARD;
if (any_of(flags & ParserFlags::SupportNgrams)) {
#if HAVE_XAPIAN_FLAG_NGRAMS
f |= Xapian::QueryParser::FLAG_NGRAMS;
#else
f |= Xapian::QueryParser::FLAG_CJK_NGRAM;
#endif
}
return f;
});
xqp.set_default_op(Xapian::Query::OP_AND);
return xqp.parse_query(expr, xflags);
}
Result<Xapian::Query>
Mu::make_xapian_query(const Store& store, const std::string& expr, Mu::ParserFlags flags) noexcept
{
if (any_of(flags & Mu::ParserFlags::XapianParser))
return xapian_query_classic(expr, flags);
return parse(store, Mu::parse_query(expr, true/*expand*/), flags);
}
#ifdef BUILD_XAPIANIZE_QUERY
int
main (int argc, char *argv[])
{
if (argc < 2) {
mu_printerrln("expected: parse-query <query>");
return 1;
}
auto store = Store::make(runtime_path(Mu::RuntimePath::XapianDb));
if (!store) {
mu_printerrln("error: {}", store.error());
return 2;
}
std::string expr;
for (auto i = 1; i < argc; ++i) {
expr += argv[i];
expr += " ";
}
if (auto&& query{make_xapian_query(*store, expr)}; !query) {
mu_printerrln("error: {}", query.error());
return 1;
} else {
mu_println("{}", query->get_description());
return 0;
}
}
#endif /*BUILD_XAPIANIZE_QUERY*/
#if BUILD_TESTS
/*
* Tests.
*
*/
#include "utils/mu-test-utils.hh"
using TestCase = std::pair<std::string, std::string>;
static void
test_xapian()
{
auto&& testhome{unwrap(make_temp_dir())};
auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)};
auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))};
std::vector<TestCase> cases = {
TestCase{R"(i:87h766tzzz.fsf@gnus.org)", R"(Query(I87h766tzzz.fsf@gnus.org))"},
TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"},
TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"},
TestCase{R"(subject:"hello world")", R"(Query((Shello PHRASE 2 Sworld)))"},
TestCase{R"(subject:/boo/")", R"(Query())"},
};
for (auto&& test: cases) {
auto&& xq{make_xapian_query(store, test.first)};
assert_valid_result(xq);
mu_println("'{}' <=> '{}'", xq->get_description(), test.second);
assert_equal(xq->get_description(), test.second);
}
remove_directory(testhome);
}
int
main(int argc, char* argv[])
{
mu_test_init(&argc, &argv);
Xapian::QueryParser qp;
// mu_println("{}", qp.parse_query("スポンサーシップ募集").get_description());
// mu_println("{}", qp.parse_query("スポンサーシップ募集", Xapian::QueryParser::FLAG_NGRAMS).get_description());
// mu_println("{}", qp.parse_query("hello world").get_description());
// mu_println("{}", qp.parse_query("hello world", Xapian::QueryParser::FLAG_NGRAMS).get_description());
g_test_add_func("/query-parser/xapianizer", test_xapian);
return g_test_run();
}
#endif /*BUILD_TESTS*/

View File

@ -32,15 +32,17 @@
#include "mu-query-results.hh"
#include "mu-query-match-deciders.hh"
#include "mu-query-threads.hh"
#include <mu-xapian.hh>
#include "mu-xapian-db.hh"
#include "mu-query-parser.hh"
using namespace Mu;
struct Query::Private {
Private(const Store& store) : store_{store}, parser_{store_} {}
// New
// bool calculate_threads (Xapian::Enquire& enq, size maxnum);
Private(const Store& store) :
store_{store},
parser_flags_{any_of(store_.message_options() & Message::Options::SupportNgrams) ?
ParserFlags::SupportNgrams : ParserFlags::None} {}
Xapian::Enquire make_enquire(const std::string& expr, Field::Id sortfield_id,
QueryFlags qflags) const;
@ -61,7 +63,7 @@ struct Query::Private {
Field::Id sortfield_id, QueryFlags qflags,
size_t maxnum) const;
const Store& store_;
const Parser parser_;
const ParserFlags parser_flags_;
};
Query::Query(const Store& store) : priv_{std::make_unique<Private>(store)} {}
@ -79,22 +81,27 @@ sort_enquire(Xapian::Enquire& enq, Field::Id sortfield_id, QueryFlags qflags)
return enq;
}
static Xapian::Query
make_query(const Store& store, const std::string& expr, ParserFlags parser_flags)
{
if (expr.empty() || expr == R"("")")
return Xapian::Query::MatchAll;
else {
if (auto&& q{make_xapian_query(store, expr, parser_flags)}; !q) {
mu_warning("error in query '{}': {}", expr, q.error().what());
return Xapian::Query::MatchNothing;
} else
return q.value();
}
}
Xapian::Enquire
Query::Private::make_enquire(const std::string& expr,
Field::Id sortfield_id,
QueryFlags qflags) const
{
auto enq{store_.xapian_db().enquire()};
if (expr.empty() || expr == R"("")")
enq.set_query(Xapian::Query::MatchAll);
else {
WarningVec warns;
const auto tree{parser_.parse(expr, warns)};
for (auto&& w : warns)
mu_warning("query warning: {}", to_string(w));
enq.set_query(xapian_query(tree));
}
enq.set_query(make_query(store_, expr, parser_flags_));
sort_enquire(enq, sortfield_id, qflags);
return enq;
@ -122,8 +129,7 @@ Query::Private::make_related_enquire(const StringSet& thread_ids,
struct ThreadKeyMaker : public Xapian::KeyMaker {
ThreadKeyMaker(const QueryMatches& matches) : match_info_(matches) {}
std::string operator()(const Xapian::Document& doc) const override
{
std::string operator()(const Xapian::Document& doc) const override {
const auto it{match_info_.find(doc.get_docid())};
return (it == match_info_.end()) ? "" : it->second.thread_path;
}
@ -288,14 +294,10 @@ Query::count(const std::string& expr) const
std::string
Query::parse(const std::string& expr, bool xapian) const
{
WarningVec warns;
const auto tree{priv_->parser_.parse(expr, warns)};
for (auto&& w : warns)
mu_warning("query warning: {}", to_string(w));
if (xapian)
return xapian_query(tree).get_description();
return make_query(priv_->store_, expr,
priv_->parser_flags_).get_description();
else
return to_string(tree);
return parse_query(expr).to_string();
}
/* LCOV_EXCL_STOP*/

View File

@ -1,129 +0,0 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include "mu-tokenizer.hh"
#include "utils/mu-utils.hh"
#include <cctype>
#include <iostream>
#include <algorithm>
using namespace Mu;
static bool
is_separator(char c)
{
if (isblank(c))
return true;
const auto seps = std::string("()");
return seps.find(c) != std::string::npos;
}
static Mu::Token
op_or_value(size_t pos, const std::string& val)
{
auto s = val;
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
if (s == "and")
return Token{pos, Token::Type::And, val};
else if (s == "or")
return Token{pos, Token::Type::Or, val};
else if (s == "xor")
return Token{pos, Token::Type::Xor, val};
else if (s == "not")
return Token{pos, Token::Type::Not, val};
else
return Token{pos, Token::Type::Data, val};
}
static void
unread_char(std::string& food, char kar, size_t& pos)
{
food = kar + food;
--pos;
}
static Mu::Token
eat_token(std::string& food, size_t& pos)
{
bool quoted{};
bool escaped{};
std::string value{};
while (!food.empty()) {
const auto kar = food[0];
food.erase(0, 1);
++pos;
if (kar == '\\') {
escaped = !escaped;
if (escaped)
continue;
}
if (kar == '"') {
if (!escaped && quoted)
return Token{pos, Token::Type::Data, value};
else {
quoted = true;
continue;
}
}
if (!quoted && !escaped && is_separator(kar)) {
if (!value.empty() && kar != ':') {
unread_char(food, kar, pos);
return op_or_value(pos, value);
}
if (quoted || isblank(kar))
continue;
switch (kar) {
case '(': return {pos, Token::Type::Open, "("};
case ')': return {pos, Token::Type::Close, ")"};
default: break;
}
}
value += kar;
escaped = false;
}
return {pos, Token::Type::Data, value};
}
Mu::Tokens
Mu::tokenize(const std::string& s)
{
Tokens tokens{};
std::string food = utf8_clean(s);
size_t pos{0};
if (s.empty())
return {};
while (!food.empty())
tokens.emplace_back(eat_token(food, pos));
return tokens;
}

View File

@ -1,139 +0,0 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef __TOKENIZER_HH__
#define __TOKENIZER_HH__
#include <string>
#include <vector>
#include <deque>
#include <ostream>
#include <stdexcept>
// A simple tokenizer, which turns a string into a deque of tokens
//
// It recognizes '(', ')', '*' 'and', 'or', 'xor', 'not'
//
// Note that even if we recognizes those at the lexical level, they might be demoted to mere strings
// when we're creating the parse tree.
//
// Furthermore, we detect ranges ("a..b") and regexps (/../) at the parser level, since we need a
// bit more context to resolve ambiguities.
namespace Mu {
// A token
struct Token {
enum class Type {
Data, /**< e .g., banana or date:..456 */
// Brackets
Open, /**< ( */
Close, /**< ) */
// Unops
Not, /**< logical not*/
// Binops
And, /**< logical and */
Or, /**< logical not */
Xor, /**< logical xor */
Empty, /**< nothing */
};
size_t pos{}; /**< position in string */
Type type{}; /**< token type */
const std::string str{}; /**< data for this token */
/**
* operator==
*
* @param rhs right-hand side
*
* @return true if rhs is equal to this; false otherwise
*/
bool operator==(const Token& rhs) const
{
return pos == rhs.pos && type == rhs.type && str == rhs.str;
}
};
/**
* operator<<
*
* @param os an output stream
* @param t a token type
*
* @return the updated output stream
*/
inline std::ostream&
operator<<(std::ostream& os, Token::Type t)
{
switch (t) {
case Token::Type::Data: os << "<data>"; break;
case Token::Type::Open: os << "<open>"; break;
case Token::Type::Close: os << "<close>"; break;
case Token::Type::Not: os << "<not>"; break;
case Token::Type::And: os << "<and>"; break;
case Token::Type::Or: os << "<or>"; break;
case Token::Type::Xor: os << "<xor>"; break;
case Token::Type::Empty: os << "<empty>"; break;
default: // can't happen, but pacify compiler
throw std::runtime_error("<<bug>>");
}
return os;
}
/**
* operator<<
*
* @param os an output stream
* @param t a token
*
* @return the updated output stream
*/
inline std::ostream&
operator<<(std::ostream& os, const Token& t)
{
os << t.pos << ": " << t.type;
if (!t.str.empty())
os << " [" << t.str << "]";
return os;
}
/**
* Tokenize a string into a vector of tokens. The tokenization always succeeds, ie., ignoring errors
* such a missing end-".
*
* @param s a string
*
* @return a deque of tokens
*/
using Tokens = std::deque<Token>;
Tokens tokenize(const std::string& s);
} // namespace Mu
#endif /* __TOKENIZER_HH__ */

View File

@ -1,162 +0,0 @@
/*
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef TREE_HH__
#define TREE_HH__
#include <vector>
#include <string>
#include <string_view>
#include <iostream>
#include <message/mu-fields.hh>
#include <utils/mu-option.hh>
#include <utils/mu-error.hh>
namespace Mu {
struct FieldValue {
FieldValue(Field::Id idarg, const std::string valarg):
field_id{idarg}, val1{valarg} {}
FieldValue(Field::Id idarg, const std::string valarg1, const std::string valarg2):
field_id{idarg}, val1{valarg1}, val2{valarg2} {}
const Field& field() const { return field_from_id(field_id); }
const std::string& value() const { return val1; }
const std::pair<std::string, std::string> range() const { return { val1, val2 }; }
const Field::Id field_id;
const std::string val1;
const std::string val2;
};
/**
* operator<<
*
* @param os an output stream
* @param fval a field value.
*
* @return the updated output stream
*/
inline std::ostream&
operator<<(std::ostream& os, const FieldValue& fval)
{
os << ' ' << quote(std::string{fval.field().name});
if (fval.field().is_range())
os << ' ' << quote(fval.range().first)
<< ' ' << quote(fval.range().second);
else
os << ' ' << quote(fval.value());
return os;
}
// A node in the parse tree
struct Node {
enum class Type {
Empty, // only for empty trees
OpAnd,
OpOr,
OpXor,
OpAndNot,
OpNot,
Value,
ValueAtomic,
Range,
Invalid
};
Node(Type _type, FieldValue&& fval) : type{_type}, field_val{std::move(fval)} {}
Node(Type _type) : type{_type} {}
Node(Node&& rhs) = default;
Type type;
Option<FieldValue> field_val;
static constexpr std::string_view type_name(Type t) {
switch (t) {
case Type::Empty:
return "";
case Type::OpAnd:
return "and";
case Type::OpOr:
return "or";
case Type::OpXor:
return "xor";
case Type::OpAndNot:
return "andnot";
case Type::OpNot:
return "not";
case Type::Value:
return "value";
case Type::ValueAtomic:
return "value_atomic";
case Type::Range:
return "range";
case Type::Invalid:
return "<invalid>";
default:
return "<error>";
}
}
static constexpr bool is_binop(Type t) {
return t == Type::OpAnd || t == Type::OpAndNot || t == Type::OpOr ||
t == Type::OpXor;
}
};
inline std::ostream&
operator<<(std::ostream& os, const Node& t)
{
os << Node::type_name(t.type);
if (t.field_val)
os << t.field_val.value();
return os;
}
struct Tree {
Tree(Node&& _node) : node(std::move(_node)) {}
Tree(Tree&& rhs) = default;
void add_child(Tree&& child) { children.emplace_back(std::move(child)); }
bool empty() const { return node.type == Node::Type::Empty; }
Node node;
std::vector<Tree> children;
};
inline std::ostream&
operator<<(std::ostream& os, const Tree& tree)
{
os << '(' << tree.node;
for (const auto& subtree : tree.children)
os << subtree;
os << ')';
return os;
}
} // namespace Mu
#endif /* TREE_HH__ */

View File

@ -1,139 +0,0 @@
/*
** Copyright (C) 2017-2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include <config.h>
#include <xapian.h>
#include "mu-xapian.hh"
#include <utils/mu-error.hh>
using namespace Mu;
static Xapian::Query
xapian_query_op(const Mu::Tree& tree)
{
if (tree.node.type == Node::Type::OpNot) { // OpNot x ::= <all> AND NOT x
if (tree.children.size() != 1)
throw std::runtime_error("invalid # of children");
return Xapian::Query(Xapian::Query::OP_AND_NOT,
Xapian::Query::MatchAll,
xapian_query(tree.children.front()));
}
const auto op = std::invoke([](Node::Type ntype) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wswitch-enum"
switch (ntype) {
case Node::Type::OpAnd:
return Xapian::Query::OP_AND;
case Node::Type::OpOr:
return Xapian::Query::OP_OR;
case Node::Type::OpXor:
return Xapian::Query::OP_XOR;
case Node::Type::OpAndNot:
return Xapian::Query::OP_AND_NOT;
case Node::Type::OpNot:
default:
throw Mu::Error(Error::Code::Internal, "invalid op"); // bug
}
#pragma GCC diagnostic pop
}, tree.node.type);
std::vector<Xapian::Query> childvec;
for (const auto& subtree : tree.children)
childvec.emplace_back(xapian_query(subtree));
return Xapian::Query(op, childvec.begin(), childvec.end());
}
static Xapian::Query
make_query(const FieldValue& fval, bool maybe_wildcard)
{
const auto vlen{fval.value().length()};
if (!maybe_wildcard || vlen <= 1 || fval.value()[vlen - 1] != '*')
return Xapian::Query(fval.field().xapian_term(fval.value()));
else
return Xapian::Query(Xapian::Query::OP_WILDCARD,
fval.field().xapian_term(fval.value().substr(0, vlen - 1)));
}
static Xapian::Query
xapian_query_value(const Mu::Tree& tree)
{
// indexable field implies it can be use with a phrase search.
const auto& field_val{tree.node.field_val.value()};
if (!field_val.field().is_indexable_term()) { //
/* not an indexable field; no extra magic needed*/
return make_query(field_val, true /*maybe-wildcard*/);
}
const bool is_atomic = tree.node.type == Node::Type::ValueAtomic;
const auto parts{split(field_val.value(), " ")};
if (parts.empty())
return Xapian::Query::MatchNothing; // shouldn't happen
else if (parts.size() == 1 && !is_atomic)
return make_query(field_val, true /*maybe-wildcard*/);
else if (is_atomic)
return make_query(field_val, false /*maybe-wildcard*/);
std::vector<Xapian::Query> phvec;
for (const auto& p : parts) {
FieldValue fv{field_val.field_id, p};
phvec.emplace_back(make_query(fv, false /*no wildcards*/));
}
return Xapian::Query(Xapian::Query::OP_PHRASE, phvec.begin(), phvec.end());
}
static Xapian::Query
xapian_query_range(const Mu::Tree& tree)
{
const auto& field_val{tree.node.field_val.value()};
return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
field_val.field().value_no(),
field_val.range().first,
field_val.range().second);
}
Xapian::Query
Mu::xapian_query(const Mu::Tree& tree)
{
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wswitch-enum"
switch (tree.node.type) {
case Node::Type::Empty:
return Xapian::Query();
case Node::Type::OpNot:
case Node::Type::OpAnd:
case Node::Type::OpOr:
case Node::Type::OpXor:
case Node::Type::OpAndNot:
return xapian_query_op(tree);
case Node::Type::Value:
case Node::Type::ValueAtomic:
return xapian_query_value(tree);
case Node::Type::Range:
return xapian_query_range(tree);
default:
throw Mu::Error(Error::Code::Internal, "invalid query"); // bug
}
#pragma GCC diagnostic pop
}

View File

@ -1,39 +0,0 @@
/*
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef MU_XAPIAN_HH__
#define MU_XAPIAN_HH__
#include <xapian.h>
#include <mu-parser.hh>
namespace Mu {
/**
* Transform a parse-tree into a Xapian query object
*
* @param tree a parse tree
*
* @return a Xapian query object
*/
Xapian::Query xapian_query(const Mu::Tree& tree);
} // namespace Mu
#endif /* MU_XAPIAN_H__ */

View File

@ -19,42 +19,30 @@
#
test('test-maildir',
executable('test-maildir',
'test-mu-maildir.cc',
install: false,
dependencies: [glib_dep, lib_mu_dep]))
'test-mu-maildir.cc',
install: false,
dependencies: [glib_dep, lib_mu_dep]))
test('test-msg',
executable('test-msg',
'test-mu-msg.cc',
install: false,
dependencies: [glib_dep, lib_mu_dep]))
'test-mu-msg.cc',
install: false,
dependencies: [glib_dep, lib_mu_dep]))
test('test-store',
executable('test-store',
'test-mu-store.cc',
install: false,
dependencies: [glib_dep, lib_mu_dep]))
'test-mu-store.cc',
install: false,
dependencies: [glib_dep, lib_mu_dep]))
test('test-query',
executable('test-query',
'test-query.cc',
install: false,
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
test('test-tokenizer',
executable('test-tokenizer',
'test-tokenizer.cc',
install: false,
dependencies: [glib_dep, lib_mu_dep]))
test('test-parser',
executable('test-parser',
'test-parser.cc',
install: false,
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
'test-query.cc',
install: false,
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
test('test-store-query',
executable('test-store-query',
'test-mu-store-query.cc',
install: false,
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
'test-mu-store-query.cc',
install: false,
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
#
# benchmarks
#

View File

@ -1,139 +0,0 @@
/*
** Copyright (C) 2017-2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include <vector>
#include <glib.h>
#include <iostream>
#include <sstream>
#include "utils/mu-test-utils.hh"
#include "mu-parser.hh"
#include "utils/mu-result.hh"
#include "utils/mu-utils.hh"
using namespace Mu;
struct Case {
const std::string expr;
const std::string expected;
WarningVec warnings{};
};
using CaseVec = std::vector<Case>;
static void
test_cases(const CaseVec& cases)
{
char* tmpdir = test_mu_common_get_random_tmpdir();
g_assert(tmpdir);
auto dummy_store{Store::make_new(tmpdir, "/tmp")};
assert_valid_result(dummy_store);
g_free(tmpdir);
Parser parser{*dummy_store, Parser::Flags::UnitTest};
for (const auto& casus : cases) {
WarningVec warnings;
const auto tree = parser.parse(casus.expr, warnings);
std::stringstream ss;
ss << tree;
if (g_test_verbose()) {
std::cout << "\n";
std::cout << casus.expr << std::endl;
std::cout << "exp:" << casus.expected << std::endl;
std::cout << "got:" << ss.str() << std::endl;
}
assert_equal(casus.expected, ss.str());
}
}
static void
test_basic()
{
CaseVec cases = {
//{ "", R"#((atom :value ""))#"},
{
"foo",
R"#((value "message-id" "foo"))#",
},
{"foo or bar", R"#((or(value "message-id" "foo")(value "message-id" "bar")))#"},
{"foo and bar", R"#((and(value "message-id" "foo")(value "message-id" "bar")))#"},
};
test_cases(cases);
}
static void
test_complex()
{
CaseVec cases = {
{"foo and bar or cuux",
R"#((or(and(value "message-id" "foo")(value "message-id" "bar")))#" +
std::string(R"#((value "message-id" "cuux")))#")},
{"a and not b", R"#((and(value "message-id" "a")(not(value "message-id" "b"))))#"},
{"a and b and c",
R"#((and(value "message-id" "a")(and(value "message-id" "b")(value "message-id" "c"))))#"},
{"(a or b) and c",
R"#((and(or(value "message-id" "a")(value "message-id" "b"))(value "message-id" "c")))#"},
{"a b", // implicit and
R"#((and(value "message-id" "a")(value "message-id" "b")))#"},
{"a not b", // implicit and not
R"#((and(value "message-id" "a")(not(value "message-id" "b"))))#"},
{"not b", // implicit and not
R"#((not(value "message-id" "b")))#"}};
test_cases(cases);
}
G_GNUC_UNUSED static void
test_range()
{
CaseVec cases = {
{"range:a..b", // implicit and
R"#((range "range" "a" "b"))#"},
};
test_cases(cases);
}
static void
test_flatten()
{
CaseVec cases = {{" Mötørhęåđ", R"#((value "message-id" "motorhead"))#"}};
test_cases(cases);
}
int
main(int argc, char* argv[])
{
g_test_init(&argc, &argv, NULL);
g_test_add_func("/parser/basic", test_basic);
g_test_add_func("/parser/complex", test_complex);
// g_test_add_func ("/parser/range", test_range);
g_test_add_func("/parser/flatten", test_flatten);
return g_test_run();
}

View File

@ -1,147 +0,0 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include <vector>
#include <glib.h>
#include <iostream>
#include <sstream>
#include "mu-tokenizer.hh"
struct Case {
const char* str;
const Mu::Tokens tokens;
};
using CaseVec = std::vector<Case>;
using namespace Mu;
using TT = Token::Type;
static void
test_cases(const CaseVec& cases)
{
for (const auto& casus : cases) {
const auto tokens = tokenize(casus.str);
g_assert_cmpuint((guint)tokens.size(), ==, (guint)casus.tokens.size());
for (size_t u = 0; u != tokens.size(); ++u) {
if (g_test_verbose()) {
std::cerr << "case " << u << " " << casus.str << std::endl;
std::cerr << "exp: '" << casus.tokens[u] << "'" << std::endl;
std::cerr << "got: '" << tokens[u] << "'" << std::endl;
}
g_assert_true(tokens[u] == casus.tokens[u]);
}
}
}
static void
test_basic()
{
CaseVec cases = {
{"", {}},
{"foo", Tokens{Token{3, TT::Data, "foo"}}},
{"foo bar cuux",
Tokens{Token{3, TT::Data, "foo"},
Token{7, TT::Data, "bar"},
Token{12, TT::Data, "cuux"}}},
{"\"foo bar\"", Tokens{Token{9, TT::Data, "foo bar"}}},
// ie. ignore missing closing '"'
{"\"foo bar", Tokens{Token{8, TT::Data, "foo bar"}}},
};
test_cases(cases);
}
static void
test_specials()
{
CaseVec cases = {
{")*(",
Tokens{Token{1, TT::Close, ")"}, Token{2, TT::Data, "*"}, Token{3, TT::Open, "("}}},
{"\")*(\"", Tokens{Token{5, TT::Data, ")*("}}},
};
test_cases(cases);
}
static void
test_ops()
{
CaseVec cases = {{"foo and bar oR cuux XoR fnorb",
Tokens{Token{3, TT::Data, "foo"},
Token{7, TT::And, "and"},
Token{11, TT::Data, "bar"},
Token{14, TT::Or, "oR"},
Token{19, TT::Data, "cuux"},
Token{23, TT::Xor, "XoR"},
Token{29, TT::Data, "fnorb"}}},
{"NOT (aap or mies)",
Tokens{Token{3, TT::Not, "NOT"},
Token{5, TT::Open, "("},
Token{8, TT::Data, "aap"},
Token{11, TT::Or, "or"},
Token{16, TT::Data, "mies"},
Token{17, TT::Close, ")"}}}};
test_cases(cases);
}
static void
test_escape()
{
CaseVec cases = {{"foo\"bar\"", Tokens{Token{8, TT::Data, "foobar"}}},
{"\"fnorb\"", Tokens{Token{7, TT::Data, "fnorb"}}},
{"\\\"fnorb\\\"", Tokens{Token{9, TT::Data, "fnorb"}}},
{"foo\\\"bar\\\"", Tokens{Token{10, TT::Data, "foobar"}}}};
test_cases(cases);
}
static void
test_to_string()
{
std::stringstream ss;
for (auto&& t : tokenize("foo and bar xor not cuux or fnorb"))
ss << t << ' ';
g_assert_true(ss.str() == "3: <data> [foo] 7: <and> [and] 11: <data> [bar] "
"15: <xor> [xor] 19: <not> [not] 24: <data> [cuux] "
"27: <or> [or] 33: <data> [fnorb] ");
}
int
main(int argc, char* argv[])
{
g_test_init(&argc, &argv, NULL);
g_test_add_func("/tokens/basic", test_basic);
g_test_add_func("/tokens/specials", test_specials);
g_test_add_func("/tokens/ops", test_ops);
g_test_add_func("/tokens/escape", test_escape);
g_test_add_func("/tokens/to-string", test_to_string);
return g_test_run();
}

View File

@ -1,38 +0,0 @@
/*
** Copyright (C) 2017-2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include <string>
#include <iostream>
#include "mu-tokenizer.hh"
int
main(int argc, char* argv[])
{
std::string s;
for (auto i = 1; i < argc; ++i)
s += " " + std::string(argv[i]);
const auto tvec = Mu::tokenize(s);
for (const auto& t : tvec)
std::cout << t << std::endl;
return 0;
}

View File

@ -25,8 +25,8 @@ quote any characters that would otherwise be interpreted by the shell, such as
* TERMS
The basic building blocks of a query are *terms*; these are just normal words like
'banana' or 'hello', or words prefixed with a field-name which make them apply
to just that field. See *mu find* for all the available fields.
'banana' or 'hello', or words prefixed with a field-name which makes them apply
to just that field. See *mu info fields* for all the available fields.
Some example queries:
#+begin_example
@ -60,9 +60,8 @@ mu find subject:\\"hi there\\"
* LOGICAL OPERATORS
We can combine terms with logical operators -- binary ones: *and*, *or*, *xor* and the
unary *not*, with the conventional rules for precedence and association, and are
case-insensitive.
unary *not*, with the conventional rules for precedence and association. The
operators are case-insensitive.
You can also group things with *(* and *)*, so you can do things like:
#+begin_example
@ -86,6 +85,7 @@ Note that a =pure not= - e.g. searching for *not apples* is quite a 'heavy' quer
The language supports matching basic PCRE regular expressions, see *pcre(3)*.
Regular expressions are enclosed in *//*. Some examples:
#+begin_example
subject:/h.llo/ # match hallo, hello, ...
subject:/
@ -96,10 +96,10 @@ matches messages in the '/foo' maildir, while the latter matches all messages in
all maildirs that match 'foo', such as '/foo', '/bar/cuux/foo', '/fooishbar'
etc.
Wildcards are an older mechanism for matching where a term with a rightmost ***
Wildcards are another mechanism for matching where a term with a rightmost ***
(and =only= in that position) matches any term that starts with the part before
the ***; they are supported for backward compatibility and *mu* translates them to
regular expressions internally:
the ***; they are therefore less powerful than regular expressions, but also much
faster:
#+begin_example
foo*
#+end_example
@ -108,8 +108,7 @@ is equivalent to
/foo.*/
#+end_example
As a note of caution, certain wild-cards and regular expression can take quite a
bit longer than 'normal' queries.
Regular expressions can be useful, but are relatively slow.
* FIELDS
@ -143,8 +142,8 @@ full table with all details, including single-char shortcuts, try the command:
| to | | Message recipient |
|------------+-----------+--------------------------------|
(*) The language code for the text-body if found. This works only
if ~mu~ was built with CLD2 support.
(*) The language code for the text-body if found. This works only if ~mu~ was
built with CLD2 support.
There are also the special fields *contact:*, which matches all contact-fields
(=from=, =to=, =cc= and =bcc=), and *recip*, which matches all recipient-fields (=to=, =cc=
@ -167,12 +166,12 @@ separated by *..*. Either lower or upper (but not both) can be omitted to create
an open range.
Dates are expressed in local time and using ISO-8601 format (YYYY-MM-DD
HH:MM:SS); you can leave out the right part, and *mu* adds the rest, depending on
HH:MM:SS); you can leave out the right part and *mu* adds the rest, depending on
whether this is the beginning or end of the range (e.g., as a lower bound,
'2015' would be interpreted as the start of that year; as an upper bound as the
end of the year).
You can use '/' , '.', '-' and 'T' to make dates more human readable.
You can use '/' , '.', '-', ':' and 'T' to make dates more human-readable.
Some examples:
#+begin_example
@ -274,6 +273,9 @@ Note that from the command-line, such queries must be quoted:
mu find 'maildir:"/Sent Items"'
#+end_example
Also note that you should *not* end the maildir with a ~/~, or it can be
misinterpreted as a regular expression term; see aforementioned.
* MORE EXAMPLES
Here are some simple examples of *mu* queries; you can make many more complicated
@ -321,16 +323,25 @@ Find all messages written in Dutch or German with the word 'hallo':
hallo and (lang:nl or lang:de)
#+end_example
* ANALZYING QUERIES
* CAVEATS
Despite all the documentation, in some cases it can be non-obvious how ~mu~
interprets a certain query. For that, you can ask ~mu~ to analyze the query --
that is, show how ~mu~ interprets the query.
With current Xapian versions, the apostroph character is considered part of a
word. Thus, you cannot find =D'Artagnan= by searching for =Artagnan=. So, include
the apostrophe in search or use a regexp search.
This uses the the ~--analyze~ option to *mu find*.
#+begin_example
$ mu find subject:wombat AND date:3m.. size:..2000 --analyze
* query:
subject:wombat AND date:3m.. size:..2000
* parsed query:
(and (subject "wombat") (date (range "2023-05-30T06:10:09Z" "")) (size (range "" "2000")))
* Xapian query:
Query((Swombat AND VALUE_GE 4 n64759341 AND VALUE_LE 17 i7d0))
#+end_example
Matching on spaces has changed compared to the old query-parser; this applies
e.g. to Maildirs that have spaces in their name, such as =Sent Items=. See *MAILDIR*
above.
The ~parsed query~ is usually the most interesting one to understand what's
happening.
#+include: "prefooter.inc" :minlevel 1