diff --git a/lib/meson.build b/lib/meson.build index 5ce9009e..226b6dc1 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -1,4 +1,4 @@ -## Copyright (C) 2021-2022 Dirk-Jan C. Binnema +## Copyright (C) 2021-2023 Dirk-Jan C. Binnema ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -26,16 +26,17 @@ lib_mu=static_library( 'mu-config.cc', 'mu-contacts-cache.cc', 'mu-maildir.cc', - 'mu-parser.cc', 'mu-query-match-deciders.cc', 'mu-query-threads.cc', 'mu-query.cc', 'mu-script.cc', 'mu-server.cc', 'mu-store.cc', - 'mu-tokenizer.cc', - 'mu-xapian.cc', - 'mu-xapian-db.cc' + 'mu-xapian-db.cc', + # query-parser + 'mu-query-processor.cc', + 'mu-query-parser.cc', + 'mu-query-xapianizer.cc' ], dependencies: [ glib_dep, @@ -46,8 +47,7 @@ lib_mu=static_library( config_h_dep, lib_mu_utils_dep, lib_mu_message_dep, - lib_mu_index_dep - ], + lib_mu_index_dep], install: false) @@ -57,14 +57,32 @@ lib_mu_dep = declare_dependency( include_directories: include_directories(['.', '..'])) -# dev helpers -tokenize = executable( - 'tokenize', - [ 'mu-tokenizer.cc', 'tokenize.cc' ], - dependencies: [ lib_mu_utils_dep, glib_dep ], - install: false) +# +# query parser dev helpers +# +process_query = executable('process-query', [ 'mu-query-processor.cc'], + install: false, + cpp_args: ['-DBUILD_PROCESS_QUERY'], + dependencies: [glib_dep, lib_mu_dep]) -# actual tests +parse_query = executable( 'parse-query', [ 'mu-query-parser.cc' ], + install: false, + cpp_args: ['-DBUILD_PARSE_QUERY'], + dependencies: [glib_dep, lib_mu_dep]) + +parse_query_expand = executable( 'parse-query-expand', [ 'mu-query-parser.cc' ], + install: false, + cpp_args: ['-DBUILD_PARSE_QUERY_EXPAND'], + dependencies: [glib_dep, lib_mu_dep]) + +xapian_query = executable('xapianize-query', [ 'mu-query-xapianizer.cc' ], + install: false, + cpp_args: ['-DBUILD_XAPIANIZE_QUERY'], + dependencies: [glib_dep, lib_mu_dep]) + +# +# unit tests +# test('test-threads', executable('test-threads', @@ -86,4 +104,25 @@ test('test-config', cpp_args: ['-DBUILD_TESTS'], dependencies: [glib_dep, lib_mu_dep])) +test('test-query-processor', + executable('test-query-processor', + 'mu-query-processor.cc', + install: false, + cpp_args: ['-DBUILD_TESTS'], + dependencies: [lib_mu_dep])) + +test('test-query-parser', + executable('test-query-parser', + 'mu-query-parser.cc', + install: false, + cpp_args: ['-DBUILD_TESTS'], + dependencies: [lib_mu_dep])) + +test('test-query-xapianizer', + executable('test-query-xapianizer', + 'mu-query-xapianizer.cc', + install: false, + cpp_args: ['-DBUILD_TESTS'], + dependencies: [lib_mu_dep])) + subdir('tests') diff --git a/lib/mu-parser.cc b/lib/mu-parser.cc deleted file mode 100644 index bf541ea4..00000000 --- a/lib/mu-parser.cc +++ /dev/null @@ -1,508 +0,0 @@ -/* -** Copyright (C) 2020 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ -#include "mu-parser.hh" - -#include -#include - -#include "mu-tokenizer.hh" -#include "utils/mu-utils.hh" -#include "utils/mu-error.hh" -#include "utils/mu-regex.hh" -#include "message/mu-message.hh" - -using namespace Mu; - -// 3 precedence levels: units (NOT,()) > factors (OR) > terms (AND) - -// query -> | ε -// -> | ε -// -> OR|XOR | ε -// -> | ε -// -> [AND]|AND NOT | ε -// -> [NOT] | ( ) | -// -> | | -// -> [field:]value -// -> [field:][lower]..[upper] -// -> [field:]/regex/ - -#define BUG(...) \ - Mu::Error(Error::Code::Internal, "BUG @ line {}", __LINE__); - -/** - * Get the "shortcut"/internal fields for the the given fieldstr or empty if there is none - * - * @param fieldstr a fieldstr, e.g "subject" or "s" for the subject field - * - * @return a vector with "exploded" values, with a code and a fullname. E.g. "s" might map - * to [<"S","subject">], while "recip" could map to [<"to", "T">, <"cc", "C">, <"bcc", "B">] - */ -struct FieldInfo { - const std::string field; - const std::string prefix; - bool supports_phrase; - Field::Id id; -}; -using FieldInfoVec = std::vector; -struct Parser::Private { - Private(const Store& store, Parser::Flags flags) : store_{store}, flags_{flags} {} - - std::vector process_regex(const std::string& field, - const Regex& rx) const; - - Mu::Tree term_1(Mu::Tokens& tokens, WarningVec& warnings) const; - Mu::Tree term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const; - Mu::Tree factor_1(Mu::Tokens& tokens, WarningVec& warnings) const; - Mu::Tree factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const; - Mu::Tree unit(Mu::Tokens& tokens, WarningVec& warnings) const; - Mu::Tree data(Mu::Tokens& tokens, WarningVec& warnings) const; - Mu::Tree range(const FieldInfoVec& fields, - const std::string& lower, - const std::string& upper, - size_t pos, - WarningVec& warnings) const; - Mu::Tree regex(const FieldInfoVec& fields, - const std::string& v, - size_t pos, - WarningVec& warnings) const; - Mu::Tree value(const FieldInfoVec& fields, - const std::string& v, - size_t pos, - WarningVec& warnings) const; - - private: - const Store& store_; - const Parser::Flags flags_; -}; - -static std::string -process_value(const std::string& field, const std::string& value) -{ - const auto id_opt{field_from_name(field)}; - if (id_opt) { -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (id_opt->id) { - case Field::Id::Priority: { - if (!value.empty()) - return std::string(1, value[0]); - } break; - case Field::Id::Flags: - if (const auto info{flag_info(value)}; info) - return std::string(1, info->shortcut_lower()); - break; - default: - break; - } -#pragma GCC diagnostic pop - } - - return value; // XXX prio/flags, etc. alias -} - -static void -add_field(std::vector& fields, Field::Id field_id) -{ - const auto field{field_from_id(field_id)}; - if (!field.shortcut) - return; // can't be searched - - fields.emplace_back(FieldInfo{std::string{field.name}, field.xapian_term(), - field.is_indexable_term(), field_id}); -} - -static std::vector -process_field(const std::string& field_str, Parser::Flags flags) -{ - std::vector fields; - if (any_of(flags & Parser::Flags::UnitTest)) { - add_field(fields, Field::Id::MessageId); - return fields; - } - - if (field_str == "contact" || field_str == "recip") { // multi fields - add_field(fields, Field::Id::To); - add_field(fields, Field::Id::Cc); - add_field(fields, Field::Id::Bcc); - if (field_str == "contact") - add_field(fields, Field::Id::From); - } else if (field_str.empty()) { - add_field(fields, Field::Id::To); - add_field(fields, Field::Id::Cc); - add_field(fields, Field::Id::Bcc); - add_field(fields, Field::Id::From); - add_field(fields, Field::Id::Subject); - add_field(fields, Field::Id::BodyText); - } else if (const auto field_opt{field_from_name(field_str)}; field_opt) - add_field(fields, field_opt->id); - - return fields; -} - -static bool -is_range_field(const std::string& field_str) -{ - if (const auto field_opt{field_from_name(field_str)}; !field_opt) - return false; - else - return field_opt->is_range(); -} - -struct MyRange { - std::string lower; - std::string upper; -}; - -static MyRange -process_range(const std::string& field_str, - const std::string& lower, const std::string& upper) -{ - const auto field_opt{field_from_name(field_str)}; - if (!field_opt) - return {lower, upper}; - - std::string l2 = lower; - std::string u2 = upper; - constexpr auto upper_limit = std::numeric_limits::max(); - - if (field_opt->id == Field::Id::Date || field_opt->id == Field::Id::Changed) { - l2 = to_lexnum(parse_date_time(lower, true).value_or(0)); - u2 = to_lexnum(parse_date_time(upper, false).value_or(upper_limit)); - } else if (field_opt->id == Field::Id::Size) { - l2 = to_lexnum(parse_size(lower, true).value_or(0)); - u2 = to_lexnum(parse_size(upper, false).value_or(upper_limit)); - } - - return {l2, u2}; -} - -std::vector -Parser::Private::process_regex(const std::string& field_str, - const Regex& rx) const -{ - const auto field_opt{field_from_name(field_str)}; - if (!field_opt) - return {}; - - const auto prefix{field_opt->xapian_term()}; - std::vector terms; - store_.for_each_term(field_opt->id, [&](auto&& str) { - auto val{str.c_str() + 1}; // strip off the Xapian prefix. - if (rx.matches(val)) - terms.emplace_back(std::move(val)); - return true; - }); - - return terms; -} - -static Token -look_ahead(const Mu::Tokens& tokens) -{ - return tokens.front(); -} - -static Mu::Tree -empty() -{ - return {{Node::Type::Empty}}; -} - -Mu::Tree -Parser::Private::value(const FieldInfoVec& fields, - const std::string& v, - size_t pos, - WarningVec& warnings) const -{ - auto val = utf8_flatten(v); - - if (fields.empty()) - throw BUG("expected one or more fields"); - - if (fields.size() == 1) { - const auto item = fields.front(); - return Tree({Node::Type::Value, - FieldValue{item.id, process_value(item.field, val)}}); - } - - // a 'multi-field' such as "recip:" - Tree tree(Node{Node::Type::OpOr}); - for (const auto& item : fields) - tree.add_child(Tree({Node::Type::Value, - FieldValue{item.id, - process_value(item.field, val)}})); - return tree; -} - -Mu::Tree -Parser::Private::regex(const FieldInfoVec& fields, - const std::string& v, - size_t pos, - WarningVec& warnings) const -{ - if (v.length() < 2) - throw BUG("expected regexp, got '%s'", v.c_str()); - - const auto rxstr = utf8_flatten(v.substr(1, v.length() - 2)); - - try { - Tree tree(Node{Node::Type::OpOr}); - const auto rx = Regex::make(rxstr, G_REGEX_OPTIMIZE); - if (!rx) - throw rx.error(); - for (const auto& field : fields) { - const auto terms = process_regex(field.field, *rx); - for (const auto& term : terms) { - tree.add_child(Tree({Node::Type::ValueAtomic, - FieldValue{field.id, term}})); - } - } - - if (tree.children.empty()) - return empty(); - else - return tree; - - } catch (...) { - // fallback - warnings.push_back({pos, "invalid regexp"}); - return value(fields, v, pos, warnings); - } -} - -Mu::Tree -Parser::Private::range(const FieldInfoVec& fields, - const std::string& lower, - const std::string& upper, - size_t pos, - WarningVec& warnings) const -{ - if (fields.empty()) - throw BUG("expected field"); - - const auto& field = fields.front(); - if (!is_range_field(field.field)) - return value(fields, lower + ".." + upper, pos, warnings); - - auto prange = process_range(field.field, lower, upper); - if (prange.lower > prange.upper) - prange = process_range(field.field, upper, lower); - - return Tree({Node::Type::Range, - FieldValue{field.id, prange.lower, prange.upper}}); -} - -Mu::Tree -Parser::Private::data(Mu::Tokens& tokens, WarningVec& warnings) const -{ - const auto token = look_ahead(tokens); - if (token.type != Token::Type::Data) - warnings.push_back({token.pos, "expected: value"}); - - tokens.pop_front(); - - std::string field, val; - const auto col = token.str.find(":"); - if (col != 0 && col != std::string::npos && col != token.str.length() - 1) { - field = token.str.substr(0, col); - val = token.str.substr(col + 1); - } else - val = token.str; - - auto fields = process_field(field, flags_); - if (fields.empty()) { // not valid field... - warnings.push_back({token.pos, mu_format("invalid field '{}'", field)}); - fields = process_field("", flags_); - // fallback, treat the whole of foo:bar as a value - return value(fields, field + ":" + val, token.pos, warnings); - } - - // does it look like a regexp? - if (val.length() >= 2) - if (val[0] == '/' && val[val.length() - 1] == '/') - return regex(fields, val, token.pos, warnings); - - // does it look like a range? - const auto dotdot = val.find(".."); - if (dotdot != std::string::npos) - return range(fields, - val.substr(0, dotdot), - val.substr(dotdot + 2), - token.pos, - warnings); - else if (is_range_field(fields.front().field)) { - // range field without a range - treat as field:val..val - return range(fields, val, val, token.pos, warnings); - } - - // if nothing else, it's a value. - return value(fields, val, token.pos, warnings); -} - -Mu::Tree -Parser::Private::unit(Mu::Tokens& tokens, WarningVec& warnings) const -{ - if (tokens.empty()) { - warnings.push_back({0, "expected: unit"}); - return empty(); - } - - const auto token = look_ahead(tokens); - - if (token.type == Token::Type::Not) { - tokens.pop_front(); - Tree tree{{Node::Type::OpNot}}; - tree.add_child(unit(tokens, warnings)); - return tree; - } - - if (token.type == Token::Type::Open) { - tokens.pop_front(); - auto tree = term_1(tokens, warnings); - if (tokens.empty()) - warnings.push_back({token.pos, "expected: ')'"}); - else { - const auto token2 = look_ahead(tokens); - if (token2.type == Token::Type::Close) - tokens.pop_front(); - else { - warnings.push_back( - {token2.pos, - std::string("expected: ')' but got ") + token2.str}); - } - } - return tree; - } - - return data(tokens, warnings); -} - -Mu::Tree -Parser::Private::factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const -{ - if (tokens.empty()) - return empty(); - - const auto token = look_ahead(tokens); - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (token.type) { - case Token::Type::And: { - tokens.pop_front(); - op = Node::Type::OpAnd; - } break; - - case Token::Type::Open: - case Token::Type::Data: - case Token::Type::Not: - op = Node::Type::OpAnd; // implicit AND - break; - - default: - return empty(); - } -#pragma GCC diagnostic pop - - return factor_1(tokens, warnings); -} - -Mu::Tree -Parser::Private::factor_1(Mu::Tokens& tokens, WarningVec& warnings) const -{ - Node::Type op{Node::Type::Invalid}; - - auto t = unit(tokens, warnings); - auto a2 = factor_2(tokens, op, warnings); - - if (a2.empty()) - return t; - - Tree tree{{op}}; - tree.add_child(std::move(t)); - tree.add_child(std::move(a2)); - - return tree; -} - -Mu::Tree -Parser::Private::term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const -{ - if (tokens.empty()) - return empty(); - - const auto token = look_ahead(tokens); - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (token.type) { - case Token::Type::Or: op = Node::Type::OpOr; break; - case Token::Type::Xor: op = Node::Type::OpXor; break; - default: - if (token.type != Token::Type::Close) - warnings.push_back({token.pos, "expected OR|XOR"}); - return empty(); - } -#pragma GCC diagnostic pop - - tokens.pop_front(); - - return term_1(tokens, warnings); -} - -Mu::Tree -Parser::Private::term_1(Mu::Tokens& tokens, WarningVec& warnings) const -{ - Node::Type op{Node::Type::Invalid}; - - auto t = factor_1(tokens, warnings); - auto o2 = term_2(tokens, op, warnings); - - if (o2.empty()) - return t; - else { - Tree tree{{op}}; - tree.add_child(std::move(t)); - tree.add_child(std::move(o2)); - return tree; - } -} - -Mu::Parser::Parser(const Store& store, Parser::Flags flags) : - priv_{std::make_unique(store, flags)} -{ -} - -Mu::Parser::~Parser() = default; - -Mu::Tree -Mu::Parser::parse(const std::string& expr, WarningVec& warnings) const -{ - try { - auto tokens = tokenize(expr); - if (tokens.empty()) - return empty(); - else - return priv_->term_1(tokens, warnings); - - } catch (const std::runtime_error& ex) { - std::cerr << ex.what() << std::endl; - return empty(); - } -} diff --git a/lib/mu-parser.hh b/lib/mu-parser.hh deleted file mode 100644 index 65adc645..00000000 --- a/lib/mu-parser.hh +++ /dev/null @@ -1,106 +0,0 @@ -/* -** Copyright (C) 2017 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#ifndef __PARSER_HH__ -#define __PARSER_HH__ - -#include "utils/mu-utils.hh" -#include -#include -#include - -#include -#include - -// A simple recursive-descent parser for queries. Follows the Xapian syntax, -// but better handles non-alphanum; also implements regexp - -namespace Mu { - -/** - * A parser warning - * - */ -struct Warning { - size_t pos{}; /**< pos in string */ - const std::string msg; /**< warning message */ - - /** - * operator== - * - * @param rhs right-hand side - * - * @return true if rhs is equal to this; false otherwise - */ - bool operator==(const Warning& rhs) const { return pos == rhs.pos && msg == rhs.msg; } -}; -using WarningVec = std::vector; - -/** - * operator<< - * - * @param os an output stream - * @param w a warning - * - * @return the updated output stream - */ -inline std::ostream& -operator<<(std::ostream& os, const Warning& w) -{ - os << w.pos << ":" << w.msg; - return os; -} - -class Parser { - public: - enum struct Flags { None = 0, UnitTest = 1 << 0 }; - - /** - * Construct a query parser object - * - * @param store a store object ptr, or none - */ - Parser(const Store& store, Flags = Flags::None); - /** - * DTOR - * - */ - ~Parser(); - - /** - * Parse a query string - * - * @param query a query string - * @param warnings vec to receive warnings - * - * @return a parse-tree - */ - - Tree parse(const std::string& query, WarningVec& warnings) const; - - private: - struct Private; - std::unique_ptr priv_; -}; - -MU_ENABLE_BITOPS(Parser::Flags); - -} // namespace Mu - -#endif /* __PARSER_HH__ */ diff --git a/lib/mu-query-parser.cc b/lib/mu-query-parser.cc new file mode 100644 index 00000000..77333d2e --- /dev/null +++ b/lib/mu-query-parser.cc @@ -0,0 +1,428 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "mu-query-parser.hh" + +#include +#include +#include +#include + +#include "utils/mu-utils.hh" +#include "utils/mu-sexp.hh" +#include "utils/mu-option.hh" +#include +#include "utils/mu-utils-file.hh" + +using namespace Mu; + +// Sexp extensions... +static Sexp& +prepend(Sexp& s, Sexp&& e) +{ + s.list().insert(s.list().begin(), std::move(e)); + return s; +} + +static Option +second(Sexp& s) +{ + if (s.listp() && !s.empty() && s.cbegin() + 1 != s.cend()) + return *(s.begin()+1); + else + return Nothing; +} + + +static bool +looks_like_matcher(const Sexp& sexp) +{ + // all the "terminal values" (from the Mu parser's pov) + const std::array value_syms = { + placeholder_sym, phrase_sym, regex_sym, range_sym, wildcard_sym + }; + + if (!sexp.listp() || sexp.empty() || !sexp.front().symbolp()) + return false; + + const auto symbol{sexp.front().symbol()}; + if (seq_some(value_syms, [&](auto &&sym) { return symbol == sym; })) + return true; + else if (!!field_from_name(symbol.name) || field_is_combi(symbol.name)) + return true; + else + return false; +} + +struct ParseContext { + bool expand; + std::vector warnings; +}; + + +/* + * Grammar + * + * query -> factor { ( | ) factor } + * factor -> unit { [] unit } + * unit -> matcher | query | <(> query <)> + * matcher + */ + +static Sexp query(Sexp& tokens, ParseContext& ctx); + +static Sexp +matcher(Sexp& tokens, ParseContext& ctx) +{ + if (tokens.empty()) + return {}; + + auto val{*tokens.head()}; + tokens.pop_front(); + + /* special case: if we find some non-matcher type here, we need to + * second-guess the tokenizer */ + if (!looks_like_matcher(val)) + val = Sexp{placeholder_sym, val.symbol().name}; + + if (ctx.expand) { /* should we expand meta-fields? */ + const auto symbol{val.front().symbol()}; + const auto fields = fields_from_name(symbol == placeholder_sym ? "" : symbol.name); + if (!fields.empty()) { + Sexp vals{}; + vals.add(or_sym); + for (auto&& field: fields) + vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}}); + val = std::move(vals); + } + } + + return val; +} + +static Sexp +unit(Sexp& tokens, ParseContext& ctx) +{ + if (tokens.head_symbolp(not_sym)) { /* NOT */ + tokens.pop_front(); + Sexp sub{query(tokens, ctx)}; + + /* special case: interpret "not" as a matcher instead; */ + if (sub.empty()) + return Sexp{placeholder_sym, not_sym.name}; + + /* we try to optimize: double negations are removed */ + if (sub.head_symbolp(not_sym)) + return *second(sub); + else + return Sexp(not_sym, std::move(sub)); + + } else if (tokens.head_symbolp(open_sym)) { /* ( sub) */ + tokens.pop_front(); + Sexp sub{query(tokens, ctx)}; + if (tokens.head_symbolp(close_sym)) + tokens.pop_front(); + else { + //g_warning("expected <)>"); + } + return sub; + } + + /* matcher */ + return matcher(tokens, ctx); +} + + +static Sexp +factor(Sexp& tokens, ParseContext& ctx) +{ + Sexp un = unit(tokens, ctx); + + /* query 'a b' is to be interpreted as 'a AND b'; + * + * we need an implicit AND if the head symbol is either + * a matcher (value) or the start of a sub-expression */ + auto implicit_and = [&]() { + if (tokens.head_symbolp(open_sym)) + return true; + else if (auto&& head{tokens.head()}; head) + return looks_like_matcher(*head); + else + return false; + }; + + Sexp uns; + while (true) { + + if (tokens.head_symbolp(and_sym)) + tokens.pop_front(); + else if (!implicit_and()) + break; + + if (auto&& un2 = unit(tokens, ctx); !un2.empty()) + uns.add(std::move(un2)); + else + break; + } + + if (!uns.empty()) { + un = Sexp{and_sym, std::move(un)}; + un.add_list(std::move(uns)); + } + + return un; +} + +static Sexp +query(Sexp& tokens, ParseContext& ctx) +{ + /* note: we flatten (or (or ( or ...)) etc. here; + * for optimization (since Xapian likes flat trees) */ + + Sexp fact = factor(tokens, ctx); + Sexp or_factors, xor_factors; + while (true) { + auto factors = std::invoke([&]()->Option { + + if (tokens.head_symbolp(or_sym)) + return or_factors; + else if (tokens.head_symbolp(xor_sym)) + return xor_factors; + else + return Nothing; + }); + + if (!factors) + break; + + tokens.pop_front(); + factors->add(factor(tokens, ctx)); + } + + // a bit clumsy... + + if (!or_factors.empty() && xor_factors.empty()) { + fact = Sexp{or_sym, std::move(fact)}; + fact.add_list(std::move(or_factors)); + } else if (or_factors.empty() && !xor_factors.empty()) { + fact = Sexp{xor_sym, std::move(fact)}; + fact.add_list(std::move(xor_factors)); + } else if (!or_factors.empty() && !xor_factors.empty()) { + fact = Sexp{or_sym, std::move(fact)}; + fact.add_list(std::move(or_factors)); + prepend(xor_factors, xor_sym); + fact.add(std::move(xor_factors)); + } + + return fact; +} + +Sexp +Mu::parse_query(const std::string& expr, bool expand) +{ + ParseContext context; + context.expand = expand; + + if (auto&& items = process_query(expr); !items.listp()) + throw std::runtime_error("tokens must be a list-sexp"); + else + return query(items, context); +} + + +#if defined(BUILD_PARSE_QUERY)||defined(BUILD_PARSE_QUERY_EXPAND) +int +main (int argc, char *argv[]) +{ + if (argc < 2) { + mu_printerrln("expected: {} ", argv[0]); + return 1; + } + + std::string expr; + for (auto i = 1; i < argc; ++i) { + expr += argv[i]; + expr += " "; + } + + auto&& sexp = parse_query(expr, +#ifdef BUILD_PARSE_QUERY_EXPAND + true/*expand*/ +#else + false/*don't expand*/ +#endif + ); + mu_println("{}", sexp.to_string()); + return 0; +} +#endif // BUILD_PARSE_QUERY || BUILD_PARSE_QUERY_EXPAND + + + +#if BUILD_TESTS +/* + * Tests. + * + */ + +#include "utils/mu-test-utils.hh" + +using TestCase = std::pair; + +static void +test_parser_basic() +{ + std::vector cases = { + // single term + TestCase{R"(a)", R"((_ "a"))"}, + // a and b + TestCase{R"(a and b)", R"((and (_ "a") (_ "b")))"}, + // a and b and c + TestCase{R"(a and b and c)", R"((and (_ "a") (_ "b") (_ "c")))"}, + // a or b + TestCase{R"(a or b)", R"((or (_ "a") (_ "b")))"}, + // a or b and c + TestCase{R"(a or b and c)", R"((or (_ "a") (and (_ "b") (_ "c"))))"}, + // a and b or c + TestCase{R"(a and b or c)", R"((or (and (_ "a") (_ "b")) (_ "c")))"}, + // not a + TestCase{R"(not a)", R"((not (_ "a")))"}, + // lone not + TestCase{R"(not)", R"((_ "not"))"}, + // a and (b or c) + TestCase{R"(a and (b or c))", R"((and (_ "a") (or (_ "b") (_ "c"))))"}, + // TODO: add more... + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first)}; + //mu_message ("'{}' <=> '{}'", sexp.to_string(), test.second); + assert_equal(sexp.to_string(), test.second); + } +} + +static void +test_parser_recover() +{ + std::vector cases = { + // implicit AND + TestCase{R"(a b)", R"((and (_ "a") (_ "b")))"}, + // a or or (second to be used as value) + TestCase{R"(a or and)", R"((or (_ "a") (_ "and")))"}, + // missing end ) + TestCase{R"(a and ()", R"((_ "a"))"}, + // missing end ) + TestCase{R"(a and (b)", R"((and (_ "a") (_ "b")))"}, + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first)}; + assert_equal(sexp.to_string(), test.second); + } +} + + +static void +test_parser_fields() +{ + std::vector cases = { + // simple field + TestCase{R"(s:hello)", R"((subject "hello"))"}, + // field, wildcard, regexp + TestCase{R"(subject:a* recip:/b/)", + R"((and (subject (wildcard "a")) (recip (regex "b"))))"}, + TestCase{R"(from:hello or subject:world)", + R"((or (from "hello") (subject "world")))"}, + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first)}; + assert_equal(sexp.to_string(), test.second); + } +} + +static void +test_parser_expand() +{ + std::vector cases = { + // simple field + TestCase{R"(recip:a)", R"((or (to "a") (cc "a") (bcc "a")))"}, + // field, wildcard, regexp + TestCase{R"(a*)", + R"((or (to (wildcard "a")) (cc (wildcard "a")) (bcc (wildcard "a")) (from (wildcard "a")) (subject (wildcard "a")) (body (wildcard "a")) (embed (wildcard "a"))))"}, + TestCase{R"(a xor contact:b)", + R"((xor (or (to "a") (cc "a") (bcc "a") (from "a") (subject "a") (body "a") (embed "a")) (or (to "b") (cc "b") (bcc "b") (from "b"))))"} + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first, true/*expand*/)}; + assert_equal(sexp.to_string(), test.second); + } +} + + +static void +test_parser_range() +{ + std::vector cases = { + TestCase{R"(size:1)", R"((size (range "1" "1")))"}, + TestCase{R"(size:2..)", R"((size (range "2" "")))"}, + TestCase{R"(size:..1k)", R"((size (range "" "1024")))"}, + TestCase{R"(size:..)", R"((size (range "" "")))"}, + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first, true/*expand*/)}; + assert_equal(sexp.to_string(), test.second); + } +} + +static void +test_parser_optimize() +{ + std::vector cases = { + TestCase{R"(not a)", R"((not (_ "a")))"}, + TestCase{R"(not not a)", R"((_ "a"))"}, + TestCase{R"(not not not a)", R"((not (_ "a")))"}, + TestCase{R"(not not not not a)", R"((_ "a"))"}, + }; + + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first)}; + assert_equal(sexp.to_string(), test.second); + } +} + +int +main(int argc, char* argv[]) +{ + mu_test_init(&argc, &argv); + + g_test_add_func("/query-parser/basic", test_parser_basic); + g_test_add_func("/query-parser/recover", test_parser_recover); + g_test_add_func("/query-parser/fields", test_parser_fields); + g_test_add_func("/query-parser/range", test_parser_range); + g_test_add_func("/query-parser/expand", test_parser_expand); + g_test_add_func("/query-parser/optimize", test_parser_optimize); + + return g_test_run(); +} + +#endif /*BUILD_TESTS*/ diff --git a/lib/mu-query-parser.hh b/lib/mu-query-parser.hh new file mode 100644 index 00000000..f78011bf --- /dev/null +++ b/lib/mu-query-parser.hh @@ -0,0 +1,116 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ +#include +#include +#include + +#include + +#include "utils/mu-sexp.hh" +#include "utils/mu-result.hh" +#include "mu-store.hh" + +namespace Mu { +/* + * Some useful symbol-sexps + */ +static inline const auto placeholder_sym = "_"_sym; +static inline const auto phrase_sym = "phrase"_sym; +static inline const auto regex_sym = "regex"_sym; +static inline const auto range_sym = "range"_sym; +static inline const auto wildcard_sym = "wildcard"_sym; + +static inline const auto open_sym = "("_sym; +static inline const auto close_sym = ")"_sym; + +static inline const auto and_sym = "and"_sym; +static inline const auto or_sym = "or"_sym; +static inline const auto xor_sym = "xor"_sym; +static inline const auto not_sym = "not"_sym; +static inline const auto and_not_sym = "and-not"_sym; + + +/* + * We take a query, then parse it into a human-readable s-expression and then + * turn that s-expression into a Xapian query + * + * some query: + * "from:hello or subject:world" + * + * 1. tokenize-query + * => ((from "hello") or (subject "world")) + * + * 2. parse-query + * => (or (from "hello") (subject "world")) + * + * 3. xapian-query + * => Query((Fhello OR Sworld)) + * * + */ + +/** + * Analyze the query expression and express it as a Sexp-list with the sequence + * of elements. + * + * @param expr a search expression + * + * @return Sexp with the sequence of elements + */ +Sexp process_query(const std::string& expr); + +/** + * Parse the query expression and create a parse-tree expressed as an Sexp + * object (tree). + * + * Internally, this processes the stream into element (see process_query()) and + * processes the tokens into a Sexp. This sexp is meant to be human-readable. + * + * @param expr a search expression + * @param expand whether to expand meta-fields (such as '_', 'recip', 'contacts') + * + * @return Sexp with the parse tree + */ +Sexp parse_query(const std::string& expr, bool expand=false); + +/** + * Make a Xapian Query for the given string expression. + * + * This uses parse_query() and turns the S-expression into a Xapian::Query. + * Unlike mere parsing, this uses the information in the store to resolve + * wildcard / regex queries. + * + * @param store the message store + * @param expr a string expression + * @param flavor type of parser to use + * + * @return a Xapian query result or an error. + */ +enum struct ParserFlags { + None = 0 << 0, + SupportNgrams = 1 << 0, /**< Support Xapian's Ngrams for CJK etc. handling */ + XapianParser = 1 << 1, /**< For testing only, use Xapian's + * built-in QueryParser; this is not + * fully compatible with mu, only useful + * for debugging. */ +}; +Result make_xapian_query(const Store& store, const std::string& expr, + ParserFlags flag=ParserFlags::None) noexcept; + +MU_ENABLE_BITOPS(ParserFlags); +} // namespace Mu diff --git a/lib/mu-query-processor.cc b/lib/mu-query-processor.cc new file mode 100644 index 00000000..ab461b4f --- /dev/null +++ b/lib/mu-query-processor.cc @@ -0,0 +1,548 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "mu-query-parser.hh" + +#include +#include +#include +#include + +#include "utils/mu-option.hh" +#include +#include "utils/mu-utils-file.hh" + +using namespace Mu; + +/** + * An 'Element' here is a rather rich version of what is traditionally + * considered a (lexical) token. + * + * We try to determine as much as possible during the analysis phase; which is + * quite a bit (given the fairly simple query language), and the parsing phase + * only has to deal with the putting these elements in a tree. + * + * During analysis: + * 1) separate the query into a sequence strings + * 2) for each of these strings + * - Does it look like an Op? ('or', 'and' etc.) --> Op + * - Otherwise: treat as a Basic field ([field]:value) + * - Whitespace in value? -> promote to Phrase + * - otherwise: + * - Is value a regex (in //) -> promote to Regex + * - Is value a wildcard (ends in '*') -> promote to Wildcard + * - is value a range (a..b) -> promote to Range + * + * After analysis, we have the sequence of element as a Sexp, which can then be + * fed to the parser. We attempt to make the Sexp as human-readable as possible. + */ +struct Element { + enum struct Bracket { Open, Close} ; + enum struct Op { And, Or, Xor, Not, AndNot }; + + template + struct FieldValue { + FieldValue(const ValueType& v): field{}, value{v}{} + + template + FieldValue(const StringType& fname, const ValueType& v): + field{std::string{fname}}, value{v}{} + template + FieldValue(const Option& fname, const ValueType& v) { + if (fname) + field = std::string{*fname}; + value = v; + } + + Option field{}; + ValueType value{}; + }; + struct Basic: public FieldValue {using FieldValue::FieldValue;}; + struct Phrase: public FieldValue {using FieldValue::FieldValue;}; + struct Regex: public FieldValue {using FieldValue::FieldValue;}; + struct Wildcard: public FieldValue {using FieldValue::FieldValue;}; + struct Range: public FieldValue> { + using FieldValue::FieldValue; }; + + using ValueType = std::variant< + /* */ + Bracket, + /* op */ + Op, + /* string values */ + std::string, + /* value types */ + Basic, + Phrase, + Regex, + Wildcard, + Range + >; + + // helper + template + struct decay_equiv: + std::is_same::type, U>::type {}; + + Element(Bracket b): value{b} {} + Element(Op op): value{op} {} + + template, T>::value>::type = 0> + Element(const std::string& field, const T& val): value{T{field, val}} {} + + Element(const std::string& val): value{val} {} + + template + Option get_opt() { + if (std::holds_alternative(value)) + return std::get(value); + else + return Nothing; + } + + Sexp sexp() const { + return std::visit([](auto&& arg)->Sexp { + + auto field_sym = [](const Option& field) { + return field ? Sexp::Symbol{*field} : placeholder_sym; + }; + + using T = std::decay_t; + + if constexpr (std::is_same_v) { + switch(arg) { + case Bracket::Open: + return open_sym; + case Bracket::Close: + return close_sym; + default: + throw std::logic_error("invalid bracket type"); + } + } else if constexpr (std::is_same_v) { + switch(arg) { + case Op::And: + return and_sym; + case Op::Or: + return or_sym; + case Op::Xor: + return xor_sym; + case Op::Not: + return not_sym; + case Op::AndNot: + return and_not_sym; + default: + throw std::logic_error("invalid op type"); + } + } else if constexpr (std::is_same_v) { + return Sexp { field_sym(arg.field), arg.value }; + } else if constexpr (std::is_same_v) { + return Sexp {field_sym(arg.field), + Sexp{ phrase_sym, arg.value }}; + } else if constexpr (std::is_same_v) { + return Sexp { field_sym(arg.field), Sexp{ regex_sym, arg.value}}; + } else if constexpr (std::is_same_v) { + return Sexp { field_sym(arg.field), Sexp{ wildcard_sym, arg.value}}; + } else if constexpr (std::is_same_v) { + return Sexp {field_sym(arg.field), + Sexp{ range_sym, arg.value.first, arg.value.second }}; + } else if constexpr (std::is_same_v) { + throw std::logic_error("no bare strings should be here"); + } else + throw std::logic_error("uninvited visitor"); + }, value); + } + + ValueType value; +}; + +using Elements = std::vector; + + + +/** + * Remove first character from string and return it. + * + * @param[in,out] str a string + * @param[in,out] pos position in _original_ string + * + * @return a char or 0 if there is none. + */ +static char +read_char(std::string& str, size_t& pos) +{ + if (str.empty()) + return {}; + + auto kar{str.at(0)}; + str.erase(0, 1); + ++pos; + + return kar; +} + +/** + * Restore kar at the beginning of the string + * + * @param[in,out] str a string + * @param[in,out] pos position in _original_ string + * @param kar a character + */ +static void +unread_char(std::string& str, size_t& pos, char kar) +{ + str = kar + str; + --pos; +} + + +/** + * Remove the the next element from the string and return it + * + * @param[in,out] str a string + * @param[in,out] pos position in _original_ string * + * + * @return an Element or Nothing + */ +static Option +next_element(std::string& str, size_t& pos) +{ + bool quoted{}, escaped{}; + std::string value{}; + + auto is_separator = [](char c) { return c == ' '|| c == '(' || c == ')'; }; + + while (!str.empty()) { + + auto kar = read_char(str, pos); + + if (kar == '\\') { + escaped = !escaped; + if (escaped) + continue; + } + + if (kar == '"' && !escaped) { + if (!escaped && quoted) + return Element{value}; + else { + quoted = true; + continue; + } + } + + if (!quoted && !escaped && is_separator(kar)) { + if (!value.empty()) { + unread_char(str, pos, kar); + return Element{value}; + } + + if (quoted || kar == ' ') + continue; + + switch (kar) { + case '(': + return Element{Element::Bracket::Open}; + case ')': + return Element{Element::Bracket::Close}; + default: + break; + } + } + + value += kar; + escaped = false; + } + + if (value.empty()) + return Nothing; + else + return Element{value}; +} + + +static Option +opify(Element&& element) +{ + auto&& str{element.get_opt()}; + if (!str) + return element; + + static const std::unordered_map ops = { + { "and", Element::Op::And }, + { "or", Element::Op::Or}, + { "xor", Element::Op::Xor }, + { "not", Element::Op::Not }, + // AndNot only appears during parsing. + }; + + if (auto&& it = ops.find(utf8_flatten(*str)); it != ops.end()) + element.value = it->second; + + return element; +} + +static Option +basify(Element&& element) +{ + auto&& str{element.get_opt()}; + if (!str) + return element; + + const auto pos = str->find(':'); + if (pos == std::string::npos) { + element.value = Element::Basic{*str}; + return element; + } + + const auto fname{str->substr(0, pos)}; + if (auto&& field{field_from_name(fname)}; field) { + auto val{str->substr(pos + 1)}; + if (field == Field::Id::Flags) { + if (auto&& finfo{flag_info(val)}; finfo) + element.value = Element::Basic{field->name, std::string{finfo->name}}; + else + Element::Basic{*str}; + } else if (field == Field::Id::Priority) { + if (auto&& prio{priority_from_name(val)}; prio) + element.value = Element::Basic{field->name, + std::string{priority_name(*prio)}}; + else + element.value = Element::Basic{*str}; + } else + element.value = Element::Basic{std::string{field->name}, + str->substr(pos + 1)}; + } else if (field_is_combi(fname)) + element.value = Element::Basic{fname, str->substr(pos +1)}; + else + element.value = Element::Basic{*str}; + + return element; +} + +static Option +phrasify(Element&& element) +{ + auto&& basic{element.get_opt()}; + if (!basic) + return element; + + auto&& val{basic->value}; + if (val.find(' ') != std::string::npos) + element.value = Element::Phrase{basic->field, val}; + + return element; +} + + +static Option +wildcardify(Element&& element) +{ + auto&& basic{element.get_opt()}; + if (!basic) + return element; + + auto&& val{basic->value}; + if (val.size() < 2 || val[val.size()-1] != '*') + return element; + + val.erase(val.size() - 1); + element.value = Element::Wildcard{basic->field, val}; + + return element; +} + +static Option +regexpify(Element&& element) +{ + auto&& str{element.get_opt()}; + if (!str) + return element; + + auto&& val{str->value}; + if (val.size() < 3 || val[0] != '/' || val[val.size()-1] != '/') + return element; + + val.erase(val.size() - 1); + val.erase(0, 1); + element.value = Element::Regex{str->field, std::move(val)}; + + return element; +} + +// handle range-fields: Size, Date, Changed +static Option +rangify(Element&& element) +{ + auto&& str{element.get_opt()}; + if (!str) + return element; + + if (!str->field) + return element; + + auto&& field = field_from_name(*str->field); + if (!field || !field->is_range()) + return element; + + /* yes: get the range */ + auto&& range = std::invoke([&]()->std::pair { + const auto val{str->value}; + const auto pos{val.find("..")}; + + if (pos == std::string::npos) + return { val, val }; + else + return {val.substr(0, pos), val.substr(pos + 2)}; + }); + + if (field->id == Field::Id::Size) { + int64_t s1{range.first.empty() ? -1 : + parse_size(range.first, false/*first*/).value_or(-1)}; + int64_t s2{range.second.empty() ? -1 : + parse_size(range.second, true/*last*/).value_or(-1)}; + if (s2 >= 0 && s1 > s2) + std::swap(s1, s2); + element.value = Element::Range{str->field, + {s1 < 0 ? "" : std::to_string(s1), + s2 < 0 ? "" : std::to_string(s2)}}; + + } else if (field->id == Field::Id::Date || field->id == Field::Id::Changed) { + auto tstamp=[](auto&& str, auto&& first)->int64_t { + return str.empty() ? -1 : + parse_date_time(str, first ,false/*local*/).value_or(-1); + }; + int64_t lower{tstamp(range.first, true/*lower*/)}; + int64_t upper{tstamp(range.second, false/*upper*/)}; + if (lower >= 0 && upper >= 0 && lower > upper) { + // can't simply swap due to rounding up/down + lower = tstamp(range.second, true/*lower*/); + upper = tstamp(range.first, false/*upper*/); + } + // use "Zulu" time. + element.value = Element::Range{ + str->field, + {lower < 0 ? "" : + mu_format("{:%FT%TZ}",mu_time(lower, true/*utc*/)), + upper < 0 ? "" : + mu_format("{:%FT%TZ}", mu_time(upper, true/*utc*/))}}; + } + + return element; +} + +static Elements +process(const std::string& expr) +{ + Elements elements{}; + size_t offset{0}; + + /* all control chars become SPC */ + std::string str{expr}; + for (auto& c: str) + c = ::iscntrl(c) ? ' ' : c; + + while(!str.empty()) { + auto&& element = next_element(str, offset) + .and_then(opify) + .and_then(basify) + .and_then(regexpify) + .and_then(phrasify) + .and_then(wildcardify) + .and_then(rangify); + if (element) + elements.emplace_back(std::move(element.value())); + } + + return elements; +} + +Sexp +Mu::process_query(const std::string& expr) +{ + const auto& elements{::process(expr)}; + + Sexp sexp{}; + for (auto&& elm: elements) + sexp.add(elm.sexp()); + + return sexp; +} + +#ifdef BUILD_PROCESS_QUERY +int +main (int argc, char *argv[]) +{ + if (argc < 2) { + mu_printerrln("expected: process-query "); + return 1; + } + + std::string expr; + for (auto i = 1; i < argc; ++i) { + expr += argv[i]; + expr += " "; + } + + auto sexp = process_query(expr); + mu_println("{}", sexp.to_string()); + + return 0; +} +#endif /*BUILD_ANALYZE_QUERY*/ + +#if BUILD_TESTS +/* + * Tests. + * + */ + +#include "utils/mu-test-utils.hh" + +using TestCase = std::pair; + +static void +test_processor() +{ + std::vector cases = { + TestCase{R"(hello world)", R"(((_ "hello") (_ "world")))"}, + TestCase{R"("hello world")", R"(((_ (phrase "hello world"))))"}, + TestCase{R"(subject:"hello world")", R"(((subject (phrase "hello world"))))"}, + // TODO: add more... + }; + + for (auto&& test: cases) { + auto&& sexp{process_query(test.first)}; + assert_equal(sexp.to_string(), test.second); + } +} + + + +int +main(int argc, char* argv[]) +{ + mu_test_init(&argc, &argv); + + g_test_add_func("/query-parser/processor", test_processor); + + return g_test_run(); +} + +#endif /*BUILD_TESTS*/ diff --git a/lib/mu-query-xapianizer.cc b/lib/mu-query-xapianizer.cc new file mode 100644 index 00000000..62ca18a4 --- /dev/null +++ b/lib/mu-query-xapianizer.cc @@ -0,0 +1,484 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "config.h" +#include "mu-query-parser.hh" + +#include +#include +#include +#include +#include + +#include "utils/mu-option.hh" +#include +#include "utils/mu-utils-file.hh" + +using namespace Mu; + + + +/** + * Expand terms for scripts without explicit word-breaks (e.g. + * Chinese/Japanese/Korean) in the way that Xapian expects it - + * use Xapian's built-in QueryParser just for that. + */ +static Result +ngram_expand(const Field& field, const std::string& str) +{ + mu_println("ng: '{}'", str); + + Xapian::QueryParser qp; + const auto pfx{std::string(1U, field.xapian_prefix())}; + + qp.set_default_op(Xapian::Query::OP_OR); + + return qp.parse_query( + str, +#if HAVE_XAPIAN_FLAG_NGRAMS + Xapian::QueryParser::FLAG_NGRAMS, +#else + Xapian::QueryParser::FLAG_CJK_NGRAM, +#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/ + pfx); +} + + + +static Option +tail(Sexp&& s) +{ + if (!s.listp() || s.empty()) + return Nothing; + + s.list().erase(s.list().begin(), s.list().begin() + 1); + + return s; +} + +Option +head_symbol(const Sexp& s) +{ + if (!s.listp() || s.empty() || !s.head() || !s.head()->symbolp()) + return Nothing; + + return s.head()->symbol().name; +} + + +Option +string_nth(const Sexp& args, size_t n) +{ + if (!args.listp() || args.size() < n + 1) + return Nothing; + + if (auto&& item{args.list().at(n)}; !item.stringp()) + return Nothing; + else + return item.string(); +} + +static Result +phrase(const Field& field, Sexp&& s) +{ + if (!field.is_indexable_term()) + return Err(Error::Code::InvalidArgument, + "field {} does not support phrases", field.name); + + if (s.size() == 1 && s.front().stringp()) { + auto&& words{split(s.front().string(), " ")}; + std::vector phvec; + phvec.reserve(words.size()); + for(auto&& w: words) + phvec.emplace_back(Xapian::Query{field.xapian_term(std::move(w))}); + return Xapian::Query{Xapian::Query::OP_PHRASE, + phvec.begin(), phvec.end()}; + } else + return Err(Error::Code::InvalidArgument, + "invalid phrase for field {}: '{}'", field.name, s.to_string()); +} + +static Result +regex(const Store& store, const Field& field, const std::string& rx_str) +{ + auto&& str{utf8_flatten(rx_str)}; + auto&& rx{Regex::make(str, G_REGEX_OPTIMIZE)}; + if (!rx) { + mu_warning("invalid regexp: '{}': {}", str, rx.error().what()); + return Xapian::Query::MatchNothing; + } + + std::vector rxvec; + store.for_each_term(field.id, [&](auto&& str) { + if (auto&& val{str.data() + 1}; rx->matches(val)) + rxvec.emplace_back(field.xapian_term(std::string_view{val})); + return true; + }); + + return Xapian::Query(Xapian::Query::OP_OR, rxvec.begin(), rxvec.end()); +} + + + +static Result +range(const Field& field, Sexp&& s) +{ + auto&& r0{string_nth(s, 0)}; + auto&& r1{string_nth(s, 1)}; + if (!r0 || !r1) + return Err(Error::Code::InvalidArgument, "expected 2 range values"); + + // in the sexp, we use iso date/time for human readability; now convert to + // time_t + auto iso_to_lexnum=[](const std::string& s)->Option { + if (s.empty()) + return s; + if (auto&& t{parse_date_time(s, true, true/*utc*/)}; !t) + return Nothing; + else + return to_lexnum(*t); + }; + + if (field == Field::Id::Date || field == Field::Id::Changed) { + // iso -> time_t + r0 = iso_to_lexnum(*r0); + r1 = iso_to_lexnum(*r1); + } else if (field == Field::Id::Size) { + if (!r0->empty()) + r0 = to_lexnum(::atoll(r0->c_str())); + if (!r1->empty()) + r1 = to_lexnum(::atoll(r1->c_str())); + } else + return Err(Error::Code::InvalidArgument, + "unsupported range field {}", field.name); + + if (r0->empty() && r1->empty()) + return Xapian::Query::MatchAll; + else if (r0->empty() && !r1->empty()) + return Xapian::Query(Xapian::Query::OP_VALUE_LE, + field.value_no(), *r1); + else if (!r0->empty() && r1->empty()) + return Xapian::Query(Xapian::Query::OP_VALUE_GE, + field.value_no(), *r0); + else + return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, + field.value_no(), *r0, *r1); +} + + + +using OpPair = std::pair; +static constexpr std::array LogOpPairs = {{ + { "and", Xapian::Query::OP_AND }, + { "or", Xapian::Query::OP_OR }, + { "xor", Xapian::Query::OP_XOR }, + { "not", Xapian::Query::OP_AND_NOT } + }}; + +static Option +find_log_op(const std::string& opname) +{ + for (auto&& p: LogOpPairs) + if (p.first == opname) + return p.second; + + return Nothing; +} + +static Result parse(const Store& store, Sexp&& s, Mu::ParserFlags flags); + +static Result +parse_logop(const Store& store, Xapian::Query::op op, Sexp&& args, Mu::ParserFlags flags) +{ + if (!args.listp() || args.empty()) + return Err(Error::Code::InvalidArgument, + "expected non-empty list but got", args.to_string()); + + std::vector qs; + for (auto&& elm: args.list()) { + if (auto&& q{parse(store, std::move(elm), flags)}; !q) + return Err(std::move(q.error())); + else + qs.emplace_back(std::move(*q)); + } + + switch(op) { + case Xapian::Query::OP_AND_NOT: + if (qs.size() != 1) + return Err(Error::Code::InvalidArgument, + "expected single argument for NOT"); + else + return Xapian::Query{op, Xapian::Query::MatchAll, qs.at(0)}; + + case Xapian::Query::OP_AND: + case Xapian::Query::OP_OR: + case Xapian::Query::OP_XOR: + return Xapian::Query(op, qs.begin(), qs.end()); + + default: + return Err(Error::Code::InvalidArgument, "unexpected xapian op"); + } +} + + +static Result +parse_field_matcher(const Store& store, const Field& field, + const std::string& match_sym, Sexp&& args) +{ + auto&& str0{string_nth(args, 0)}; + + if (match_sym == wildcard_sym.name && str0) + return Xapian::Query{Xapian::Query::OP_WILDCARD, + field.xapian_term(*str0)}; + else if (match_sym == range_sym.name && !!str0) + return range(field, std::move(args)); + else if (match_sym == regex_sym.name && !!str0) + return regex(store, field, *str0); + else if (match_sym == phrase_sym.name) + return phrase(field, std::move(args)); + + return Err(Error::Code::InvalidArgument, + "invalid field '{}'/'{}' matcher: {}", + field.name, match_sym, args.to_string()); +} + + +static Result +parse_basic(const Field& field, Sexp&& vals, Mu::ParserFlags flags) +{ + static auto ngrams = any_of(flags & ParserFlags::SupportNgrams); + + if (!vals.stringp()) + return Err(Error::Code::InvalidArgument, "expected string"); + + auto&& val{vals.string()}; + + switch (field.id) { + case Field::Id::Flags: + if (auto&& finfo{flag_info(val)}; finfo) + return Xapian::Query{field.xapian_term(finfo->shortcut_lower())}; + else + return Err(Error::Code::InvalidArgument, + "invalid flag '{}'", val); + case Field::Id::Priority: + if (auto&& prio{priority_from_name(val)}; prio) + return Xapian::Query{field.xapian_term(to_char(*prio))}; + else + return Err(Error::Code::InvalidArgument, + "invalid priority '{}'", val); + default: { + auto q{Xapian::Query{field.xapian_term(val)}}; + if (ngrams) { // special case: cjk; see if we can create an expanded query. + if (field.is_indexable_term() && contains_unbroken_script(val)) + if (auto&& ng{ngram_expand(field, val)}; ng) + return ng; + } + return q; + }} + +} + +static Result +parse(const Store& store, Sexp&& s, Mu::ParserFlags flags) +{ + auto&& headsym{head_symbol(s)}; + if (!headsym) + return Err(Error::Code::InvalidArgument, + "expected (symbol ...) but got {}", s.to_string()); + + // ie., something like (or|and| ... ....) + if (auto&& logop{find_log_op(*headsym)}; logop) { + if (auto&& args{tail(std::move(s))}; !args) + return Err(Error::Code::InvalidArgument, + "expected (logop ...) but got {}", + s.to_string()); + else + return parse_logop(store, *logop, std::move(*args), flags); + + } + // something like (field ...) + else if (auto&& field{field_from_name(*headsym)}; field) { + + auto&& rest{tail(std::move(s))}; + if (!rest || rest->empty()) + return Err(Error::Code::InvalidArgument, + "expected field-value or field-matcher"); + + auto&& matcher{rest->front()}; + + // field-value: (field "value"); ensure "value" is there + if (matcher.stringp()) + return parse_basic(*field, std::move(matcher), flags); + + // otherwise, we expect a field-matcher, e.g. (field (phrase "a b c")) + // ensure the matcher is a list starting with a symbol + auto&& match_sym{head_symbol(matcher)}; + if (!match_sym) + return Err(Error::Code::InvalidArgument, + "expected field-matcher"); + + if (auto&& args{tail(std::move(matcher))}; !args) + return Err(Error::Code::InvalidArgument, "expected matcher arguments"); + else + return parse_field_matcher(store, *field, + *match_sym, std::move(*args)); + } + return Err(Error::Code::InvalidArgument, + "unexpected sexp {}", s.to_string()); +} + + +// parse the way Xapian's internal parser does it; for testing. +static Xapian::Query +xapian_query_classic(const std::string& expr, Mu::ParserFlags flags) +{ + Xapian::QueryParser xqp; + + // add prefixes + field_for_each([&](auto&& field){ + + if (!field.is_searchable()) + return; + + const auto prefix{std::string(1U, field.xapian_prefix())}; + std::vector names = { + std::string{field.name}, + std::string(1U, field.shortcut) + }; + if (!field.alias.empty()) + names.emplace_back(std::string{field.alias}); + + for (auto&& name: names) + xqp.add_prefix(name, prefix); + }); + + const auto xflags = std::invoke([&]() { + unsigned f = Xapian::QueryParser::FLAG_PHRASE | + Xapian::QueryParser::FLAG_BOOLEAN | + Xapian::QueryParser::FLAG_WILDCARD; + if (any_of(flags & ParserFlags::SupportNgrams)) { +#if HAVE_XAPIAN_FLAG_NGRAMS + f |= Xapian::QueryParser::FLAG_NGRAMS; +#else + f |= Xapian::QueryParser::FLAG_CJK_NGRAM; +#endif + } + return f; + }); + + xqp.set_default_op(Xapian::Query::OP_AND); + return xqp.parse_query(expr, xflags); +} + +Result +Mu::make_xapian_query(const Store& store, const std::string& expr, Mu::ParserFlags flags) noexcept +{ + if (any_of(flags & Mu::ParserFlags::XapianParser)) + return xapian_query_classic(expr, flags); + + return parse(store, Mu::parse_query(expr, true/*expand*/), flags); +} + + +#ifdef BUILD_XAPIANIZE_QUERY +int +main (int argc, char *argv[]) +{ + if (argc < 2) { + mu_printerrln("expected: parse-query "); + return 1; + } + + auto store = Store::make(runtime_path(Mu::RuntimePath::XapianDb)); + if (!store) { + mu_printerrln("error: {}", store.error()); + return 2; + } + + std::string expr; + for (auto i = 1; i < argc; ++i) { + expr += argv[i]; + expr += " "; + } + + if (auto&& query{make_xapian_query(*store, expr)}; !query) { + mu_printerrln("error: {}", query.error()); + return 1; + } else { + mu_println("{}", query->get_description()); + return 0; + } +} +#endif /*BUILD_XAPIANIZE_QUERY*/ + +#if BUILD_TESTS +/* + * Tests. + * + */ + +#include "utils/mu-test-utils.hh" + +using TestCase = std::pair; + +static void +test_xapian() +{ + auto&& testhome{unwrap(make_temp_dir())}; + auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)}; + auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))}; + + std::vector cases = { + TestCase{R"(i:87h766tzzz.fsf@gnus.org)", R"(Query(I87h766tzzz.fsf@gnus.org))"}, + TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"}, + TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"}, + TestCase{R"(subject:"hello world")", R"(Query((Shello PHRASE 2 Sworld)))"}, + TestCase{R"(subject:/boo/")", R"(Query())"}, + }; + + for (auto&& test: cases) { + auto&& xq{make_xapian_query(store, test.first)}; + assert_valid_result(xq); + + mu_println("'{}' <=> '{}'", xq->get_description(), test.second); + assert_equal(xq->get_description(), test.second); + } + + remove_directory(testhome); +} + +int +main(int argc, char* argv[]) +{ + mu_test_init(&argc, &argv); + + + Xapian::QueryParser qp; + // mu_println("{}", qp.parse_query("スポンサーシップ募集").get_description()); + // mu_println("{}", qp.parse_query("スポンサーシップ募集", Xapian::QueryParser::FLAG_NGRAMS).get_description()); + + // mu_println("{}", qp.parse_query("hello world").get_description()); + // mu_println("{}", qp.parse_query("hello world", Xapian::QueryParser::FLAG_NGRAMS).get_description()); + + g_test_add_func("/query-parser/xapianizer", test_xapian); + + return g_test_run(); +} + +#endif /*BUILD_TESTS*/ diff --git a/lib/mu-query.cc b/lib/mu-query.cc index 97b1b71f..418f2017 100644 --- a/lib/mu-query.cc +++ b/lib/mu-query.cc @@ -32,15 +32,17 @@ #include "mu-query-results.hh" #include "mu-query-match-deciders.hh" #include "mu-query-threads.hh" -#include #include "mu-xapian-db.hh" +#include "mu-query-parser.hh" + using namespace Mu; struct Query::Private { - Private(const Store& store) : store_{store}, parser_{store_} {} - // New - // bool calculate_threads (Xapian::Enquire& enq, size maxnum); + Private(const Store& store) : + store_{store}, + parser_flags_{any_of(store_.message_options() & Message::Options::SupportNgrams) ? + ParserFlags::SupportNgrams : ParserFlags::None} {} Xapian::Enquire make_enquire(const std::string& expr, Field::Id sortfield_id, QueryFlags qflags) const; @@ -61,7 +63,7 @@ struct Query::Private { Field::Id sortfield_id, QueryFlags qflags, size_t maxnum) const; const Store& store_; - const Parser parser_; + const ParserFlags parser_flags_; }; Query::Query(const Store& store) : priv_{std::make_unique(store)} {} @@ -79,22 +81,27 @@ sort_enquire(Xapian::Enquire& enq, Field::Id sortfield_id, QueryFlags qflags) return enq; } +static Xapian::Query +make_query(const Store& store, const std::string& expr, ParserFlags parser_flags) +{ + if (expr.empty() || expr == R"("")") + return Xapian::Query::MatchAll; + else { + if (auto&& q{make_xapian_query(store, expr, parser_flags)}; !q) { + mu_warning("error in query '{}': {}", expr, q.error().what()); + return Xapian::Query::MatchNothing; + } else + return q.value(); + } +} + Xapian::Enquire Query::Private::make_enquire(const std::string& expr, Field::Id sortfield_id, QueryFlags qflags) const { auto enq{store_.xapian_db().enquire()}; - if (expr.empty() || expr == R"("")") - enq.set_query(Xapian::Query::MatchAll); - else { - WarningVec warns; - const auto tree{parser_.parse(expr, warns)}; - for (auto&& w : warns) - mu_warning("query warning: {}", to_string(w)); - enq.set_query(xapian_query(tree)); - } - + enq.set_query(make_query(store_, expr, parser_flags_)); sort_enquire(enq, sortfield_id, qflags); return enq; @@ -122,8 +129,7 @@ Query::Private::make_related_enquire(const StringSet& thread_ids, struct ThreadKeyMaker : public Xapian::KeyMaker { ThreadKeyMaker(const QueryMatches& matches) : match_info_(matches) {} - std::string operator()(const Xapian::Document& doc) const override - { + std::string operator()(const Xapian::Document& doc) const override { const auto it{match_info_.find(doc.get_docid())}; return (it == match_info_.end()) ? "" : it->second.thread_path; } @@ -288,14 +294,10 @@ Query::count(const std::string& expr) const std::string Query::parse(const std::string& expr, bool xapian) const { - WarningVec warns; - const auto tree{priv_->parser_.parse(expr, warns)}; - for (auto&& w : warns) - mu_warning("query warning: {}", to_string(w)); - if (xapian) - return xapian_query(tree).get_description(); + return make_query(priv_->store_, expr, + priv_->parser_flags_).get_description(); else - return to_string(tree); + return parse_query(expr).to_string(); } /* LCOV_EXCL_STOP*/ diff --git a/lib/mu-tokenizer.cc b/lib/mu-tokenizer.cc deleted file mode 100644 index 14b318b5..00000000 --- a/lib/mu-tokenizer.cc +++ /dev/null @@ -1,129 +0,0 @@ -/* -** Copyright (C) 2020 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include "mu-tokenizer.hh" -#include "utils/mu-utils.hh" - -#include -#include -#include - -using namespace Mu; - -static bool -is_separator(char c) -{ - if (isblank(c)) - return true; - - const auto seps = std::string("()"); - return seps.find(c) != std::string::npos; -} - -static Mu::Token -op_or_value(size_t pos, const std::string& val) -{ - auto s = val; - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - - if (s == "and") - return Token{pos, Token::Type::And, val}; - else if (s == "or") - return Token{pos, Token::Type::Or, val}; - else if (s == "xor") - return Token{pos, Token::Type::Xor, val}; - else if (s == "not") - return Token{pos, Token::Type::Not, val}; - else - return Token{pos, Token::Type::Data, val}; -} - -static void -unread_char(std::string& food, char kar, size_t& pos) -{ - food = kar + food; - --pos; -} - -static Mu::Token -eat_token(std::string& food, size_t& pos) -{ - bool quoted{}; - bool escaped{}; - std::string value{}; - - while (!food.empty()) { - const auto kar = food[0]; - food.erase(0, 1); - ++pos; - - if (kar == '\\') { - escaped = !escaped; - if (escaped) - continue; - } - - if (kar == '"') { - if (!escaped && quoted) - return Token{pos, Token::Type::Data, value}; - else { - quoted = true; - continue; - } - } - - if (!quoted && !escaped && is_separator(kar)) { - if (!value.empty() && kar != ':') { - unread_char(food, kar, pos); - return op_or_value(pos, value); - } - - if (quoted || isblank(kar)) - continue; - - switch (kar) { - case '(': return {pos, Token::Type::Open, "("}; - case ')': return {pos, Token::Type::Close, ")"}; - default: break; - } - } - - value += kar; - escaped = false; - } - - return {pos, Token::Type::Data, value}; -} - -Mu::Tokens -Mu::tokenize(const std::string& s) -{ - Tokens tokens{}; - - std::string food = utf8_clean(s); - size_t pos{0}; - - if (s.empty()) - return {}; - - while (!food.empty()) - tokens.emplace_back(eat_token(food, pos)); - - return tokens; -} diff --git a/lib/mu-tokenizer.hh b/lib/mu-tokenizer.hh deleted file mode 100644 index 7016e8b7..00000000 --- a/lib/mu-tokenizer.hh +++ /dev/null @@ -1,139 +0,0 @@ -/* -** Copyright (C) 2017 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#ifndef __TOKENIZER_HH__ -#define __TOKENIZER_HH__ - -#include -#include -#include -#include -#include - -// A simple tokenizer, which turns a string into a deque of tokens -// -// It recognizes '(', ')', '*' 'and', 'or', 'xor', 'not' -// -// Note that even if we recognizes those at the lexical level, they might be demoted to mere strings -// when we're creating the parse tree. -// -// Furthermore, we detect ranges ("a..b") and regexps (/../) at the parser level, since we need a -// bit more context to resolve ambiguities. - -namespace Mu { - -// A token -struct Token { - enum class Type { - Data, /**< e .g., banana or date:..456 */ - - // Brackets - Open, /**< ( */ - Close, /**< ) */ - - // Unops - Not, /**< logical not*/ - - // Binops - And, /**< logical and */ - Or, /**< logical not */ - Xor, /**< logical xor */ - - Empty, /**< nothing */ - }; - - size_t pos{}; /**< position in string */ - Type type{}; /**< token type */ - const std::string str{}; /**< data for this token */ - - /** - * operator== - * - * @param rhs right-hand side - * - * @return true if rhs is equal to this; false otherwise - */ - bool operator==(const Token& rhs) const - { - return pos == rhs.pos && type == rhs.type && str == rhs.str; - } -}; - -/** - * operator<< - * - * @param os an output stream - * @param t a token type - * - * @return the updated output stream - */ -inline std::ostream& -operator<<(std::ostream& os, Token::Type t) -{ - switch (t) { - case Token::Type::Data: os << ""; break; - - case Token::Type::Open: os << ""; break; - case Token::Type::Close: os << ""; break; - - case Token::Type::Not: os << ""; break; - case Token::Type::And: os << ""; break; - case Token::Type::Or: os << ""; break; - case Token::Type::Xor: os << ""; break; - case Token::Type::Empty: os << ""; break; - default: // can't happen, but pacify compiler - throw std::runtime_error("<>"); - } - - return os; -} - -/** - * operator<< - * - * @param os an output stream - * @param t a token - * - * @return the updated output stream - */ -inline std::ostream& -operator<<(std::ostream& os, const Token& t) -{ - os << t.pos << ": " << t.type; - - if (!t.str.empty()) - os << " [" << t.str << "]"; - - return os; -} - -/** - * Tokenize a string into a vector of tokens. The tokenization always succeeds, ie., ignoring errors - * such a missing end-". - * - * @param s a string - * - * @return a deque of tokens - */ -using Tokens = std::deque; -Tokens tokenize(const std::string& s); - -} // namespace Mu - -#endif /* __TOKENIZER_HH__ */ diff --git a/lib/mu-tree.hh b/lib/mu-tree.hh deleted file mode 100644 index 5c058905..00000000 --- a/lib/mu-tree.hh +++ /dev/null @@ -1,162 +0,0 @@ -/* -** Copyright (C) 2022 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#ifndef TREE_HH__ -#define TREE_HH__ - -#include -#include -#include -#include -#include - -#include -#include - -namespace Mu { - -struct FieldValue { - FieldValue(Field::Id idarg, const std::string valarg): - field_id{idarg}, val1{valarg} {} - FieldValue(Field::Id idarg, const std::string valarg1, const std::string valarg2): - field_id{idarg}, val1{valarg1}, val2{valarg2} {} - - const Field& field() const { return field_from_id(field_id); } - const std::string& value() const { return val1; } - const std::pair range() const { return { val1, val2 }; } - - const Field::Id field_id; - const std::string val1; - const std::string val2; - -}; - - -/** - * operator<< - * - * @param os an output stream - * @param fval a field value. - * - * @return the updated output stream - */ -inline std::ostream& -operator<<(std::ostream& os, const FieldValue& fval) -{ - os << ' ' << quote(std::string{fval.field().name}); - - if (fval.field().is_range()) - os << ' ' << quote(fval.range().first) - << ' ' << quote(fval.range().second); - else - os << ' ' << quote(fval.value()); - - return os; -} - -// A node in the parse tree -struct Node { - enum class Type { - Empty, // only for empty trees - OpAnd, - OpOr, - OpXor, - OpAndNot, - OpNot, - Value, - ValueAtomic, - Range, - Invalid - }; - - Node(Type _type, FieldValue&& fval) : type{_type}, field_val{std::move(fval)} {} - Node(Type _type) : type{_type} {} - Node(Node&& rhs) = default; - - Type type; - Option field_val; - - static constexpr std::string_view type_name(Type t) { - switch (t) { - case Type::Empty: - return ""; - case Type::OpAnd: - return "and"; - case Type::OpOr: - return "or"; - case Type::OpXor: - return "xor"; - case Type::OpAndNot: - return "andnot"; - case Type::OpNot: - return "not"; - case Type::Value: - return "value"; - case Type::ValueAtomic: - return "value_atomic"; - case Type::Range: - return "range"; - case Type::Invalid: - return ""; - default: - return ""; - } - } - - static constexpr bool is_binop(Type t) { - return t == Type::OpAnd || t == Type::OpAndNot || t == Type::OpOr || - t == Type::OpXor; - } -}; - -inline std::ostream& -operator<<(std::ostream& os, const Node& t) -{ - os << Node::type_name(t.type); - if (t.field_val) - os << t.field_val.value(); - - return os; -} - -struct Tree { - Tree(Node&& _node) : node(std::move(_node)) {} - Tree(Tree&& rhs) = default; - - void add_child(Tree&& child) { children.emplace_back(std::move(child)); } - bool empty() const { return node.type == Node::Type::Empty; } - - Node node; - std::vector children; -}; - -inline std::ostream& -operator<<(std::ostream& os, const Tree& tree) -{ - os << '(' << tree.node; - for (const auto& subtree : tree.children) - os << subtree; - os << ')'; - - return os; -} - -} // namespace Mu - -#endif /* TREE_HH__ */ diff --git a/lib/mu-xapian.cc b/lib/mu-xapian.cc deleted file mode 100644 index 19b3d3c3..00000000 --- a/lib/mu-xapian.cc +++ /dev/null @@ -1,139 +0,0 @@ -/* -** Copyright (C) 2017-2022 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include - -#include -#include "mu-xapian.hh" -#include - -using namespace Mu; - -static Xapian::Query -xapian_query_op(const Mu::Tree& tree) -{ - if (tree.node.type == Node::Type::OpNot) { // OpNot x ::= AND NOT x - if (tree.children.size() != 1) - throw std::runtime_error("invalid # of children"); - return Xapian::Query(Xapian::Query::OP_AND_NOT, - Xapian::Query::MatchAll, - xapian_query(tree.children.front())); - } - - const auto op = std::invoke([](Node::Type ntype) { -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (ntype) { - case Node::Type::OpAnd: - return Xapian::Query::OP_AND; - case Node::Type::OpOr: - return Xapian::Query::OP_OR; - case Node::Type::OpXor: - return Xapian::Query::OP_XOR; - case Node::Type::OpAndNot: - return Xapian::Query::OP_AND_NOT; - case Node::Type::OpNot: - default: - throw Mu::Error(Error::Code::Internal, "invalid op"); // bug - } -#pragma GCC diagnostic pop - }, tree.node.type); - - std::vector childvec; - for (const auto& subtree : tree.children) - childvec.emplace_back(xapian_query(subtree)); - - return Xapian::Query(op, childvec.begin(), childvec.end()); -} - -static Xapian::Query -make_query(const FieldValue& fval, bool maybe_wildcard) -{ - const auto vlen{fval.value().length()}; - if (!maybe_wildcard || vlen <= 1 || fval.value()[vlen - 1] != '*') - return Xapian::Query(fval.field().xapian_term(fval.value())); - else - return Xapian::Query(Xapian::Query::OP_WILDCARD, - fval.field().xapian_term(fval.value().substr(0, vlen - 1))); -} - -static Xapian::Query -xapian_query_value(const Mu::Tree& tree) -{ - // indexable field implies it can be use with a phrase search. - const auto& field_val{tree.node.field_val.value()}; - if (!field_val.field().is_indexable_term()) { // - /* not an indexable field; no extra magic needed*/ - return make_query(field_val, true /*maybe-wildcard*/); - } - - const bool is_atomic = tree.node.type == Node::Type::ValueAtomic; - - const auto parts{split(field_val.value(), " ")}; - if (parts.empty()) - return Xapian::Query::MatchNothing; // shouldn't happen - else if (parts.size() == 1 && !is_atomic) - return make_query(field_val, true /*maybe-wildcard*/); - else if (is_atomic) - return make_query(field_val, false /*maybe-wildcard*/); - - std::vector phvec; - for (const auto& p : parts) { - FieldValue fv{field_val.field_id, p}; - phvec.emplace_back(make_query(fv, false /*no wildcards*/)); - } - - return Xapian::Query(Xapian::Query::OP_PHRASE, phvec.begin(), phvec.end()); -} - -static Xapian::Query -xapian_query_range(const Mu::Tree& tree) -{ - const auto& field_val{tree.node.field_val.value()}; - - return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, - field_val.field().value_no(), - field_val.range().first, - field_val.range().second); -} - -Xapian::Query -Mu::xapian_query(const Mu::Tree& tree) -{ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (tree.node.type) { - case Node::Type::Empty: - return Xapian::Query(); - case Node::Type::OpNot: - case Node::Type::OpAnd: - case Node::Type::OpOr: - case Node::Type::OpXor: - case Node::Type::OpAndNot: - return xapian_query_op(tree); - case Node::Type::Value: - case Node::Type::ValueAtomic: - return xapian_query_value(tree); - case Node::Type::Range: - return xapian_query_range(tree); - default: - throw Mu::Error(Error::Code::Internal, "invalid query"); // bug - } -#pragma GCC diagnostic pop -} diff --git a/lib/mu-xapian.hh b/lib/mu-xapian.hh deleted file mode 100644 index 54ee006d..00000000 --- a/lib/mu-xapian.hh +++ /dev/null @@ -1,39 +0,0 @@ -/* -** Copyright (C) 2022 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#ifndef MU_XAPIAN_HH__ -#define MU_XAPIAN_HH__ - -#include -#include - -namespace Mu { - -/** - * Transform a parse-tree into a Xapian query object - * - * @param tree a parse tree - * - * @return a Xapian query object - */ -Xapian::Query xapian_query(const Mu::Tree& tree); - -} // namespace Mu - -#endif /* MU_XAPIAN_H__ */ diff --git a/lib/tests/meson.build b/lib/tests/meson.build index 17a9b726..5427fbb3 100644 --- a/lib/tests/meson.build +++ b/lib/tests/meson.build @@ -19,42 +19,30 @@ # test('test-maildir', executable('test-maildir', - 'test-mu-maildir.cc', - install: false, - dependencies: [glib_dep, lib_mu_dep])) + 'test-mu-maildir.cc', + install: false, + dependencies: [glib_dep, lib_mu_dep])) test('test-msg', executable('test-msg', - 'test-mu-msg.cc', - install: false, - dependencies: [glib_dep, lib_mu_dep])) + 'test-mu-msg.cc', + install: false, + dependencies: [glib_dep, lib_mu_dep])) test('test-store', executable('test-store', - 'test-mu-store.cc', - install: false, - dependencies: [glib_dep, lib_mu_dep])) + 'test-mu-store.cc', + install: false, + dependencies: [glib_dep, lib_mu_dep])) test('test-query', executable('test-query', - 'test-query.cc', - install: false, - dependencies: [glib_dep, gmime_dep, lib_mu_dep])) - -test('test-tokenizer', - executable('test-tokenizer', - 'test-tokenizer.cc', - install: false, - dependencies: [glib_dep, lib_mu_dep])) - -test('test-parser', - executable('test-parser', - 'test-parser.cc', - install: false, - dependencies: [glib_dep, gmime_dep, lib_mu_dep])) + 'test-query.cc', + install: false, + dependencies: [glib_dep, gmime_dep, lib_mu_dep])) test('test-store-query', executable('test-store-query', - 'test-mu-store-query.cc', - install: false, - dependencies: [glib_dep, gmime_dep, lib_mu_dep])) + 'test-mu-store-query.cc', + install: false, + dependencies: [glib_dep, gmime_dep, lib_mu_dep])) # # benchmarks # diff --git a/lib/tests/test-parser.cc b/lib/tests/test-parser.cc deleted file mode 100644 index 4590a28f..00000000 --- a/lib/tests/test-parser.cc +++ /dev/null @@ -1,139 +0,0 @@ -/* -** Copyright (C) 2017-2020 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include -#include - -#include -#include - -#include "utils/mu-test-utils.hh" - -#include "mu-parser.hh" -#include "utils/mu-result.hh" -#include "utils/mu-utils.hh" -using namespace Mu; - -struct Case { - const std::string expr; - const std::string expected; - WarningVec warnings{}; -}; - -using CaseVec = std::vector; - -static void -test_cases(const CaseVec& cases) -{ - char* tmpdir = test_mu_common_get_random_tmpdir(); - g_assert(tmpdir); - auto dummy_store{Store::make_new(tmpdir, "/tmp")}; - assert_valid_result(dummy_store); - - g_free(tmpdir); - - Parser parser{*dummy_store, Parser::Flags::UnitTest}; - - for (const auto& casus : cases) { - WarningVec warnings; - const auto tree = parser.parse(casus.expr, warnings); - - std::stringstream ss; - ss << tree; - - if (g_test_verbose()) { - std::cout << "\n"; - std::cout << casus.expr << std::endl; - std::cout << "exp:" << casus.expected << std::endl; - std::cout << "got:" << ss.str() << std::endl; - } - - assert_equal(casus.expected, ss.str()); - } -} - -static void -test_basic() -{ - CaseVec cases = { - //{ "", R"#((atom :value ""))#"}, - { - "foo", - R"#((value "message-id" "foo"))#", - }, - {"foo or bar", R"#((or(value "message-id" "foo")(value "message-id" "bar")))#"}, - {"foo and bar", R"#((and(value "message-id" "foo")(value "message-id" "bar")))#"}, - }; - - test_cases(cases); -} - -static void -test_complex() -{ - CaseVec cases = { - {"foo and bar or cuux", - R"#((or(and(value "message-id" "foo")(value "message-id" "bar")))#" + - std::string(R"#((value "message-id" "cuux")))#")}, - {"a and not b", R"#((and(value "message-id" "a")(not(value "message-id" "b"))))#"}, - {"a and b and c", - R"#((and(value "message-id" "a")(and(value "message-id" "b")(value "message-id" "c"))))#"}, - {"(a or b) and c", - R"#((and(or(value "message-id" "a")(value "message-id" "b"))(value "message-id" "c")))#"}, - {"a b", // implicit and - R"#((and(value "message-id" "a")(value "message-id" "b")))#"}, - {"a not b", // implicit and not - R"#((and(value "message-id" "a")(not(value "message-id" "b"))))#"}, - {"not b", // implicit and not - R"#((not(value "message-id" "b")))#"}}; - - test_cases(cases); -} - -G_GNUC_UNUSED static void -test_range() -{ - CaseVec cases = { - {"range:a..b", // implicit and - R"#((range "range" "a" "b"))#"}, - }; - - test_cases(cases); -} - -static void -test_flatten() -{ - CaseVec cases = {{" Mötørhęåđ", R"#((value "message-id" "motorhead"))#"}}; - - test_cases(cases); -} - -int -main(int argc, char* argv[]) -{ - g_test_init(&argc, &argv, NULL); - - g_test_add_func("/parser/basic", test_basic); - g_test_add_func("/parser/complex", test_complex); - // g_test_add_func ("/parser/range", test_range); - g_test_add_func("/parser/flatten", test_flatten); - - return g_test_run(); -} diff --git a/lib/tests/test-tokenizer.cc b/lib/tests/test-tokenizer.cc deleted file mode 100644 index 6e287f0e..00000000 --- a/lib/tests/test-tokenizer.cc +++ /dev/null @@ -1,147 +0,0 @@ -/* -** Copyright (C) 2017 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include -#include -#include -#include - -#include "mu-tokenizer.hh" - -struct Case { - const char* str; - const Mu::Tokens tokens; -}; - -using CaseVec = std::vector; - -using namespace Mu; -using TT = Token::Type; - -static void -test_cases(const CaseVec& cases) -{ - for (const auto& casus : cases) { - const auto tokens = tokenize(casus.str); - - g_assert_cmpuint((guint)tokens.size(), ==, (guint)casus.tokens.size()); - for (size_t u = 0; u != tokens.size(); ++u) { - if (g_test_verbose()) { - std::cerr << "case " << u << " " << casus.str << std::endl; - std::cerr << "exp: '" << casus.tokens[u] << "'" << std::endl; - std::cerr << "got: '" << tokens[u] << "'" << std::endl; - } - g_assert_true(tokens[u] == casus.tokens[u]); - } - } -} - -static void -test_basic() -{ - CaseVec cases = { - {"", {}}, - - {"foo", Tokens{Token{3, TT::Data, "foo"}}}, - - {"foo bar cuux", - Tokens{Token{3, TT::Data, "foo"}, - Token{7, TT::Data, "bar"}, - Token{12, TT::Data, "cuux"}}}, - - {"\"foo bar\"", Tokens{Token{9, TT::Data, "foo bar"}}}, - - // ie. ignore missing closing '"' - {"\"foo bar", Tokens{Token{8, TT::Data, "foo bar"}}}, - - }; - - test_cases(cases); -} - -static void -test_specials() -{ - CaseVec cases = { - {")*(", - Tokens{Token{1, TT::Close, ")"}, Token{2, TT::Data, "*"}, Token{3, TT::Open, "("}}}, - {"\")*(\"", Tokens{Token{5, TT::Data, ")*("}}}, - }; - - test_cases(cases); -} - -static void -test_ops() -{ - CaseVec cases = {{"foo and bar oR cuux XoR fnorb", - Tokens{Token{3, TT::Data, "foo"}, - Token{7, TT::And, "and"}, - Token{11, TT::Data, "bar"}, - Token{14, TT::Or, "oR"}, - Token{19, TT::Data, "cuux"}, - Token{23, TT::Xor, "XoR"}, - Token{29, TT::Data, "fnorb"}}}, - {"NOT (aap or mies)", - Tokens{Token{3, TT::Not, "NOT"}, - Token{5, TT::Open, "("}, - Token{8, TT::Data, "aap"}, - Token{11, TT::Or, "or"}, - Token{16, TT::Data, "mies"}, - Token{17, TT::Close, ")"}}}}; - - test_cases(cases); -} - -static void -test_escape() -{ - CaseVec cases = {{"foo\"bar\"", Tokens{Token{8, TT::Data, "foobar"}}}, - {"\"fnorb\"", Tokens{Token{7, TT::Data, "fnorb"}}}, - {"\\\"fnorb\\\"", Tokens{Token{9, TT::Data, "fnorb"}}}, - {"foo\\\"bar\\\"", Tokens{Token{10, TT::Data, "foobar"}}}}; - - test_cases(cases); -} - -static void -test_to_string() -{ - std::stringstream ss; - for (auto&& t : tokenize("foo and bar xor not cuux or fnorb")) - ss << t << ' '; - - g_assert_true(ss.str() == "3: [foo] 7: [and] 11: [bar] " - "15: [xor] 19: [not] 24: [cuux] " - "27: [or] 33: [fnorb] "); -} - -int -main(int argc, char* argv[]) -{ - g_test_init(&argc, &argv, NULL); - - g_test_add_func("/tokens/basic", test_basic); - g_test_add_func("/tokens/specials", test_specials); - g_test_add_func("/tokens/ops", test_ops); - g_test_add_func("/tokens/escape", test_escape); - g_test_add_func("/tokens/to-string", test_to_string); - - return g_test_run(); -} diff --git a/lib/tokenize.cc b/lib/tokenize.cc deleted file mode 100644 index 96a87087..00000000 --- a/lib/tokenize.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* -** Copyright (C) 2017-2020 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include -#include - -#include "mu-tokenizer.hh" - -int -main(int argc, char* argv[]) -{ - std::string s; - - for (auto i = 1; i < argc; ++i) - s += " " + std::string(argv[i]); - - const auto tvec = Mu::tokenize(s); - for (const auto& t : tvec) - std::cout << t << std::endl; - - return 0; -} diff --git a/man/mu-query.7.org b/man/mu-query.7.org index ed527ef1..73858a72 100644 --- a/man/mu-query.7.org +++ b/man/mu-query.7.org @@ -25,8 +25,8 @@ quote any characters that would otherwise be interpreted by the shell, such as * TERMS The basic building blocks of a query are *terms*; these are just normal words like -'banana' or 'hello', or words prefixed with a field-name which make them apply -to just that field. See *mu find* for all the available fields. +'banana' or 'hello', or words prefixed with a field-name which makes them apply +to just that field. See *mu info fields* for all the available fields. Some example queries: #+begin_example @@ -60,9 +60,8 @@ mu find subject:\\"hi there\\" * LOGICAL OPERATORS We can combine terms with logical operators -- binary ones: *and*, *or*, *xor* and the -unary *not*, with the conventional rules for precedence and association, and are -case-insensitive. - +unary *not*, with the conventional rules for precedence and association. The +operators are case-insensitive. You can also group things with *(* and *)*, so you can do things like: #+begin_example @@ -86,6 +85,7 @@ Note that a =pure not= - e.g. searching for *not apples* is quite a 'heavy' quer The language supports matching basic PCRE regular expressions, see *pcre(3)*. Regular expressions are enclosed in *//*. Some examples: + #+begin_example subject:/h.llo/ # match hallo, hello, ... subject:/ @@ -96,10 +96,10 @@ matches messages in the '/foo' maildir, while the latter matches all messages in all maildirs that match 'foo', such as '/foo', '/bar/cuux/foo', '/fooishbar' etc. -Wildcards are an older mechanism for matching where a term with a rightmost *** +Wildcards are another mechanism for matching where a term with a rightmost *** (and =only= in that position) matches any term that starts with the part before -the ***; they are supported for backward compatibility and *mu* translates them to -regular expressions internally: +the ***; they are therefore less powerful than regular expressions, but also much +faster: #+begin_example foo* #+end_example @@ -108,8 +108,7 @@ is equivalent to /foo.*/ #+end_example -As a note of caution, certain wild-cards and regular expression can take quite a -bit longer than 'normal' queries. +Regular expressions can be useful, but are relatively slow. * FIELDS @@ -143,8 +142,8 @@ full table with all details, including single-char shortcuts, try the command: | to | | Message recipient | |------------+-----------+--------------------------------| -(*) The language code for the text-body if found. This works only -if ~mu~ was built with CLD2 support. +(*) The language code for the text-body if found. This works only if ~mu~ was +built with CLD2 support. There are also the special fields *contact:*, which matches all contact-fields (=from=, =to=, =cc= and =bcc=), and *recip*, which matches all recipient-fields (=to=, =cc= @@ -167,12 +166,12 @@ separated by *..*. Either lower or upper (but not both) can be omitted to create an open range. Dates are expressed in local time and using ISO-8601 format (YYYY-MM-DD -HH:MM:SS); you can leave out the right part, and *mu* adds the rest, depending on +HH:MM:SS); you can leave out the right part and *mu* adds the rest, depending on whether this is the beginning or end of the range (e.g., as a lower bound, '2015' would be interpreted as the start of that year; as an upper bound as the end of the year). -You can use '/' , '.', '-' and 'T' to make dates more human readable. +You can use '/' , '.', '-', ':' and 'T' to make dates more human-readable. Some examples: #+begin_example @@ -274,6 +273,9 @@ Note that from the command-line, such queries must be quoted: mu find 'maildir:"/Sent Items"' #+end_example +Also note that you should *not* end the maildir with a ~/~, or it can be +misinterpreted as a regular expression term; see aforementioned. + * MORE EXAMPLES Here are some simple examples of *mu* queries; you can make many more complicated @@ -321,16 +323,25 @@ Find all messages written in Dutch or German with the word 'hallo': hallo and (lang:nl or lang:de) #+end_example +* ANALZYING QUERIES -* CAVEATS +Despite all the documentation, in some cases it can be non-obvious how ~mu~ +interprets a certain query. For that, you can ask ~mu~ to analyze the query -- +that is, show how ~mu~ interprets the query. -With current Xapian versions, the apostroph character is considered part of a -word. Thus, you cannot find =D'Artagnan= by searching for =Artagnan=. So, include -the apostrophe in search or use a regexp search. +This uses the the ~--analyze~ option to *mu find*. +#+begin_example +$ mu find subject:wombat AND date:3m.. size:..2000 --analyze +* query: + subject:wombat AND date:3m.. size:..2000 +* parsed query: + (and (subject "wombat") (date (range "2023-05-30T06:10:09Z" "")) (size (range "" "2000"))) +* Xapian query: + Query((Swombat AND VALUE_GE 4 n64759341 AND VALUE_LE 17 i7d0)) +#+end_example -Matching on spaces has changed compared to the old query-parser; this applies -e.g. to Maildirs that have spaces in their name, such as =Sent Items=. See *MAILDIR* -above. +The ~parsed query~ is usually the most interesting one to understand what's +happening. #+include: "prefooter.inc" :minlevel 1