/* ** Copyright (C) 2020 Dirk-Jan C. Binnema ** ** This library is free software; you can redistribute it and/or ** modify it under the terms of the GNU Lesser General Public License ** as published by the Free Software Foundation; either version 2.1 ** of the License, or (at your option) any later version. ** ** This library is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ** Lesser General Public License for more details. ** ** You should have received a copy of the GNU Lesser General Public ** License along with this library; if not, write to the Free ** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA ** 02110-1301, USA. */ #include "mu-parser.hh" #include #include #include #include "mu-tokenizer.hh" #include "utils/mu-utils.hh" #include "utils/mu-error.hh" #include "message/mu-message.hh" using namespace Mu; // 3 precedence levels: units (NOT,()) > factors (OR) > terms (AND) // query -> | ε // -> | ε // -> OR|XOR | ε // -> | ε // -> [AND]|AND NOT | ε // -> [NOT] | ( ) | // -> | | // -> [field:]value // -> [field:][lower]..[upper] // -> [field:]/regex/ #define BUG(...) \ Mu::Error(Error::Code::Internal, format("%u: BUG: ", __LINE__) + format(__VA_ARGS__)) /** * Get the "shortcut"/internal fields for the the given fieldstr or empty if there is none * * @param fieldstr a fieldstr, e.g "subject" or "s" for the subject field * * @return a vector with "exploded" values, with a code and a fullname. E.g. "s" might map * to [<"S","subject">], while "recip" could map to [<"to", "T">, <"cc", "C">, <"bcc", "B">] */ struct FieldInfo { const std::string field; const std::string prefix; bool supports_phrase; Field::Id id; }; using FieldInfoVec = std::vector; struct Parser::Private { Private(const Store& store, Parser::Flags flags) : store_{store}, flags_{flags} {} std::vector process_regex(const std::string& field, const std::regex& rx) const; Mu::Tree term_1(Mu::Tokens& tokens, WarningVec& warnings) const; Mu::Tree term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const; Mu::Tree factor_1(Mu::Tokens& tokens, WarningVec& warnings) const; Mu::Tree factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const; Mu::Tree unit(Mu::Tokens& tokens, WarningVec& warnings) const; Mu::Tree data(Mu::Tokens& tokens, WarningVec& warnings) const; Mu::Tree range(const FieldInfoVec& fields, const std::string& lower, const std::string& upper, size_t pos, WarningVec& warnings) const; Mu::Tree regex(const FieldInfoVec& fields, const std::string& v, size_t pos, WarningVec& warnings) const; Mu::Tree value(const FieldInfoVec& fields, const std::string& v, size_t pos, WarningVec& warnings) const; private: const Store& store_; const Parser::Flags flags_; }; static std::string process_value(const std::string& field, const std::string& value) { const auto id_opt{field_from_name(field)}; if (id_opt) { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-enum" switch (id_opt->id) { case Field::Id::Priority: { if (!value.empty()) return std::string(1, value[0]); } break; case Field::Id::Flags: if (const auto info{flag_info(value)}; info) return std::string(1, info->shortcut_lower()); break; default: break; } #pragma GCC diagnostic pop } return value; // XXX prio/flags, etc. alias } static void add_field(std::vector& fields, Field::Id field_id) { const auto field{field_from_id(field_id)}; if (!field.shortcut) return; // can't be searched fields.emplace_back(FieldInfo{std::string{field.name}, field.xapian_term(), field.is_indexable_term(), field_id}); } static std::vector process_field(const std::string& field_str, Parser::Flags flags) { std::vector fields; if (any_of(flags & Parser::Flags::UnitTest)) { add_field(fields, Field::Id::MessageId); return fields; } if (field_str == "contact" || field_str == "recip") { // multi fields add_field(fields, Field::Id::To); add_field(fields, Field::Id::Cc); add_field(fields, Field::Id::Bcc); if (field_str == "contact") add_field(fields, Field::Id::From); } else if (field_str.empty()) { add_field(fields, Field::Id::To); add_field(fields, Field::Id::Cc); add_field(fields, Field::Id::Bcc); add_field(fields, Field::Id::From); add_field(fields, Field::Id::Subject); add_field(fields, Field::Id::BodyText); } else if (const auto field_opt{field_from_name(field_str)}; field_opt) add_field(fields, field_opt->id); return fields; } static bool is_range_field(const std::string& field_str) { if (const auto field_opt{field_from_name(field_str)}; !field_opt) return false; else return field_opt->is_range(); } struct MyRange { std::string lower; std::string upper; }; static MyRange process_range(const std::string& field_str, const std::string& lower, const std::string& upper) { const auto field_opt{field_from_name(field_str)}; if (!field_opt) return {lower, upper}; std::string l2 = lower; std::string u2 = upper; constexpr auto upper_limit = std::numeric_limits::max(); if (field_opt->id == Field::Id::Date || field_opt->id == Field::Id::Changed) { l2 = to_lexnum(parse_date_time(lower, true).value_or(0)); u2 = to_lexnum(parse_date_time(upper, false).value_or(upper_limit)); } else if (field_opt->id == Field::Id::Size) { l2 = to_lexnum(parse_size(lower, true).value_or(0)); u2 = to_lexnum(parse_size(upper, false).value_or(upper_limit)); } return {l2, u2}; } std::vector Parser::Private::process_regex(const std::string& field_str, const std::regex& rx) const { const auto field_opt{field_from_name(field_str)}; if (!field_opt) return {}; const auto prefix{field_opt->xapian_term()}; std::vector terms; store_.for_each_term(field_opt->id, [&](auto&& str) { auto val{str.c_str() + 1}; // strip off the Xapian prefix. if (std::regex_search(val, rx)) terms.emplace_back(std::move(val)); return true; }); return terms; } static Token look_ahead(const Mu::Tokens& tokens) { return tokens.front(); } static Mu::Tree empty() { return {{Node::Type::Empty}}; } Mu::Tree Parser::Private::value(const FieldInfoVec& fields, const std::string& v, size_t pos, WarningVec& warnings) const { auto val = utf8_flatten(v); if (fields.empty()) throw BUG("expected one or more fields"); if (fields.size() == 1) { const auto item = fields.front(); return Tree({Node::Type::Value, FieldValue{item.id, process_value(item.field, val)}}); } // a 'multi-field' such as "recip:" Tree tree(Node{Node::Type::OpOr}); for (const auto& item : fields) tree.add_child(Tree({Node::Type::Value, FieldValue{item.id, process_value(item.field, val)}})); return tree; } Mu::Tree Parser::Private::regex(const FieldInfoVec& fields, const std::string& v, size_t pos, WarningVec& warnings) const { if (v.length() < 2) throw BUG("expected regexp, got '%s'", v.c_str()); const auto rxstr = utf8_flatten(v.substr(1, v.length() - 2)); try { Tree tree(Node{Node::Type::OpOr}); const auto rx = std::regex(rxstr); for (const auto& field : fields) { const auto terms = process_regex(field.field, rx); for (const auto& term : terms) { tree.add_child(Tree({Node::Type::Value, FieldValue{field.id, term}})); } } if (tree.children.empty()) return empty(); else return tree; } catch (...) { // fallback warnings.push_back({pos, "invalid regexp"}); return value(fields, v, pos, warnings); } } Mu::Tree Parser::Private::range(const FieldInfoVec& fields, const std::string& lower, const std::string& upper, size_t pos, WarningVec& warnings) const { if (fields.empty()) throw BUG("expected field"); const auto& field = fields.front(); if (!is_range_field(field.field)) return value(fields, lower + ".." + upper, pos, warnings); auto prange = process_range(field.field, lower, upper); if (prange.lower > prange.upper) prange = process_range(field.field, upper, lower); return Tree({Node::Type::Range, FieldValue{field.id, prange.lower, prange.upper}}); } Mu::Tree Parser::Private::data(Mu::Tokens& tokens, WarningVec& warnings) const { const auto token = look_ahead(tokens); if (token.type != Token::Type::Data) warnings.push_back({token.pos, "expected: value"}); tokens.pop_front(); std::string field, val; const auto col = token.str.find(":"); if (col != 0 && col != std::string::npos && col != token.str.length() - 1) { field = token.str.substr(0, col); val = token.str.substr(col + 1); } else val = token.str; auto fields = process_field(field, flags_); if (fields.empty()) { // not valid field... warnings.push_back({token.pos, format("invalid field '%s'", field.c_str())}); fields = process_field("", flags_); // fallback, treat the whole of foo:bar as a value return value(fields, field + ":" + val, token.pos, warnings); } // does it look like a regexp? if (val.length() >= 2) if (val[0] == '/' && val[val.length() - 1] == '/') return regex(fields, val, token.pos, warnings); // does it look like a range? const auto dotdot = val.find(".."); if (dotdot != std::string::npos) return range(fields, val.substr(0, dotdot), val.substr(dotdot + 2), token.pos, warnings); else if (is_range_field(fields.front().field)) { // range field without a range - treat as field:val..val return range(fields, val, val, token.pos, warnings); } // if nothing else, it's a value. return value(fields, val, token.pos, warnings); } Mu::Tree Parser::Private::unit(Mu::Tokens& tokens, WarningVec& warnings) const { if (tokens.empty()) { warnings.push_back({0, "expected: unit"}); return empty(); } const auto token = look_ahead(tokens); if (token.type == Token::Type::Not) { tokens.pop_front(); Tree tree{{Node::Type::OpNot}}; tree.add_child(unit(tokens, warnings)); return tree; } if (token.type == Token::Type::Open) { tokens.pop_front(); auto tree = term_1(tokens, warnings); if (tokens.empty()) warnings.push_back({token.pos, "expected: ')'"}); else { const auto token2 = look_ahead(tokens); if (token2.type == Token::Type::Close) tokens.pop_front(); else { warnings.push_back( {token2.pos, std::string("expected: ')' but got ") + token2.str}); } } return tree; } return data(tokens, warnings); } Mu::Tree Parser::Private::factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const { if (tokens.empty()) return empty(); const auto token = look_ahead(tokens); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-enum" switch (token.type) { case Token::Type::And: { tokens.pop_front(); op = Node::Type::OpAnd; } break; case Token::Type::Open: case Token::Type::Data: case Token::Type::Not: op = Node::Type::OpAnd; // implicit AND break; default: return empty(); } #pragma GCC diagnostic pop return factor_1(tokens, warnings); } Mu::Tree Parser::Private::factor_1(Mu::Tokens& tokens, WarningVec& warnings) const { Node::Type op{Node::Type::Invalid}; auto t = unit(tokens, warnings); auto a2 = factor_2(tokens, op, warnings); if (a2.empty()) return t; Tree tree{{op}}; tree.add_child(std::move(t)); tree.add_child(std::move(a2)); return tree; } Mu::Tree Parser::Private::term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const { if (tokens.empty()) return empty(); const auto token = look_ahead(tokens); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-enum" switch (token.type) { case Token::Type::Or: op = Node::Type::OpOr; break; case Token::Type::Xor: op = Node::Type::OpXor; break; default: if (token.type != Token::Type::Close) warnings.push_back({token.pos, "expected OR|XOR"}); return empty(); } #pragma GCC diagnostic pop tokens.pop_front(); return term_1(tokens, warnings); } Mu::Tree Parser::Private::term_1(Mu::Tokens& tokens, WarningVec& warnings) const { Node::Type op{Node::Type::Invalid}; auto t = factor_1(tokens, warnings); auto o2 = term_2(tokens, op, warnings); if (o2.empty()) return t; else { Tree tree{{op}}; tree.add_child(std::move(t)); tree.add_child(std::move(o2)); return tree; } } Mu::Parser::Parser(const Store& store, Parser::Flags flags) : priv_{std::make_unique(store, flags)} { } Mu::Parser::~Parser() = default; Mu::Tree Mu::Parser::parse(const std::string& expr, WarningVec& warnings) const { try { auto tokens = tokenize(expr); if (tokens.empty()) return empty(); else return priv_->term_1(tokens, warnings); } catch (const std::runtime_error& ex) { std::cerr << ex.what() << std::endl; return empty(); } }