/* ** Copyright (C) 2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the ** Free Software Foundation; either version 3, or (at your option) any ** later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software Foundation, ** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ** */ #include "mu-query-parser.hh" #include #include #include #include #include "utils/mu-option.hh" #include #include "utils/mu-utils-file.hh" using namespace Mu; /** * An 'Element' here is a rather rich version of what is traditionally * considered a (lexical) token. * * We try to determine as much as possible during the analysis phase; which is * quite a bit (given the fairly simple query language), and the parsing phase * only has to deal with the putting these elements in a tree. * * During analysis: * 1) separate the query into a sequence strings * 2) for each of these strings * - Does it look like an Op? ('or', 'and' etc.) --> Op * - Otherwise: treat as a Basic field ([field]:value) * - Whitespace in value? -> promote to Phrase * - otherwise: * - Is value a regex (in //) -> promote to Regex * - Is value a wildcard (ends in '*') -> promote to Wildcard * - is value a range (a..b) -> promote to Range * * After analysis, we have the sequence of element as a Sexp, which can then be * fed to the parser. We attempt to make the Sexp as human-readable as possible. */ struct Element { enum struct Bracket { Open, Close} ; enum struct Op { And, Or, Xor, Not, AndNot }; template struct FieldValue { FieldValue(const ValueType& v): field{}, value{v}{} template FieldValue(const StringType& fname, const ValueType& v): field{std::string{fname}}, value{v}{} template FieldValue(const Option& fname, const ValueType& v) { if (fname) field = std::string{*fname}; value = v; } Option field{}; ValueType value{}; }; struct Basic: public FieldValue {using FieldValue::FieldValue;}; struct Phrase: public FieldValue {using FieldValue::FieldValue;}; struct Regex: public FieldValue {using FieldValue::FieldValue;}; struct Wildcard: public FieldValue {using FieldValue::FieldValue;}; struct Range: public FieldValue> { using FieldValue::FieldValue; }; using ValueType = std::variant< /* */ Bracket, /* op */ Op, /* string values */ std::string, /* value types */ Basic, Phrase, Regex, Wildcard, Range >; // helper template struct decay_equiv: std::is_same::type, U>::type {}; Element(Bracket b): value{b} {} Element(Op op): value{op} {} template, T>::value>::type = 0> Element(const std::string& field, const T& val): value{T{field, val}} {} Element(const std::string& val): value{val} {} template Option get_opt() { if (std::holds_alternative(value)) return std::get(value); else return Nothing; } Sexp sexp() const { return std::visit([](auto&& arg)->Sexp { auto field_sym = [](const Option& field) { return field ? Sexp::Symbol{*field} : placeholder_sym; }; using T = std::decay_t; if constexpr (std::is_same_v) { switch(arg) { case Bracket::Open: return open_sym; case Bracket::Close: return close_sym; default: throw std::logic_error("invalid bracket type"); } } else if constexpr (std::is_same_v) { switch(arg) { case Op::And: return and_sym; case Op::Or: return or_sym; case Op::Xor: return xor_sym; case Op::Not: return not_sym; case Op::AndNot: return and_not_sym; default: throw std::logic_error("invalid op type"); } } else if constexpr (std::is_same_v) { return Sexp { field_sym(arg.field), arg.value }; } else if constexpr (std::is_same_v) { return Sexp {field_sym(arg.field), Sexp{ phrase_sym, arg.value }}; } else if constexpr (std::is_same_v) { return Sexp { field_sym(arg.field), Sexp{ regex_sym, arg.value}}; } else if constexpr (std::is_same_v) { return Sexp { field_sym(arg.field), Sexp{ wildcard_sym, arg.value}}; } else if constexpr (std::is_same_v) { return Sexp {field_sym(arg.field), Sexp{ range_sym, arg.value.first, arg.value.second }}; } else if constexpr (std::is_same_v) { throw std::logic_error("no bare strings should be here"); } else throw std::logic_error("uninvited visitor"); }, value); } ValueType value; }; using Elements = std::vector; /** * Remove first character from string and return it. * * @param[in,out] str a string * @param[in,out] pos position in _original_ string * * @return a char or 0 if there is none. */ static char read_char(std::string& str, size_t& pos) { if (str.empty()) return {}; auto kar{str.at(0)}; str.erase(0, 1); ++pos; return kar; } /** * Restore kar at the beginning of the string * * @param[in,out] str a string * @param[in,out] pos position in _original_ string * @param kar a character */ static void unread_char(std::string& str, size_t& pos, char kar) { str = kar + str; --pos; } /** * Remove the the next element from the string and return it * * @param[in,out] str a string * @param[in,out] pos position in _original_ string * * * @return an Element or Nothing */ static Option next_element(std::string& str, size_t& pos) { bool quoted{}, escaped{}; std::string value{}; auto is_separator = [](char c) { return c == ' '|| c == '(' || c == ')'; }; while (!str.empty()) { auto kar = read_char(str, pos); if (kar == '\\') { escaped = !escaped; if (escaped) continue; } if (kar == '"' && !escaped) { if (!escaped && quoted) return Element{value}; else { quoted = true; continue; } } if (!quoted && !escaped && is_separator(kar)) { if (!value.empty()) { unread_char(str, pos, kar); return Element{value}; } if (quoted || kar == ' ') continue; switch (kar) { case '(': return Element{Element::Bracket::Open}; case ')': return Element{Element::Bracket::Close}; default: break; } } value += kar; escaped = false; } if (value.empty()) return Nothing; else return Element{value}; } static Option opify(Element&& element) { auto&& str{element.get_opt()}; if (!str) return element; static const std::unordered_map ops = { { "and", Element::Op::And }, { "or", Element::Op::Or}, { "xor", Element::Op::Xor }, { "not", Element::Op::Not }, // AndNot only appears during parsing. }; if (auto&& it = ops.find(utf8_flatten(*str)); it != ops.end()) element.value = it->second; return element; } static Option basify(Element&& element) { auto&& str{element.get_opt()}; if (!str) return element; const auto pos = str->find(':'); if (pos == std::string::npos) { element.value = Element::Basic{*str}; return element; } const auto fname{str->substr(0, pos)}; if (auto&& field{field_from_name(fname)}; field) { auto val{str->substr(pos + 1)}; if (field == Field::Id::Flags) { if (auto&& finfo{flag_info(val)}; finfo) element.value = Element::Basic{field->name, std::string{finfo->name}}; else Element::Basic{*str}; } else if (field == Field::Id::Priority) { if (auto&& prio{priority_from_name(val)}; prio) element.value = Element::Basic{field->name, std::string{priority_name(*prio)}}; else element.value = Element::Basic{*str}; } else element.value = Element::Basic{std::string{field->name}, str->substr(pos + 1)}; } else if (field_is_combi(fname)) element.value = Element::Basic{fname, str->substr(pos +1)}; else element.value = Element::Basic{*str}; return element; } static Option phrasify(Element&& element) { auto&& basic{element.get_opt()}; if (!basic) return element; auto&& val{basic->value}; if (val.find(' ') != std::string::npos) element.value = Element::Phrase{basic->field, val}; return element; } static Option wildcardify(Element&& element) { auto&& basic{element.get_opt()}; if (!basic) return element; auto&& val{basic->value}; if (val.size() < 2 || val[val.size()-1] != '*') return element; val.erase(val.size() - 1); element.value = Element::Wildcard{basic->field, val}; return element; } static Option regexpify(Element&& element) { auto&& str{element.get_opt()}; if (!str) return element; auto&& val{str->value}; if (val.size() < 3 || val[0] != '/' || val[val.size()-1] != '/') return element; val.erase(val.size() - 1); val.erase(0, 1); element.value = Element::Regex{str->field, std::move(val)}; return element; } // handle range-fields: Size, Date, Changed static Option rangify(Element&& element) { auto&& str{element.get_opt()}; if (!str) return element; if (!str->field) return element; auto&& field = field_from_name(*str->field); if (!field || !field->is_range()) return element; /* yes: get the range */ auto&& range = std::invoke([&]()->std::pair { const auto val{str->value}; const auto pos{val.find("..")}; if (pos == std::string::npos) return { val, val }; else return {val.substr(0, pos), val.substr(pos + 2)}; }); if (field->id == Field::Id::Size) { int64_t s1{range.first.empty() ? -1 : parse_size(range.first, false/*first*/).value_or(-1)}; int64_t s2{range.second.empty() ? -1 : parse_size(range.second, true/*last*/).value_or(-1)}; if (s2 >= 0 && s1 > s2) std::swap(s1, s2); element.value = Element::Range{str->field, {s1 < 0 ? "" : std::to_string(s1), s2 < 0 ? "" : std::to_string(s2)}}; } else if (field->id == Field::Id::Date || field->id == Field::Id::Changed) { auto tstamp=[](auto&& str, auto&& first)->int64_t { return str.empty() ? -1 : parse_date_time(str, first ,false/*local*/).value_or(-1); }; int64_t lower{tstamp(range.first, true/*lower*/)}; int64_t upper{tstamp(range.second, false/*upper*/)}; if (lower >= 0 && upper >= 0 && lower > upper) { // can't simply swap due to rounding up/down lower = tstamp(range.second, true/*lower*/); upper = tstamp(range.first, false/*upper*/); } // use "Zulu" time. element.value = Element::Range{ str->field, {lower < 0 ? "" : mu_format("{:%FT%TZ}",mu_time(lower, true/*utc*/)), upper < 0 ? "" : mu_format("{:%FT%TZ}", mu_time(upper, true/*utc*/))}}; } return element; } static Elements process(const std::string& expr) { Elements elements{}; size_t offset{0}; /* all control chars become SPC */ std::string str{expr}; for (auto& c: str) c = ::iscntrl(c) ? ' ' : c; while(!str.empty()) { auto&& element = next_element(str, offset) .and_then(opify) .and_then(basify) .and_then(regexpify) .and_then(phrasify) .and_then(wildcardify) .and_then(rangify); if (element) elements.emplace_back(std::move(element.value())); } return elements; } Sexp Mu::process_query(const std::string& expr) { const auto& elements{::process(expr)}; Sexp sexp{}; for (auto&& elm: elements) sexp.add(elm.sexp()); return sexp; } #ifdef BUILD_PROCESS_QUERY int main (int argc, char *argv[]) { if (argc < 2) { mu_printerrln("expected: process-query "); return 1; } std::string expr; for (auto i = 1; i < argc; ++i) { expr += argv[i]; expr += " "; } auto sexp = process_query(expr); mu_println("{}", sexp.to_string()); return 0; } #endif /*BUILD_ANALYZE_QUERY*/ #if BUILD_TESTS /* * Tests. * */ #include "utils/mu-test-utils.hh" using TestCase = std::pair; static void test_processor() { std::vector cases = { TestCase{R"(hello world)", R"(((_ "hello") (_ "world")))"}, TestCase{R"("hello world")", R"(((_ (phrase "hello world"))))"}, TestCase{R"(subject:"hello world")", R"(((subject (phrase "hello world"))))"}, // TODO: add more... }; for (auto&& test: cases) { auto&& sexp{process_query(test.first)}; assert_equal(sexp.to_string(), test.second); } } int main(int argc, char* argv[]) { mu_test_init(&argc, &argv); g_test_add_func("/query-parser/processor", test_processor); return g_test_run(); } #endif /*BUILD_TESTS*/