diff --git a/NEWS.org b/NEWS.org index 4a6b67f7..cc554dec 100644 --- a/NEWS.org +++ b/NEWS.org @@ -40,8 +40,14 @@ rather than 250000; the latter was too high for systems with limited memory. You can of course change that with ~--batch-size=...~ - - restore expansion for path options such as ~--maildir=~/Maildir~ for shells - that don't do that, such as Bash. + - restore expansion for path options such as ~--maildir=~/Maildir~ (to e.g. + ~/home/user/Maildir~) for shells that don't do that, such as Bash. + + - updated query-parser; this is (should be) compatible with the older one, + apart from a number of fixes. There is a new option ~--analyze~ to ~mu find~ + which shows the parsed query in a human-readable s-expression form; this + can be used to debug your queries (this replaces the older + ~--format=mquery|xquery~) *** mu4e diff --git a/lib/index/mu-indexer.cc b/lib/index/mu-indexer.cc index 4d257f7d..6656f82b 100644 --- a/lib/index/mu-indexer.cc +++ b/lib/index/mu-indexer.cc @@ -87,10 +87,11 @@ struct Indexer::Private { was_empty_{store.empty()} { mu_message("created indexer for {} -> " - "{} (batch-size: {}; was-empty: {})", + "{} (batch-size: {}; was-empty: {}; ngrams: {})", store.root_maildir(), store.path(), store.config().get(), - was_empty_); + was_empty_, + store.config().get()); } ~Private() { @@ -238,7 +239,7 @@ Indexer::Private::add_message(const std::string& path) * * std::unique_lock lock{w_lock_}; */ - auto msg{Message::make_from_path(path)}; + auto msg{Message::make_from_path(path, store_.message_options())}; if (!msg) { mu_warning("failed to create message from {}: {}", path, msg.error().what()); return false; diff --git a/lib/meson.build b/lib/meson.build index 5ce9009e..226b6dc1 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -1,4 +1,4 @@ -## Copyright (C) 2021-2022 Dirk-Jan C. Binnema +## Copyright (C) 2021-2023 Dirk-Jan C. Binnema ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -26,16 +26,17 @@ lib_mu=static_library( 'mu-config.cc', 'mu-contacts-cache.cc', 'mu-maildir.cc', - 'mu-parser.cc', 'mu-query-match-deciders.cc', 'mu-query-threads.cc', 'mu-query.cc', 'mu-script.cc', 'mu-server.cc', 'mu-store.cc', - 'mu-tokenizer.cc', - 'mu-xapian.cc', - 'mu-xapian-db.cc' + 'mu-xapian-db.cc', + # query-parser + 'mu-query-processor.cc', + 'mu-query-parser.cc', + 'mu-query-xapianizer.cc' ], dependencies: [ glib_dep, @@ -46,8 +47,7 @@ lib_mu=static_library( config_h_dep, lib_mu_utils_dep, lib_mu_message_dep, - lib_mu_index_dep - ], + lib_mu_index_dep], install: false) @@ -57,14 +57,32 @@ lib_mu_dep = declare_dependency( include_directories: include_directories(['.', '..'])) -# dev helpers -tokenize = executable( - 'tokenize', - [ 'mu-tokenizer.cc', 'tokenize.cc' ], - dependencies: [ lib_mu_utils_dep, glib_dep ], - install: false) +# +# query parser dev helpers +# +process_query = executable('process-query', [ 'mu-query-processor.cc'], + install: false, + cpp_args: ['-DBUILD_PROCESS_QUERY'], + dependencies: [glib_dep, lib_mu_dep]) -# actual tests +parse_query = executable( 'parse-query', [ 'mu-query-parser.cc' ], + install: false, + cpp_args: ['-DBUILD_PARSE_QUERY'], + dependencies: [glib_dep, lib_mu_dep]) + +parse_query_expand = executable( 'parse-query-expand', [ 'mu-query-parser.cc' ], + install: false, + cpp_args: ['-DBUILD_PARSE_QUERY_EXPAND'], + dependencies: [glib_dep, lib_mu_dep]) + +xapian_query = executable('xapianize-query', [ 'mu-query-xapianizer.cc' ], + install: false, + cpp_args: ['-DBUILD_XAPIANIZE_QUERY'], + dependencies: [glib_dep, lib_mu_dep]) + +# +# unit tests +# test('test-threads', executable('test-threads', @@ -86,4 +104,25 @@ test('test-config', cpp_args: ['-DBUILD_TESTS'], dependencies: [glib_dep, lib_mu_dep])) +test('test-query-processor', + executable('test-query-processor', + 'mu-query-processor.cc', + install: false, + cpp_args: ['-DBUILD_TESTS'], + dependencies: [lib_mu_dep])) + +test('test-query-parser', + executable('test-query-parser', + 'mu-query-parser.cc', + install: false, + cpp_args: ['-DBUILD_TESTS'], + dependencies: [lib_mu_dep])) + +test('test-query-xapianizer', + executable('test-query-xapianizer', + 'mu-query-xapianizer.cc', + install: false, + cpp_args: ['-DBUILD_TESTS'], + dependencies: [lib_mu_dep])) + subdir('tests') diff --git a/lib/message/meson.build b/lib/message/meson.build index b1bc21b5..4c7bb601 100644 --- a/lib/message/meson.build +++ b/lib/message/meson.build @@ -1,4 +1,4 @@ -## Copyright (C) 2022 Dirk-Jan C. Binnema +## Copyright (C) 2022-2023 Dirk-Jan C. Binnema ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -38,7 +38,7 @@ lib_mu_message=static_library( lib_mu_message_dep = declare_dependency( link_with: lib_mu_message, - dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep ], + dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep, config_h_dep ], include_directories: include_directories(['.', '..'])) diff --git a/lib/message/mu-document.cc b/lib/message/mu-document.cc index 87ddd683..29af3763 100644 --- a/lib/message/mu-document.cc +++ b/lib/message/mu-document.cc @@ -1,5 +1,5 @@ /* -** Copyright (C) 2022 Dirk-Jan C. Binnema +** Copyright (C) 2022-2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the @@ -16,6 +16,7 @@ ** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ** */ +#include "config.h" #include "mu-document.hh" #include "mu-message.hh" @@ -31,9 +32,14 @@ #include #include - using namespace Mu; +// backward compat +#ifndef HAVE_XAPIAN_FLAG_NGRAMS +#define FLAG_NGRAMS FLAG_CJK_NGRAM +#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/ + + const Xapian::Document& Document::xapian_document() const { @@ -58,16 +64,29 @@ Document::put_prop(const Field& field, SexpType&& val) std::forward(val)); } +static Xapian::TermGenerator +make_term_generator(Xapian::Document& doc, Document::Options opts) +{ + Xapian::TermGenerator termgen; + + if (any_of(opts & Document::Options::SupportNgrams)) + termgen.set_flags(Xapian::TermGenerator::FLAG_NGRAMS); + + termgen.set_document(doc); + + return termgen; +} + static void -add_search_term(Xapian::Document& doc, const Field& field, const std::string& val) +add_search_term(Xapian::Document& doc, const Field& field, const std::string& val, + Document::Options opts) { if (field.is_normal_term()) { doc.add_term(field.xapian_term(val)); } else if (field.is_boolean_term()) { doc.add_boolean_term(field.xapian_term(val)); } else if (field.is_indexable_term()) { - Xapian::TermGenerator termgen; - termgen.set_document(doc); + auto&& termgen{make_term_generator(doc, opts)}; termgen.index_text(utf8_flatten(val), 1, field.xapian_term()); /* also add as 'normal' term, so some queries where the indexer * eats special chars also match */ @@ -88,7 +107,7 @@ Document::add(Field::Id id, const std::string& val) xdoc_.add_value(field.value_no(), val); if (field.is_searchable()) - add_search_term(xdoc_, field, val); + add_search_term(xdoc_, field, val, options_); if (field.include_in_sexp()) put_prop(field, val); @@ -107,7 +126,7 @@ Document::add(Field::Id id, const std::vector& vals) if (field.is_searchable()) std::for_each(vals.begin(), vals.end(), [&](const auto& val) { - add_search_term(xdoc_, field, val); }); + add_search_term(xdoc_, field, val, options_); }); if (field.include_in_sexp()) { Sexp elms{}; @@ -149,9 +168,7 @@ Document::add(Field::Id id, const Contacts& contacts) std::vector cvec; const std::string sepa2(1, SepaChar2); - - Xapian::TermGenerator termgen; - termgen.set_document(xdoc_); + auto&& termgen{make_term_generator(xdoc_, options_)}; for (auto&& contact: contacts) { diff --git a/lib/message/mu-document.hh b/lib/message/mu-document.hh index 21de4e20..ef045772 100644 --- a/lib/message/mu-document.hh +++ b/lib/message/mu-document.hh @@ -41,17 +41,27 @@ namespace Mu { */ class Document { public: + enum struct Options { + None = 0, + SupportNgrams = 1 << 0, /**< Support ngrams, as used in + * CJK and other languages. */ + }; + /** * Construct a message for a new Xapian Document + * + * @param flags behavioral flags */ - Document() {} + Document(Options opts = Options::None): options_{opts} {} /** * Construct a message document based on an existing Xapian document. * * @param doc + * @param flags behavioral flags */ - Document(const Xapian::Document& doc): xdoc_{doc} {} + Document(const Xapian::Document& doc, Options opts = Options::None): + xdoc_{doc}, options_{opts} {} /** * DTOR @@ -240,11 +250,12 @@ private: return cached_sexp_; } - mutable Xapian::Document xdoc_; + Options options_; mutable Sexp cached_sexp_; mutable bool dirty_sexp_{}; /* xdoc's sexp is outdated */ }; +MU_ENABLE_BITOPS(Document::Options); } // namepace Mu diff --git a/lib/message/mu-fields.hh b/lib/message/mu-fields.hh index 24279de8..abd0e1c8 100644 --- a/lib/message/mu-fields.hh +++ b/lib/message/mu-fields.hh @@ -207,7 +207,10 @@ struct Field { } }; -static inline bool operator==(const Field& f1, const Field& f2) { return f1.id == f2.id; } +// equality +static inline constexpr bool operator==(const Field& f1, const Field& f2) { return f1.id == f2.id; } +static inline constexpr bool operator==(const Field& f1, const Field::Id id) { return f1.id == id; } + MU_ENABLE_BITOPS(Field::Flag); @@ -594,20 +597,5 @@ Option field_from_number(size_t id) } -/** - * Get a fmt-printable representation of Field for fmt - * - * @param field a Field - * - * @return a printable representation - */ -static inline constexpr auto format_as(const Field& field) { - return field.name; -} -static inline constexpr auto format_as(const Field::Id id) { - return format_as(field_from_id(id)); -} - - } // namespace Mu #endif /* MU_FIELDS_HH__ */ diff --git a/lib/message/mu-message.cc b/lib/message/mu-message.cc index c026dfcb..3f443d45 100644 --- a/lib/message/mu-message.cc +++ b/lib/message/mu-message.cc @@ -45,9 +45,10 @@ using namespace Mu; struct Message::Private { - Private(Message::Options options): opts{options} {} + Private(Message::Options options): + opts{options}, doc{doc_opts(opts)} {} Private(Message::Options options, Xapian::Document&& xdoc): - opts{options}, doc{std::move(xdoc)} {} + opts{options}, doc{std::move(xdoc), doc_opts(opts)} {} Message::Options opts; Document doc; @@ -70,6 +71,13 @@ struct Message::Private { Option embedded; Option language; /* body ISO language code */ + +private: + Document::Options doc_opts(Message::Options mopts) { + return any_of(opts & Message::Options::SupportNgrams) ? + Document::Options::SupportNgrams : + Document::Options::None; + } }; @@ -176,6 +184,11 @@ Message::document() const return priv_->doc; } +Message::Options +Message::options() const +{ + return priv_->opts; +} unsigned Message::docid() const diff --git a/lib/message/mu-message.hh b/lib/message/mu-message.hh index 09a677b9..8fb9f4d2 100644 --- a/lib/message/mu-message.hh +++ b/lib/message/mu-message.hh @@ -1,5 +1,5 @@ /* -** Copyright (C) 2022 Dirk-Jan C. Binnema +** Copyright (C) 2022-2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the @@ -49,8 +49,10 @@ public: Decrypt = 1 << 0, /**< Attempt to decrypt */ RetrieveKeys = 1 << 1, /**< Auto-retrieve crypto keys (implies network * access) */ - AllowRelativePath = 1 << 2, /**< Allow relateive paths for filename + AllowRelativePath = 1 << 2, /**< Allow relative paths for filename * in make_from_path */ + SupportNgrams = 1 << 3, /**< Support ngrams, as used in + * CJK and other languages. */ }; /** @@ -60,7 +62,6 @@ public: */ Message(Message&& other) noexcept; - /** * operator= * @@ -147,6 +148,14 @@ public: const Document& document() const; + /** + * The message options for this message + * + * @return message options + */ + Options options() const; + + /** * Get the document-id, or 0 if non-existent. * diff --git a/lib/mu-config.hh b/lib/mu-config.hh index 13b1bcfb..afbfda85 100644 --- a/lib/mu-config.hh +++ b/lib/mu-config.hh @@ -51,6 +51,8 @@ struct Property { PersonalAddresses, /**< List of personal e-mail addresses */ RootMaildir, /**< Root maildir path */ SchemaVersion, /**< Xapian DB schema version */ + SupportNgrams, /**< Support ngrams for indexing & querying + * for e.g. CJK languages */ /* */ _count_ /* Number of Ids */ }; @@ -61,12 +63,13 @@ struct Property { enum struct Flags { None = 0, /**< Nothing in particular */ ReadOnly = 1 << 0, /**< Property is read-only for external use - * (but can change from within the store) */ + * (but can change from within the store) */ Configurable = 1 << 1, /**< A user-configurable parameter; name * starts with 'conf-' */ Internal = 1 << 2, /**< Mu-internal field */ }; enum struct Type { + Boolean, /**< Some boolean value */ Number, /**< Some number */ Timestamp, /**< Timestamp number */ Path, /**< Path string */ @@ -176,6 +179,14 @@ public: {}, "Version of the Xapian database schema" }, + { + Id::SupportNgrams, + Type::Boolean, + Flags::Configurable, + "support-ngrams", + {}, + "Support n-grams for working with CJK and other languages" + }, }}; /** @@ -229,6 +240,9 @@ public: }); if constexpr (prop.type == Type::Number) return static_cast(str.empty() ? 0 : std::atoll(str.c_str())); + if constexpr (prop.type == Type::Boolean) + return static_cast(str.empty() ? false : + std::atol(str.c_str()) != 0); else if constexpr (prop.type == Type::Timestamp) return static_cast(str.empty() ? 0 : std::atoll(str.c_str())); else if constexpr (prop.type == Type::Path || prop.type == Type::String) @@ -257,6 +271,8 @@ public: const auto strval = std::invoke([&]{ if constexpr (prop.type == Type::Number || prop.type == Type::Timestamp) return mu_format("{}", static_cast(val)); + if constexpr (prop.type == Type::Boolean) + return val ? "1" : "0"; else if constexpr (prop.type == Type::Path || prop.type == Type::String) return std::string{val}; else if constexpr (prop.type == Type::StringList) diff --git a/lib/mu-parser.cc b/lib/mu-parser.cc deleted file mode 100644 index bf541ea4..00000000 --- a/lib/mu-parser.cc +++ /dev/null @@ -1,508 +0,0 @@ -/* -** Copyright (C) 2020 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ -#include "mu-parser.hh" - -#include -#include - -#include "mu-tokenizer.hh" -#include "utils/mu-utils.hh" -#include "utils/mu-error.hh" -#include "utils/mu-regex.hh" -#include "message/mu-message.hh" - -using namespace Mu; - -// 3 precedence levels: units (NOT,()) > factors (OR) > terms (AND) - -// query -> | ε -// -> | ε -// -> OR|XOR | ε -// -> | ε -// -> [AND]|AND NOT | ε -// -> [NOT] | ( ) | -// -> | | -// -> [field:]value -// -> [field:][lower]..[upper] -// -> [field:]/regex/ - -#define BUG(...) \ - Mu::Error(Error::Code::Internal, "BUG @ line {}", __LINE__); - -/** - * Get the "shortcut"/internal fields for the the given fieldstr or empty if there is none - * - * @param fieldstr a fieldstr, e.g "subject" or "s" for the subject field - * - * @return a vector with "exploded" values, with a code and a fullname. E.g. "s" might map - * to [<"S","subject">], while "recip" could map to [<"to", "T">, <"cc", "C">, <"bcc", "B">] - */ -struct FieldInfo { - const std::string field; - const std::string prefix; - bool supports_phrase; - Field::Id id; -}; -using FieldInfoVec = std::vector; -struct Parser::Private { - Private(const Store& store, Parser::Flags flags) : store_{store}, flags_{flags} {} - - std::vector process_regex(const std::string& field, - const Regex& rx) const; - - Mu::Tree term_1(Mu::Tokens& tokens, WarningVec& warnings) const; - Mu::Tree term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const; - Mu::Tree factor_1(Mu::Tokens& tokens, WarningVec& warnings) const; - Mu::Tree factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const; - Mu::Tree unit(Mu::Tokens& tokens, WarningVec& warnings) const; - Mu::Tree data(Mu::Tokens& tokens, WarningVec& warnings) const; - Mu::Tree range(const FieldInfoVec& fields, - const std::string& lower, - const std::string& upper, - size_t pos, - WarningVec& warnings) const; - Mu::Tree regex(const FieldInfoVec& fields, - const std::string& v, - size_t pos, - WarningVec& warnings) const; - Mu::Tree value(const FieldInfoVec& fields, - const std::string& v, - size_t pos, - WarningVec& warnings) const; - - private: - const Store& store_; - const Parser::Flags flags_; -}; - -static std::string -process_value(const std::string& field, const std::string& value) -{ - const auto id_opt{field_from_name(field)}; - if (id_opt) { -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (id_opt->id) { - case Field::Id::Priority: { - if (!value.empty()) - return std::string(1, value[0]); - } break; - case Field::Id::Flags: - if (const auto info{flag_info(value)}; info) - return std::string(1, info->shortcut_lower()); - break; - default: - break; - } -#pragma GCC diagnostic pop - } - - return value; // XXX prio/flags, etc. alias -} - -static void -add_field(std::vector& fields, Field::Id field_id) -{ - const auto field{field_from_id(field_id)}; - if (!field.shortcut) - return; // can't be searched - - fields.emplace_back(FieldInfo{std::string{field.name}, field.xapian_term(), - field.is_indexable_term(), field_id}); -} - -static std::vector -process_field(const std::string& field_str, Parser::Flags flags) -{ - std::vector fields; - if (any_of(flags & Parser::Flags::UnitTest)) { - add_field(fields, Field::Id::MessageId); - return fields; - } - - if (field_str == "contact" || field_str == "recip") { // multi fields - add_field(fields, Field::Id::To); - add_field(fields, Field::Id::Cc); - add_field(fields, Field::Id::Bcc); - if (field_str == "contact") - add_field(fields, Field::Id::From); - } else if (field_str.empty()) { - add_field(fields, Field::Id::To); - add_field(fields, Field::Id::Cc); - add_field(fields, Field::Id::Bcc); - add_field(fields, Field::Id::From); - add_field(fields, Field::Id::Subject); - add_field(fields, Field::Id::BodyText); - } else if (const auto field_opt{field_from_name(field_str)}; field_opt) - add_field(fields, field_opt->id); - - return fields; -} - -static bool -is_range_field(const std::string& field_str) -{ - if (const auto field_opt{field_from_name(field_str)}; !field_opt) - return false; - else - return field_opt->is_range(); -} - -struct MyRange { - std::string lower; - std::string upper; -}; - -static MyRange -process_range(const std::string& field_str, - const std::string& lower, const std::string& upper) -{ - const auto field_opt{field_from_name(field_str)}; - if (!field_opt) - return {lower, upper}; - - std::string l2 = lower; - std::string u2 = upper; - constexpr auto upper_limit = std::numeric_limits::max(); - - if (field_opt->id == Field::Id::Date || field_opt->id == Field::Id::Changed) { - l2 = to_lexnum(parse_date_time(lower, true).value_or(0)); - u2 = to_lexnum(parse_date_time(upper, false).value_or(upper_limit)); - } else if (field_opt->id == Field::Id::Size) { - l2 = to_lexnum(parse_size(lower, true).value_or(0)); - u2 = to_lexnum(parse_size(upper, false).value_or(upper_limit)); - } - - return {l2, u2}; -} - -std::vector -Parser::Private::process_regex(const std::string& field_str, - const Regex& rx) const -{ - const auto field_opt{field_from_name(field_str)}; - if (!field_opt) - return {}; - - const auto prefix{field_opt->xapian_term()}; - std::vector terms; - store_.for_each_term(field_opt->id, [&](auto&& str) { - auto val{str.c_str() + 1}; // strip off the Xapian prefix. - if (rx.matches(val)) - terms.emplace_back(std::move(val)); - return true; - }); - - return terms; -} - -static Token -look_ahead(const Mu::Tokens& tokens) -{ - return tokens.front(); -} - -static Mu::Tree -empty() -{ - return {{Node::Type::Empty}}; -} - -Mu::Tree -Parser::Private::value(const FieldInfoVec& fields, - const std::string& v, - size_t pos, - WarningVec& warnings) const -{ - auto val = utf8_flatten(v); - - if (fields.empty()) - throw BUG("expected one or more fields"); - - if (fields.size() == 1) { - const auto item = fields.front(); - return Tree({Node::Type::Value, - FieldValue{item.id, process_value(item.field, val)}}); - } - - // a 'multi-field' such as "recip:" - Tree tree(Node{Node::Type::OpOr}); - for (const auto& item : fields) - tree.add_child(Tree({Node::Type::Value, - FieldValue{item.id, - process_value(item.field, val)}})); - return tree; -} - -Mu::Tree -Parser::Private::regex(const FieldInfoVec& fields, - const std::string& v, - size_t pos, - WarningVec& warnings) const -{ - if (v.length() < 2) - throw BUG("expected regexp, got '%s'", v.c_str()); - - const auto rxstr = utf8_flatten(v.substr(1, v.length() - 2)); - - try { - Tree tree(Node{Node::Type::OpOr}); - const auto rx = Regex::make(rxstr, G_REGEX_OPTIMIZE); - if (!rx) - throw rx.error(); - for (const auto& field : fields) { - const auto terms = process_regex(field.field, *rx); - for (const auto& term : terms) { - tree.add_child(Tree({Node::Type::ValueAtomic, - FieldValue{field.id, term}})); - } - } - - if (tree.children.empty()) - return empty(); - else - return tree; - - } catch (...) { - // fallback - warnings.push_back({pos, "invalid regexp"}); - return value(fields, v, pos, warnings); - } -} - -Mu::Tree -Parser::Private::range(const FieldInfoVec& fields, - const std::string& lower, - const std::string& upper, - size_t pos, - WarningVec& warnings) const -{ - if (fields.empty()) - throw BUG("expected field"); - - const auto& field = fields.front(); - if (!is_range_field(field.field)) - return value(fields, lower + ".." + upper, pos, warnings); - - auto prange = process_range(field.field, lower, upper); - if (prange.lower > prange.upper) - prange = process_range(field.field, upper, lower); - - return Tree({Node::Type::Range, - FieldValue{field.id, prange.lower, prange.upper}}); -} - -Mu::Tree -Parser::Private::data(Mu::Tokens& tokens, WarningVec& warnings) const -{ - const auto token = look_ahead(tokens); - if (token.type != Token::Type::Data) - warnings.push_back({token.pos, "expected: value"}); - - tokens.pop_front(); - - std::string field, val; - const auto col = token.str.find(":"); - if (col != 0 && col != std::string::npos && col != token.str.length() - 1) { - field = token.str.substr(0, col); - val = token.str.substr(col + 1); - } else - val = token.str; - - auto fields = process_field(field, flags_); - if (fields.empty()) { // not valid field... - warnings.push_back({token.pos, mu_format("invalid field '{}'", field)}); - fields = process_field("", flags_); - // fallback, treat the whole of foo:bar as a value - return value(fields, field + ":" + val, token.pos, warnings); - } - - // does it look like a regexp? - if (val.length() >= 2) - if (val[0] == '/' && val[val.length() - 1] == '/') - return regex(fields, val, token.pos, warnings); - - // does it look like a range? - const auto dotdot = val.find(".."); - if (dotdot != std::string::npos) - return range(fields, - val.substr(0, dotdot), - val.substr(dotdot + 2), - token.pos, - warnings); - else if (is_range_field(fields.front().field)) { - // range field without a range - treat as field:val..val - return range(fields, val, val, token.pos, warnings); - } - - // if nothing else, it's a value. - return value(fields, val, token.pos, warnings); -} - -Mu::Tree -Parser::Private::unit(Mu::Tokens& tokens, WarningVec& warnings) const -{ - if (tokens.empty()) { - warnings.push_back({0, "expected: unit"}); - return empty(); - } - - const auto token = look_ahead(tokens); - - if (token.type == Token::Type::Not) { - tokens.pop_front(); - Tree tree{{Node::Type::OpNot}}; - tree.add_child(unit(tokens, warnings)); - return tree; - } - - if (token.type == Token::Type::Open) { - tokens.pop_front(); - auto tree = term_1(tokens, warnings); - if (tokens.empty()) - warnings.push_back({token.pos, "expected: ')'"}); - else { - const auto token2 = look_ahead(tokens); - if (token2.type == Token::Type::Close) - tokens.pop_front(); - else { - warnings.push_back( - {token2.pos, - std::string("expected: ')' but got ") + token2.str}); - } - } - return tree; - } - - return data(tokens, warnings); -} - -Mu::Tree -Parser::Private::factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const -{ - if (tokens.empty()) - return empty(); - - const auto token = look_ahead(tokens); - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (token.type) { - case Token::Type::And: { - tokens.pop_front(); - op = Node::Type::OpAnd; - } break; - - case Token::Type::Open: - case Token::Type::Data: - case Token::Type::Not: - op = Node::Type::OpAnd; // implicit AND - break; - - default: - return empty(); - } -#pragma GCC diagnostic pop - - return factor_1(tokens, warnings); -} - -Mu::Tree -Parser::Private::factor_1(Mu::Tokens& tokens, WarningVec& warnings) const -{ - Node::Type op{Node::Type::Invalid}; - - auto t = unit(tokens, warnings); - auto a2 = factor_2(tokens, op, warnings); - - if (a2.empty()) - return t; - - Tree tree{{op}}; - tree.add_child(std::move(t)); - tree.add_child(std::move(a2)); - - return tree; -} - -Mu::Tree -Parser::Private::term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const -{ - if (tokens.empty()) - return empty(); - - const auto token = look_ahead(tokens); - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (token.type) { - case Token::Type::Or: op = Node::Type::OpOr; break; - case Token::Type::Xor: op = Node::Type::OpXor; break; - default: - if (token.type != Token::Type::Close) - warnings.push_back({token.pos, "expected OR|XOR"}); - return empty(); - } -#pragma GCC diagnostic pop - - tokens.pop_front(); - - return term_1(tokens, warnings); -} - -Mu::Tree -Parser::Private::term_1(Mu::Tokens& tokens, WarningVec& warnings) const -{ - Node::Type op{Node::Type::Invalid}; - - auto t = factor_1(tokens, warnings); - auto o2 = term_2(tokens, op, warnings); - - if (o2.empty()) - return t; - else { - Tree tree{{op}}; - tree.add_child(std::move(t)); - tree.add_child(std::move(o2)); - return tree; - } -} - -Mu::Parser::Parser(const Store& store, Parser::Flags flags) : - priv_{std::make_unique(store, flags)} -{ -} - -Mu::Parser::~Parser() = default; - -Mu::Tree -Mu::Parser::parse(const std::string& expr, WarningVec& warnings) const -{ - try { - auto tokens = tokenize(expr); - if (tokens.empty()) - return empty(); - else - return priv_->term_1(tokens, warnings); - - } catch (const std::runtime_error& ex) { - std::cerr << ex.what() << std::endl; - return empty(); - } -} diff --git a/lib/mu-parser.hh b/lib/mu-parser.hh deleted file mode 100644 index 65adc645..00000000 --- a/lib/mu-parser.hh +++ /dev/null @@ -1,106 +0,0 @@ -/* -** Copyright (C) 2017 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#ifndef __PARSER_HH__ -#define __PARSER_HH__ - -#include "utils/mu-utils.hh" -#include -#include -#include - -#include -#include - -// A simple recursive-descent parser for queries. Follows the Xapian syntax, -// but better handles non-alphanum; also implements regexp - -namespace Mu { - -/** - * A parser warning - * - */ -struct Warning { - size_t pos{}; /**< pos in string */ - const std::string msg; /**< warning message */ - - /** - * operator== - * - * @param rhs right-hand side - * - * @return true if rhs is equal to this; false otherwise - */ - bool operator==(const Warning& rhs) const { return pos == rhs.pos && msg == rhs.msg; } -}; -using WarningVec = std::vector; - -/** - * operator<< - * - * @param os an output stream - * @param w a warning - * - * @return the updated output stream - */ -inline std::ostream& -operator<<(std::ostream& os, const Warning& w) -{ - os << w.pos << ":" << w.msg; - return os; -} - -class Parser { - public: - enum struct Flags { None = 0, UnitTest = 1 << 0 }; - - /** - * Construct a query parser object - * - * @param store a store object ptr, or none - */ - Parser(const Store& store, Flags = Flags::None); - /** - * DTOR - * - */ - ~Parser(); - - /** - * Parse a query string - * - * @param query a query string - * @param warnings vec to receive warnings - * - * @return a parse-tree - */ - - Tree parse(const std::string& query, WarningVec& warnings) const; - - private: - struct Private; - std::unique_ptr priv_; -}; - -MU_ENABLE_BITOPS(Parser::Flags); - -} // namespace Mu - -#endif /* __PARSER_HH__ */ diff --git a/lib/mu-query-parser.cc b/lib/mu-query-parser.cc new file mode 100644 index 00000000..77333d2e --- /dev/null +++ b/lib/mu-query-parser.cc @@ -0,0 +1,428 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "mu-query-parser.hh" + +#include +#include +#include +#include + +#include "utils/mu-utils.hh" +#include "utils/mu-sexp.hh" +#include "utils/mu-option.hh" +#include +#include "utils/mu-utils-file.hh" + +using namespace Mu; + +// Sexp extensions... +static Sexp& +prepend(Sexp& s, Sexp&& e) +{ + s.list().insert(s.list().begin(), std::move(e)); + return s; +} + +static Option +second(Sexp& s) +{ + if (s.listp() && !s.empty() && s.cbegin() + 1 != s.cend()) + return *(s.begin()+1); + else + return Nothing; +} + + +static bool +looks_like_matcher(const Sexp& sexp) +{ + // all the "terminal values" (from the Mu parser's pov) + const std::array value_syms = { + placeholder_sym, phrase_sym, regex_sym, range_sym, wildcard_sym + }; + + if (!sexp.listp() || sexp.empty() || !sexp.front().symbolp()) + return false; + + const auto symbol{sexp.front().symbol()}; + if (seq_some(value_syms, [&](auto &&sym) { return symbol == sym; })) + return true; + else if (!!field_from_name(symbol.name) || field_is_combi(symbol.name)) + return true; + else + return false; +} + +struct ParseContext { + bool expand; + std::vector warnings; +}; + + +/* + * Grammar + * + * query -> factor { ( | ) factor } + * factor -> unit { [] unit } + * unit -> matcher | query | <(> query <)> + * matcher + */ + +static Sexp query(Sexp& tokens, ParseContext& ctx); + +static Sexp +matcher(Sexp& tokens, ParseContext& ctx) +{ + if (tokens.empty()) + return {}; + + auto val{*tokens.head()}; + tokens.pop_front(); + + /* special case: if we find some non-matcher type here, we need to + * second-guess the tokenizer */ + if (!looks_like_matcher(val)) + val = Sexp{placeholder_sym, val.symbol().name}; + + if (ctx.expand) { /* should we expand meta-fields? */ + const auto symbol{val.front().symbol()}; + const auto fields = fields_from_name(symbol == placeholder_sym ? "" : symbol.name); + if (!fields.empty()) { + Sexp vals{}; + vals.add(or_sym); + for (auto&& field: fields) + vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}}); + val = std::move(vals); + } + } + + return val; +} + +static Sexp +unit(Sexp& tokens, ParseContext& ctx) +{ + if (tokens.head_symbolp(not_sym)) { /* NOT */ + tokens.pop_front(); + Sexp sub{query(tokens, ctx)}; + + /* special case: interpret "not" as a matcher instead; */ + if (sub.empty()) + return Sexp{placeholder_sym, not_sym.name}; + + /* we try to optimize: double negations are removed */ + if (sub.head_symbolp(not_sym)) + return *second(sub); + else + return Sexp(not_sym, std::move(sub)); + + } else if (tokens.head_symbolp(open_sym)) { /* ( sub) */ + tokens.pop_front(); + Sexp sub{query(tokens, ctx)}; + if (tokens.head_symbolp(close_sym)) + tokens.pop_front(); + else { + //g_warning("expected <)>"); + } + return sub; + } + + /* matcher */ + return matcher(tokens, ctx); +} + + +static Sexp +factor(Sexp& tokens, ParseContext& ctx) +{ + Sexp un = unit(tokens, ctx); + + /* query 'a b' is to be interpreted as 'a AND b'; + * + * we need an implicit AND if the head symbol is either + * a matcher (value) or the start of a sub-expression */ + auto implicit_and = [&]() { + if (tokens.head_symbolp(open_sym)) + return true; + else if (auto&& head{tokens.head()}; head) + return looks_like_matcher(*head); + else + return false; + }; + + Sexp uns; + while (true) { + + if (tokens.head_symbolp(and_sym)) + tokens.pop_front(); + else if (!implicit_and()) + break; + + if (auto&& un2 = unit(tokens, ctx); !un2.empty()) + uns.add(std::move(un2)); + else + break; + } + + if (!uns.empty()) { + un = Sexp{and_sym, std::move(un)}; + un.add_list(std::move(uns)); + } + + return un; +} + +static Sexp +query(Sexp& tokens, ParseContext& ctx) +{ + /* note: we flatten (or (or ( or ...)) etc. here; + * for optimization (since Xapian likes flat trees) */ + + Sexp fact = factor(tokens, ctx); + Sexp or_factors, xor_factors; + while (true) { + auto factors = std::invoke([&]()->Option { + + if (tokens.head_symbolp(or_sym)) + return or_factors; + else if (tokens.head_symbolp(xor_sym)) + return xor_factors; + else + return Nothing; + }); + + if (!factors) + break; + + tokens.pop_front(); + factors->add(factor(tokens, ctx)); + } + + // a bit clumsy... + + if (!or_factors.empty() && xor_factors.empty()) { + fact = Sexp{or_sym, std::move(fact)}; + fact.add_list(std::move(or_factors)); + } else if (or_factors.empty() && !xor_factors.empty()) { + fact = Sexp{xor_sym, std::move(fact)}; + fact.add_list(std::move(xor_factors)); + } else if (!or_factors.empty() && !xor_factors.empty()) { + fact = Sexp{or_sym, std::move(fact)}; + fact.add_list(std::move(or_factors)); + prepend(xor_factors, xor_sym); + fact.add(std::move(xor_factors)); + } + + return fact; +} + +Sexp +Mu::parse_query(const std::string& expr, bool expand) +{ + ParseContext context; + context.expand = expand; + + if (auto&& items = process_query(expr); !items.listp()) + throw std::runtime_error("tokens must be a list-sexp"); + else + return query(items, context); +} + + +#if defined(BUILD_PARSE_QUERY)||defined(BUILD_PARSE_QUERY_EXPAND) +int +main (int argc, char *argv[]) +{ + if (argc < 2) { + mu_printerrln("expected: {} ", argv[0]); + return 1; + } + + std::string expr; + for (auto i = 1; i < argc; ++i) { + expr += argv[i]; + expr += " "; + } + + auto&& sexp = parse_query(expr, +#ifdef BUILD_PARSE_QUERY_EXPAND + true/*expand*/ +#else + false/*don't expand*/ +#endif + ); + mu_println("{}", sexp.to_string()); + return 0; +} +#endif // BUILD_PARSE_QUERY || BUILD_PARSE_QUERY_EXPAND + + + +#if BUILD_TESTS +/* + * Tests. + * + */ + +#include "utils/mu-test-utils.hh" + +using TestCase = std::pair; + +static void +test_parser_basic() +{ + std::vector cases = { + // single term + TestCase{R"(a)", R"((_ "a"))"}, + // a and b + TestCase{R"(a and b)", R"((and (_ "a") (_ "b")))"}, + // a and b and c + TestCase{R"(a and b and c)", R"((and (_ "a") (_ "b") (_ "c")))"}, + // a or b + TestCase{R"(a or b)", R"((or (_ "a") (_ "b")))"}, + // a or b and c + TestCase{R"(a or b and c)", R"((or (_ "a") (and (_ "b") (_ "c"))))"}, + // a and b or c + TestCase{R"(a and b or c)", R"((or (and (_ "a") (_ "b")) (_ "c")))"}, + // not a + TestCase{R"(not a)", R"((not (_ "a")))"}, + // lone not + TestCase{R"(not)", R"((_ "not"))"}, + // a and (b or c) + TestCase{R"(a and (b or c))", R"((and (_ "a") (or (_ "b") (_ "c"))))"}, + // TODO: add more... + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first)}; + //mu_message ("'{}' <=> '{}'", sexp.to_string(), test.second); + assert_equal(sexp.to_string(), test.second); + } +} + +static void +test_parser_recover() +{ + std::vector cases = { + // implicit AND + TestCase{R"(a b)", R"((and (_ "a") (_ "b")))"}, + // a or or (second to be used as value) + TestCase{R"(a or and)", R"((or (_ "a") (_ "and")))"}, + // missing end ) + TestCase{R"(a and ()", R"((_ "a"))"}, + // missing end ) + TestCase{R"(a and (b)", R"((and (_ "a") (_ "b")))"}, + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first)}; + assert_equal(sexp.to_string(), test.second); + } +} + + +static void +test_parser_fields() +{ + std::vector cases = { + // simple field + TestCase{R"(s:hello)", R"((subject "hello"))"}, + // field, wildcard, regexp + TestCase{R"(subject:a* recip:/b/)", + R"((and (subject (wildcard "a")) (recip (regex "b"))))"}, + TestCase{R"(from:hello or subject:world)", + R"((or (from "hello") (subject "world")))"}, + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first)}; + assert_equal(sexp.to_string(), test.second); + } +} + +static void +test_parser_expand() +{ + std::vector cases = { + // simple field + TestCase{R"(recip:a)", R"((or (to "a") (cc "a") (bcc "a")))"}, + // field, wildcard, regexp + TestCase{R"(a*)", + R"((or (to (wildcard "a")) (cc (wildcard "a")) (bcc (wildcard "a")) (from (wildcard "a")) (subject (wildcard "a")) (body (wildcard "a")) (embed (wildcard "a"))))"}, + TestCase{R"(a xor contact:b)", + R"((xor (or (to "a") (cc "a") (bcc "a") (from "a") (subject "a") (body "a") (embed "a")) (or (to "b") (cc "b") (bcc "b") (from "b"))))"} + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first, true/*expand*/)}; + assert_equal(sexp.to_string(), test.second); + } +} + + +static void +test_parser_range() +{ + std::vector cases = { + TestCase{R"(size:1)", R"((size (range "1" "1")))"}, + TestCase{R"(size:2..)", R"((size (range "2" "")))"}, + TestCase{R"(size:..1k)", R"((size (range "" "1024")))"}, + TestCase{R"(size:..)", R"((size (range "" "")))"}, + }; + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first, true/*expand*/)}; + assert_equal(sexp.to_string(), test.second); + } +} + +static void +test_parser_optimize() +{ + std::vector cases = { + TestCase{R"(not a)", R"((not (_ "a")))"}, + TestCase{R"(not not a)", R"((_ "a"))"}, + TestCase{R"(not not not a)", R"((not (_ "a")))"}, + TestCase{R"(not not not not a)", R"((_ "a"))"}, + }; + + + for (auto&& test: cases) { + auto&& sexp{parse_query(test.first)}; + assert_equal(sexp.to_string(), test.second); + } +} + +int +main(int argc, char* argv[]) +{ + mu_test_init(&argc, &argv); + + g_test_add_func("/query-parser/basic", test_parser_basic); + g_test_add_func("/query-parser/recover", test_parser_recover); + g_test_add_func("/query-parser/fields", test_parser_fields); + g_test_add_func("/query-parser/range", test_parser_range); + g_test_add_func("/query-parser/expand", test_parser_expand); + g_test_add_func("/query-parser/optimize", test_parser_optimize); + + return g_test_run(); +} + +#endif /*BUILD_TESTS*/ diff --git a/lib/mu-query-parser.hh b/lib/mu-query-parser.hh new file mode 100644 index 00000000..f78011bf --- /dev/null +++ b/lib/mu-query-parser.hh @@ -0,0 +1,116 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ +#include +#include +#include + +#include + +#include "utils/mu-sexp.hh" +#include "utils/mu-result.hh" +#include "mu-store.hh" + +namespace Mu { +/* + * Some useful symbol-sexps + */ +static inline const auto placeholder_sym = "_"_sym; +static inline const auto phrase_sym = "phrase"_sym; +static inline const auto regex_sym = "regex"_sym; +static inline const auto range_sym = "range"_sym; +static inline const auto wildcard_sym = "wildcard"_sym; + +static inline const auto open_sym = "("_sym; +static inline const auto close_sym = ")"_sym; + +static inline const auto and_sym = "and"_sym; +static inline const auto or_sym = "or"_sym; +static inline const auto xor_sym = "xor"_sym; +static inline const auto not_sym = "not"_sym; +static inline const auto and_not_sym = "and-not"_sym; + + +/* + * We take a query, then parse it into a human-readable s-expression and then + * turn that s-expression into a Xapian query + * + * some query: + * "from:hello or subject:world" + * + * 1. tokenize-query + * => ((from "hello") or (subject "world")) + * + * 2. parse-query + * => (or (from "hello") (subject "world")) + * + * 3. xapian-query + * => Query((Fhello OR Sworld)) + * * + */ + +/** + * Analyze the query expression and express it as a Sexp-list with the sequence + * of elements. + * + * @param expr a search expression + * + * @return Sexp with the sequence of elements + */ +Sexp process_query(const std::string& expr); + +/** + * Parse the query expression and create a parse-tree expressed as an Sexp + * object (tree). + * + * Internally, this processes the stream into element (see process_query()) and + * processes the tokens into a Sexp. This sexp is meant to be human-readable. + * + * @param expr a search expression + * @param expand whether to expand meta-fields (such as '_', 'recip', 'contacts') + * + * @return Sexp with the parse tree + */ +Sexp parse_query(const std::string& expr, bool expand=false); + +/** + * Make a Xapian Query for the given string expression. + * + * This uses parse_query() and turns the S-expression into a Xapian::Query. + * Unlike mere parsing, this uses the information in the store to resolve + * wildcard / regex queries. + * + * @param store the message store + * @param expr a string expression + * @param flavor type of parser to use + * + * @return a Xapian query result or an error. + */ +enum struct ParserFlags { + None = 0 << 0, + SupportNgrams = 1 << 0, /**< Support Xapian's Ngrams for CJK etc. handling */ + XapianParser = 1 << 1, /**< For testing only, use Xapian's + * built-in QueryParser; this is not + * fully compatible with mu, only useful + * for debugging. */ +}; +Result make_xapian_query(const Store& store, const std::string& expr, + ParserFlags flag=ParserFlags::None) noexcept; + +MU_ENABLE_BITOPS(ParserFlags); +} // namespace Mu diff --git a/lib/mu-query-processor.cc b/lib/mu-query-processor.cc new file mode 100644 index 00000000..ab461b4f --- /dev/null +++ b/lib/mu-query-processor.cc @@ -0,0 +1,548 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "mu-query-parser.hh" + +#include +#include +#include +#include + +#include "utils/mu-option.hh" +#include +#include "utils/mu-utils-file.hh" + +using namespace Mu; + +/** + * An 'Element' here is a rather rich version of what is traditionally + * considered a (lexical) token. + * + * We try to determine as much as possible during the analysis phase; which is + * quite a bit (given the fairly simple query language), and the parsing phase + * only has to deal with the putting these elements in a tree. + * + * During analysis: + * 1) separate the query into a sequence strings + * 2) for each of these strings + * - Does it look like an Op? ('or', 'and' etc.) --> Op + * - Otherwise: treat as a Basic field ([field]:value) + * - Whitespace in value? -> promote to Phrase + * - otherwise: + * - Is value a regex (in //) -> promote to Regex + * - Is value a wildcard (ends in '*') -> promote to Wildcard + * - is value a range (a..b) -> promote to Range + * + * After analysis, we have the sequence of element as a Sexp, which can then be + * fed to the parser. We attempt to make the Sexp as human-readable as possible. + */ +struct Element { + enum struct Bracket { Open, Close} ; + enum struct Op { And, Or, Xor, Not, AndNot }; + + template + struct FieldValue { + FieldValue(const ValueType& v): field{}, value{v}{} + + template + FieldValue(const StringType& fname, const ValueType& v): + field{std::string{fname}}, value{v}{} + template + FieldValue(const Option& fname, const ValueType& v) { + if (fname) + field = std::string{*fname}; + value = v; + } + + Option field{}; + ValueType value{}; + }; + struct Basic: public FieldValue {using FieldValue::FieldValue;}; + struct Phrase: public FieldValue {using FieldValue::FieldValue;}; + struct Regex: public FieldValue {using FieldValue::FieldValue;}; + struct Wildcard: public FieldValue {using FieldValue::FieldValue;}; + struct Range: public FieldValue> { + using FieldValue::FieldValue; }; + + using ValueType = std::variant< + /* */ + Bracket, + /* op */ + Op, + /* string values */ + std::string, + /* value types */ + Basic, + Phrase, + Regex, + Wildcard, + Range + >; + + // helper + template + struct decay_equiv: + std::is_same::type, U>::type {}; + + Element(Bracket b): value{b} {} + Element(Op op): value{op} {} + + template, T>::value>::type = 0> + Element(const std::string& field, const T& val): value{T{field, val}} {} + + Element(const std::string& val): value{val} {} + + template + Option get_opt() { + if (std::holds_alternative(value)) + return std::get(value); + else + return Nothing; + } + + Sexp sexp() const { + return std::visit([](auto&& arg)->Sexp { + + auto field_sym = [](const Option& field) { + return field ? Sexp::Symbol{*field} : placeholder_sym; + }; + + using T = std::decay_t; + + if constexpr (std::is_same_v) { + switch(arg) { + case Bracket::Open: + return open_sym; + case Bracket::Close: + return close_sym; + default: + throw std::logic_error("invalid bracket type"); + } + } else if constexpr (std::is_same_v) { + switch(arg) { + case Op::And: + return and_sym; + case Op::Or: + return or_sym; + case Op::Xor: + return xor_sym; + case Op::Not: + return not_sym; + case Op::AndNot: + return and_not_sym; + default: + throw std::logic_error("invalid op type"); + } + } else if constexpr (std::is_same_v) { + return Sexp { field_sym(arg.field), arg.value }; + } else if constexpr (std::is_same_v) { + return Sexp {field_sym(arg.field), + Sexp{ phrase_sym, arg.value }}; + } else if constexpr (std::is_same_v) { + return Sexp { field_sym(arg.field), Sexp{ regex_sym, arg.value}}; + } else if constexpr (std::is_same_v) { + return Sexp { field_sym(arg.field), Sexp{ wildcard_sym, arg.value}}; + } else if constexpr (std::is_same_v) { + return Sexp {field_sym(arg.field), + Sexp{ range_sym, arg.value.first, arg.value.second }}; + } else if constexpr (std::is_same_v) { + throw std::logic_error("no bare strings should be here"); + } else + throw std::logic_error("uninvited visitor"); + }, value); + } + + ValueType value; +}; + +using Elements = std::vector; + + + +/** + * Remove first character from string and return it. + * + * @param[in,out] str a string + * @param[in,out] pos position in _original_ string + * + * @return a char or 0 if there is none. + */ +static char +read_char(std::string& str, size_t& pos) +{ + if (str.empty()) + return {}; + + auto kar{str.at(0)}; + str.erase(0, 1); + ++pos; + + return kar; +} + +/** + * Restore kar at the beginning of the string + * + * @param[in,out] str a string + * @param[in,out] pos position in _original_ string + * @param kar a character + */ +static void +unread_char(std::string& str, size_t& pos, char kar) +{ + str = kar + str; + --pos; +} + + +/** + * Remove the the next element from the string and return it + * + * @param[in,out] str a string + * @param[in,out] pos position in _original_ string * + * + * @return an Element or Nothing + */ +static Option +next_element(std::string& str, size_t& pos) +{ + bool quoted{}, escaped{}; + std::string value{}; + + auto is_separator = [](char c) { return c == ' '|| c == '(' || c == ')'; }; + + while (!str.empty()) { + + auto kar = read_char(str, pos); + + if (kar == '\\') { + escaped = !escaped; + if (escaped) + continue; + } + + if (kar == '"' && !escaped) { + if (!escaped && quoted) + return Element{value}; + else { + quoted = true; + continue; + } + } + + if (!quoted && !escaped && is_separator(kar)) { + if (!value.empty()) { + unread_char(str, pos, kar); + return Element{value}; + } + + if (quoted || kar == ' ') + continue; + + switch (kar) { + case '(': + return Element{Element::Bracket::Open}; + case ')': + return Element{Element::Bracket::Close}; + default: + break; + } + } + + value += kar; + escaped = false; + } + + if (value.empty()) + return Nothing; + else + return Element{value}; +} + + +static Option +opify(Element&& element) +{ + auto&& str{element.get_opt()}; + if (!str) + return element; + + static const std::unordered_map ops = { + { "and", Element::Op::And }, + { "or", Element::Op::Or}, + { "xor", Element::Op::Xor }, + { "not", Element::Op::Not }, + // AndNot only appears during parsing. + }; + + if (auto&& it = ops.find(utf8_flatten(*str)); it != ops.end()) + element.value = it->second; + + return element; +} + +static Option +basify(Element&& element) +{ + auto&& str{element.get_opt()}; + if (!str) + return element; + + const auto pos = str->find(':'); + if (pos == std::string::npos) { + element.value = Element::Basic{*str}; + return element; + } + + const auto fname{str->substr(0, pos)}; + if (auto&& field{field_from_name(fname)}; field) { + auto val{str->substr(pos + 1)}; + if (field == Field::Id::Flags) { + if (auto&& finfo{flag_info(val)}; finfo) + element.value = Element::Basic{field->name, std::string{finfo->name}}; + else + Element::Basic{*str}; + } else if (field == Field::Id::Priority) { + if (auto&& prio{priority_from_name(val)}; prio) + element.value = Element::Basic{field->name, + std::string{priority_name(*prio)}}; + else + element.value = Element::Basic{*str}; + } else + element.value = Element::Basic{std::string{field->name}, + str->substr(pos + 1)}; + } else if (field_is_combi(fname)) + element.value = Element::Basic{fname, str->substr(pos +1)}; + else + element.value = Element::Basic{*str}; + + return element; +} + +static Option +phrasify(Element&& element) +{ + auto&& basic{element.get_opt()}; + if (!basic) + return element; + + auto&& val{basic->value}; + if (val.find(' ') != std::string::npos) + element.value = Element::Phrase{basic->field, val}; + + return element; +} + + +static Option +wildcardify(Element&& element) +{ + auto&& basic{element.get_opt()}; + if (!basic) + return element; + + auto&& val{basic->value}; + if (val.size() < 2 || val[val.size()-1] != '*') + return element; + + val.erase(val.size() - 1); + element.value = Element::Wildcard{basic->field, val}; + + return element; +} + +static Option +regexpify(Element&& element) +{ + auto&& str{element.get_opt()}; + if (!str) + return element; + + auto&& val{str->value}; + if (val.size() < 3 || val[0] != '/' || val[val.size()-1] != '/') + return element; + + val.erase(val.size() - 1); + val.erase(0, 1); + element.value = Element::Regex{str->field, std::move(val)}; + + return element; +} + +// handle range-fields: Size, Date, Changed +static Option +rangify(Element&& element) +{ + auto&& str{element.get_opt()}; + if (!str) + return element; + + if (!str->field) + return element; + + auto&& field = field_from_name(*str->field); + if (!field || !field->is_range()) + return element; + + /* yes: get the range */ + auto&& range = std::invoke([&]()->std::pair { + const auto val{str->value}; + const auto pos{val.find("..")}; + + if (pos == std::string::npos) + return { val, val }; + else + return {val.substr(0, pos), val.substr(pos + 2)}; + }); + + if (field->id == Field::Id::Size) { + int64_t s1{range.first.empty() ? -1 : + parse_size(range.first, false/*first*/).value_or(-1)}; + int64_t s2{range.second.empty() ? -1 : + parse_size(range.second, true/*last*/).value_or(-1)}; + if (s2 >= 0 && s1 > s2) + std::swap(s1, s2); + element.value = Element::Range{str->field, + {s1 < 0 ? "" : std::to_string(s1), + s2 < 0 ? "" : std::to_string(s2)}}; + + } else if (field->id == Field::Id::Date || field->id == Field::Id::Changed) { + auto tstamp=[](auto&& str, auto&& first)->int64_t { + return str.empty() ? -1 : + parse_date_time(str, first ,false/*local*/).value_or(-1); + }; + int64_t lower{tstamp(range.first, true/*lower*/)}; + int64_t upper{tstamp(range.second, false/*upper*/)}; + if (lower >= 0 && upper >= 0 && lower > upper) { + // can't simply swap due to rounding up/down + lower = tstamp(range.second, true/*lower*/); + upper = tstamp(range.first, false/*upper*/); + } + // use "Zulu" time. + element.value = Element::Range{ + str->field, + {lower < 0 ? "" : + mu_format("{:%FT%TZ}",mu_time(lower, true/*utc*/)), + upper < 0 ? "" : + mu_format("{:%FT%TZ}", mu_time(upper, true/*utc*/))}}; + } + + return element; +} + +static Elements +process(const std::string& expr) +{ + Elements elements{}; + size_t offset{0}; + + /* all control chars become SPC */ + std::string str{expr}; + for (auto& c: str) + c = ::iscntrl(c) ? ' ' : c; + + while(!str.empty()) { + auto&& element = next_element(str, offset) + .and_then(opify) + .and_then(basify) + .and_then(regexpify) + .and_then(phrasify) + .and_then(wildcardify) + .and_then(rangify); + if (element) + elements.emplace_back(std::move(element.value())); + } + + return elements; +} + +Sexp +Mu::process_query(const std::string& expr) +{ + const auto& elements{::process(expr)}; + + Sexp sexp{}; + for (auto&& elm: elements) + sexp.add(elm.sexp()); + + return sexp; +} + +#ifdef BUILD_PROCESS_QUERY +int +main (int argc, char *argv[]) +{ + if (argc < 2) { + mu_printerrln("expected: process-query "); + return 1; + } + + std::string expr; + for (auto i = 1; i < argc; ++i) { + expr += argv[i]; + expr += " "; + } + + auto sexp = process_query(expr); + mu_println("{}", sexp.to_string()); + + return 0; +} +#endif /*BUILD_ANALYZE_QUERY*/ + +#if BUILD_TESTS +/* + * Tests. + * + */ + +#include "utils/mu-test-utils.hh" + +using TestCase = std::pair; + +static void +test_processor() +{ + std::vector cases = { + TestCase{R"(hello world)", R"(((_ "hello") (_ "world")))"}, + TestCase{R"("hello world")", R"(((_ (phrase "hello world"))))"}, + TestCase{R"(subject:"hello world")", R"(((subject (phrase "hello world"))))"}, + // TODO: add more... + }; + + for (auto&& test: cases) { + auto&& sexp{process_query(test.first)}; + assert_equal(sexp.to_string(), test.second); + } +} + + + +int +main(int argc, char* argv[]) +{ + mu_test_init(&argc, &argv); + + g_test_add_func("/query-parser/processor", test_processor); + + return g_test_run(); +} + +#endif /*BUILD_TESTS*/ diff --git a/lib/mu-query-xapianizer.cc b/lib/mu-query-xapianizer.cc new file mode 100644 index 00000000..ff541682 --- /dev/null +++ b/lib/mu-query-xapianizer.cc @@ -0,0 +1,468 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "config.h" +#include "mu-query-parser.hh" + +#include +#include +#include +#include +#include + +#include "utils/mu-option.hh" +#include +#include "utils/mu-utils-file.hh" + +using namespace Mu; + +// backward compat +#ifndef HAVE_XAPIAN_FLAG_NGRAMS +#define FLAG_NGRAMS FLAG_CJK_NGRAM +#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/ + +/** + * Expand terms for scripts without explicit word-breaks (e.g. + * Chinese/Japanese/Korean) in the way that Xapian expects it - + * use Xapian's built-in QueryParser just for that. + */ +static Result +ngram_expand(const Field& field, const std::string& str) +{ + Xapian::QueryParser qp; + const auto pfx{std::string(1U, field.xapian_prefix())}; + + qp.set_default_op(Xapian::Query::OP_OR); + + return qp.parse_query(str, Xapian::QueryParser::FLAG_NGRAMS, pfx); +} + + +static Option +tail(Sexp&& s) +{ + if (!s.listp() || s.empty()) + return Nothing; + + s.list().erase(s.list().begin(), s.list().begin() + 1); + + return s; +} + +Option +head_symbol(const Sexp& s) +{ + if (!s.listp() || s.empty() || !s.head() || !s.head()->symbolp()) + return Nothing; + + return s.head()->symbol().name; +} + + +Option +string_nth(const Sexp& args, size_t n) +{ + if (!args.listp() || args.size() < n + 1) + return Nothing; + + if (auto&& item{args.list().at(n)}; !item.stringp()) + return Nothing; + else + return item.string(); +} + +static Result +phrase(const Field& field, Sexp&& s) +{ + if (!field.is_indexable_term()) + return Err(Error::Code::InvalidArgument, + "field {} does not support phrases", field.name); + + if (s.size() == 1 && s.front().stringp()) { + auto&& words{split(s.front().string(), " ")}; + std::vector phvec; + phvec.reserve(words.size()); + for(auto&& w: words) + phvec.emplace_back(Xapian::Query{field.xapian_term(std::move(w))}); + return Xapian::Query{Xapian::Query::OP_PHRASE, + phvec.begin(), phvec.end()}; + } else + return Err(Error::Code::InvalidArgument, + "invalid phrase for field {}: '{}'", field.name, s.to_string()); +} + +static Result +regex(const Store& store, const Field& field, const std::string& rx_str) +{ + auto&& str{utf8_flatten(rx_str)}; + auto&& rx{Regex::make(str, G_REGEX_OPTIMIZE)}; + if (!rx) { + mu_warning("invalid regexp: '{}': {}", str, rx.error().what()); + return Xapian::Query::MatchNothing; + } + + std::vector rxvec; + store.for_each_term(field.id, [&](auto&& str) { + if (auto&& val{str.data() + 1}; rx->matches(val)) + rxvec.emplace_back(field.xapian_term(std::string_view{val})); + return true; + }); + + return Xapian::Query(Xapian::Query::OP_OR, rxvec.begin(), rxvec.end()); +} + + + +static Result +range(const Field& field, Sexp&& s) +{ + auto&& r0{string_nth(s, 0)}; + auto&& r1{string_nth(s, 1)}; + if (!r0 || !r1) + return Err(Error::Code::InvalidArgument, "expected 2 range values"); + + // in the sexp, we use iso date/time for human readability; now convert to + // time_t + auto iso_to_lexnum=[](const std::string& s)->Option { + if (s.empty()) + return s; + if (auto&& t{parse_date_time(s, true, true/*utc*/)}; !t) + return Nothing; + else + return to_lexnum(*t); + }; + + if (field == Field::Id::Date || field == Field::Id::Changed) { + // iso -> time_t + r0 = iso_to_lexnum(*r0); + r1 = iso_to_lexnum(*r1); + } else if (field == Field::Id::Size) { + if (!r0->empty()) + r0 = to_lexnum(::atoll(r0->c_str())); + if (!r1->empty()) + r1 = to_lexnum(::atoll(r1->c_str())); + } else + return Err(Error::Code::InvalidArgument, + "unsupported range field {}", field.name); + + if (r0->empty() && r1->empty()) + return Xapian::Query::MatchAll; + else if (r0->empty() && !r1->empty()) + return Xapian::Query(Xapian::Query::OP_VALUE_LE, + field.value_no(), *r1); + else if (!r0->empty() && r1->empty()) + return Xapian::Query(Xapian::Query::OP_VALUE_GE, + field.value_no(), *r0); + else + return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, + field.value_no(), *r0, *r1); +} + + + +using OpPair = std::pair; +static constexpr std::array LogOpPairs = {{ + { "and", Xapian::Query::OP_AND }, + { "or", Xapian::Query::OP_OR }, + { "xor", Xapian::Query::OP_XOR }, + { "not", Xapian::Query::OP_AND_NOT } + }}; + +static Option +find_log_op(const std::string& opname) +{ + for (auto&& p: LogOpPairs) + if (p.first == opname) + return p.second; + + return Nothing; +} + +static Result parse(const Store& store, Sexp&& s, Mu::ParserFlags flags); + +static Result +parse_logop(const Store& store, Xapian::Query::op op, Sexp&& args, Mu::ParserFlags flags) +{ + if (!args.listp() || args.empty()) + return Err(Error::Code::InvalidArgument, + "expected non-empty list but got", args.to_string()); + + std::vector qs; + for (auto&& elm: args.list()) { + if (auto&& q{parse(store, std::move(elm), flags)}; !q) + return Err(std::move(q.error())); + else + qs.emplace_back(std::move(*q)); + } + + switch(op) { + case Xapian::Query::OP_AND_NOT: + if (qs.size() != 1) + return Err(Error::Code::InvalidArgument, + "expected single argument for NOT"); + else + return Xapian::Query{op, Xapian::Query::MatchAll, qs.at(0)}; + + case Xapian::Query::OP_AND: + case Xapian::Query::OP_OR: + case Xapian::Query::OP_XOR: + return Xapian::Query(op, qs.begin(), qs.end()); + + default: + return Err(Error::Code::InvalidArgument, "unexpected xapian op"); + } +} + + +static Result +parse_field_matcher(const Store& store, const Field& field, + const std::string& match_sym, Sexp&& args) +{ + auto&& str0{string_nth(args, 0)}; + + if (match_sym == wildcard_sym.name && str0) + return Xapian::Query{Xapian::Query::OP_WILDCARD, + field.xapian_term(*str0)}; + else if (match_sym == range_sym.name && !!str0) + return range(field, std::move(args)); + else if (match_sym == regex_sym.name && !!str0) + return regex(store, field, *str0); + else if (match_sym == phrase_sym.name) + return phrase(field, std::move(args)); + + return Err(Error::Code::InvalidArgument, + "invalid field '{}'/'{}' matcher: {}", + field.name, match_sym, args.to_string()); +} + + +static Result parse_basic(const Field &field, Sexp &&vals, + Mu::ParserFlags flags) +{ + auto ngrams = any_of(flags & ParserFlags::SupportNgrams); + if (!vals.stringp()) + return Err(Error::Code::InvalidArgument, "expected string"); + + auto&& val{vals.string()}; + + switch (field.id) { + case Field::Id::Flags: + if (auto&& finfo{flag_info(val)}; finfo) + return Xapian::Query{field.xapian_term(finfo->shortcut_lower())}; + else + return Err(Error::Code::InvalidArgument, + "invalid flag '{}'", val); + case Field::Id::Priority: + if (auto&& prio{priority_from_name(val)}; prio) + return Xapian::Query{field.xapian_term(to_char(*prio))}; + else + return Err(Error::Code::InvalidArgument, + "invalid priority '{}'", val); + default: { + auto q{Xapian::Query{field.xapian_term(val)}}; + if (ngrams) { // special case: cjk; see if we can create an expanded query. + if (field.is_indexable_term() && contains_unbroken_script(val)) + if (auto&& ng{ngram_expand(field, val)}; ng) + return ng; + } + return q; + }} + +} + +static Result +parse(const Store& store, Sexp&& s, Mu::ParserFlags flags) +{ + auto&& headsym{head_symbol(s)}; + if (!headsym) + return Err(Error::Code::InvalidArgument, + "expected (symbol ...) but got {}", s.to_string()); + + // ie., something like (or|and| ... ....) + if (auto&& logop{find_log_op(*headsym)}; logop) { + if (auto&& args{tail(std::move(s))}; !args) + return Err(Error::Code::InvalidArgument, + "expected (logop ...) but got {}", + s.to_string()); + else + return parse_logop(store, *logop, std::move(*args), flags); + + } + // something like (field ...) + else if (auto&& field{field_from_name(*headsym)}; field) { + + auto&& rest{tail(std::move(s))}; + if (!rest || rest->empty()) + return Err(Error::Code::InvalidArgument, + "expected field-value or field-matcher"); + + auto&& matcher{rest->front()}; + // field-value: (field "value"); ensure "value" is there + if (matcher.stringp()) + return parse_basic(*field, std::move(matcher), flags); + + // otherwise, we expect a field-matcher, e.g. (field (phrase "a b c")) + // ensure the matcher is a list starting with a symbol + auto&& match_sym{head_symbol(matcher)}; + if (!match_sym) + return Err(Error::Code::InvalidArgument, + "expected field-matcher"); + + if (auto&& args{tail(std::move(matcher))}; !args) + return Err(Error::Code::InvalidArgument, "expected matcher arguments"); + else + return parse_field_matcher(store, *field, + *match_sym, std::move(*args)); + } + return Err(Error::Code::InvalidArgument, + "unexpected sexp {}", s.to_string()); +} + + +// parse the way Xapian's internal parser does it; for testing. +static Xapian::Query +xapian_query_classic(const std::string& expr, Mu::ParserFlags flags) +{ + Xapian::QueryParser xqp; + + // add prefixes + field_for_each([&](auto&& field){ + + if (!field.is_searchable()) + return; + + const auto prefix{std::string(1U, field.xapian_prefix())}; + std::vector names = { + std::string{field.name}, + std::string(1U, field.shortcut) + }; + if (!field.alias.empty()) + names.emplace_back(std::string{field.alias}); + + for (auto&& name: names) + xqp.add_prefix(name, prefix); + }); + + const auto xflags = std::invoke([&]() { + unsigned f = Xapian::QueryParser::FLAG_PHRASE | + Xapian::QueryParser::FLAG_BOOLEAN | + Xapian::QueryParser::FLAG_WILDCARD; + if (any_of(flags & ParserFlags::SupportNgrams)) { +#if HAVE_XAPIAN_FLAG_NGRAMS + f |= Xapian::QueryParser::FLAG_NGRAMS; +#else + f |= Xapian::QueryParser::FLAG_CJK_NGRAM; +#endif + } + return f; + }); + + xqp.set_default_op(Xapian::Query::OP_AND); + return xqp.parse_query(expr, xflags); +} + +Result +Mu::make_xapian_query(const Store& store, const std::string& expr, Mu::ParserFlags flags) noexcept +{ + if (any_of(flags & Mu::ParserFlags::XapianParser)) + return xapian_query_classic(expr, flags); + + return parse(store, Mu::parse_query(expr, true/*expand*/), flags); +} + + +#ifdef BUILD_XAPIANIZE_QUERY +int +main (int argc, char *argv[]) +{ + if (argc < 2) { + mu_printerrln("expected: parse-query "); + return 1; + } + + auto store = Store::make(runtime_path(Mu::RuntimePath::XapianDb)); + if (!store) { + mu_printerrln("error: {}", store.error()); + return 2; + } + + std::string expr; + for (auto i = 1; i < argc; ++i) { + expr += argv[i]; + expr += " "; + } + + if (auto&& query{make_xapian_query(*store, expr)}; !query) { + mu_printerrln("error: {}", query.error()); + return 1; + } else { + mu_println("{}", query->get_description()); + return 0; + } +} +#endif /*BUILD_XAPIANIZE_QUERY*/ + +#if BUILD_TESTS +/* + * Tests. + * + */ + +#include "utils/mu-test-utils.hh" + +using TestCase = std::pair; + +static void +test_xapian() +{ + auto&& testhome{unwrap(make_temp_dir())}; + auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)}; + auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))}; + + std::vector cases = { + TestCase{R"(i:87h766tzzz.fsf@gnus.org)", R"(Query(I87h766tzzz.fsf@gnus.org))"}, + TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"}, + TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"}, + TestCase{R"(subject:"hello world")", R"(Query((Shello PHRASE 2 Sworld)))"}, + TestCase{R"(subject:/boo/")", R"(Query())"}, + }; + + for (auto&& test: cases) { + auto&& xq{make_xapian_query(store, test.first)}; + assert_valid_result(xq); + + mu_println("'{}' <=> '{}'", xq->get_description(), test.second); + assert_equal(xq->get_description(), test.second); + } + + remove_directory(testhome); +} + +int +main(int argc, char* argv[]) +{ + mu_test_init(&argc, &argv); + + Xapian::QueryParser qp; + g_test_add_func("/query-parser/xapianizer", test_xapian); + + return g_test_run(); +} + +#endif /*BUILD_TESTS*/ diff --git a/lib/mu-query.cc b/lib/mu-query.cc index 97b1b71f..3518e573 100644 --- a/lib/mu-query.cc +++ b/lib/mu-query.cc @@ -32,15 +32,17 @@ #include "mu-query-results.hh" #include "mu-query-match-deciders.hh" #include "mu-query-threads.hh" -#include #include "mu-xapian-db.hh" +#include "mu-query-parser.hh" + using namespace Mu; struct Query::Private { - Private(const Store& store) : store_{store}, parser_{store_} {} - // New - // bool calculate_threads (Xapian::Enquire& enq, size maxnum); + Private(const Store& store) : + store_{store}, + parser_flags_{any_of(store_.message_options() & Message::Options::SupportNgrams) ? + ParserFlags::SupportNgrams : ParserFlags::None} {} Xapian::Enquire make_enquire(const std::string& expr, Field::Id sortfield_id, QueryFlags qflags) const; @@ -61,7 +63,7 @@ struct Query::Private { Field::Id sortfield_id, QueryFlags qflags, size_t maxnum) const; const Store& store_; - const Parser parser_; + const ParserFlags parser_flags_; }; Query::Query(const Store& store) : priv_{std::make_unique(store)} {} @@ -79,22 +81,27 @@ sort_enquire(Xapian::Enquire& enq, Field::Id sortfield_id, QueryFlags qflags) return enq; } +static Xapian::Query +make_query(const Store& store, const std::string& expr, ParserFlags parser_flags) +{ + if (expr.empty() || expr == R"("")") + return Xapian::Query::MatchAll; + else { + if (auto&& q{make_xapian_query(store, expr, parser_flags)}; !q) { + mu_warning("error in query '{}': {}", expr, q.error().what()); + return Xapian::Query::MatchNothing; + } else + return q.value(); + } +} + Xapian::Enquire Query::Private::make_enquire(const std::string& expr, Field::Id sortfield_id, QueryFlags qflags) const { auto enq{store_.xapian_db().enquire()}; - if (expr.empty() || expr == R"("")") - enq.set_query(Xapian::Query::MatchAll); - else { - WarningVec warns; - const auto tree{parser_.parse(expr, warns)}; - for (auto&& w : warns) - mu_warning("query warning: {}", to_string(w)); - enq.set_query(xapian_query(tree)); - } - + enq.set_query(make_query(store_, expr, parser_flags_)); sort_enquire(enq, sortfield_id, qflags); return enq; @@ -122,8 +129,7 @@ Query::Private::make_related_enquire(const StringSet& thread_ids, struct ThreadKeyMaker : public Xapian::KeyMaker { ThreadKeyMaker(const QueryMatches& matches) : match_info_(matches) {} - std::string operator()(const Xapian::Document& doc) const override - { + std::string operator()(const Xapian::Document& doc) const override { const auto it{match_info_.find(doc.get_docid())}; return (it == match_info_.end()) ? "" : it->second.thread_path; } @@ -257,10 +263,13 @@ Query::run(const std::string& expr, Field::Id sortfield_id, g_return_val_if_fail(none_of(qflags & QueryFlags::Leader), Err(Error::Code::InvalidArgument, "cannot pass Leader flag")); - StopWatch sw{mu_format( - "ran query '{}'; related: {}; threads: {}; max-size: {}", expr, - any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no", - any_of(qflags & QueryFlags::Threading) ? "yes" : "no", maxnum)}; + StopWatch sw{ + mu_format("query: '{}'; (related:{}; threads:{}; ngrams:{}; max-size:{})", + expr, + any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no", + any_of(qflags & QueryFlags::Threading) ? "yes" : "no", + any_of(priv_->parser_flags_ & ParserFlags::SupportNgrams) ? "yes" : "no", + maxnum == 0 ? std::string{"∞"} : std::to_string(maxnum))}; return xapian_try_result([&]{ if (auto&& res = priv_->run(expr, sortfield_id, qflags, maxnum); res) @@ -288,14 +297,10 @@ Query::count(const std::string& expr) const std::string Query::parse(const std::string& expr, bool xapian) const { - WarningVec warns; - const auto tree{priv_->parser_.parse(expr, warns)}; - for (auto&& w : warns) - mu_warning("query warning: {}", to_string(w)); - if (xapian) - return xapian_query(tree).get_description(); + return make_query(priv_->store_, expr, + priv_->parser_flags_).get_description(); else - return to_string(tree); + return parse_query(expr).to_string(); } /* LCOV_EXCL_STOP*/ diff --git a/lib/mu-query.hh b/lib/mu-query.hh index ad042e1b..ff216bd1 100644 --- a/lib/mu-query.hh +++ b/lib/mu-query.hh @@ -1,5 +1,5 @@ /* -** Copyright (C) 2008-2021 Dirk-Jan C. Binnema +** Copyright (C) 2008-2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by diff --git a/lib/mu-store.cc b/lib/mu-store.cc index fe52114b..e5576278 100644 --- a/lib/mu-store.cc +++ b/lib/mu-store.cc @@ -70,7 +70,8 @@ struct Store::Private { : XapianDb::Flavor::Open)}, config_{xapian_db_}, contacts_cache_{config_}, - root_maildir_{remove_slash(config_.get())} + root_maildir_{remove_slash(config_.get())}, + message_opts_{make_message_options(config_)} {} Private(const std::string& path, const std::string& root_maildir, @@ -78,7 +79,8 @@ struct Store::Private { xapian_db_{XapianDb(path, XapianDb::Flavor::CreateOverwrite)}, config_{make_config(xapian_db_, root_maildir, conf)}, contacts_cache_{config_}, - root_maildir_{remove_slash(config_.get())} + root_maildir_{remove_slash(config_.get())}, + message_opts_{make_message_options(config_)} {} ~Private() try { @@ -133,6 +135,13 @@ struct Store::Private { return config; } + Message::Options make_message_options(const Config& conf) { + if (conf.get()) + return Message::Options::SupportNgrams; + else + return Message::Options::None; + } + Option find_message_unlocked(Store::Id docid) const; Store::IdVec find_duplicates_unlocked(const Store& store, const std::string& message_id) const; @@ -150,7 +159,8 @@ struct Store::Private { ContactsCache contacts_cache_; std::unique_ptr indexer_; - const std::string root_maildir_; + const std::string root_maildir_; + const Message::Options message_opts_; size_t transaction_size_{}; std::mutex lock_; @@ -340,6 +350,11 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new) if (auto&& res = msg.set_maildir(mdir.value()); !res) return Err(res.error()); + // we shouldn't mix ngrams/non-ngrams messages. + if (any_of(msg.options() & Message::Options::SupportNgrams) != + any_of(message_options() & Message::Options::SupportNgrams)) + return Err(Error::Code::InvalidArgument, "incompatible message options"); + /* add contacts from this message to cache; this cache * also determines whether those contacts are _personal_, i.e. match * our personal addresses. @@ -371,6 +386,16 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new) return res; } +Result +Store::add_message(const std::string& path, bool use_transaction, bool is_new) +{ + if (auto msg{Message::make_from_path(path, priv_->message_opts_)}; !msg) + return Err(msg.error()); + else + return add_message(msg.value(), use_transaction, is_new); +} + + bool Store::remove_message(const std::string& path) { @@ -649,3 +674,9 @@ Store::maildirs() const return mdirs; } + +Message::Options +Store::message_options() const +{ + return priv_->message_opts_; +} diff --git a/lib/mu-store.hh b/lib/mu-store.hh index b36848ee..e67005e4 100644 --- a/lib/mu-store.hh +++ b/lib/mu-store.hh @@ -207,21 +207,7 @@ public: Result add_message(Message& msg, bool use_transaction = false, bool is_new = false); Result add_message(const std::string& path, bool use_transaction = false, - bool is_new = false) { - if (auto msg{Message::make_from_path(path)}; !msg) - return Err(msg.error()); - else - return add_message(msg.value(), use_transaction, is_new); - } - - /** - * Update a message in the store. - * - * @param msg a message - * @param id the id for this message - * - * @return Ok() or an error. - */ + bool is_new = false); /** * Remove a message from the store. It will _not_ remove the message @@ -258,7 +244,6 @@ public: */ Option find_message(Id id) const; - /** * Find the messages for the given ids * @@ -288,7 +273,6 @@ public: */ bool contains_message(const std::string& path) const; - /** * Options for moving * @@ -437,6 +421,15 @@ public: */ std::vector maildirs() const; + + /** + * Compatible message-options for this store + * + * @return message-options. + */ + Message::Options message_options() const; + + /* * _almost_ private */ @@ -474,6 +467,13 @@ private: MU_ENABLE_BITOPS(Store::Options); MU_ENABLE_BITOPS(Store::MoveOptions); +static inline std::string +format_as(const Store& store) +{ + return mu_format("store ({}/{})", format_as(store.xapian_db()), + store.root_maildir()); +} + } // namespace Mu #endif /* __MU_STORE_HH__ */ diff --git a/lib/mu-tokenizer.cc b/lib/mu-tokenizer.cc deleted file mode 100644 index 14b318b5..00000000 --- a/lib/mu-tokenizer.cc +++ /dev/null @@ -1,129 +0,0 @@ -/* -** Copyright (C) 2020 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include "mu-tokenizer.hh" -#include "utils/mu-utils.hh" - -#include -#include -#include - -using namespace Mu; - -static bool -is_separator(char c) -{ - if (isblank(c)) - return true; - - const auto seps = std::string("()"); - return seps.find(c) != std::string::npos; -} - -static Mu::Token -op_or_value(size_t pos, const std::string& val) -{ - auto s = val; - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - - if (s == "and") - return Token{pos, Token::Type::And, val}; - else if (s == "or") - return Token{pos, Token::Type::Or, val}; - else if (s == "xor") - return Token{pos, Token::Type::Xor, val}; - else if (s == "not") - return Token{pos, Token::Type::Not, val}; - else - return Token{pos, Token::Type::Data, val}; -} - -static void -unread_char(std::string& food, char kar, size_t& pos) -{ - food = kar + food; - --pos; -} - -static Mu::Token -eat_token(std::string& food, size_t& pos) -{ - bool quoted{}; - bool escaped{}; - std::string value{}; - - while (!food.empty()) { - const auto kar = food[0]; - food.erase(0, 1); - ++pos; - - if (kar == '\\') { - escaped = !escaped; - if (escaped) - continue; - } - - if (kar == '"') { - if (!escaped && quoted) - return Token{pos, Token::Type::Data, value}; - else { - quoted = true; - continue; - } - } - - if (!quoted && !escaped && is_separator(kar)) { - if (!value.empty() && kar != ':') { - unread_char(food, kar, pos); - return op_or_value(pos, value); - } - - if (quoted || isblank(kar)) - continue; - - switch (kar) { - case '(': return {pos, Token::Type::Open, "("}; - case ')': return {pos, Token::Type::Close, ")"}; - default: break; - } - } - - value += kar; - escaped = false; - } - - return {pos, Token::Type::Data, value}; -} - -Mu::Tokens -Mu::tokenize(const std::string& s) -{ - Tokens tokens{}; - - std::string food = utf8_clean(s); - size_t pos{0}; - - if (s.empty()) - return {}; - - while (!food.empty()) - tokens.emplace_back(eat_token(food, pos)); - - return tokens; -} diff --git a/lib/mu-tokenizer.hh b/lib/mu-tokenizer.hh deleted file mode 100644 index 7016e8b7..00000000 --- a/lib/mu-tokenizer.hh +++ /dev/null @@ -1,139 +0,0 @@ -/* -** Copyright (C) 2017 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#ifndef __TOKENIZER_HH__ -#define __TOKENIZER_HH__ - -#include -#include -#include -#include -#include - -// A simple tokenizer, which turns a string into a deque of tokens -// -// It recognizes '(', ')', '*' 'and', 'or', 'xor', 'not' -// -// Note that even if we recognizes those at the lexical level, they might be demoted to mere strings -// when we're creating the parse tree. -// -// Furthermore, we detect ranges ("a..b") and regexps (/../) at the parser level, since we need a -// bit more context to resolve ambiguities. - -namespace Mu { - -// A token -struct Token { - enum class Type { - Data, /**< e .g., banana or date:..456 */ - - // Brackets - Open, /**< ( */ - Close, /**< ) */ - - // Unops - Not, /**< logical not*/ - - // Binops - And, /**< logical and */ - Or, /**< logical not */ - Xor, /**< logical xor */ - - Empty, /**< nothing */ - }; - - size_t pos{}; /**< position in string */ - Type type{}; /**< token type */ - const std::string str{}; /**< data for this token */ - - /** - * operator== - * - * @param rhs right-hand side - * - * @return true if rhs is equal to this; false otherwise - */ - bool operator==(const Token& rhs) const - { - return pos == rhs.pos && type == rhs.type && str == rhs.str; - } -}; - -/** - * operator<< - * - * @param os an output stream - * @param t a token type - * - * @return the updated output stream - */ -inline std::ostream& -operator<<(std::ostream& os, Token::Type t) -{ - switch (t) { - case Token::Type::Data: os << ""; break; - - case Token::Type::Open: os << ""; break; - case Token::Type::Close: os << ""; break; - - case Token::Type::Not: os << ""; break; - case Token::Type::And: os << ""; break; - case Token::Type::Or: os << ""; break; - case Token::Type::Xor: os << ""; break; - case Token::Type::Empty: os << ""; break; - default: // can't happen, but pacify compiler - throw std::runtime_error("<>"); - } - - return os; -} - -/** - * operator<< - * - * @param os an output stream - * @param t a token - * - * @return the updated output stream - */ -inline std::ostream& -operator<<(std::ostream& os, const Token& t) -{ - os << t.pos << ": " << t.type; - - if (!t.str.empty()) - os << " [" << t.str << "]"; - - return os; -} - -/** - * Tokenize a string into a vector of tokens. The tokenization always succeeds, ie., ignoring errors - * such a missing end-". - * - * @param s a string - * - * @return a deque of tokens - */ -using Tokens = std::deque; -Tokens tokenize(const std::string& s); - -} // namespace Mu - -#endif /* __TOKENIZER_HH__ */ diff --git a/lib/mu-tree.hh b/lib/mu-tree.hh deleted file mode 100644 index 5c058905..00000000 --- a/lib/mu-tree.hh +++ /dev/null @@ -1,162 +0,0 @@ -/* -** Copyright (C) 2022 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#ifndef TREE_HH__ -#define TREE_HH__ - -#include -#include -#include -#include -#include - -#include -#include - -namespace Mu { - -struct FieldValue { - FieldValue(Field::Id idarg, const std::string valarg): - field_id{idarg}, val1{valarg} {} - FieldValue(Field::Id idarg, const std::string valarg1, const std::string valarg2): - field_id{idarg}, val1{valarg1}, val2{valarg2} {} - - const Field& field() const { return field_from_id(field_id); } - const std::string& value() const { return val1; } - const std::pair range() const { return { val1, val2 }; } - - const Field::Id field_id; - const std::string val1; - const std::string val2; - -}; - - -/** - * operator<< - * - * @param os an output stream - * @param fval a field value. - * - * @return the updated output stream - */ -inline std::ostream& -operator<<(std::ostream& os, const FieldValue& fval) -{ - os << ' ' << quote(std::string{fval.field().name}); - - if (fval.field().is_range()) - os << ' ' << quote(fval.range().first) - << ' ' << quote(fval.range().second); - else - os << ' ' << quote(fval.value()); - - return os; -} - -// A node in the parse tree -struct Node { - enum class Type { - Empty, // only for empty trees - OpAnd, - OpOr, - OpXor, - OpAndNot, - OpNot, - Value, - ValueAtomic, - Range, - Invalid - }; - - Node(Type _type, FieldValue&& fval) : type{_type}, field_val{std::move(fval)} {} - Node(Type _type) : type{_type} {} - Node(Node&& rhs) = default; - - Type type; - Option field_val; - - static constexpr std::string_view type_name(Type t) { - switch (t) { - case Type::Empty: - return ""; - case Type::OpAnd: - return "and"; - case Type::OpOr: - return "or"; - case Type::OpXor: - return "xor"; - case Type::OpAndNot: - return "andnot"; - case Type::OpNot: - return "not"; - case Type::Value: - return "value"; - case Type::ValueAtomic: - return "value_atomic"; - case Type::Range: - return "range"; - case Type::Invalid: - return ""; - default: - return ""; - } - } - - static constexpr bool is_binop(Type t) { - return t == Type::OpAnd || t == Type::OpAndNot || t == Type::OpOr || - t == Type::OpXor; - } -}; - -inline std::ostream& -operator<<(std::ostream& os, const Node& t) -{ - os << Node::type_name(t.type); - if (t.field_val) - os << t.field_val.value(); - - return os; -} - -struct Tree { - Tree(Node&& _node) : node(std::move(_node)) {} - Tree(Tree&& rhs) = default; - - void add_child(Tree&& child) { children.emplace_back(std::move(child)); } - bool empty() const { return node.type == Node::Type::Empty; } - - Node node; - std::vector children; -}; - -inline std::ostream& -operator<<(std::ostream& os, const Tree& tree) -{ - os << '(' << tree.node; - for (const auto& subtree : tree.children) - os << subtree; - os << ')'; - - return os; -} - -} // namespace Mu - -#endif /* TREE_HH__ */ diff --git a/lib/mu-xapian-db.cc b/lib/mu-xapian-db.cc index ccda245b..1d5a864a 100644 --- a/lib/mu-xapian-db.cc +++ b/lib/mu-xapian-db.cc @@ -101,4 +101,6 @@ XapianDb::XapianDb(const std::string& db_path, Flavor flavor) : if (flavor == Flavor::CreateOverwrite) set_timestamp(MetadataIface::created_key); + + mu_debug("created {} / {}", flavor, *this); } diff --git a/lib/mu-xapian-db.hh b/lib/mu-xapian-db.hh index 49b00ed1..64a24f90 100644 --- a/lib/mu-xapian-db.hh +++ b/lib/mu-xapian-db.hh @@ -192,6 +192,16 @@ public: */ const std::string& path() const; + /** + * Get a description of the Xapian database + * + * @return description + */ + const std::string description() const { + return db().get_description(); + } + + /** * Get the number of documents (messages) in the database * @@ -399,6 +409,27 @@ private: DbType db_; }; +constexpr std::string_view +format_as(XapianDb::Flavor flavor) +{ + switch(flavor) { + case XapianDb::Flavor::CreateOverwrite: + return "create-overwrite"; + case XapianDb::Flavor::Open: + return "open"; + case XapianDb::Flavor::ReadOnly: + return "read-only"; + default: + return "??"; + } +} + +static inline std::string +format_as(const XapianDb& db) +{ + return mu_format("{} @ {}", db.description(), db.path()); +} + } // namespace Mu #endif /* MU_XAPIAN_DB_HH__ */ diff --git a/lib/mu-xapian.cc b/lib/mu-xapian.cc deleted file mode 100644 index 19b3d3c3..00000000 --- a/lib/mu-xapian.cc +++ /dev/null @@ -1,139 +0,0 @@ -/* -** Copyright (C) 2017-2022 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include - -#include -#include "mu-xapian.hh" -#include - -using namespace Mu; - -static Xapian::Query -xapian_query_op(const Mu::Tree& tree) -{ - if (tree.node.type == Node::Type::OpNot) { // OpNot x ::= AND NOT x - if (tree.children.size() != 1) - throw std::runtime_error("invalid # of children"); - return Xapian::Query(Xapian::Query::OP_AND_NOT, - Xapian::Query::MatchAll, - xapian_query(tree.children.front())); - } - - const auto op = std::invoke([](Node::Type ntype) { -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (ntype) { - case Node::Type::OpAnd: - return Xapian::Query::OP_AND; - case Node::Type::OpOr: - return Xapian::Query::OP_OR; - case Node::Type::OpXor: - return Xapian::Query::OP_XOR; - case Node::Type::OpAndNot: - return Xapian::Query::OP_AND_NOT; - case Node::Type::OpNot: - default: - throw Mu::Error(Error::Code::Internal, "invalid op"); // bug - } -#pragma GCC diagnostic pop - }, tree.node.type); - - std::vector childvec; - for (const auto& subtree : tree.children) - childvec.emplace_back(xapian_query(subtree)); - - return Xapian::Query(op, childvec.begin(), childvec.end()); -} - -static Xapian::Query -make_query(const FieldValue& fval, bool maybe_wildcard) -{ - const auto vlen{fval.value().length()}; - if (!maybe_wildcard || vlen <= 1 || fval.value()[vlen - 1] != '*') - return Xapian::Query(fval.field().xapian_term(fval.value())); - else - return Xapian::Query(Xapian::Query::OP_WILDCARD, - fval.field().xapian_term(fval.value().substr(0, vlen - 1))); -} - -static Xapian::Query -xapian_query_value(const Mu::Tree& tree) -{ - // indexable field implies it can be use with a phrase search. - const auto& field_val{tree.node.field_val.value()}; - if (!field_val.field().is_indexable_term()) { // - /* not an indexable field; no extra magic needed*/ - return make_query(field_val, true /*maybe-wildcard*/); - } - - const bool is_atomic = tree.node.type == Node::Type::ValueAtomic; - - const auto parts{split(field_val.value(), " ")}; - if (parts.empty()) - return Xapian::Query::MatchNothing; // shouldn't happen - else if (parts.size() == 1 && !is_atomic) - return make_query(field_val, true /*maybe-wildcard*/); - else if (is_atomic) - return make_query(field_val, false /*maybe-wildcard*/); - - std::vector phvec; - for (const auto& p : parts) { - FieldValue fv{field_val.field_id, p}; - phvec.emplace_back(make_query(fv, false /*no wildcards*/)); - } - - return Xapian::Query(Xapian::Query::OP_PHRASE, phvec.begin(), phvec.end()); -} - -static Xapian::Query -xapian_query_range(const Mu::Tree& tree) -{ - const auto& field_val{tree.node.field_val.value()}; - - return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, - field_val.field().value_no(), - field_val.range().first, - field_val.range().second); -} - -Xapian::Query -Mu::xapian_query(const Mu::Tree& tree) -{ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-enum" - switch (tree.node.type) { - case Node::Type::Empty: - return Xapian::Query(); - case Node::Type::OpNot: - case Node::Type::OpAnd: - case Node::Type::OpOr: - case Node::Type::OpXor: - case Node::Type::OpAndNot: - return xapian_query_op(tree); - case Node::Type::Value: - case Node::Type::ValueAtomic: - return xapian_query_value(tree); - case Node::Type::Range: - return xapian_query_range(tree); - default: - throw Mu::Error(Error::Code::Internal, "invalid query"); // bug - } -#pragma GCC diagnostic pop -} diff --git a/lib/mu-xapian.hh b/lib/mu-xapian.hh deleted file mode 100644 index 54ee006d..00000000 --- a/lib/mu-xapian.hh +++ /dev/null @@ -1,39 +0,0 @@ -/* -** Copyright (C) 2022 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#ifndef MU_XAPIAN_HH__ -#define MU_XAPIAN_HH__ - -#include -#include - -namespace Mu { - -/** - * Transform a parse-tree into a Xapian query object - * - * @param tree a parse tree - * - * @return a Xapian query object - */ -Xapian::Query xapian_query(const Mu::Tree& tree); - -} // namespace Mu - -#endif /* MU_XAPIAN_H__ */ diff --git a/lib/tests/meson.build b/lib/tests/meson.build index 17a9b726..5427fbb3 100644 --- a/lib/tests/meson.build +++ b/lib/tests/meson.build @@ -19,42 +19,30 @@ # test('test-maildir', executable('test-maildir', - 'test-mu-maildir.cc', - install: false, - dependencies: [glib_dep, lib_mu_dep])) + 'test-mu-maildir.cc', + install: false, + dependencies: [glib_dep, lib_mu_dep])) test('test-msg', executable('test-msg', - 'test-mu-msg.cc', - install: false, - dependencies: [glib_dep, lib_mu_dep])) + 'test-mu-msg.cc', + install: false, + dependencies: [glib_dep, lib_mu_dep])) test('test-store', executable('test-store', - 'test-mu-store.cc', - install: false, - dependencies: [glib_dep, lib_mu_dep])) + 'test-mu-store.cc', + install: false, + dependencies: [glib_dep, lib_mu_dep])) test('test-query', executable('test-query', - 'test-query.cc', - install: false, - dependencies: [glib_dep, gmime_dep, lib_mu_dep])) - -test('test-tokenizer', - executable('test-tokenizer', - 'test-tokenizer.cc', - install: false, - dependencies: [glib_dep, lib_mu_dep])) - -test('test-parser', - executable('test-parser', - 'test-parser.cc', - install: false, - dependencies: [glib_dep, gmime_dep, lib_mu_dep])) + 'test-query.cc', + install: false, + dependencies: [glib_dep, gmime_dep, lib_mu_dep])) test('test-store-query', executable('test-store-query', - 'test-mu-store-query.cc', - install: false, - dependencies: [glib_dep, gmime_dep, lib_mu_dep])) + 'test-mu-store-query.cc', + install: false, + dependencies: [glib_dep, gmime_dep, lib_mu_dep])) # # benchmarks # diff --git a/lib/tests/test-mu-store-query.cc b/lib/tests/test-mu-store-query.cc index 4a8f632a..3052b03b 100644 --- a/lib/tests/test-mu-store-query.cc +++ b/lib/tests/test-mu-store-query.cc @@ -1,5 +1,5 @@ /* -** Copyright (C) 2022 Dirk-Jan C. Binnema +** Copyright (C) 2022-2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the @@ -29,9 +29,12 @@ #include #include #include +#include #include #include +#include "mu-query-parser.hh" + using namespace Mu; @@ -40,7 +43,7 @@ using TestMap = std::unordered_map; static Store make_test_store(const std::string& test_path, const TestMap& test_map, - const StringVec &personal_addresses) + Option conf={}) { std::string maildir = test_path + "/Maildir/"; // note the trailing '/' @@ -49,12 +52,11 @@ make_test_store(const std::string& test_path, const TestMap& test_map, /* write messages to disk */ for (auto&& item: test_map) { - const auto msgpath = maildir + "/" + item.first; - /* create the directory for the message */ + const auto msgpath{join_paths(maildir, item.first)}; auto dir = to_string_gchar(g_path_get_dirname(msgpath.c_str())); if (g_test_verbose()) - g_message("create message dir %s", dir.c_str()); + mu_message("create maildir {}", dir.c_str()); g_assert_cmpuint(g_mkdir_with_parents(dir.c_str(), 0700), ==, 0); @@ -65,11 +67,6 @@ make_test_store(const std::string& test_path, const TestMap& test_map, stream.close(); } - /* make the store */ - MemDb mdb; - Config conf{mdb}; - conf.set(personal_addresses); - auto store = Store::make_new(test_path, maildir, conf); assert_valid_result(store); @@ -90,7 +87,6 @@ make_test_store(const std::string& test_path, const TestMap& test_map, return std::move(store.value()); } - static void test_simple() { @@ -161,7 +157,9 @@ I said: "Aujourd'hui!" }) { if (g_test_verbose()) - g_message("query: '%s'", expr); + mu_message("query: '{}'\n", expr, + make_xapian_query(store, expr)->get_description()); + auto qr = store.run_query(expr); assert_valid_result(qr); g_assert_false(qr->empty()); @@ -644,7 +642,8 @@ test_term_split() // Note the fancy quote in "foo’s bar" const TestMap test_msgs = {{ "inbox/new/msg", - { R"(Message-Id: + { +R"(Message-Id: From: "Foo Example" Date: Wed, 26 Oct 2022 11:01:54 -0700 To: example@example.com @@ -657,17 +656,57 @@ Boo! TempDir tdir; auto store{make_test_store(tdir.path(), test_msgs, {})}; /* true: match; false: no match */ - const auto cases = std::array, 6>{{ + const auto cases = std::array, 7>{{ {"subject:foo's", true}, {"subject:foo*", true}, {"subject:/foo/", true}, {"subject:/foo’s/", true}, /* <-- breaks before PR #2365 */ {"subject:/foo.*bar/", true}, /* <-- breaks before PR #2365 */ - {"subject:/foo’s bar/", false}, /* <-- no matching yet */ + {"subject:/foo’s bar/", false}, /* <-- no matching, needs quoting */ + {"subject:\"/foo’s bar/\"", true}, /* <-- this works, quote the regex */ }}; for (auto&& test: cases) { - g_debug("query: %s", test.first); + mu_debug("query: '{}'", test.first); + auto qr = store.run_query(test.first); + assert_valid_result(qr); + if (test.second) + g_assert_cmpuint(qr->size(), ==, 1); + else + g_assert_true(qr->empty()); + } +} + +static void +test_subject_kata_containers() +{ + g_test_bug("2167"); + + // Note the fancy quote in "foo’s bar" + const TestMap test_msgs = {{ + "inbox/new/msg", + { +R"(Message-Id: +From: "Foo Example" +Date: Wed, 26 Oct 2022 11:01:54 -0700 +To: example@example.com +Subject: kata-containers + +Boo! +)"}, + }}; + + TempDir tdir; + auto store{make_test_store(tdir.path(), test_msgs, {})}; + /* true: match; false: no match */ + const auto cases = std::array, 3>{{ + {"subject:kata", true}, + {"subject:containers", true}, + {"subject:kata-containers", true} + }}; + + for (auto&& test: cases) { + mu_debug("query: '{}'", test.first); auto qr = store.run_query(test.first); assert_valid_result(qr); if (test.second) @@ -776,15 +815,75 @@ html assert_valid_result(qr); g_assert_cmpuint(qr->size(), ==, 1); } - } +static void +test_cjk() +{ + g_test_bug("2167"); + + // Note the fancy quote in "foo’s bar" + const TestMap test_msgs = {{ + "inbox/new/msg", + { +R"(From: "Bob" +Subject: スポンサーシップ募集 +To: "Chase" +Message-Id: 112342343e9dfo.fsf@builder.com + + 中文 + +https://trac.xapian.org/ticket/719 + + サーバがダウンしました +)"}}}; + + MemDb mdb; + Config conf{mdb}; + conf.set(true); + + TempDir tdir; + auto store{make_test_store(tdir.path(), test_msgs, conf)}; + store.commit(); + + /* true: match; false: no match */ + const auto cases = std::vector>{{ + {"body:中文", true}, + {"body:中", true}, + {"body:文", true}, + {"body:し", true}, + {"body:サー", true}, + {"body:サーバがダウンしました", true}, // fail + {"中文", true}, + {"中", true}, + {"文", true}, + {"subject:スポン", true }, + {"subject:スポンサーシップ募集", true }, + {"subject:シップ", true }, // XXX should match + {"サーバがダウンしました", true}, // okay + {"body:サーバがダウンしました", true}, // okay + {"subject:スポンサーシップ募集", true}, // okay + {"subject:シップx", true }, // XXX should match + }}; + + for (auto&& test: cases) { + auto qr = store.run_query(std::string{test.first}); + assert_valid_result(qr); + if (test.second) + g_assert_cmpuint(qr->size(), ==, 1); + else + g_assert_true(qr->empty()); + } +} + int main(int argc, char* argv[]) { mu_test_init(&argc, &argv); + //_test_add_func("/store/query/cjk", test_cjk); + g_test_add_func("/store/query/simple", test_simple); g_test_add_func("/store/query/spam-address-components", test_spam_address_components); @@ -800,9 +899,15 @@ main(int argc, char* argv[]) test_duplicate_refresh_rename); g_test_add_func("/store/query/term-split", test_term_split); + g_test_add_func("/store/query/kata_containers", + test_subject_kata_containers); g_test_add_func("/store/query/related-dup-threaded", test_related_dup_threaded); - g_test_add_func("/store/query/html", test_html); + g_test_add_func("/store/query/html", + test_html); + + g_test_add_func("/store/query/cjk-once-more", test_cjk); + return g_test_run(); } diff --git a/lib/tests/test-parser.cc b/lib/tests/test-parser.cc deleted file mode 100644 index 4590a28f..00000000 --- a/lib/tests/test-parser.cc +++ /dev/null @@ -1,139 +0,0 @@ -/* -** Copyright (C) 2017-2020 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include -#include - -#include -#include - -#include "utils/mu-test-utils.hh" - -#include "mu-parser.hh" -#include "utils/mu-result.hh" -#include "utils/mu-utils.hh" -using namespace Mu; - -struct Case { - const std::string expr; - const std::string expected; - WarningVec warnings{}; -}; - -using CaseVec = std::vector; - -static void -test_cases(const CaseVec& cases) -{ - char* tmpdir = test_mu_common_get_random_tmpdir(); - g_assert(tmpdir); - auto dummy_store{Store::make_new(tmpdir, "/tmp")}; - assert_valid_result(dummy_store); - - g_free(tmpdir); - - Parser parser{*dummy_store, Parser::Flags::UnitTest}; - - for (const auto& casus : cases) { - WarningVec warnings; - const auto tree = parser.parse(casus.expr, warnings); - - std::stringstream ss; - ss << tree; - - if (g_test_verbose()) { - std::cout << "\n"; - std::cout << casus.expr << std::endl; - std::cout << "exp:" << casus.expected << std::endl; - std::cout << "got:" << ss.str() << std::endl; - } - - assert_equal(casus.expected, ss.str()); - } -} - -static void -test_basic() -{ - CaseVec cases = { - //{ "", R"#((atom :value ""))#"}, - { - "foo", - R"#((value "message-id" "foo"))#", - }, - {"foo or bar", R"#((or(value "message-id" "foo")(value "message-id" "bar")))#"}, - {"foo and bar", R"#((and(value "message-id" "foo")(value "message-id" "bar")))#"}, - }; - - test_cases(cases); -} - -static void -test_complex() -{ - CaseVec cases = { - {"foo and bar or cuux", - R"#((or(and(value "message-id" "foo")(value "message-id" "bar")))#" + - std::string(R"#((value "message-id" "cuux")))#")}, - {"a and not b", R"#((and(value "message-id" "a")(not(value "message-id" "b"))))#"}, - {"a and b and c", - R"#((and(value "message-id" "a")(and(value "message-id" "b")(value "message-id" "c"))))#"}, - {"(a or b) and c", - R"#((and(or(value "message-id" "a")(value "message-id" "b"))(value "message-id" "c")))#"}, - {"a b", // implicit and - R"#((and(value "message-id" "a")(value "message-id" "b")))#"}, - {"a not b", // implicit and not - R"#((and(value "message-id" "a")(not(value "message-id" "b"))))#"}, - {"not b", // implicit and not - R"#((not(value "message-id" "b")))#"}}; - - test_cases(cases); -} - -G_GNUC_UNUSED static void -test_range() -{ - CaseVec cases = { - {"range:a..b", // implicit and - R"#((range "range" "a" "b"))#"}, - }; - - test_cases(cases); -} - -static void -test_flatten() -{ - CaseVec cases = {{" Mötørhęåđ", R"#((value "message-id" "motorhead"))#"}}; - - test_cases(cases); -} - -int -main(int argc, char* argv[]) -{ - g_test_init(&argc, &argv, NULL); - - g_test_add_func("/parser/basic", test_basic); - g_test_add_func("/parser/complex", test_complex); - // g_test_add_func ("/parser/range", test_range); - g_test_add_func("/parser/flatten", test_flatten); - - return g_test_run(); -} diff --git a/lib/tests/test-tokenizer.cc b/lib/tests/test-tokenizer.cc deleted file mode 100644 index 6e287f0e..00000000 --- a/lib/tests/test-tokenizer.cc +++ /dev/null @@ -1,147 +0,0 @@ -/* -** Copyright (C) 2017 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include -#include -#include -#include - -#include "mu-tokenizer.hh" - -struct Case { - const char* str; - const Mu::Tokens tokens; -}; - -using CaseVec = std::vector; - -using namespace Mu; -using TT = Token::Type; - -static void -test_cases(const CaseVec& cases) -{ - for (const auto& casus : cases) { - const auto tokens = tokenize(casus.str); - - g_assert_cmpuint((guint)tokens.size(), ==, (guint)casus.tokens.size()); - for (size_t u = 0; u != tokens.size(); ++u) { - if (g_test_verbose()) { - std::cerr << "case " << u << " " << casus.str << std::endl; - std::cerr << "exp: '" << casus.tokens[u] << "'" << std::endl; - std::cerr << "got: '" << tokens[u] << "'" << std::endl; - } - g_assert_true(tokens[u] == casus.tokens[u]); - } - } -} - -static void -test_basic() -{ - CaseVec cases = { - {"", {}}, - - {"foo", Tokens{Token{3, TT::Data, "foo"}}}, - - {"foo bar cuux", - Tokens{Token{3, TT::Data, "foo"}, - Token{7, TT::Data, "bar"}, - Token{12, TT::Data, "cuux"}}}, - - {"\"foo bar\"", Tokens{Token{9, TT::Data, "foo bar"}}}, - - // ie. ignore missing closing '"' - {"\"foo bar", Tokens{Token{8, TT::Data, "foo bar"}}}, - - }; - - test_cases(cases); -} - -static void -test_specials() -{ - CaseVec cases = { - {")*(", - Tokens{Token{1, TT::Close, ")"}, Token{2, TT::Data, "*"}, Token{3, TT::Open, "("}}}, - {"\")*(\"", Tokens{Token{5, TT::Data, ")*("}}}, - }; - - test_cases(cases); -} - -static void -test_ops() -{ - CaseVec cases = {{"foo and bar oR cuux XoR fnorb", - Tokens{Token{3, TT::Data, "foo"}, - Token{7, TT::And, "and"}, - Token{11, TT::Data, "bar"}, - Token{14, TT::Or, "oR"}, - Token{19, TT::Data, "cuux"}, - Token{23, TT::Xor, "XoR"}, - Token{29, TT::Data, "fnorb"}}}, - {"NOT (aap or mies)", - Tokens{Token{3, TT::Not, "NOT"}, - Token{5, TT::Open, "("}, - Token{8, TT::Data, "aap"}, - Token{11, TT::Or, "or"}, - Token{16, TT::Data, "mies"}, - Token{17, TT::Close, ")"}}}}; - - test_cases(cases); -} - -static void -test_escape() -{ - CaseVec cases = {{"foo\"bar\"", Tokens{Token{8, TT::Data, "foobar"}}}, - {"\"fnorb\"", Tokens{Token{7, TT::Data, "fnorb"}}}, - {"\\\"fnorb\\\"", Tokens{Token{9, TT::Data, "fnorb"}}}, - {"foo\\\"bar\\\"", Tokens{Token{10, TT::Data, "foobar"}}}}; - - test_cases(cases); -} - -static void -test_to_string() -{ - std::stringstream ss; - for (auto&& t : tokenize("foo and bar xor not cuux or fnorb")) - ss << t << ' '; - - g_assert_true(ss.str() == "3: [foo] 7: [and] 11: [bar] " - "15: [xor] 19: [not] 24: [cuux] " - "27: [or] 33: [fnorb] "); -} - -int -main(int argc, char* argv[]) -{ - g_test_init(&argc, &argv, NULL); - - g_test_add_func("/tokens/basic", test_basic); - g_test_add_func("/tokens/specials", test_specials); - g_test_add_func("/tokens/ops", test_ops); - g_test_add_func("/tokens/escape", test_escape); - g_test_add_func("/tokens/to-string", test_to_string); - - return g_test_run(); -} diff --git a/lib/tokenize.cc b/lib/tokenize.cc deleted file mode 100644 index 96a87087..00000000 --- a/lib/tokenize.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* -** Copyright (C) 2017-2020 Dirk-Jan C. Binnema -** -** This library is free software; you can redistribute it and/or -** modify it under the terms of the GNU Lesser General Public License -** as published by the Free Software Foundation; either version 2.1 -** of the License, or (at your option) any later version. -** -** This library is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -** Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public -** License along with this library; if not, write to the Free -** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA -** 02110-1301, USA. -*/ - -#include -#include - -#include "mu-tokenizer.hh" - -int -main(int argc, char* argv[]) -{ - std::string s; - - for (auto i = 1; i < argc; ++i) - s += " " + std::string(argv[i]); - - const auto tvec = Mu::tokenize(s); - for (const auto& t : tvec) - std::cout << t << std::endl; - - return 0; -} diff --git a/lib/utils/mu-test-utils.cc b/lib/utils/mu-test-utils.cc index dd9fe1c7..164e22e0 100644 --- a/lib/utils/mu-test-utils.cc +++ b/lib/utils/mu-test-utils.cc @@ -94,9 +94,12 @@ Mu::mu_test_init(int *argc, char ***argv) { const auto tmpdir{test_random_tmpdir()}; + g_unsetenv("XAPIAN_CJK_NGRAM"); g_setenv("MU_TEST", "yes", TRUE); g_setenv("XDG_CACHE_HOME", tmpdir.c_str(), TRUE); + setlocale(LC_ALL, ""); + g_test_init(argc, argv, NULL); g_test_bug_base("https://github.com/djcb/mu/issues/"); diff --git a/lib/utils/mu-unbroken.hh b/lib/utils/mu-unbroken.hh new file mode 100644 index 00000000..c1b1d6bc --- /dev/null +++ b/lib/utils/mu-unbroken.hh @@ -0,0 +1,127 @@ +// borrowed from Xapian; slightly adapted + +/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) + * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) + * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) + * Copyright (c) 2011,2018,2019,2023 Olly Betts + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef MU_UNBROKEN_HH__ +#define MU_UNBROKEN_HH__ + +#include +#include + +/** + * Does unichar p belong to a script without explicit word separators? + * + * @param p + * + * @return true or false + */ +constexpr bool +is_unbroken_script(unsigned p) +{ + // Array containing the last value in each range of codepoints which + // are either all in scripts which are written without explicit word + // breaks, or all not in such scripts. + // + // We only include scripts here which ICU has dictionaries for. The + // same list is currently also used to decide which languages to do + // ngrams for, though perhaps that should use a separate list. + constexpr unsigned splits[] = { + // 0E00..0E7F; Thai, Lanna Tai, Pali + // 0E80..0EFF; Lao + 0x0E00 - 1, 0x0EFF, + // 1000..109F; Myanmar (Burmese) + 0x1000 - 1, 0x109F, + // 1100..11FF; Hangul Jamo + 0x1100 - 1, 0x11FF, + // 1780..17FF; Khmer + 0x1780 - 1, 0x17FF, + // 19E0..19FF; Khmer Symbols + 0x19E0 - 1, 0x19FF, + // 2E80..2EFF; CJK Radicals Supplement + // 2F00..2FDF; Kangxi Radicals + // 2FE0..2FFF; Ideographic Description Characters + // 3000..303F; CJK Symbols and Punctuation + // 3040..309F; Hiragana + // 30A0..30FF; Katakana + // 3100..312F; Bopomofo + // 3130..318F; Hangul Compatibility Jamo + // 3190..319F; Kanbun + // 31A0..31BF; Bopomofo Extended + // 31C0..31EF; CJK Strokes + // 31F0..31FF; Katakana Phonetic Extensions + // 3200..32FF; Enclosed CJK Letters and Months + // 3300..33FF; CJK Compatibility + // 3400..4DBF; CJK Unified Ideographs Extension A + // 4DC0..4DFF; Yijing Hexagram Symbols + // 4E00..9FFF; CJK Unified Ideographs + 0x2E80 - 1, 0x9FFF, + // A700..A71F; Modifier Tone Letters + 0xA700 - 1, 0xA71F, + // A960..A97F; Hangul Jamo Extended-A + 0xA960 - 1, 0xA97F, + // A9E0..A9FF; Myanmar Extended-B (Burmese) + 0xA9E0 - 1, 0xA9FF, + // AA60..AA7F; Myanmar Extended-A (Burmese) + 0xAA60 - 1, 0xAA7F, + // AC00..D7AF; Hangul Syllables + // D7B0..D7FF; Hangul Jamo Extended-B + 0xAC00 - 1, 0xD7FF, + // F900..FAFF; CJK Compatibility Ideographs + 0xF900 - 1, 0xFAFF, + // FE30..FE4F; CJK Compatibility Forms + 0xFE30 - 1, 0xFE4F, + // FF00..FFEF; Halfwidth and Fullwidth Forms + 0xFF00 - 1, 0xFFEF, + // 1AFF0..1AFFF; Kana Extended-B + // 1B000..1B0FF; Kana Supplement + // 1B100..1B12F; Kana Extended-A + // 1B130..1B16F; Small Kana Extension + 0x1AFF0 - 1, 0x1B16F, + // 1F200..1F2FF; Enclosed Ideographic Supplement + 0x1F200 - 1, 0x1F2FF, + // 20000..2A6DF; CJK Unified Ideographs Extension B + 0x20000 - 1, 0x2A6DF, + // 2A700..2B73F; CJK Unified Ideographs Extension C + // 2B740..2B81F; CJK Unified Ideographs Extension D + // 2B820..2CEAF; CJK Unified Ideographs Extension E + // 2CEB0..2EBEF; CJK Unified Ideographs Extension F + 0x2A700 - 1, 0x2EBEF, + // 2F800..2FA1F; CJK Compatibility Ideographs Supplement + 0x2F800 - 1, 0x2FA1F, + // 30000..3134F; CJK Unified Ideographs Extension G + // 31350..323AF; CJK Unified Ideographs Extension H + 0x30000 - 1, 0x323AF + }; + // Binary chop to find the first entry which is >= p. If it's an odd + // offset then the codepoint is in a script which needs splitting; if it's + // an even offset then it's not. + auto it = std::lower_bound(std::begin(splits), + std::end(splits), p); + + return ((it - splits) & 1); +} + + +#endif /* MU_UNBROKEN_HH__ */ diff --git a/lib/utils/mu-utils.cc b/lib/utils/mu-utils.cc index 1702ddca..e60b65ff 100644 --- a/lib/utils/mu-utils.cc +++ b/lib/utils/mu-utils.cc @@ -44,6 +44,8 @@ #include #include "mu-utils.hh" +#include "mu-unbroken.hh" + #include "mu-error.hh" #include "mu-option.hh" @@ -112,12 +114,28 @@ gx_utf8_flatten(const gchar* str, gssize len) } // namespace +bool +Mu::contains_unbroken_script(const char *str) +{ + while (str && *str) { + auto uc = g_utf8_get_char(str); + if (is_unbroken_script(uc)) + return true; + str = g_utf8_next_char(str); + } + + return false; +} + std::string // gx_utf8_flatten Mu::utf8_flatten(const char* str) { if (!str) return {}; + if (contains_unbroken_script(str)) + return std::string{str}; + // the pure-ascii case if (g_str_is_ascii(str)) { auto l = g_ascii_strdown(str, -1); diff --git a/lib/utils/mu-utils.hh b/lib/utils/mu-utils.hh index 73f32dfd..6ca0e85f 100644 --- a/lib/utils/mu-utils.hh +++ b/lib/utils/mu-utils.hh @@ -154,7 +154,19 @@ std::tm mu_time(T t={}, bool use_utc=false) { using StringVec = std::vector; /** - * Flatten a string -- downcase and fold diacritics etc. + * Does the string contain script without explicit word separators? + * + * @param str a string + * + * @return true or false + */ +bool contains_unbroken_script(const char* str); +static inline bool contains_unbroken_script(const std::string& str) { + return contains_unbroken_script(str.c_str()); +} + +/** + * Flatten a string -- down-case and fold diacritics. * * @param str a string * diff --git a/lib/utils/tests/test-utils.cc b/lib/utils/tests/test-utils.cc index 56bb95a5..f0e98412 100644 --- a/lib/utils/tests/test-utils.cc +++ b/lib/utils/tests/test-utils.cc @@ -45,14 +45,8 @@ test_cases(const CaseVec& cases, ProcFunc proc) { for (const auto& casus : cases) { const auto res = proc(casus.expr, casus.is_first); - if (g_test_verbose()) { - std::cout << "\n"; - std::cout << casus.expr << ' ' << casus.is_first << std::endl; - std::cout << "exp: '" << casus.expected << "'" << std::endl; - std::cout << "got: '" << res << "'" << std::endl; - } - - g_assert_true(casus.expected == res); + //mu_println("'{}'\n'{}'", casus.expected, res); + assert_equal(casus.expected, res); } } @@ -161,6 +155,8 @@ test_flatten() {"Менделе́ев", true, "менделеев"}, {"", false, ""}, {"Ångström", true, "angstrom"}, + // don't touch combining characters in CJK etc. + {"スポンサーシップ募集",true, "スポンサーシップ募集"} }; test_cases(cases, [](auto s, auto f) { return utf8_flatten(s); }); diff --git a/man/mu-find.1.org b/man/mu-find.1.org index 35ad5380..845db60c 100644 --- a/man/mu-find.1.org +++ b/man/mu-find.1.org @@ -119,7 +119,7 @@ entries are displayed. ** --summary-len= If > 0, use that number of lines of the message to provide a summary. -** --format= +** --format= output results in the specified format: @@ -129,9 +129,7 @@ output results in the specified format: information). - *xml* formats the search results as XML. - *sexp* formats the search results as an s-expression as used in Lisp programming - environments. -- *xquery* shows the Xapian query corresponding to your search terms. This is - meant for for debugging purposes. + environments ** --linksdir= and -c, --clearlinks when using ~-format=links~, output the results as a maildir with symbolic links to @@ -215,6 +213,14 @@ not this may not really be the same message, if the message-id was copied. The algorithm used for determining the threads is based on Jamie Zawinksi's description: http://www.jwz.org/doc/threading.html +** -a,--analyze +instead of executing the query, analyze it by show the parse-tree s-expression +and a stringified version of the Xapian query. This can help users to determine +how ~mu~ interprets some query. + +The output of this command are differ between versions, but should be helpful +nevertheless. + #+include: "muhome.inc" :minlevel 2 #+include: "common-options.inc" :minlevel 1 diff --git a/man/mu-init.1.org b/man/mu-init.1.org index 2a28700c..653afac5 100644 --- a/man/mu-init.1.org +++ b/man/mu-init.1.org @@ -17,6 +17,7 @@ has completed, you can run *mu index* * INIT OPTIONS ** -m, --maildir= + starts searching at ==. By default, *mu* uses whatever the *MAILDIR* environment variable is set to; if it is not set, it tries =~/Maildir= if it already exists. @@ -54,6 +55,13 @@ number of changes after which they are committed to the database; decreasing this reduces the memory requirements, but make indexing substantially slows (and vice-versa for increasing). Usually, the default of 250000 should be fine. +** --support-ngrams + +whether to enable support for using ngrams in indexing and query parsing; this +can be useful for languages without explicit word-breaks, such as +Chinese/Japanes/Korean. See *NGRAM SUPPORT* below. + + ** --reinit reinitialize the database from an earlier version; that is, create a new empty @@ -62,8 +70,20 @@ options. #+include: "muhome.inc" :minlevel 2 +* NGRAM SUPPORT + +*mu*'s underlying Xapian database supports 'ngrams', which improve searching for +languages/scripts that do not have explicit word breaks, such as Chinese, +Japanese and Korean. It is fairly intrusive, and influence both indexing and +query-parsing; it is not enabled by default, and is recommended only if you need +to search in such languages. + +When enabled, *mu* automatically uses ngrams automatically. Xapian environment +variables such as ~XAPIAN_CJK_NGRAM~ are ignored. + #+include: "exit-code.inc" :minlevel 1 + * EXAMPLE #+begin_example $ mu init --maildir=~/Maildir --my-address=alice@example.com --my-address=bob@example.com --ignored-address='/.*reply.*/' diff --git a/man/mu-query.7.org b/man/mu-query.7.org index ed527ef1..73858a72 100644 --- a/man/mu-query.7.org +++ b/man/mu-query.7.org @@ -25,8 +25,8 @@ quote any characters that would otherwise be interpreted by the shell, such as * TERMS The basic building blocks of a query are *terms*; these are just normal words like -'banana' or 'hello', or words prefixed with a field-name which make them apply -to just that field. See *mu find* for all the available fields. +'banana' or 'hello', or words prefixed with a field-name which makes them apply +to just that field. See *mu info fields* for all the available fields. Some example queries: #+begin_example @@ -60,9 +60,8 @@ mu find subject:\\"hi there\\" * LOGICAL OPERATORS We can combine terms with logical operators -- binary ones: *and*, *or*, *xor* and the -unary *not*, with the conventional rules for precedence and association, and are -case-insensitive. - +unary *not*, with the conventional rules for precedence and association. The +operators are case-insensitive. You can also group things with *(* and *)*, so you can do things like: #+begin_example @@ -86,6 +85,7 @@ Note that a =pure not= - e.g. searching for *not apples* is quite a 'heavy' quer The language supports matching basic PCRE regular expressions, see *pcre(3)*. Regular expressions are enclosed in *//*. Some examples: + #+begin_example subject:/h.llo/ # match hallo, hello, ... subject:/ @@ -96,10 +96,10 @@ matches messages in the '/foo' maildir, while the latter matches all messages in all maildirs that match 'foo', such as '/foo', '/bar/cuux/foo', '/fooishbar' etc. -Wildcards are an older mechanism for matching where a term with a rightmost *** +Wildcards are another mechanism for matching where a term with a rightmost *** (and =only= in that position) matches any term that starts with the part before -the ***; they are supported for backward compatibility and *mu* translates them to -regular expressions internally: +the ***; they are therefore less powerful than regular expressions, but also much +faster: #+begin_example foo* #+end_example @@ -108,8 +108,7 @@ is equivalent to /foo.*/ #+end_example -As a note of caution, certain wild-cards and regular expression can take quite a -bit longer than 'normal' queries. +Regular expressions can be useful, but are relatively slow. * FIELDS @@ -143,8 +142,8 @@ full table with all details, including single-char shortcuts, try the command: | to | | Message recipient | |------------+-----------+--------------------------------| -(*) The language code for the text-body if found. This works only -if ~mu~ was built with CLD2 support. +(*) The language code for the text-body if found. This works only if ~mu~ was +built with CLD2 support. There are also the special fields *contact:*, which matches all contact-fields (=from=, =to=, =cc= and =bcc=), and *recip*, which matches all recipient-fields (=to=, =cc= @@ -167,12 +166,12 @@ separated by *..*. Either lower or upper (but not both) can be omitted to create an open range. Dates are expressed in local time and using ISO-8601 format (YYYY-MM-DD -HH:MM:SS); you can leave out the right part, and *mu* adds the rest, depending on +HH:MM:SS); you can leave out the right part and *mu* adds the rest, depending on whether this is the beginning or end of the range (e.g., as a lower bound, '2015' would be interpreted as the start of that year; as an upper bound as the end of the year). -You can use '/' , '.', '-' and 'T' to make dates more human readable. +You can use '/' , '.', '-', ':' and 'T' to make dates more human-readable. Some examples: #+begin_example @@ -274,6 +273,9 @@ Note that from the command-line, such queries must be quoted: mu find 'maildir:"/Sent Items"' #+end_example +Also note that you should *not* end the maildir with a ~/~, or it can be +misinterpreted as a regular expression term; see aforementioned. + * MORE EXAMPLES Here are some simple examples of *mu* queries; you can make many more complicated @@ -321,16 +323,25 @@ Find all messages written in Dutch or German with the word 'hallo': hallo and (lang:nl or lang:de) #+end_example +* ANALZYING QUERIES -* CAVEATS +Despite all the documentation, in some cases it can be non-obvious how ~mu~ +interprets a certain query. For that, you can ask ~mu~ to analyze the query -- +that is, show how ~mu~ interprets the query. -With current Xapian versions, the apostroph character is considered part of a -word. Thus, you cannot find =D'Artagnan= by searching for =Artagnan=. So, include -the apostrophe in search or use a regexp search. +This uses the the ~--analyze~ option to *mu find*. +#+begin_example +$ mu find subject:wombat AND date:3m.. size:..2000 --analyze +* query: + subject:wombat AND date:3m.. size:..2000 +* parsed query: + (and (subject "wombat") (date (range "2023-05-30T06:10:09Z" "")) (size (range "" "2000"))) +* Xapian query: + Query((Swombat AND VALUE_GE 4 n64759341 AND VALUE_LE 17 i7d0)) +#+end_example -Matching on spaces has changed compared to the old query-parser; this applies -e.g. to Maildirs that have spaces in their name, such as =Sent Items=. See *MAILDIR* -above. +The ~parsed query~ is usually the most interesting one to understand what's +happening. #+include: "prefooter.inc" :minlevel 1 diff --git a/meson.build b/meson.build index 5a1dd71b..b54be449 100644 --- a/meson.build +++ b/meson.build @@ -149,9 +149,17 @@ gobject_dep = dependency('gobject-2.0', version: '>= 2.60') gio_dep = dependency('gio-2.0', version: '>= 2.60') gio_unix_dep = dependency('gio-unix-2.0', version: '>= 2.60') gmime_dep = dependency('gmime-3.0', version: '>= 3.2') -xapian_dep = dependency('xapian-core', version:'>= 1.4') thread_dep = dependency('threads') +# we need Xapian 1.4; if we have 1.4.23, we have some newer APIs. +xapian_dep = dependency('xapian-core', version:'>= 1.4.23', required:false) +if xapian_dep.found() + config_h_data.set('HAVE_XAPIAN_FLAG_NGRAMS', 1) +else + xapian_dep = dependency('xapian-core', version:'>= 1.4') + message('Found xapian ' + xapian_dep.version()) +endif + # optionally, use Compact Language Detector2 if we can find it. cld2_dep = meson.get_compiler('cpp').find_library('cld2', required: false) if cld2_dep.found() diff --git a/mu/mu-cmd-find.cc b/mu/mu-cmd-find.cc index 12765140..8dc007ec 100644 --- a/mu/mu-cmd-find.cc +++ b/mu/mu-cmd-find.cc @@ -33,6 +33,7 @@ #include "mu-query-match-deciders.hh" #include "mu-query.hh" #include "mu-bookmarks.hh" +#include "mu-query-parser.hh" #include "message/mu-message.hh" #include "utils/mu-option.hh" @@ -61,12 +62,30 @@ using OutputFunc = std::function(const Option& msg, const using Format = Options::Find::Format; static Result -print_internal(const Store& store, - const std::string& expr, - bool xapian, - bool warn) +analyze_query_expr(const Store& store, const std::string& expr, const Options& opts) { - mu_println("{}", store.parse_query(expr, xapian)); + auto print_item=[&](auto&&title, auto&&val) { + const auto blue{opts.nocolor ? "" : MU_COLOR_BLUE}; + const auto green{opts.nocolor ? "" : MU_COLOR_GREEN}; + const auto reset{opts.nocolor ? "" : MU_COLOR_DEFAULT}; + mu_println("* {}{}{}:\n {}{}{}", blue, title, reset, green, val, reset); + }; + + print_item("query", expr); + + const auto pq{parse_query(expr, false/*don't expand*/).to_string()}; + const auto pqx{parse_query(expr, true/*do expand*/).to_string()}; + + print_item("parsed query", pq); + if (pq != pqx) + print_item("parsed query (expanded)", pqx); + + auto xq{make_xapian_query(store, expr)}; + if (!xq) + return Err(std::move(xq.error())); + + print_item("Xapian query", xq->get_description()); + return Ok(); } @@ -473,7 +492,7 @@ output_query_results(const QueryResults& qres, const Options& opts) } static Result -process_query(const Store& store, const std::string& expr, const Options& opts) +process_store_query(const Store& store, const std::string& expr, const Options& opts) { auto qres{run_query(store, expr, opts)}; if (!qres) @@ -492,18 +511,14 @@ Mu::mu_cmd_find(const Store& store, const Options& opts) if (!expr) return Err(expr.error()); - if (opts.find.format == Format::XQuery) - return print_internal(store, *expr, true, false); - else if (opts.find.format == Format::MQuery) - return print_internal(store, *expr, false, opts.verbose); + if (opts.find.analyze) + return analyze_query_expr(store, *expr, opts); else - return process_query(store, *expr, opts); + return process_store_query(store, *expr, opts); } - - #ifdef BUILD_TESTS /* * Tests. diff --git a/mu/mu-cmd-info.cc b/mu/mu-cmd-info.cc index dd0f133a..71723d62 100644 --- a/mu/mu-cmd-info.cc +++ b/mu/mu-cmd-info.cc @@ -202,6 +202,8 @@ topic_store(const Mu::Store& store, const Options& opts) info.add_row({"ignored-address", c}); info.add_row({"messages in store", mu_format("{}", store.size())}); + info.add_row({"support-ngrams", conf.get() ? "yes" : "no"}); + info.add_row({"last-change", tstamp(store.statistics().last_change)}); info.add_row({"last-index", tstamp(store.statistics().last_index)}); diff --git a/mu/mu-cmd-init.cc b/mu/mu-cmd-init.cc index 3743c681..2bbbd915 100644 --- a/mu/mu-cmd-init.cc +++ b/mu/mu-cmd-init.cc @@ -55,6 +55,8 @@ Mu::mu_cmd_init(const Options& opts) conf.set(opts.init.my_addresses); if (!opts.init.ignored_addresses.empty()) conf.set(opts.init.ignored_addresses); + if (opts.init.support_ngrams) + conf.set(true); return Store::make_new(opts.runtime_path(RuntimePath::XapianDb), opts.init.maildir, conf); diff --git a/mu/mu-options.cc b/mu/mu-options.cc index d22c4f1d..b603cc0d 100644 --- a/mu/mu-options.cc +++ b/mu/mu-options.cc @@ -337,12 +337,6 @@ sub_find(CLI::App& sub, Options& opts) { Format::Json, {"json", "JSON"} }, - { Format::XQuery, - {"xquery", "Show Xapian query (for debugging)"} - }, - { Format::MQuery, - {"mquery", "Show mu query for (for debugging)"} - }, }}; sub.add_flag("--threads,-t", opts.find.threads, @@ -351,6 +345,8 @@ sub_find(CLI::App& sub, Options& opts) "Show only one of messages with same message-id"); sub.add_flag("--include-related,-r", opts.find.include_related, "Include related messages in results"); + sub.add_flag("--analyze,-a", opts.find.analyze, + "Analyze the query"); const auto fhelp = options_help(FormatInfos, Format::Plain); const auto fmap = options_map(FormatInfos); @@ -461,13 +457,16 @@ sub_init(CLI::App& sub, Options& opts) "Maximum allowed message size in bytes"); sub.add_option("--batch-size", opts.init.batch_size, "Maximum size of database transaction"); + sub.add_option("--support-ngrams", opts.init.support_ngrams, + "Support CJK n-grams if for querying/indexing"); sub.add_flag("--reinit", opts.init.reinit, "Re-initialize database with current settings") ->excludes("--maildir") ->excludes("--my-address") ->excludes("--ignored-address") ->excludes("--max-message-size") - ->excludes("--batch-size"); + ->excludes("--batch-size") + ->excludes("--support-ngrams"); } static void diff --git a/mu/mu-options.hh b/mu/mu-options.hh index 58d2ca39..95f9f92a 100644 --- a/mu/mu-options.hh +++ b/mu/mu-options.hh @@ -143,11 +143,12 @@ struct Options { bool reverse; /**< sort in revers order (z->a) */ bool threads; /**< show message threads */ bool clearlinks; /**< clear linksdir first */ - std::string linksdir; /**< directory for links */ + std::string linksdir; /**< directory for links */ OptSize summary_len; /**< max # of lines for summary */ std::string bookmark; /**< use bookmark */ + bool analyze; /**< analyze query */ - enum struct Format { Plain, Links, Xml, Json, Sexp, XQuery, MQuery, Exec }; + enum struct Format { Plain, Links, Xml, Json, Sexp, Exec }; Format format; /**< Output format */ std::string exec; /**< cmd to execute on matches */ bool skip_dups; /**< show only first with msg id */ @@ -184,13 +185,15 @@ struct Options { * Init */ struct Init { - std::string maildir; /**< where the mails are */ - StringVec my_addresses; /**< personal e-mail addresses */ - StringVec ignored_addresses; /**< addresses to be ignored for + std::string maildir; /**< where the mails are */ + StringVec my_addresses; /**< personal e-mail addresses */ + StringVec ignored_addresses; /**< addresses to be ignored for * the contacts-cache */ - OptSize max_msg_size; /**< max size for message files */ - OptSize batch_size; /**< db transaction batch size */ - bool reinit; /**< re-initialize */ + OptSize max_msg_size; /**< max size for message files */ + OptSize batch_size; /**< db transaction batch size */ + bool reinit; /**< re-initialize */ + bool support_ngrams; /**< support CJK etc. ngrams */ + } init; /* diff --git a/mu/mu.cc b/mu/mu.cc index 8147c7f5..5e20cafa 100644 --- a/mu/mu.cc +++ b/mu/mu.cc @@ -90,6 +90,14 @@ handle_result(const Result& res, const Mu::Options& opts) int main(int argc, char* argv[]) { + /* + * We handle this through explicit options + */ + g_unsetenv("XAPIAN_CJK_NGRAM"); + + /* + * set up locale + */ setlocale(LC_ALL, ""); /* diff --git a/mu/tests/test-mu-query.cc b/mu/tests/test-mu-query.cc index a377e4d2..2ecc28bd 100644 --- a/mu/tests/test-mu-query.cc +++ b/mu/tests/test-mu-query.cc @@ -578,48 +578,11 @@ test_mu_query_threads_compilation_error(void) 3); } -/* https://github.com/djcb/mu/issues/1428 */ -static void -test_mu_query_cjk(void) -{ - /* XXX: this doesn't pass yet; return for now */ - g_test_skip("skip CJK tests"); - return; - - { - g_unsetenv("XAPIAN_CJK_NGRAM"); - const auto xpath = make_database(MU_TESTMAILDIR_CJK); - g_assert_cmpuint(run_and_count_matches(xpath, - "サーバがダウンしました", - QueryFlags::None), - ==, 1); - g_assert_cmpuint(run_and_count_matches(xpath, - "サーバ", - QueryFlags::None), - ==, 0); - } - - { - g_setenv("XAPIAN_CJK_NGRAM", "1", TRUE); - const auto xpath = make_database(MU_TESTMAILDIR_CJK); - g_assert_cmpuint(run_and_count_matches(xpath, - "サーバがダウンしました", - QueryFlags::None), - ==, 0); - g_assert_cmpuint(run_and_count_matches(xpath, - "サーバ", - QueryFlags::None), - ==, 0); - } -} - int main(int argc, char* argv[]) { int rv; - setlocale(LC_ALL, ""); - mu_test_init(&argc, &argv); DB_PATH1 = make_database(MU_TESTMAILDIR); g_assert_false(DB_PATH1.empty()); @@ -661,8 +624,6 @@ main(int argc, char* argv[]) g_test_add_func("/mu-query/test-mu-query-threads-compilation-error", test_mu_query_threads_compilation_error); - g_test_add_func("/mu-query/test-mu-query-cjk", - test_mu_query_cjk); rv = g_test_run(); return rv;