From 264bb092f075e7fdbac4617f9d56d1ca5f9a48ca Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Sat, 9 Sep 2023 11:57:05 +0300 Subject: [PATCH] support xapian ngrams Xapian supports an "ngrams" option to help with languages/scripts without explicit wordbreaks, such as Chinese / Japanese / Korean. Add some plumbing for supporting this in mu as well. Experimental for now. --- lib/index/mu-indexer.cc | 7 ++++--- lib/message/meson.build | 4 ++-- lib/message/mu-document.cc | 37 +++++++++++++++++++++++++++---------- lib/message/mu-document.hh | 17 ++++++++++++++--- lib/message/mu-message.cc | 17 +++++++++++++++-- lib/message/mu-message.hh | 15 ++++++++++++--- lib/mu-config.hh | 18 +++++++++++++++++- lib/mu-query-xapianizer.cc | 32 ++++++++------------------------ lib/mu-query.cc | 11 +++++++---- lib/mu-query.hh | 2 +- lib/mu-store.cc | 37 ++++++++++++++++++++++++++++++++++--- lib/mu-store.hh | 27 ++++++++++----------------- lib/utils/mu-test-utils.cc | 3 +++ man/mu-init.1.org | 20 ++++++++++++++++++++ meson.build | 10 +++++++++- mu/mu-cmd-info.cc | 2 ++ mu/mu-cmd-init.cc | 2 ++ mu/mu-options.cc | 5 ++++- mu/mu-options.hh | 14 ++++++++------ mu/mu.cc | 8 ++++++++ 20 files changed, 207 insertions(+), 81 deletions(-) diff --git a/lib/index/mu-indexer.cc b/lib/index/mu-indexer.cc index 4d257f7d..6656f82b 100644 --- a/lib/index/mu-indexer.cc +++ b/lib/index/mu-indexer.cc @@ -87,10 +87,11 @@ struct Indexer::Private { was_empty_{store.empty()} { mu_message("created indexer for {} -> " - "{} (batch-size: {}; was-empty: {})", + "{} (batch-size: {}; was-empty: {}; ngrams: {})", store.root_maildir(), store.path(), store.config().get(), - was_empty_); + was_empty_, + store.config().get()); } ~Private() { @@ -238,7 +239,7 @@ Indexer::Private::add_message(const std::string& path) * * std::unique_lock lock{w_lock_}; */ - auto msg{Message::make_from_path(path)}; + auto msg{Message::make_from_path(path, store_.message_options())}; if (!msg) { mu_warning("failed to create message from {}: {}", path, msg.error().what()); return false; diff --git a/lib/message/meson.build b/lib/message/meson.build index b1bc21b5..4c7bb601 100644 --- a/lib/message/meson.build +++ b/lib/message/meson.build @@ -1,4 +1,4 @@ -## Copyright (C) 2022 Dirk-Jan C. Binnema +## Copyright (C) 2022-2023 Dirk-Jan C. Binnema ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -38,7 +38,7 @@ lib_mu_message=static_library( lib_mu_message_dep = declare_dependency( link_with: lib_mu_message, - dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep ], + dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep, config_h_dep ], include_directories: include_directories(['.', '..'])) diff --git a/lib/message/mu-document.cc b/lib/message/mu-document.cc index 87ddd683..29af3763 100644 --- a/lib/message/mu-document.cc +++ b/lib/message/mu-document.cc @@ -1,5 +1,5 @@ /* -** Copyright (C) 2022 Dirk-Jan C. Binnema +** Copyright (C) 2022-2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the @@ -16,6 +16,7 @@ ** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ** */ +#include "config.h" #include "mu-document.hh" #include "mu-message.hh" @@ -31,9 +32,14 @@ #include #include - using namespace Mu; +// backward compat +#ifndef HAVE_XAPIAN_FLAG_NGRAMS +#define FLAG_NGRAMS FLAG_CJK_NGRAM +#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/ + + const Xapian::Document& Document::xapian_document() const { @@ -58,16 +64,29 @@ Document::put_prop(const Field& field, SexpType&& val) std::forward(val)); } +static Xapian::TermGenerator +make_term_generator(Xapian::Document& doc, Document::Options opts) +{ + Xapian::TermGenerator termgen; + + if (any_of(opts & Document::Options::SupportNgrams)) + termgen.set_flags(Xapian::TermGenerator::FLAG_NGRAMS); + + termgen.set_document(doc); + + return termgen; +} + static void -add_search_term(Xapian::Document& doc, const Field& field, const std::string& val) +add_search_term(Xapian::Document& doc, const Field& field, const std::string& val, + Document::Options opts) { if (field.is_normal_term()) { doc.add_term(field.xapian_term(val)); } else if (field.is_boolean_term()) { doc.add_boolean_term(field.xapian_term(val)); } else if (field.is_indexable_term()) { - Xapian::TermGenerator termgen; - termgen.set_document(doc); + auto&& termgen{make_term_generator(doc, opts)}; termgen.index_text(utf8_flatten(val), 1, field.xapian_term()); /* also add as 'normal' term, so some queries where the indexer * eats special chars also match */ @@ -88,7 +107,7 @@ Document::add(Field::Id id, const std::string& val) xdoc_.add_value(field.value_no(), val); if (field.is_searchable()) - add_search_term(xdoc_, field, val); + add_search_term(xdoc_, field, val, options_); if (field.include_in_sexp()) put_prop(field, val); @@ -107,7 +126,7 @@ Document::add(Field::Id id, const std::vector& vals) if (field.is_searchable()) std::for_each(vals.begin(), vals.end(), [&](const auto& val) { - add_search_term(xdoc_, field, val); }); + add_search_term(xdoc_, field, val, options_); }); if (field.include_in_sexp()) { Sexp elms{}; @@ -149,9 +168,7 @@ Document::add(Field::Id id, const Contacts& contacts) std::vector cvec; const std::string sepa2(1, SepaChar2); - - Xapian::TermGenerator termgen; - termgen.set_document(xdoc_); + auto&& termgen{make_term_generator(xdoc_, options_)}; for (auto&& contact: contacts) { diff --git a/lib/message/mu-document.hh b/lib/message/mu-document.hh index 21de4e20..ef045772 100644 --- a/lib/message/mu-document.hh +++ b/lib/message/mu-document.hh @@ -41,17 +41,27 @@ namespace Mu { */ class Document { public: + enum struct Options { + None = 0, + SupportNgrams = 1 << 0, /**< Support ngrams, as used in + * CJK and other languages. */ + }; + /** * Construct a message for a new Xapian Document + * + * @param flags behavioral flags */ - Document() {} + Document(Options opts = Options::None): options_{opts} {} /** * Construct a message document based on an existing Xapian document. * * @param doc + * @param flags behavioral flags */ - Document(const Xapian::Document& doc): xdoc_{doc} {} + Document(const Xapian::Document& doc, Options opts = Options::None): + xdoc_{doc}, options_{opts} {} /** * DTOR @@ -240,11 +250,12 @@ private: return cached_sexp_; } - mutable Xapian::Document xdoc_; + Options options_; mutable Sexp cached_sexp_; mutable bool dirty_sexp_{}; /* xdoc's sexp is outdated */ }; +MU_ENABLE_BITOPS(Document::Options); } // namepace Mu diff --git a/lib/message/mu-message.cc b/lib/message/mu-message.cc index c026dfcb..3f443d45 100644 --- a/lib/message/mu-message.cc +++ b/lib/message/mu-message.cc @@ -45,9 +45,10 @@ using namespace Mu; struct Message::Private { - Private(Message::Options options): opts{options} {} + Private(Message::Options options): + opts{options}, doc{doc_opts(opts)} {} Private(Message::Options options, Xapian::Document&& xdoc): - opts{options}, doc{std::move(xdoc)} {} + opts{options}, doc{std::move(xdoc), doc_opts(opts)} {} Message::Options opts; Document doc; @@ -70,6 +71,13 @@ struct Message::Private { Option embedded; Option language; /* body ISO language code */ + +private: + Document::Options doc_opts(Message::Options mopts) { + return any_of(opts & Message::Options::SupportNgrams) ? + Document::Options::SupportNgrams : + Document::Options::None; + } }; @@ -176,6 +184,11 @@ Message::document() const return priv_->doc; } +Message::Options +Message::options() const +{ + return priv_->opts; +} unsigned Message::docid() const diff --git a/lib/message/mu-message.hh b/lib/message/mu-message.hh index 09a677b9..8fb9f4d2 100644 --- a/lib/message/mu-message.hh +++ b/lib/message/mu-message.hh @@ -1,5 +1,5 @@ /* -** Copyright (C) 2022 Dirk-Jan C. Binnema +** Copyright (C) 2022-2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the @@ -49,8 +49,10 @@ public: Decrypt = 1 << 0, /**< Attempt to decrypt */ RetrieveKeys = 1 << 1, /**< Auto-retrieve crypto keys (implies network * access) */ - AllowRelativePath = 1 << 2, /**< Allow relateive paths for filename + AllowRelativePath = 1 << 2, /**< Allow relative paths for filename * in make_from_path */ + SupportNgrams = 1 << 3, /**< Support ngrams, as used in + * CJK and other languages. */ }; /** @@ -60,7 +62,6 @@ public: */ Message(Message&& other) noexcept; - /** * operator= * @@ -147,6 +148,14 @@ public: const Document& document() const; + /** + * The message options for this message + * + * @return message options + */ + Options options() const; + + /** * Get the document-id, or 0 if non-existent. * diff --git a/lib/mu-config.hh b/lib/mu-config.hh index 13b1bcfb..afbfda85 100644 --- a/lib/mu-config.hh +++ b/lib/mu-config.hh @@ -51,6 +51,8 @@ struct Property { PersonalAddresses, /**< List of personal e-mail addresses */ RootMaildir, /**< Root maildir path */ SchemaVersion, /**< Xapian DB schema version */ + SupportNgrams, /**< Support ngrams for indexing & querying + * for e.g. CJK languages */ /* */ _count_ /* Number of Ids */ }; @@ -61,12 +63,13 @@ struct Property { enum struct Flags { None = 0, /**< Nothing in particular */ ReadOnly = 1 << 0, /**< Property is read-only for external use - * (but can change from within the store) */ + * (but can change from within the store) */ Configurable = 1 << 1, /**< A user-configurable parameter; name * starts with 'conf-' */ Internal = 1 << 2, /**< Mu-internal field */ }; enum struct Type { + Boolean, /**< Some boolean value */ Number, /**< Some number */ Timestamp, /**< Timestamp number */ Path, /**< Path string */ @@ -176,6 +179,14 @@ public: {}, "Version of the Xapian database schema" }, + { + Id::SupportNgrams, + Type::Boolean, + Flags::Configurable, + "support-ngrams", + {}, + "Support n-grams for working with CJK and other languages" + }, }}; /** @@ -229,6 +240,9 @@ public: }); if constexpr (prop.type == Type::Number) return static_cast(str.empty() ? 0 : std::atoll(str.c_str())); + if constexpr (prop.type == Type::Boolean) + return static_cast(str.empty() ? false : + std::atol(str.c_str()) != 0); else if constexpr (prop.type == Type::Timestamp) return static_cast(str.empty() ? 0 : std::atoll(str.c_str())); else if constexpr (prop.type == Type::Path || prop.type == Type::String) @@ -257,6 +271,8 @@ public: const auto strval = std::invoke([&]{ if constexpr (prop.type == Type::Number || prop.type == Type::Timestamp) return mu_format("{}", static_cast(val)); + if constexpr (prop.type == Type::Boolean) + return val ? "1" : "0"; else if constexpr (prop.type == Type::Path || prop.type == Type::String) return std::string{val}; else if constexpr (prop.type == Type::StringList) diff --git a/lib/mu-query-xapianizer.cc b/lib/mu-query-xapianizer.cc index 62ca18a4..ff541682 100644 --- a/lib/mu-query-xapianizer.cc +++ b/lib/mu-query-xapianizer.cc @@ -32,7 +32,10 @@ using namespace Mu; - +// backward compat +#ifndef HAVE_XAPIAN_FLAG_NGRAMS +#define FLAG_NGRAMS FLAG_CJK_NGRAM +#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/ /** * Expand terms for scripts without explicit word-breaks (e.g. @@ -42,25 +45,15 @@ using namespace Mu; static Result ngram_expand(const Field& field, const std::string& str) { - mu_println("ng: '{}'", str); - Xapian::QueryParser qp; const auto pfx{std::string(1U, field.xapian_prefix())}; qp.set_default_op(Xapian::Query::OP_OR); - return qp.parse_query( - str, -#if HAVE_XAPIAN_FLAG_NGRAMS - Xapian::QueryParser::FLAG_NGRAMS, -#else - Xapian::QueryParser::FLAG_CJK_NGRAM, -#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/ - pfx); + return qp.parse_query(str, Xapian::QueryParser::FLAG_NGRAMS, pfx); } - static Option tail(Sexp&& s) { @@ -259,11 +252,10 @@ parse_field_matcher(const Store& store, const Field& field, } -static Result -parse_basic(const Field& field, Sexp&& vals, Mu::ParserFlags flags) +static Result parse_basic(const Field &field, Sexp &&vals, + Mu::ParserFlags flags) { - static auto ngrams = any_of(flags & ParserFlags::SupportNgrams); - + auto ngrams = any_of(flags & ParserFlags::SupportNgrams); if (!vals.stringp()) return Err(Error::Code::InvalidArgument, "expected string"); @@ -321,7 +313,6 @@ parse(const Store& store, Sexp&& s, Mu::ParserFlags flags) "expected field-value or field-matcher"); auto&& matcher{rest->front()}; - // field-value: (field "value"); ensure "value" is there if (matcher.stringp()) return parse_basic(*field, std::move(matcher), flags); @@ -468,14 +459,7 @@ main(int argc, char* argv[]) { mu_test_init(&argc, &argv); - Xapian::QueryParser qp; - // mu_println("{}", qp.parse_query("スポンサーシップ募集").get_description()); - // mu_println("{}", qp.parse_query("スポンサーシップ募集", Xapian::QueryParser::FLAG_NGRAMS).get_description()); - - // mu_println("{}", qp.parse_query("hello world").get_description()); - // mu_println("{}", qp.parse_query("hello world", Xapian::QueryParser::FLAG_NGRAMS).get_description()); - g_test_add_func("/query-parser/xapianizer", test_xapian); return g_test_run(); diff --git a/lib/mu-query.cc b/lib/mu-query.cc index 418f2017..3518e573 100644 --- a/lib/mu-query.cc +++ b/lib/mu-query.cc @@ -263,10 +263,13 @@ Query::run(const std::string& expr, Field::Id sortfield_id, g_return_val_if_fail(none_of(qflags & QueryFlags::Leader), Err(Error::Code::InvalidArgument, "cannot pass Leader flag")); - StopWatch sw{mu_format( - "ran query '{}'; related: {}; threads: {}; max-size: {}", expr, - any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no", - any_of(qflags & QueryFlags::Threading) ? "yes" : "no", maxnum)}; + StopWatch sw{ + mu_format("query: '{}'; (related:{}; threads:{}; ngrams:{}; max-size:{})", + expr, + any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no", + any_of(qflags & QueryFlags::Threading) ? "yes" : "no", + any_of(priv_->parser_flags_ & ParserFlags::SupportNgrams) ? "yes" : "no", + maxnum == 0 ? std::string{"∞"} : std::to_string(maxnum))}; return xapian_try_result([&]{ if (auto&& res = priv_->run(expr, sortfield_id, qflags, maxnum); res) diff --git a/lib/mu-query.hh b/lib/mu-query.hh index ad042e1b..ff216bd1 100644 --- a/lib/mu-query.hh +++ b/lib/mu-query.hh @@ -1,5 +1,5 @@ /* -** Copyright (C) 2008-2021 Dirk-Jan C. Binnema +** Copyright (C) 2008-2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by diff --git a/lib/mu-store.cc b/lib/mu-store.cc index fe52114b..e5576278 100644 --- a/lib/mu-store.cc +++ b/lib/mu-store.cc @@ -70,7 +70,8 @@ struct Store::Private { : XapianDb::Flavor::Open)}, config_{xapian_db_}, contacts_cache_{config_}, - root_maildir_{remove_slash(config_.get())} + root_maildir_{remove_slash(config_.get())}, + message_opts_{make_message_options(config_)} {} Private(const std::string& path, const std::string& root_maildir, @@ -78,7 +79,8 @@ struct Store::Private { xapian_db_{XapianDb(path, XapianDb::Flavor::CreateOverwrite)}, config_{make_config(xapian_db_, root_maildir, conf)}, contacts_cache_{config_}, - root_maildir_{remove_slash(config_.get())} + root_maildir_{remove_slash(config_.get())}, + message_opts_{make_message_options(config_)} {} ~Private() try { @@ -133,6 +135,13 @@ struct Store::Private { return config; } + Message::Options make_message_options(const Config& conf) { + if (conf.get()) + return Message::Options::SupportNgrams; + else + return Message::Options::None; + } + Option find_message_unlocked(Store::Id docid) const; Store::IdVec find_duplicates_unlocked(const Store& store, const std::string& message_id) const; @@ -150,7 +159,8 @@ struct Store::Private { ContactsCache contacts_cache_; std::unique_ptr indexer_; - const std::string root_maildir_; + const std::string root_maildir_; + const Message::Options message_opts_; size_t transaction_size_{}; std::mutex lock_; @@ -340,6 +350,11 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new) if (auto&& res = msg.set_maildir(mdir.value()); !res) return Err(res.error()); + // we shouldn't mix ngrams/non-ngrams messages. + if (any_of(msg.options() & Message::Options::SupportNgrams) != + any_of(message_options() & Message::Options::SupportNgrams)) + return Err(Error::Code::InvalidArgument, "incompatible message options"); + /* add contacts from this message to cache; this cache * also determines whether those contacts are _personal_, i.e. match * our personal addresses. @@ -371,6 +386,16 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new) return res; } +Result +Store::add_message(const std::string& path, bool use_transaction, bool is_new) +{ + if (auto msg{Message::make_from_path(path, priv_->message_opts_)}; !msg) + return Err(msg.error()); + else + return add_message(msg.value(), use_transaction, is_new); +} + + bool Store::remove_message(const std::string& path) { @@ -649,3 +674,9 @@ Store::maildirs() const return mdirs; } + +Message::Options +Store::message_options() const +{ + return priv_->message_opts_; +} diff --git a/lib/mu-store.hh b/lib/mu-store.hh index b36848ee..0ae0d2ef 100644 --- a/lib/mu-store.hh +++ b/lib/mu-store.hh @@ -207,21 +207,7 @@ public: Result add_message(Message& msg, bool use_transaction = false, bool is_new = false); Result add_message(const std::string& path, bool use_transaction = false, - bool is_new = false) { - if (auto msg{Message::make_from_path(path)}; !msg) - return Err(msg.error()); - else - return add_message(msg.value(), use_transaction, is_new); - } - - /** - * Update a message in the store. - * - * @param msg a message - * @param id the id for this message - * - * @return Ok() or an error. - */ + bool is_new = false); /** * Remove a message from the store. It will _not_ remove the message @@ -258,7 +244,6 @@ public: */ Option find_message(Id id) const; - /** * Find the messages for the given ids * @@ -288,7 +273,6 @@ public: */ bool contains_message(const std::string& path) const; - /** * Options for moving * @@ -437,6 +421,15 @@ public: */ std::vector maildirs() const; + + /** + * Compatible message-options for this store + * + * @return message-options. + */ + Message::Options message_options() const; + + /* * _almost_ private */ diff --git a/lib/utils/mu-test-utils.cc b/lib/utils/mu-test-utils.cc index dd9fe1c7..164e22e0 100644 --- a/lib/utils/mu-test-utils.cc +++ b/lib/utils/mu-test-utils.cc @@ -94,9 +94,12 @@ Mu::mu_test_init(int *argc, char ***argv) { const auto tmpdir{test_random_tmpdir()}; + g_unsetenv("XAPIAN_CJK_NGRAM"); g_setenv("MU_TEST", "yes", TRUE); g_setenv("XDG_CACHE_HOME", tmpdir.c_str(), TRUE); + setlocale(LC_ALL, ""); + g_test_init(argc, argv, NULL); g_test_bug_base("https://github.com/djcb/mu/issues/"); diff --git a/man/mu-init.1.org b/man/mu-init.1.org index 2a28700c..653afac5 100644 --- a/man/mu-init.1.org +++ b/man/mu-init.1.org @@ -17,6 +17,7 @@ has completed, you can run *mu index* * INIT OPTIONS ** -m, --maildir= + starts searching at ==. By default, *mu* uses whatever the *MAILDIR* environment variable is set to; if it is not set, it tries =~/Maildir= if it already exists. @@ -54,6 +55,13 @@ number of changes after which they are committed to the database; decreasing this reduces the memory requirements, but make indexing substantially slows (and vice-versa for increasing). Usually, the default of 250000 should be fine. +** --support-ngrams + +whether to enable support for using ngrams in indexing and query parsing; this +can be useful for languages without explicit word-breaks, such as +Chinese/Japanes/Korean. See *NGRAM SUPPORT* below. + + ** --reinit reinitialize the database from an earlier version; that is, create a new empty @@ -62,8 +70,20 @@ options. #+include: "muhome.inc" :minlevel 2 +* NGRAM SUPPORT + +*mu*'s underlying Xapian database supports 'ngrams', which improve searching for +languages/scripts that do not have explicit word breaks, such as Chinese, +Japanese and Korean. It is fairly intrusive, and influence both indexing and +query-parsing; it is not enabled by default, and is recommended only if you need +to search in such languages. + +When enabled, *mu* automatically uses ngrams automatically. Xapian environment +variables such as ~XAPIAN_CJK_NGRAM~ are ignored. + #+include: "exit-code.inc" :minlevel 1 + * EXAMPLE #+begin_example $ mu init --maildir=~/Maildir --my-address=alice@example.com --my-address=bob@example.com --ignored-address='/.*reply.*/' diff --git a/meson.build b/meson.build index 5a1dd71b..b54be449 100644 --- a/meson.build +++ b/meson.build @@ -149,9 +149,17 @@ gobject_dep = dependency('gobject-2.0', version: '>= 2.60') gio_dep = dependency('gio-2.0', version: '>= 2.60') gio_unix_dep = dependency('gio-unix-2.0', version: '>= 2.60') gmime_dep = dependency('gmime-3.0', version: '>= 3.2') -xapian_dep = dependency('xapian-core', version:'>= 1.4') thread_dep = dependency('threads') +# we need Xapian 1.4; if we have 1.4.23, we have some newer APIs. +xapian_dep = dependency('xapian-core', version:'>= 1.4.23', required:false) +if xapian_dep.found() + config_h_data.set('HAVE_XAPIAN_FLAG_NGRAMS', 1) +else + xapian_dep = dependency('xapian-core', version:'>= 1.4') + message('Found xapian ' + xapian_dep.version()) +endif + # optionally, use Compact Language Detector2 if we can find it. cld2_dep = meson.get_compiler('cpp').find_library('cld2', required: false) if cld2_dep.found() diff --git a/mu/mu-cmd-info.cc b/mu/mu-cmd-info.cc index dd0f133a..71723d62 100644 --- a/mu/mu-cmd-info.cc +++ b/mu/mu-cmd-info.cc @@ -202,6 +202,8 @@ topic_store(const Mu::Store& store, const Options& opts) info.add_row({"ignored-address", c}); info.add_row({"messages in store", mu_format("{}", store.size())}); + info.add_row({"support-ngrams", conf.get() ? "yes" : "no"}); + info.add_row({"last-change", tstamp(store.statistics().last_change)}); info.add_row({"last-index", tstamp(store.statistics().last_index)}); diff --git a/mu/mu-cmd-init.cc b/mu/mu-cmd-init.cc index 3743c681..2bbbd915 100644 --- a/mu/mu-cmd-init.cc +++ b/mu/mu-cmd-init.cc @@ -55,6 +55,8 @@ Mu::mu_cmd_init(const Options& opts) conf.set(opts.init.my_addresses); if (!opts.init.ignored_addresses.empty()) conf.set(opts.init.ignored_addresses); + if (opts.init.support_ngrams) + conf.set(true); return Store::make_new(opts.runtime_path(RuntimePath::XapianDb), opts.init.maildir, conf); diff --git a/mu/mu-options.cc b/mu/mu-options.cc index dd8edd02..b603cc0d 100644 --- a/mu/mu-options.cc +++ b/mu/mu-options.cc @@ -457,13 +457,16 @@ sub_init(CLI::App& sub, Options& opts) "Maximum allowed message size in bytes"); sub.add_option("--batch-size", opts.init.batch_size, "Maximum size of database transaction"); + sub.add_option("--support-ngrams", opts.init.support_ngrams, + "Support CJK n-grams if for querying/indexing"); sub.add_flag("--reinit", opts.init.reinit, "Re-initialize database with current settings") ->excludes("--maildir") ->excludes("--my-address") ->excludes("--ignored-address") ->excludes("--max-message-size") - ->excludes("--batch-size"); + ->excludes("--batch-size") + ->excludes("--support-ngrams"); } static void diff --git a/mu/mu-options.hh b/mu/mu-options.hh index 8cfdb8bf..95f9f92a 100644 --- a/mu/mu-options.hh +++ b/mu/mu-options.hh @@ -185,13 +185,15 @@ struct Options { * Init */ struct Init { - std::string maildir; /**< where the mails are */ - StringVec my_addresses; /**< personal e-mail addresses */ - StringVec ignored_addresses; /**< addresses to be ignored for + std::string maildir; /**< where the mails are */ + StringVec my_addresses; /**< personal e-mail addresses */ + StringVec ignored_addresses; /**< addresses to be ignored for * the contacts-cache */ - OptSize max_msg_size; /**< max size for message files */ - OptSize batch_size; /**< db transaction batch size */ - bool reinit; /**< re-initialize */ + OptSize max_msg_size; /**< max size for message files */ + OptSize batch_size; /**< db transaction batch size */ + bool reinit; /**< re-initialize */ + bool support_ngrams; /**< support CJK etc. ngrams */ + } init; /* diff --git a/mu/mu.cc b/mu/mu.cc index 8147c7f5..5e20cafa 100644 --- a/mu/mu.cc +++ b/mu/mu.cc @@ -90,6 +90,14 @@ handle_result(const Result& res, const Mu::Options& opts) int main(int argc, char* argv[]) { + /* + * We handle this through explicit options + */ + g_unsetenv("XAPIAN_CJK_NGRAM"); + + /* + * set up locale + */ setlocale(LC_ALL, ""); /*