mirror of https://github.com/djcb/mu.git
support xapian ngrams
Xapian supports an "ngrams" option to help with languages/scripts without explicit wordbreaks, such as Chinese / Japanese / Korean. Add some plumbing for supporting this in mu as well. Experimental for now.
This commit is contained in:
parent
f6122ecc9e
commit
264bb092f0
|
@ -87,10 +87,11 @@ struct Indexer::Private {
|
|||
was_empty_{store.empty()} {
|
||||
|
||||
mu_message("created indexer for {} -> "
|
||||
"{} (batch-size: {}; was-empty: {})",
|
||||
"{} (batch-size: {}; was-empty: {}; ngrams: {})",
|
||||
store.root_maildir(), store.path(),
|
||||
store.config().get<Mu::Config::Id::BatchSize>(),
|
||||
was_empty_);
|
||||
was_empty_,
|
||||
store.config().get<Mu::Config::Id::SupportNgrams>());
|
||||
}
|
||||
|
||||
~Private() {
|
||||
|
@ -238,7 +239,7 @@ Indexer::Private::add_message(const std::string& path)
|
|||
*
|
||||
* std::unique_lock lock{w_lock_};
|
||||
*/
|
||||
auto msg{Message::make_from_path(path)};
|
||||
auto msg{Message::make_from_path(path, store_.message_options())};
|
||||
if (!msg) {
|
||||
mu_warning("failed to create message from {}: {}", path, msg.error().what());
|
||||
return false;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
## Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
## Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
##
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
|
@ -38,7 +38,7 @@ lib_mu_message=static_library(
|
|||
|
||||
lib_mu_message_dep = declare_dependency(
|
||||
link_with: lib_mu_message,
|
||||
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep ],
|
||||
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep, config_h_dep ],
|
||||
include_directories:
|
||||
include_directories(['.', '..']))
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify it
|
||||
** under the terms of the GNU General Public License as published by the
|
||||
|
@ -16,6 +16,7 @@
|
|||
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
**
|
||||
*/
|
||||
#include "config.h"
|
||||
|
||||
#include "mu-document.hh"
|
||||
#include "mu-message.hh"
|
||||
|
@ -31,9 +32,14 @@
|
|||
#include <string>
|
||||
#include <utils/mu-utils.hh>
|
||||
|
||||
|
||||
using namespace Mu;
|
||||
|
||||
// backward compat
|
||||
#ifndef HAVE_XAPIAN_FLAG_NGRAMS
|
||||
#define FLAG_NGRAMS FLAG_CJK_NGRAM
|
||||
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
|
||||
|
||||
|
||||
const Xapian::Document&
|
||||
Document::xapian_document() const
|
||||
{
|
||||
|
@ -58,16 +64,29 @@ Document::put_prop(const Field& field, SexpType&& val)
|
|||
std::forward<SexpType>(val));
|
||||
}
|
||||
|
||||
static Xapian::TermGenerator
|
||||
make_term_generator(Xapian::Document& doc, Document::Options opts)
|
||||
{
|
||||
Xapian::TermGenerator termgen;
|
||||
|
||||
if (any_of(opts & Document::Options::SupportNgrams))
|
||||
termgen.set_flags(Xapian::TermGenerator::FLAG_NGRAMS);
|
||||
|
||||
termgen.set_document(doc);
|
||||
|
||||
return termgen;
|
||||
}
|
||||
|
||||
static void
|
||||
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val)
|
||||
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val,
|
||||
Document::Options opts)
|
||||
{
|
||||
if (field.is_normal_term()) {
|
||||
doc.add_term(field.xapian_term(val));
|
||||
} else if (field.is_boolean_term()) {
|
||||
doc.add_boolean_term(field.xapian_term(val));
|
||||
} else if (field.is_indexable_term()) {
|
||||
Xapian::TermGenerator termgen;
|
||||
termgen.set_document(doc);
|
||||
auto&& termgen{make_term_generator(doc, opts)};
|
||||
termgen.index_text(utf8_flatten(val), 1, field.xapian_term());
|
||||
/* also add as 'normal' term, so some queries where the indexer
|
||||
* eats special chars also match */
|
||||
|
@ -88,7 +107,7 @@ Document::add(Field::Id id, const std::string& val)
|
|||
xdoc_.add_value(field.value_no(), val);
|
||||
|
||||
if (field.is_searchable())
|
||||
add_search_term(xdoc_, field, val);
|
||||
add_search_term(xdoc_, field, val, options_);
|
||||
|
||||
if (field.include_in_sexp())
|
||||
put_prop(field, val);
|
||||
|
@ -107,7 +126,7 @@ Document::add(Field::Id id, const std::vector<std::string>& vals)
|
|||
if (field.is_searchable())
|
||||
std::for_each(vals.begin(), vals.end(),
|
||||
[&](const auto& val) {
|
||||
add_search_term(xdoc_, field, val); });
|
||||
add_search_term(xdoc_, field, val, options_); });
|
||||
|
||||
if (field.include_in_sexp()) {
|
||||
Sexp elms{};
|
||||
|
@ -149,9 +168,7 @@ Document::add(Field::Id id, const Contacts& contacts)
|
|||
std::vector<std::string> cvec;
|
||||
|
||||
const std::string sepa2(1, SepaChar2);
|
||||
|
||||
Xapian::TermGenerator termgen;
|
||||
termgen.set_document(xdoc_);
|
||||
auto&& termgen{make_term_generator(xdoc_, options_)};
|
||||
|
||||
for (auto&& contact: contacts) {
|
||||
|
||||
|
|
|
@ -41,17 +41,27 @@ namespace Mu {
|
|||
*/
|
||||
class Document {
|
||||
public:
|
||||
enum struct Options {
|
||||
None = 0,
|
||||
SupportNgrams = 1 << 0, /**< Support ngrams, as used in
|
||||
* CJK and other languages. */
|
||||
};
|
||||
|
||||
/**
|
||||
* Construct a message for a new Xapian Document
|
||||
*
|
||||
* @param flags behavioral flags
|
||||
*/
|
||||
Document() {}
|
||||
Document(Options opts = Options::None): options_{opts} {}
|
||||
|
||||
/**
|
||||
* Construct a message document based on an existing Xapian document.
|
||||
*
|
||||
* @param doc
|
||||
* @param flags behavioral flags
|
||||
*/
|
||||
Document(const Xapian::Document& doc): xdoc_{doc} {}
|
||||
Document(const Xapian::Document& doc, Options opts = Options::None):
|
||||
xdoc_{doc}, options_{opts} {}
|
||||
|
||||
/**
|
||||
* DTOR
|
||||
|
@ -240,11 +250,12 @@ private:
|
|||
return cached_sexp_;
|
||||
}
|
||||
|
||||
|
||||
mutable Xapian::Document xdoc_;
|
||||
Options options_;
|
||||
mutable Sexp cached_sexp_;
|
||||
mutable bool dirty_sexp_{}; /* xdoc's sexp is outdated */
|
||||
};
|
||||
MU_ENABLE_BITOPS(Document::Options);
|
||||
|
||||
} // namepace Mu
|
||||
|
||||
|
|
|
@ -45,9 +45,10 @@
|
|||
using namespace Mu;
|
||||
|
||||
struct Message::Private {
|
||||
Private(Message::Options options): opts{options} {}
|
||||
Private(Message::Options options):
|
||||
opts{options}, doc{doc_opts(opts)} {}
|
||||
Private(Message::Options options, Xapian::Document&& xdoc):
|
||||
opts{options}, doc{std::move(xdoc)} {}
|
||||
opts{options}, doc{std::move(xdoc), doc_opts(opts)} {}
|
||||
|
||||
Message::Options opts;
|
||||
Document doc;
|
||||
|
@ -70,6 +71,13 @@ struct Message::Private {
|
|||
Option<std::string> embedded;
|
||||
|
||||
Option<std::string> language; /* body ISO language code */
|
||||
|
||||
private:
|
||||
Document::Options doc_opts(Message::Options mopts) {
|
||||
return any_of(opts & Message::Options::SupportNgrams) ?
|
||||
Document::Options::SupportNgrams :
|
||||
Document::Options::None;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
@ -176,6 +184,11 @@ Message::document() const
|
|||
return priv_->doc;
|
||||
}
|
||||
|
||||
Message::Options
|
||||
Message::options() const
|
||||
{
|
||||
return priv_->opts;
|
||||
}
|
||||
|
||||
unsigned
|
||||
Message::docid() const
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify it
|
||||
** under the terms of the GNU General Public License as published by the
|
||||
|
@ -49,8 +49,10 @@ public:
|
|||
Decrypt = 1 << 0, /**< Attempt to decrypt */
|
||||
RetrieveKeys = 1 << 1, /**< Auto-retrieve crypto keys (implies network
|
||||
* access) */
|
||||
AllowRelativePath = 1 << 2, /**< Allow relateive paths for filename
|
||||
AllowRelativePath = 1 << 2, /**< Allow relative paths for filename
|
||||
* in make_from_path */
|
||||
SupportNgrams = 1 << 3, /**< Support ngrams, as used in
|
||||
* CJK and other languages. */
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -60,7 +62,6 @@ public:
|
|||
*/
|
||||
Message(Message&& other) noexcept;
|
||||
|
||||
|
||||
/**
|
||||
* operator=
|
||||
*
|
||||
|
@ -147,6 +148,14 @@ public:
|
|||
const Document& document() const;
|
||||
|
||||
|
||||
/**
|
||||
* The message options for this message
|
||||
*
|
||||
* @return message options
|
||||
*/
|
||||
Options options() const;
|
||||
|
||||
|
||||
/**
|
||||
* Get the document-id, or 0 if non-existent.
|
||||
*
|
||||
|
|
|
@ -51,6 +51,8 @@ struct Property {
|
|||
PersonalAddresses, /**< List of personal e-mail addresses */
|
||||
RootMaildir, /**< Root maildir path */
|
||||
SchemaVersion, /**< Xapian DB schema version */
|
||||
SupportNgrams, /**< Support ngrams for indexing & querying
|
||||
* for e.g. CJK languages */
|
||||
/* <private> */
|
||||
_count_ /* Number of Ids */
|
||||
};
|
||||
|
@ -61,12 +63,13 @@ struct Property {
|
|||
enum struct Flags {
|
||||
None = 0, /**< Nothing in particular */
|
||||
ReadOnly = 1 << 0, /**< Property is read-only for external use
|
||||
* (but can change from within the store) */
|
||||
* (but can change from within the store) */
|
||||
Configurable = 1 << 1, /**< A user-configurable parameter; name
|
||||
* starts with 'conf-' */
|
||||
Internal = 1 << 2, /**< Mu-internal field */
|
||||
};
|
||||
enum struct Type {
|
||||
Boolean, /**< Some boolean value */
|
||||
Number, /**< Some number */
|
||||
Timestamp, /**< Timestamp number */
|
||||
Path, /**< Path string */
|
||||
|
@ -176,6 +179,14 @@ public:
|
|||
{},
|
||||
"Version of the Xapian database schema"
|
||||
},
|
||||
{
|
||||
Id::SupportNgrams,
|
||||
Type::Boolean,
|
||||
Flags::Configurable,
|
||||
"support-ngrams",
|
||||
{},
|
||||
"Support n-grams for working with CJK and other languages"
|
||||
},
|
||||
}};
|
||||
|
||||
/**
|
||||
|
@ -229,6 +240,9 @@ public:
|
|||
});
|
||||
if constexpr (prop.type == Type::Number)
|
||||
return static_cast<size_t>(str.empty() ? 0 : std::atoll(str.c_str()));
|
||||
if constexpr (prop.type == Type::Boolean)
|
||||
return static_cast<size_t>(str.empty() ? false :
|
||||
std::atol(str.c_str()) != 0);
|
||||
else if constexpr (prop.type == Type::Timestamp)
|
||||
return static_cast<time_t>(str.empty() ? 0 : std::atoll(str.c_str()));
|
||||
else if constexpr (prop.type == Type::Path || prop.type == Type::String)
|
||||
|
@ -257,6 +271,8 @@ public:
|
|||
const auto strval = std::invoke([&]{
|
||||
if constexpr (prop.type == Type::Number || prop.type == Type::Timestamp)
|
||||
return mu_format("{}", static_cast<int64_t>(val));
|
||||
if constexpr (prop.type == Type::Boolean)
|
||||
return val ? "1" : "0";
|
||||
else if constexpr (prop.type == Type::Path || prop.type == Type::String)
|
||||
return std::string{val};
|
||||
else if constexpr (prop.type == Type::StringList)
|
||||
|
|
|
@ -32,7 +32,10 @@
|
|||
|
||||
using namespace Mu;
|
||||
|
||||
|
||||
// backward compat
|
||||
#ifndef HAVE_XAPIAN_FLAG_NGRAMS
|
||||
#define FLAG_NGRAMS FLAG_CJK_NGRAM
|
||||
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
|
||||
|
||||
/**
|
||||
* Expand terms for scripts without explicit word-breaks (e.g.
|
||||
|
@ -42,25 +45,15 @@ using namespace Mu;
|
|||
static Result<Xapian::Query>
|
||||
ngram_expand(const Field& field, const std::string& str)
|
||||
{
|
||||
mu_println("ng: '{}'", str);
|
||||
|
||||
Xapian::QueryParser qp;
|
||||
const auto pfx{std::string(1U, field.xapian_prefix())};
|
||||
|
||||
qp.set_default_op(Xapian::Query::OP_OR);
|
||||
|
||||
return qp.parse_query(
|
||||
str,
|
||||
#if HAVE_XAPIAN_FLAG_NGRAMS
|
||||
Xapian::QueryParser::FLAG_NGRAMS,
|
||||
#else
|
||||
Xapian::QueryParser::FLAG_CJK_NGRAM,
|
||||
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
|
||||
pfx);
|
||||
return qp.parse_query(str, Xapian::QueryParser::FLAG_NGRAMS, pfx);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static Option<Sexp>
|
||||
tail(Sexp&& s)
|
||||
{
|
||||
|
@ -259,11 +252,10 @@ parse_field_matcher(const Store& store, const Field& field,
|
|||
}
|
||||
|
||||
|
||||
static Result<Xapian::Query>
|
||||
parse_basic(const Field& field, Sexp&& vals, Mu::ParserFlags flags)
|
||||
static Result<Xapian::Query> parse_basic(const Field &field, Sexp &&vals,
|
||||
Mu::ParserFlags flags)
|
||||
{
|
||||
static auto ngrams = any_of(flags & ParserFlags::SupportNgrams);
|
||||
|
||||
auto ngrams = any_of(flags & ParserFlags::SupportNgrams);
|
||||
if (!vals.stringp())
|
||||
return Err(Error::Code::InvalidArgument, "expected string");
|
||||
|
||||
|
@ -321,7 +313,6 @@ parse(const Store& store, Sexp&& s, Mu::ParserFlags flags)
|
|||
"expected field-value or field-matcher");
|
||||
|
||||
auto&& matcher{rest->front()};
|
||||
|
||||
// field-value: (field "value"); ensure "value" is there
|
||||
if (matcher.stringp())
|
||||
return parse_basic(*field, std::move(matcher), flags);
|
||||
|
@ -468,14 +459,7 @@ main(int argc, char* argv[])
|
|||
{
|
||||
mu_test_init(&argc, &argv);
|
||||
|
||||
|
||||
Xapian::QueryParser qp;
|
||||
// mu_println("{}", qp.parse_query("スポンサーシップ募集").get_description());
|
||||
// mu_println("{}", qp.parse_query("スポンサーシップ募集", Xapian::QueryParser::FLAG_NGRAMS).get_description());
|
||||
|
||||
// mu_println("{}", qp.parse_query("hello world").get_description());
|
||||
// mu_println("{}", qp.parse_query("hello world", Xapian::QueryParser::FLAG_NGRAMS).get_description());
|
||||
|
||||
g_test_add_func("/query-parser/xapianizer", test_xapian);
|
||||
|
||||
return g_test_run();
|
||||
|
|
|
@ -263,10 +263,13 @@ Query::run(const std::string& expr, Field::Id sortfield_id,
|
|||
g_return_val_if_fail(none_of(qflags & QueryFlags::Leader),
|
||||
Err(Error::Code::InvalidArgument, "cannot pass Leader flag"));
|
||||
|
||||
StopWatch sw{mu_format(
|
||||
"ran query '{}'; related: {}; threads: {}; max-size: {}", expr,
|
||||
any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no",
|
||||
any_of(qflags & QueryFlags::Threading) ? "yes" : "no", maxnum)};
|
||||
StopWatch sw{
|
||||
mu_format("query: '{}'; (related:{}; threads:{}; ngrams:{}; max-size:{})",
|
||||
expr,
|
||||
any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no",
|
||||
any_of(qflags & QueryFlags::Threading) ? "yes" : "no",
|
||||
any_of(priv_->parser_flags_ & ParserFlags::SupportNgrams) ? "yes" : "no",
|
||||
maxnum == 0 ? std::string{"∞"} : std::to_string(maxnum))};
|
||||
|
||||
return xapian_try_result([&]{
|
||||
if (auto&& res = priv_->run(expr, sortfield_id, qflags, maxnum); res)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
** Copyright (C) 2008-2021 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
** Copyright (C) 2008-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify
|
||||
** it under the terms of the GNU General Public License as published by
|
||||
|
|
|
@ -70,7 +70,8 @@ struct Store::Private {
|
|||
: XapianDb::Flavor::Open)},
|
||||
config_{xapian_db_},
|
||||
contacts_cache_{config_},
|
||||
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())}
|
||||
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())},
|
||||
message_opts_{make_message_options(config_)}
|
||||
{}
|
||||
|
||||
Private(const std::string& path, const std::string& root_maildir,
|
||||
|
@ -78,7 +79,8 @@ struct Store::Private {
|
|||
xapian_db_{XapianDb(path, XapianDb::Flavor::CreateOverwrite)},
|
||||
config_{make_config(xapian_db_, root_maildir, conf)},
|
||||
contacts_cache_{config_},
|
||||
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())}
|
||||
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())},
|
||||
message_opts_{make_message_options(config_)}
|
||||
{}
|
||||
|
||||
~Private() try {
|
||||
|
@ -133,6 +135,13 @@ struct Store::Private {
|
|||
return config;
|
||||
}
|
||||
|
||||
Message::Options make_message_options(const Config& conf) {
|
||||
if (conf.get<Config::Id::SupportNgrams>())
|
||||
return Message::Options::SupportNgrams;
|
||||
else
|
||||
return Message::Options::None;
|
||||
}
|
||||
|
||||
Option<Message> find_message_unlocked(Store::Id docid) const;
|
||||
Store::IdVec find_duplicates_unlocked(const Store& store,
|
||||
const std::string& message_id) const;
|
||||
|
@ -150,7 +159,8 @@ struct Store::Private {
|
|||
ContactsCache contacts_cache_;
|
||||
std::unique_ptr<Indexer> indexer_;
|
||||
|
||||
const std::string root_maildir_;
|
||||
const std::string root_maildir_;
|
||||
const Message::Options message_opts_;
|
||||
|
||||
size_t transaction_size_{};
|
||||
std::mutex lock_;
|
||||
|
@ -340,6 +350,11 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new)
|
|||
if (auto&& res = msg.set_maildir(mdir.value()); !res)
|
||||
return Err(res.error());
|
||||
|
||||
// we shouldn't mix ngrams/non-ngrams messages.
|
||||
if (any_of(msg.options() & Message::Options::SupportNgrams) !=
|
||||
any_of(message_options() & Message::Options::SupportNgrams))
|
||||
return Err(Error::Code::InvalidArgument, "incompatible message options");
|
||||
|
||||
/* add contacts from this message to cache; this cache
|
||||
* also determines whether those contacts are _personal_, i.e. match
|
||||
* our personal addresses.
|
||||
|
@ -371,6 +386,16 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new)
|
|||
return res;
|
||||
}
|
||||
|
||||
Result<Store::Id>
|
||||
Store::add_message(const std::string& path, bool use_transaction, bool is_new)
|
||||
{
|
||||
if (auto msg{Message::make_from_path(path, priv_->message_opts_)}; !msg)
|
||||
return Err(msg.error());
|
||||
else
|
||||
return add_message(msg.value(), use_transaction, is_new);
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
Store::remove_message(const std::string& path)
|
||||
{
|
||||
|
@ -649,3 +674,9 @@ Store::maildirs() const
|
|||
|
||||
return mdirs;
|
||||
}
|
||||
|
||||
Message::Options
|
||||
Store::message_options() const
|
||||
{
|
||||
return priv_->message_opts_;
|
||||
}
|
||||
|
|
|
@ -207,21 +207,7 @@ public:
|
|||
Result<Id> add_message(Message& msg, bool use_transaction = false,
|
||||
bool is_new = false);
|
||||
Result<Id> add_message(const std::string& path, bool use_transaction = false,
|
||||
bool is_new = false) {
|
||||
if (auto msg{Message::make_from_path(path)}; !msg)
|
||||
return Err(msg.error());
|
||||
else
|
||||
return add_message(msg.value(), use_transaction, is_new);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a message in the store.
|
||||
*
|
||||
* @param msg a message
|
||||
* @param id the id for this message
|
||||
*
|
||||
* @return Ok() or an error.
|
||||
*/
|
||||
bool is_new = false);
|
||||
|
||||
/**
|
||||
* Remove a message from the store. It will _not_ remove the message
|
||||
|
@ -258,7 +244,6 @@ public:
|
|||
*/
|
||||
Option<Message> find_message(Id id) const;
|
||||
|
||||
|
||||
/**
|
||||
* Find the messages for the given ids
|
||||
*
|
||||
|
@ -288,7 +273,6 @@ public:
|
|||
*/
|
||||
bool contains_message(const std::string& path) const;
|
||||
|
||||
|
||||
/**
|
||||
* Options for moving
|
||||
*
|
||||
|
@ -437,6 +421,15 @@ public:
|
|||
*/
|
||||
std::vector<std::string> maildirs() const;
|
||||
|
||||
|
||||
/**
|
||||
* Compatible message-options for this store
|
||||
*
|
||||
* @return message-options.
|
||||
*/
|
||||
Message::Options message_options() const;
|
||||
|
||||
|
||||
/*
|
||||
* _almost_ private
|
||||
*/
|
||||
|
|
|
@ -94,9 +94,12 @@ Mu::mu_test_init(int *argc, char ***argv)
|
|||
{
|
||||
const auto tmpdir{test_random_tmpdir()};
|
||||
|
||||
g_unsetenv("XAPIAN_CJK_NGRAM");
|
||||
g_setenv("MU_TEST", "yes", TRUE);
|
||||
g_setenv("XDG_CACHE_HOME", tmpdir.c_str(), TRUE);
|
||||
|
||||
setlocale(LC_ALL, "");
|
||||
|
||||
g_test_init(argc, argv, NULL);
|
||||
|
||||
g_test_bug_base("https://github.com/djcb/mu/issues/");
|
||||
|
|
|
@ -17,6 +17,7 @@ has completed, you can run *mu index*
|
|||
* INIT OPTIONS
|
||||
|
||||
** -m, --maildir=<maildir>
|
||||
|
||||
starts searching at =<maildir>=. By default, *mu* uses whatever the *MAILDIR*
|
||||
environment variable is set to; if it is not set, it tries =~/Maildir= if it
|
||||
already exists.
|
||||
|
@ -54,6 +55,13 @@ number of changes after which they are committed to the database; decreasing
|
|||
this reduces the memory requirements, but make indexing substantially slows (and
|
||||
vice-versa for increasing). Usually, the default of 250000 should be fine.
|
||||
|
||||
** --support-ngrams
|
||||
|
||||
whether to enable support for using ngrams in indexing and query parsing; this
|
||||
can be useful for languages without explicit word-breaks, such as
|
||||
Chinese/Japanes/Korean. See *NGRAM SUPPORT* below.
|
||||
|
||||
|
||||
** --reinit
|
||||
|
||||
reinitialize the database from an earlier version; that is, create a new empty
|
||||
|
@ -62,8 +70,20 @@ options.
|
|||
|
||||
#+include: "muhome.inc" :minlevel 2
|
||||
|
||||
* NGRAM SUPPORT
|
||||
|
||||
*mu*'s underlying Xapian database supports 'ngrams', which improve searching for
|
||||
languages/scripts that do not have explicit word breaks, such as Chinese,
|
||||
Japanese and Korean. It is fairly intrusive, and influence both indexing and
|
||||
query-parsing; it is not enabled by default, and is recommended only if you need
|
||||
to search in such languages.
|
||||
|
||||
When enabled, *mu* automatically uses ngrams automatically. Xapian environment
|
||||
variables such as ~XAPIAN_CJK_NGRAM~ are ignored.
|
||||
|
||||
#+include: "exit-code.inc" :minlevel 1
|
||||
|
||||
|
||||
* EXAMPLE
|
||||
#+begin_example
|
||||
$ mu init --maildir=~/Maildir --my-address=alice@example.com --my-address=bob@example.com --ignored-address='/.*reply.*/'
|
||||
|
|
10
meson.build
10
meson.build
|
@ -149,9 +149,17 @@ gobject_dep = dependency('gobject-2.0', version: '>= 2.60')
|
|||
gio_dep = dependency('gio-2.0', version: '>= 2.60')
|
||||
gio_unix_dep = dependency('gio-unix-2.0', version: '>= 2.60')
|
||||
gmime_dep = dependency('gmime-3.0', version: '>= 3.2')
|
||||
xapian_dep = dependency('xapian-core', version:'>= 1.4')
|
||||
thread_dep = dependency('threads')
|
||||
|
||||
# we need Xapian 1.4; if we have 1.4.23, we have some newer APIs.
|
||||
xapian_dep = dependency('xapian-core', version:'>= 1.4.23', required:false)
|
||||
if xapian_dep.found()
|
||||
config_h_data.set('HAVE_XAPIAN_FLAG_NGRAMS', 1)
|
||||
else
|
||||
xapian_dep = dependency('xapian-core', version:'>= 1.4')
|
||||
message('Found xapian ' + xapian_dep.version())
|
||||
endif
|
||||
|
||||
# optionally, use Compact Language Detector2 if we can find it.
|
||||
cld2_dep = meson.get_compiler('cpp').find_library('cld2', required: false)
|
||||
if cld2_dep.found()
|
||||
|
|
|
@ -202,6 +202,8 @@ topic_store(const Mu::Store& store, const Options& opts)
|
|||
info.add_row({"ignored-address", c});
|
||||
|
||||
info.add_row({"messages in store", mu_format("{}", store.size())});
|
||||
info.add_row({"support-ngrams", conf.get<Config::Id::SupportNgrams>() ? "yes" : "no"});
|
||||
|
||||
info.add_row({"last-change", tstamp(store.statistics().last_change)});
|
||||
info.add_row({"last-index", tstamp(store.statistics().last_index)});
|
||||
|
||||
|
|
|
@ -55,6 +55,8 @@ Mu::mu_cmd_init(const Options& opts)
|
|||
conf.set<Config::Id::PersonalAddresses>(opts.init.my_addresses);
|
||||
if (!opts.init.ignored_addresses.empty())
|
||||
conf.set<Config::Id::IgnoredAddresses>(opts.init.ignored_addresses);
|
||||
if (opts.init.support_ngrams)
|
||||
conf.set<Config::Id::SupportNgrams>(true);
|
||||
|
||||
return Store::make_new(opts.runtime_path(RuntimePath::XapianDb),
|
||||
opts.init.maildir, conf);
|
||||
|
|
|
@ -457,13 +457,16 @@ sub_init(CLI::App& sub, Options& opts)
|
|||
"Maximum allowed message size in bytes");
|
||||
sub.add_option("--batch-size", opts.init.batch_size,
|
||||
"Maximum size of database transaction");
|
||||
sub.add_option("--support-ngrams", opts.init.support_ngrams,
|
||||
"Support CJK n-grams if for querying/indexing");
|
||||
sub.add_flag("--reinit", opts.init.reinit,
|
||||
"Re-initialize database with current settings")
|
||||
->excludes("--maildir")
|
||||
->excludes("--my-address")
|
||||
->excludes("--ignored-address")
|
||||
->excludes("--max-message-size")
|
||||
->excludes("--batch-size");
|
||||
->excludes("--batch-size")
|
||||
->excludes("--support-ngrams");
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
@ -185,13 +185,15 @@ struct Options {
|
|||
* Init
|
||||
*/
|
||||
struct Init {
|
||||
std::string maildir; /**< where the mails are */
|
||||
StringVec my_addresses; /**< personal e-mail addresses */
|
||||
StringVec ignored_addresses; /**< addresses to be ignored for
|
||||
std::string maildir; /**< where the mails are */
|
||||
StringVec my_addresses; /**< personal e-mail addresses */
|
||||
StringVec ignored_addresses; /**< addresses to be ignored for
|
||||
* the contacts-cache */
|
||||
OptSize max_msg_size; /**< max size for message files */
|
||||
OptSize batch_size; /**< db transaction batch size */
|
||||
bool reinit; /**< re-initialize */
|
||||
OptSize max_msg_size; /**< max size for message files */
|
||||
OptSize batch_size; /**< db transaction batch size */
|
||||
bool reinit; /**< re-initialize */
|
||||
bool support_ngrams; /**< support CJK etc. ngrams */
|
||||
|
||||
} init;
|
||||
|
||||
/*
|
||||
|
|
Loading…
Reference in New Issue