support xapian ngrams

Xapian supports an "ngrams" option to help with languages/scripts
without explicit wordbreaks, such as Chinese / Japanese / Korean.

Add some plumbing for supporting this in mu as well. Experimental for
now.
This commit is contained in:
Dirk-Jan C. Binnema 2023-09-09 11:57:05 +03:00
parent f6122ecc9e
commit 264bb092f0
20 changed files with 207 additions and 81 deletions

View File

@ -87,10 +87,11 @@ struct Indexer::Private {
was_empty_{store.empty()} {
mu_message("created indexer for {} -> "
"{} (batch-size: {}; was-empty: {})",
"{} (batch-size: {}; was-empty: {}; ngrams: {})",
store.root_maildir(), store.path(),
store.config().get<Mu::Config::Id::BatchSize>(),
was_empty_);
was_empty_,
store.config().get<Mu::Config::Id::SupportNgrams>());
}
~Private() {
@ -238,7 +239,7 @@ Indexer::Private::add_message(const std::string& path)
*
* std::unique_lock lock{w_lock_};
*/
auto msg{Message::make_from_path(path)};
auto msg{Message::make_from_path(path, store_.message_options())};
if (!msg) {
mu_warning("failed to create message from {}: {}", path, msg.error().what());
return false;

View File

@ -1,4 +1,4 @@
## Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
## Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
@ -38,7 +38,7 @@ lib_mu_message=static_library(
lib_mu_message_dep = declare_dependency(
link_with: lib_mu_message,
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep ],
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep, config_h_dep ],
include_directories:
include_directories(['.', '..']))

View File

@ -1,5 +1,5 @@
/*
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
@ -16,6 +16,7 @@
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include "config.h"
#include "mu-document.hh"
#include "mu-message.hh"
@ -31,9 +32,14 @@
#include <string>
#include <utils/mu-utils.hh>
using namespace Mu;
// backward compat
#ifndef HAVE_XAPIAN_FLAG_NGRAMS
#define FLAG_NGRAMS FLAG_CJK_NGRAM
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
const Xapian::Document&
Document::xapian_document() const
{
@ -58,16 +64,29 @@ Document::put_prop(const Field& field, SexpType&& val)
std::forward<SexpType>(val));
}
static Xapian::TermGenerator
make_term_generator(Xapian::Document& doc, Document::Options opts)
{
Xapian::TermGenerator termgen;
if (any_of(opts & Document::Options::SupportNgrams))
termgen.set_flags(Xapian::TermGenerator::FLAG_NGRAMS);
termgen.set_document(doc);
return termgen;
}
static void
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val)
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val,
Document::Options opts)
{
if (field.is_normal_term()) {
doc.add_term(field.xapian_term(val));
} else if (field.is_boolean_term()) {
doc.add_boolean_term(field.xapian_term(val));
} else if (field.is_indexable_term()) {
Xapian::TermGenerator termgen;
termgen.set_document(doc);
auto&& termgen{make_term_generator(doc, opts)};
termgen.index_text(utf8_flatten(val), 1, field.xapian_term());
/* also add as 'normal' term, so some queries where the indexer
* eats special chars also match */
@ -88,7 +107,7 @@ Document::add(Field::Id id, const std::string& val)
xdoc_.add_value(field.value_no(), val);
if (field.is_searchable())
add_search_term(xdoc_, field, val);
add_search_term(xdoc_, field, val, options_);
if (field.include_in_sexp())
put_prop(field, val);
@ -107,7 +126,7 @@ Document::add(Field::Id id, const std::vector<std::string>& vals)
if (field.is_searchable())
std::for_each(vals.begin(), vals.end(),
[&](const auto& val) {
add_search_term(xdoc_, field, val); });
add_search_term(xdoc_, field, val, options_); });
if (field.include_in_sexp()) {
Sexp elms{};
@ -149,9 +168,7 @@ Document::add(Field::Id id, const Contacts& contacts)
std::vector<std::string> cvec;
const std::string sepa2(1, SepaChar2);
Xapian::TermGenerator termgen;
termgen.set_document(xdoc_);
auto&& termgen{make_term_generator(xdoc_, options_)};
for (auto&& contact: contacts) {

View File

@ -41,17 +41,27 @@ namespace Mu {
*/
class Document {
public:
enum struct Options {
None = 0,
SupportNgrams = 1 << 0, /**< Support ngrams, as used in
* CJK and other languages. */
};
/**
* Construct a message for a new Xapian Document
*
* @param flags behavioral flags
*/
Document() {}
Document(Options opts = Options::None): options_{opts} {}
/**
* Construct a message document based on an existing Xapian document.
*
* @param doc
* @param flags behavioral flags
*/
Document(const Xapian::Document& doc): xdoc_{doc} {}
Document(const Xapian::Document& doc, Options opts = Options::None):
xdoc_{doc}, options_{opts} {}
/**
* DTOR
@ -240,11 +250,12 @@ private:
return cached_sexp_;
}
mutable Xapian::Document xdoc_;
Options options_;
mutable Sexp cached_sexp_;
mutable bool dirty_sexp_{}; /* xdoc's sexp is outdated */
};
MU_ENABLE_BITOPS(Document::Options);
} // namepace Mu

View File

@ -45,9 +45,10 @@
using namespace Mu;
struct Message::Private {
Private(Message::Options options): opts{options} {}
Private(Message::Options options):
opts{options}, doc{doc_opts(opts)} {}
Private(Message::Options options, Xapian::Document&& xdoc):
opts{options}, doc{std::move(xdoc)} {}
opts{options}, doc{std::move(xdoc), doc_opts(opts)} {}
Message::Options opts;
Document doc;
@ -70,6 +71,13 @@ struct Message::Private {
Option<std::string> embedded;
Option<std::string> language; /* body ISO language code */
private:
Document::Options doc_opts(Message::Options mopts) {
return any_of(opts & Message::Options::SupportNgrams) ?
Document::Options::SupportNgrams :
Document::Options::None;
}
};
@ -176,6 +184,11 @@ Message::document() const
return priv_->doc;
}
Message::Options
Message::options() const
{
return priv_->opts;
}
unsigned
Message::docid() const

View File

@ -1,5 +1,5 @@
/*
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
@ -49,8 +49,10 @@ public:
Decrypt = 1 << 0, /**< Attempt to decrypt */
RetrieveKeys = 1 << 1, /**< Auto-retrieve crypto keys (implies network
* access) */
AllowRelativePath = 1 << 2, /**< Allow relateive paths for filename
AllowRelativePath = 1 << 2, /**< Allow relative paths for filename
* in make_from_path */
SupportNgrams = 1 << 3, /**< Support ngrams, as used in
* CJK and other languages. */
};
/**
@ -60,7 +62,6 @@ public:
*/
Message(Message&& other) noexcept;
/**
* operator=
*
@ -147,6 +148,14 @@ public:
const Document& document() const;
/**
* The message options for this message
*
* @return message options
*/
Options options() const;
/**
* Get the document-id, or 0 if non-existent.
*

View File

@ -51,6 +51,8 @@ struct Property {
PersonalAddresses, /**< List of personal e-mail addresses */
RootMaildir, /**< Root maildir path */
SchemaVersion, /**< Xapian DB schema version */
SupportNgrams, /**< Support ngrams for indexing & querying
* for e.g. CJK languages */
/* <private> */
_count_ /* Number of Ids */
};
@ -61,12 +63,13 @@ struct Property {
enum struct Flags {
None = 0, /**< Nothing in particular */
ReadOnly = 1 << 0, /**< Property is read-only for external use
* (but can change from within the store) */
* (but can change from within the store) */
Configurable = 1 << 1, /**< A user-configurable parameter; name
* starts with 'conf-' */
Internal = 1 << 2, /**< Mu-internal field */
};
enum struct Type {
Boolean, /**< Some boolean value */
Number, /**< Some number */
Timestamp, /**< Timestamp number */
Path, /**< Path string */
@ -176,6 +179,14 @@ public:
{},
"Version of the Xapian database schema"
},
{
Id::SupportNgrams,
Type::Boolean,
Flags::Configurable,
"support-ngrams",
{},
"Support n-grams for working with CJK and other languages"
},
}};
/**
@ -229,6 +240,9 @@ public:
});
if constexpr (prop.type == Type::Number)
return static_cast<size_t>(str.empty() ? 0 : std::atoll(str.c_str()));
if constexpr (prop.type == Type::Boolean)
return static_cast<size_t>(str.empty() ? false :
std::atol(str.c_str()) != 0);
else if constexpr (prop.type == Type::Timestamp)
return static_cast<time_t>(str.empty() ? 0 : std::atoll(str.c_str()));
else if constexpr (prop.type == Type::Path || prop.type == Type::String)
@ -257,6 +271,8 @@ public:
const auto strval = std::invoke([&]{
if constexpr (prop.type == Type::Number || prop.type == Type::Timestamp)
return mu_format("{}", static_cast<int64_t>(val));
if constexpr (prop.type == Type::Boolean)
return val ? "1" : "0";
else if constexpr (prop.type == Type::Path || prop.type == Type::String)
return std::string{val};
else if constexpr (prop.type == Type::StringList)

View File

@ -32,7 +32,10 @@
using namespace Mu;
// backward compat
#ifndef HAVE_XAPIAN_FLAG_NGRAMS
#define FLAG_NGRAMS FLAG_CJK_NGRAM
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
/**
* Expand terms for scripts without explicit word-breaks (e.g.
@ -42,25 +45,15 @@ using namespace Mu;
static Result<Xapian::Query>
ngram_expand(const Field& field, const std::string& str)
{
mu_println("ng: '{}'", str);
Xapian::QueryParser qp;
const auto pfx{std::string(1U, field.xapian_prefix())};
qp.set_default_op(Xapian::Query::OP_OR);
return qp.parse_query(
str,
#if HAVE_XAPIAN_FLAG_NGRAMS
Xapian::QueryParser::FLAG_NGRAMS,
#else
Xapian::QueryParser::FLAG_CJK_NGRAM,
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
pfx);
return qp.parse_query(str, Xapian::QueryParser::FLAG_NGRAMS, pfx);
}
static Option<Sexp>
tail(Sexp&& s)
{
@ -259,11 +252,10 @@ parse_field_matcher(const Store& store, const Field& field,
}
static Result<Xapian::Query>
parse_basic(const Field& field, Sexp&& vals, Mu::ParserFlags flags)
static Result<Xapian::Query> parse_basic(const Field &field, Sexp &&vals,
Mu::ParserFlags flags)
{
static auto ngrams = any_of(flags & ParserFlags::SupportNgrams);
auto ngrams = any_of(flags & ParserFlags::SupportNgrams);
if (!vals.stringp())
return Err(Error::Code::InvalidArgument, "expected string");
@ -321,7 +313,6 @@ parse(const Store& store, Sexp&& s, Mu::ParserFlags flags)
"expected field-value or field-matcher");
auto&& matcher{rest->front()};
// field-value: (field "value"); ensure "value" is there
if (matcher.stringp())
return parse_basic(*field, std::move(matcher), flags);
@ -468,14 +459,7 @@ main(int argc, char* argv[])
{
mu_test_init(&argc, &argv);
Xapian::QueryParser qp;
// mu_println("{}", qp.parse_query("スポンサーシップ募集").get_description());
// mu_println("{}", qp.parse_query("スポンサーシップ募集", Xapian::QueryParser::FLAG_NGRAMS).get_description());
// mu_println("{}", qp.parse_query("hello world").get_description());
// mu_println("{}", qp.parse_query("hello world", Xapian::QueryParser::FLAG_NGRAMS).get_description());
g_test_add_func("/query-parser/xapianizer", test_xapian);
return g_test_run();

View File

@ -263,10 +263,13 @@ Query::run(const std::string& expr, Field::Id sortfield_id,
g_return_val_if_fail(none_of(qflags & QueryFlags::Leader),
Err(Error::Code::InvalidArgument, "cannot pass Leader flag"));
StopWatch sw{mu_format(
"ran query '{}'; related: {}; threads: {}; max-size: {}", expr,
any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no",
any_of(qflags & QueryFlags::Threading) ? "yes" : "no", maxnum)};
StopWatch sw{
mu_format("query: '{}'; (related:{}; threads:{}; ngrams:{}; max-size:{})",
expr,
any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no",
any_of(qflags & QueryFlags::Threading) ? "yes" : "no",
any_of(priv_->parser_flags_ & ParserFlags::SupportNgrams) ? "yes" : "no",
maxnum == 0 ? std::string{""} : std::to_string(maxnum))};
return xapian_try_result([&]{
if (auto&& res = priv_->run(expr, sortfield_id, qflags, maxnum); res)

View File

@ -1,5 +1,5 @@
/*
** Copyright (C) 2008-2021 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
** Copyright (C) 2008-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by

View File

@ -70,7 +70,8 @@ struct Store::Private {
: XapianDb::Flavor::Open)},
config_{xapian_db_},
contacts_cache_{config_},
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())}
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())},
message_opts_{make_message_options(config_)}
{}
Private(const std::string& path, const std::string& root_maildir,
@ -78,7 +79,8 @@ struct Store::Private {
xapian_db_{XapianDb(path, XapianDb::Flavor::CreateOverwrite)},
config_{make_config(xapian_db_, root_maildir, conf)},
contacts_cache_{config_},
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())}
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())},
message_opts_{make_message_options(config_)}
{}
~Private() try {
@ -133,6 +135,13 @@ struct Store::Private {
return config;
}
Message::Options make_message_options(const Config& conf) {
if (conf.get<Config::Id::SupportNgrams>())
return Message::Options::SupportNgrams;
else
return Message::Options::None;
}
Option<Message> find_message_unlocked(Store::Id docid) const;
Store::IdVec find_duplicates_unlocked(const Store& store,
const std::string& message_id) const;
@ -150,7 +159,8 @@ struct Store::Private {
ContactsCache contacts_cache_;
std::unique_ptr<Indexer> indexer_;
const std::string root_maildir_;
const std::string root_maildir_;
const Message::Options message_opts_;
size_t transaction_size_{};
std::mutex lock_;
@ -340,6 +350,11 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new)
if (auto&& res = msg.set_maildir(mdir.value()); !res)
return Err(res.error());
// we shouldn't mix ngrams/non-ngrams messages.
if (any_of(msg.options() & Message::Options::SupportNgrams) !=
any_of(message_options() & Message::Options::SupportNgrams))
return Err(Error::Code::InvalidArgument, "incompatible message options");
/* add contacts from this message to cache; this cache
* also determines whether those contacts are _personal_, i.e. match
* our personal addresses.
@ -371,6 +386,16 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new)
return res;
}
Result<Store::Id>
Store::add_message(const std::string& path, bool use_transaction, bool is_new)
{
if (auto msg{Message::make_from_path(path, priv_->message_opts_)}; !msg)
return Err(msg.error());
else
return add_message(msg.value(), use_transaction, is_new);
}
bool
Store::remove_message(const std::string& path)
{
@ -649,3 +674,9 @@ Store::maildirs() const
return mdirs;
}
Message::Options
Store::message_options() const
{
return priv_->message_opts_;
}

View File

@ -207,21 +207,7 @@ public:
Result<Id> add_message(Message& msg, bool use_transaction = false,
bool is_new = false);
Result<Id> add_message(const std::string& path, bool use_transaction = false,
bool is_new = false) {
if (auto msg{Message::make_from_path(path)}; !msg)
return Err(msg.error());
else
return add_message(msg.value(), use_transaction, is_new);
}
/**
* Update a message in the store.
*
* @param msg a message
* @param id the id for this message
*
* @return Ok() or an error.
*/
bool is_new = false);
/**
* Remove a message from the store. It will _not_ remove the message
@ -258,7 +244,6 @@ public:
*/
Option<Message> find_message(Id id) const;
/**
* Find the messages for the given ids
*
@ -288,7 +273,6 @@ public:
*/
bool contains_message(const std::string& path) const;
/**
* Options for moving
*
@ -437,6 +421,15 @@ public:
*/
std::vector<std::string> maildirs() const;
/**
* Compatible message-options for this store
*
* @return message-options.
*/
Message::Options message_options() const;
/*
* _almost_ private
*/

View File

@ -94,9 +94,12 @@ Mu::mu_test_init(int *argc, char ***argv)
{
const auto tmpdir{test_random_tmpdir()};
g_unsetenv("XAPIAN_CJK_NGRAM");
g_setenv("MU_TEST", "yes", TRUE);
g_setenv("XDG_CACHE_HOME", tmpdir.c_str(), TRUE);
setlocale(LC_ALL, "");
g_test_init(argc, argv, NULL);
g_test_bug_base("https://github.com/djcb/mu/issues/");

View File

@ -17,6 +17,7 @@ has completed, you can run *mu index*
* INIT OPTIONS
** -m, --maildir=<maildir>
starts searching at =<maildir>=. By default, *mu* uses whatever the *MAILDIR*
environment variable is set to; if it is not set, it tries =~/Maildir= if it
already exists.
@ -54,6 +55,13 @@ number of changes after which they are committed to the database; decreasing
this reduces the memory requirements, but make indexing substantially slows (and
vice-versa for increasing). Usually, the default of 250000 should be fine.
** --support-ngrams
whether to enable support for using ngrams in indexing and query parsing; this
can be useful for languages without explicit word-breaks, such as
Chinese/Japanes/Korean. See *NGRAM SUPPORT* below.
** --reinit
reinitialize the database from an earlier version; that is, create a new empty
@ -62,8 +70,20 @@ options.
#+include: "muhome.inc" :minlevel 2
* NGRAM SUPPORT
*mu*'s underlying Xapian database supports 'ngrams', which improve searching for
languages/scripts that do not have explicit word breaks, such as Chinese,
Japanese and Korean. It is fairly intrusive, and influence both indexing and
query-parsing; it is not enabled by default, and is recommended only if you need
to search in such languages.
When enabled, *mu* automatically uses ngrams automatically. Xapian environment
variables such as ~XAPIAN_CJK_NGRAM~ are ignored.
#+include: "exit-code.inc" :minlevel 1
* EXAMPLE
#+begin_example
$ mu init --maildir=~/Maildir --my-address=alice@example.com --my-address=bob@example.com --ignored-address='/.*reply.*/'

View File

@ -149,9 +149,17 @@ gobject_dep = dependency('gobject-2.0', version: '>= 2.60')
gio_dep = dependency('gio-2.0', version: '>= 2.60')
gio_unix_dep = dependency('gio-unix-2.0', version: '>= 2.60')
gmime_dep = dependency('gmime-3.0', version: '>= 3.2')
xapian_dep = dependency('xapian-core', version:'>= 1.4')
thread_dep = dependency('threads')
# we need Xapian 1.4; if we have 1.4.23, we have some newer APIs.
xapian_dep = dependency('xapian-core', version:'>= 1.4.23', required:false)
if xapian_dep.found()
config_h_data.set('HAVE_XAPIAN_FLAG_NGRAMS', 1)
else
xapian_dep = dependency('xapian-core', version:'>= 1.4')
message('Found xapian ' + xapian_dep.version())
endif
# optionally, use Compact Language Detector2 if we can find it.
cld2_dep = meson.get_compiler('cpp').find_library('cld2', required: false)
if cld2_dep.found()

View File

@ -202,6 +202,8 @@ topic_store(const Mu::Store& store, const Options& opts)
info.add_row({"ignored-address", c});
info.add_row({"messages in store", mu_format("{}", store.size())});
info.add_row({"support-ngrams", conf.get<Config::Id::SupportNgrams>() ? "yes" : "no"});
info.add_row({"last-change", tstamp(store.statistics().last_change)});
info.add_row({"last-index", tstamp(store.statistics().last_index)});

View File

@ -55,6 +55,8 @@ Mu::mu_cmd_init(const Options& opts)
conf.set<Config::Id::PersonalAddresses>(opts.init.my_addresses);
if (!opts.init.ignored_addresses.empty())
conf.set<Config::Id::IgnoredAddresses>(opts.init.ignored_addresses);
if (opts.init.support_ngrams)
conf.set<Config::Id::SupportNgrams>(true);
return Store::make_new(opts.runtime_path(RuntimePath::XapianDb),
opts.init.maildir, conf);

View File

@ -457,13 +457,16 @@ sub_init(CLI::App& sub, Options& opts)
"Maximum allowed message size in bytes");
sub.add_option("--batch-size", opts.init.batch_size,
"Maximum size of database transaction");
sub.add_option("--support-ngrams", opts.init.support_ngrams,
"Support CJK n-grams if for querying/indexing");
sub.add_flag("--reinit", opts.init.reinit,
"Re-initialize database with current settings")
->excludes("--maildir")
->excludes("--my-address")
->excludes("--ignored-address")
->excludes("--max-message-size")
->excludes("--batch-size");
->excludes("--batch-size")
->excludes("--support-ngrams");
}
static void

View File

@ -185,13 +185,15 @@ struct Options {
* Init
*/
struct Init {
std::string maildir; /**< where the mails are */
StringVec my_addresses; /**< personal e-mail addresses */
StringVec ignored_addresses; /**< addresses to be ignored for
std::string maildir; /**< where the mails are */
StringVec my_addresses; /**< personal e-mail addresses */
StringVec ignored_addresses; /**< addresses to be ignored for
* the contacts-cache */
OptSize max_msg_size; /**< max size for message files */
OptSize batch_size; /**< db transaction batch size */
bool reinit; /**< re-initialize */
OptSize max_msg_size; /**< max size for message files */
OptSize batch_size; /**< db transaction batch size */
bool reinit; /**< re-initialize */
bool support_ngrams; /**< support CJK etc. ngrams */
} init;
/*

View File

@ -90,6 +90,14 @@ handle_result(const Result<void>& res, const Mu::Options& opts)
int
main(int argc, char* argv[])
{
/*
* We handle this through explicit options
*/
g_unsetenv("XAPIAN_CJK_NGRAM");
/*
* set up locale
*/
setlocale(LC_ALL, "");
/*