store: update to use Message; big cleanup

Remove much of the message processing from the store
This commit is contained in:
Dirk-Jan C. Binnema 2022-04-28 22:59:03 +03:00
parent 459f6db476
commit 525fef479a
4 changed files with 257 additions and 90 deletions

View File

@ -69,7 +69,7 @@ struct IndexState {
void change_to(State new_state)
{
g_debug("changing indexer state %s->%s", name((State)state_),
name((State)new_state));
name((State)new_state));
state_ = new_state;
}
@ -80,14 +80,14 @@ private:
struct Indexer::Private {
Private(Mu::Store& store)
: store_{store}, scanner_{store_.properties().root_maildir,
[this](auto&& path, auto&& statbuf, auto&& info) {
[this](auto&& path, auto&& statbuf, auto&& info) {
return handler(path, statbuf, info);
}},
max_message_size_{store_.properties().max_message_size}
{
g_message("created indexer for %s -> %s (batch-size: %zu)",
store.properties().root_maildir.c_str(),
store.properties().database_path.c_str(), store.properties().batch_size);
store.properties().root_maildir.c_str(),
store.properties().database_path.c_str(), store.properties().batch_size);
}
~Private()
@ -135,7 +135,7 @@ struct Indexer::Private {
bool
Indexer::Private::handler(const std::string& fullpath, struct stat* statbuf,
Scanner::HandleType htype)
Scanner::HandleType htype)
{
switch (htype) {
case Scanner::HandleType::EnterDir:
@ -148,8 +148,8 @@ Indexer::Private::handler(const std::string& fullpath, struct stat* statbuf,
if (conf_.lazy_check && dirstamp_ >= statbuf->st_mtime &&
htype == Scanner::HandleType::EnterNewCur) {
g_debug("skip %s (seems up-to-date: %s >= %s)", fullpath.c_str(),
time_to_string("%FT%T", dirstamp_).c_str(),
time_to_string("%FT%T", statbuf->st_mtime).c_str());
time_to_string("%FT%T", dirstamp_).c_str(),
time_to_string("%FT%T", statbuf->st_mtime).c_str());
return false;
}
@ -183,7 +183,7 @@ Indexer::Private::handler(const std::string& fullpath, struct stat* statbuf,
if ((size_t)statbuf->st_size > max_message_size_) {
g_debug("skip %s (too big: %" G_GINT64_FORMAT " bytes)", fullpath.c_str(),
(gint64)statbuf->st_size);
(gint64)statbuf->st_size);
return false;
}
@ -216,6 +216,26 @@ Indexer::Private::maybe_start_worker()
}
}
static bool
add_message(Store& store, const std::string& path)
{
auto msg{Message::make_from_path(path)};
if (!msg) {
g_warning("failed to create message from %s: %s",
path.c_str(), msg.error().what());
return false;
}
auto res = store.add_message(msg.value(), true /*use-transaction*/);
if (!res) {
g_warning("failed to add message @ %s: %s",
path.c_str(), res.error().what());
return false;
}
return true;
}
void
Indexer::Private::item_worker()
{
@ -230,9 +250,8 @@ Indexer::Private::item_worker()
std::lock_guard lock{w_lock_};
switch (item.type) {
case WorkItem::Type::File:
store_.add_message(item.full_path,
true /*use-transaction*/);
++progress_.updated;
if (add_message(store_, item.full_path))
++progress_.updated;
break;
case WorkItem::Type::Dir:
store_.set_dirstamp(item.full_path, ::time(NULL));
@ -243,7 +262,7 @@ Indexer::Private::item_worker()
}
} catch (const Mu::Error& er) {
g_warning("error adding message @ %s: %s",
item.full_path.c_str(), er.what());
item.full_path.c_str(), er.what());
}
maybe_start_worker();
@ -262,7 +281,7 @@ Indexer::Private::cleanup()
++n;
if (::access(path.c_str(), R_OK) != 0) {
g_debug("cannot read %s (id=%u); queueing for removal from store",
path.c_str(), id);
path.c_str(), id);
orphans.emplace_back(id);
}
@ -304,7 +323,7 @@ Indexer::Private::scan_worker()
return workers_.size();
});
g_debug("process %zu remaining message(s) with %zu worker(s)",
todos_.size(), workers_size);
todos_.size(), workers_size);
while (!todos_.empty())
std::this_thread::sleep_for(100ms);
}
@ -328,16 +347,15 @@ Indexer::Private::start(const Indexer::Config& conf)
conf_ = conf;
if (conf_.max_threads == 0) {
/* we're blocked mostly by a) filesystem and b) database;
* so it's not very useful to start many threads */
max_workers_ = std::max(
4U, std::thread::hardware_concurrency());
/* note, most time is spent in the (single) db thread
* but creating messages in parallel still helps a bit */
max_workers_ = std::thread::hardware_concurrency()/2;
} else
max_workers_ = conf.max_threads;
g_debug("starting indexer with <= %zu worker thread(s)", max_workers_);
g_debug("indexing: %s; clean-up: %s", conf_.scan ? "yes" : "no",
conf_.cleanup ? "yes" : "no");
conf_.cleanup ? "yes" : "no");
state_.change_to(IndexState::Scanning);
/* kick off the first worker, which will spawn more if needed. */

View File

@ -69,8 +69,9 @@ struct Store::Private {
: read_only_{readonly}, db_{make_xapian_db(path,
read_only_ ? XapianOpts::ReadOnly
: XapianOpts::Open)},
properties_{make_properties(path)}, contacts_cache_{db().get_metadata(ContactsKey),
properties_.personal_addresses}
properties_{make_properties(path)},
contacts_cache_{db().get_metadata(ContactsKey),
properties_.personal_addresses}
{}
Private(const std::string& path,
@ -204,10 +205,10 @@ struct Store::Private {
}
Store::Properties init_metadata(const Store::Config& conf,
const std::string& path,
const std::string& root_maildir,
const StringVec& personal_addresses)
{
const std::string& path,
const std::string& root_maildir,
const StringVec& personal_addresses)
{
writable_db().set_metadata(SchemaVersionKey, ExpectedSchemaVersion);
writable_db().set_metadata(CreatedKey, Mu::format("%" PRId64, (int64_t)::time({})));
@ -218,7 +219,7 @@ struct Store::Private {
: DefaultMaxMessageSize;
writable_db().set_metadata(MaxMessageSizeKey, Mu::format("%zu", max_msg_size));
writable_db().set_metadata(RootMaildirKey, root_maildir);
writable_db().set_metadata(RootMaildirKey, canonicalize_filename(root_maildir, {}));
std::string addrs;
for (const auto& addr : personal_addresses) { // _very_ minimal check.
@ -318,60 +319,54 @@ Store::empty() const
return size() == 0;
}
static std::string
maildir_from_path(const std::string& root, const std::string& path)
{
if (G_UNLIKELY(root.empty()) || root.length() >= path.length() || path.find(root) != 0)
throw Mu::Error{Error::Code::InvalidArgument,
"root '%s' is not a proper suffix of path '%s'",
root.c_str(),
path.c_str()};
auto mdir{path.substr(root.length())};
auto slash{mdir.rfind('/')};
if (G_UNLIKELY(slash == std::string::npos) || slash < 4)
throw Mu::Error{Error::Code::InvalidArgument, "invalid path: %s", path.c_str()};
mdir.erase(slash);
auto subdir = mdir.data() + slash - 4;
if (G_UNLIKELY(strncmp(subdir, "/cur", 4) != 0 && strncmp(subdir, "/new", 4)))
throw Mu::Error{Error::Code::InvalidArgument,
"cannot find '/new' or '/cur' - invalid path: %s",
path.c_str()};
if (mdir.length() == 4)
return "/";
mdir.erase(mdir.length() - 4);
return mdir;
}
Result<Store::Id>
Store::add_message(const std::string& path, bool use_transaction)
{
const auto maildir{maildir_from_path(properties().root_maildir, path)};
auto msg{Message::make_from_path(Message::Options::None, path, maildir)};
if (G_UNLIKELY(!msg))
if (auto msg{Message::make_from_path(path)}; !msg)
return Err(msg.error());
else
return add_message(msg.value(), use_transaction);
}
Result<Store::Id>
Store::add_message(Message& msg, bool use_transaction)
{
const auto mdir{mu_maildir_from_path(msg.path(),
properties().root_maildir)};
if (!mdir)
return Err(mdir.error());
if (auto&& res = msg.set_maildir(mdir.value()); !res)
return Err(res.error());
std::lock_guard guard{priv_->lock_};
if (use_transaction)
priv_->transaction_inc();
priv_->contacts_cache_.add(msg.all_contacts());
const auto docid = priv_->writable_db().add_document(
msg->document().xapian_document());
const auto docid = xapian_try([&]{
if (use_transaction) /* commit if batch is full */
priv_->transaction_maybe_commit();
if (use_transaction)
priv_->transaction_inc();
const auto docid = priv_->writable_db().add_document(
msg.document().xapian_document());
if (use_transaction) /* commit if batch is full */
priv_->transaction_maybe_commit();
return docid;
}, InvalidId);
if (G_UNLIKELY(docid == InvalidId))
return Err(Error::Code::Message, "failed to add message");
g_debug("added message @ %s; docid = %u", path.c_str(), docid);
g_debug("added message @ %s; docid = %u", msg.path().c_str(), docid);
g_debug("%s", msg.document().xapian_document().get_description().c_str());
return Ok(static_cast<Store::Id>(docid));
}
bool
Store::update_message(const Message& msg, unsigned docid)
{
@ -379,8 +374,8 @@ Store::update_message(const Message& msg, unsigned docid)
[&]{
priv_->writable_db().replace_document(
docid, msg.document().xapian_document());
g_debug("updated message %u @ %s", docid, msg.path().c_str());
g_debug("updated message @ %s; docid = %u",
msg.path().c_str(), docid);
return true;
}, false);
}
@ -462,9 +457,6 @@ Store::move_message(Store::Id id,
return Ok(std::move(msg.value()));
}
std::string
Store::metadata(const std::string& key) const
{
@ -598,14 +590,12 @@ Store::lock() const
return priv_->lock_;
}
Option<QueryResults>
Result<QueryResults>
Store::run_query(const std::string& expr,
Option<Field::Id> sortfield_id,
Field::Id sortfield_id,
QueryFlags flags, size_t maxnum) const
{
return xapian_try([&] {
Query q{*this};
return q.run(expr, sortfield_id, flags, maxnum);}, Nothing);
return Query{*this}.run(expr, sortfield_id, flags, maxnum);
}
size_t

View File

@ -133,7 +133,7 @@ public:
Indexer& indexer();
/**
* Run a query; see the `mu-query` man page for the syntax.
* Run a query; see the `mu-query` man page for the syntax.
*
* Multi-threaded callers must aquire the lock and keep it
* at least as long as the return value.
@ -143,11 +143,11 @@ public:
* @param flags query flags
* @param maxnum maximum number of results to return. 0 for 'no limit'
*
* @return the query-results, or Nothing in case of error.
* @return the query-results or an error.
*/
std::mutex& lock() const;
Option<QueryResults> run_query(const std::string& expr = "",
Option<Field::Id> sortfield_id = {},
Result<QueryResults> run_query(const std::string& expr,
Field::Id sortfield_id = Field::Id::Date,
QueryFlags flags = QueryFlags::None,
size_t maxnum = 0) const;
@ -186,6 +186,19 @@ public:
*/
Result<Id> add_message(const std::string& path, bool use_transaction = false);
/**
* Add a message to the store. When planning to write many messages,
* it's much faster to do so in a transaction. If so, set
* @in_transaction to true. When done with adding messages, call
* commit().
*
* @param msg a message
* @param whether to bundle up to batch_size changes in a transaction
*
* @return the doc id of the added message or an error.
*/
Result<Id> add_message(Message& msg, bool use_transaction = false);
/**
* Update a message in the store.
*

View File

@ -52,31 +52,178 @@ test_store_ctor_dtor()
static void
test_store_add_count_remove()
{
TempDir tempdir;
Mu::Store store{tempdir.path(), MuTestMaildir, {}, {}};
TempDir tempdir{false};
const auto id1 = store.add_message(MuTestMaildir + "/cur/1283599333.1840_11.cthulhu!2,");
Mu::Store store{tempdir.path() + "/xapian", MuTestMaildir, {}, {}};
const auto msgpath{MuTestMaildir + "/cur/1283599333.1840_11.cthulhu!2,"};
const auto id1 = store.add_message(msgpath);
assert_valid_result(id1);
store.commit();
g_assert_cmpuint(store.size(), ==, 1);
g_assert_true(store.contains_message(MuTestMaildir + "/cur/1283599333.1840_11.cthulhu!2,"));
g_assert_true(store.contains_message(msgpath));
g_assert_true(store.contains_message(msgpath));
const auto id2 = store.add_message(MuTestMaildir2 + "/bar/cur/mail3");
assert_valid_result(id2);
g_assert_false(!!id2); // wrong maildir.
store.commit();
const auto msg3path{MuTestMaildir + "/cur/1252168370_3.14675.cthulhu!2,S"};
const auto id3 = store.add_message(msg3path);
assert_valid_result(id3);
g_assert_cmpuint(store.size(), ==, 2);
g_assert_true(store.contains_message(MuTestMaildir2 + "/bar/cur/mail3"));
g_assert_true(store.contains_message(msg3path));
store.remove_message(id1.value());
g_assert_cmpuint(store.size(), ==, 1);
g_assert_false(
store.contains_message(MuTestMaildir + "/cur/1283599333.1840_11.cthulhu!2,"));
store.remove_message(MuTestMaildir2 + "/bar/cur/mail3");
store.remove_message(msg3path);
g_assert_true(store.empty());
g_assert_false(store.contains_message(MuTestMaildir2 + "/bar/cur/mail3"));
g_assert_false(store.contains_message(msg3path));
}
static void
test_message_mailing_list()
{
constexpr const char *test_message_1 =
R"(Return-Path: <sqlite-dev-bounces@sqlite.org>
X-Original-To: xxxx@localhost
Delivered-To: xxxx@localhost
Received: from mindcrime (localhost [127.0.0.1])
by mail.xxxxsoftware.nl (Postfix) with ESMTP id 32F276963F
for <xxxx@localhost>; Mon, 4 Aug 2008 21:49:34 +0300 (EEST)
Message-Id: <83B5AF40-DBFA-4578-A043-04C80276E195@sqlabs.net>
From: anon@example.com
To: sqlite-dev@sqlite.org
Mime-Version: 1.0 (Apple Message framework v926)
Date: Mon, 4 Aug 2008 11:40:49 +0200
X-Mailer: Apple Mail (2.926)
Subject: Capybaras United
Precedence: list
Reply-To: sqlite-dev@sqlite.org
List-Id: <sqlite-dev.sqlite.org>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Sender: sqlite-dev-bounces@sqlite.org
Content-Length: 639
Inside sqlite3VdbeExec there is a very big switch statement.
In order to increase performance with few modifications to the
original code, why not use this technique ?
http://docs.freebsd.org/info/gcc/gcc.info.Labels_as_Values.html
With a properly defined "instructions" array, instead of the switch
statement you can use something like:
goto * instructions[pOp->opcode];
)";
TempDir tempdir;
Mu::Store store{tempdir.path(), "/home/test/Maildir", {}, {}};
const auto msgpath{"/home/test/Maildir/inbox/cur/1649279256.107710_1.evergrey:2,S"};
auto message{Message::make_from_text(test_message_1, msgpath)};
assert_valid_result(message);
const auto docid = store.add_message(*message);
assert_valid_result(docid);
g_assert_cmpuint(store.size(),==, 1);
auto msg2{store.find_message(*docid)};
g_assert_true(!!msg2);
assert_equal(message->path(), msg2->path());
g_assert_true(store.contains_message(message->path()));
const auto qr = store.run_query("to:sqlite-dev@sqlite.org");
g_assert_true(!!qr);
g_assert_cmpuint(qr->size(), ==, 1);
}
static void
test_message_attachments(void)
{
constexpr const char* msg_text =
R"(Return-Path: <foo@example.com>
Received: from pop.gmail.com [256.85.129.309]
by evergrey with POP3 (fetchmail-6.4.29)
for <djcb@localhost> (single-drop); Thu, 24 Mar 2022 20:12:40 +0200 (EET)
Sender: "Foo, Example" <foo@example.com>
User-agent: mu4e 1.7.11; emacs 29.0.50
From: "Foo Example" <foo@example.com>
To: bar@example.com
Subject: =?utf-8?B?w6R0dMOkY2htZcOxdHM=?=
Date: Thu, 24 Mar 2022 20:04:39 +0200
Organization: ACME Inc.
Message-Id: <3144HPOJ0VC77.3H1XTAG2AMTLH@"@WILSONB.COM>
MIME-Version: 1.0
X-label: @NextActions operation:mindcrime Queensrÿche
Content-Type: multipart/mixed; boundary="=-=-="
--=-=-=
Content-Type: text/plain
Hello,
--=-=-=
Content-Type: image/jpeg
Content-Disposition: attachment; filename=file-01.bin
Content-Transfer-Encoding: base64
AAECAw==
--=-=-=
Content-Type: audio/ogg
Content-Disposition: inline; filename=/tmp/file-02.bin
Content-Transfer-Encoding: base64
BAUGBw==
--=-=-=
Content-Type: message/rfc822
Content-Disposition: attachment;
filename="message.eml"
From: "Fnorb" <fnorb@example.com>
To: Bob <bob@example.com>
Subject: news for you
Date: Mon, 28 Mar 2022 22:53:26 +0300
Attached message!
--=-=-=
Content-Type: text/plain
World!
--=-=-=--
)";
TempDir tempdir;
Mu::Store store{tempdir.path(), "/home/test/Maildir", {}, {}};
auto message{Message::make_from_text(
msg_text,
"/home/test/Maildir/inbox/cur/1649279256.abcde_1.evergrey:2,S")};
assert_valid_result(message);
const auto docid = store.add_message(*message);
assert_valid_result(docid);
store.commit();
auto msg2{store.find_message(*docid)};
g_assert_true(!!msg2);
assert_equal(message->path(), msg2->path());
g_assert_true(store.contains_message(message->path()));
// for (auto&& term = msg2->document().xapian_document().termlist_begin();
// term != msg2->document().xapian_document().termlist_end(); ++term)
// g_message(">>> %s", (*term).c_str());
}
int
main(int argc, char* argv[])
{
@ -85,11 +232,10 @@ main(int argc, char* argv[])
/* mu_runtime_init/uninit */
g_test_add_func("/store/ctor-dtor", test_store_ctor_dtor);
g_test_add_func("/store/add-count-remove", test_store_add_count_remove);
// if (!g_test_verbose())
// g_log_set_handler (NULL,
// G_LOG_LEVEL_MASK | G_LOG_FLAG_FATAL| G_LOG_FLAG_RECURSION,
// (GLogFunc)black_hole, NULL);
g_test_add_func("/store/message/mailing-list",
test_message_mailing_list);
g_test_add_func("/store/message/attachments",
test_message_attachments);
return g_test_run();
}