From 4e6bd7dfdf5d229159c4492129e78110e71c9d58 Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Sat, 27 Jun 2020 11:39:43 +0300 Subject: [PATCH] lib/index: Implement new indexer Implement a new message indexer consisting of a single-threaded scanner and a multi-threaded indexer. This allows for a number of optimizations as well as background indexing, though this initial version should be behave similar to the old indexer. --- configure.ac | 1 + lib/Makefile.am | 2 +- lib/index/Makefile.am | 45 ++++ lib/index/mu-indexer.cc | 350 ++++++++++++++++++++++++++++ lib/index/mu-indexer.hh | 114 +++++++++ lib/index/mu-scanner.cc | 242 +++++++++++++++++++ lib/index/mu-scanner.hh | 96 ++++++++ lib/index/test-scanner.cc | 68 ++++++ lib/mu-index.c | 476 -------------------------------------- lib/mu-index.h | 193 ---------------- mu/mu-cmd-find.cc | 1 - toys/mug/mug.c | 5 +- 12 files changed, 918 insertions(+), 675 deletions(-) create mode 100644 lib/index/Makefile.am create mode 100644 lib/index/mu-indexer.cc create mode 100644 lib/index/mu-indexer.hh create mode 100644 lib/index/mu-scanner.cc create mode 100644 lib/index/mu-scanner.hh create mode 100644 lib/index/test-scanner.cc delete mode 100644 lib/mu-index.c delete mode 100644 lib/mu-index.h diff --git a/configure.ac b/configure.ac index b1fe215f..6c2aa17f 100644 --- a/configure.ac +++ b/configure.ac @@ -264,6 +264,7 @@ lib/Makefile lib/doxyfile lib/utils/Makefile lib/query/Makefile +lib/index/Makefile mu4e/Makefile mu4e/mu4e-meta.el guile/Makefile diff --git a/lib/Makefile.am b/lib/Makefile.am index 8f2fa27e..76eecb2f 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -18,7 +18,7 @@ # before descending into tests/ include $(top_srcdir)/gtest.mk -SUBDIRS= utils query +SUBDIRS= utils query index if HAVE_JSON_GLIB json_srcs= \ diff --git a/lib/index/Makefile.am b/lib/index/Makefile.am new file mode 100644 index 00000000..602eb4dd --- /dev/null +++ b/lib/index/Makefile.am @@ -0,0 +1,45 @@ +## Copyright (C) 2020 Dirk-Jan C. Binnema +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software Foundation, +## Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +include $(top_srcdir)/gtest.mk + +AM_CPPFLAGS= \ + $(CODE_COVERAGE_CPPFLAGS) + +AM_CXXFLAGS= \ + $(WARN_CXXFLAGS) \ + $(GLIB_CFLAGS) \ + $(ASAN_CXXFLAGS) \ + $(CODE_COVERAGE_CFLAGS) \ + -I${top_srcdir}/lib + +AM_LDFLAGS= \ + $(ASAN_LDFLAGS) + +noinst_LTLIBRARIES= \ + libmu-index.la + +libmu_index_la_SOURCES= \ + mu-indexer.cc \ + mu-indexer.hh \ + mu-scanner.cc \ + mu-scanner.hh + +libmu_index_la_LIBADD= \ + $(GLIB_LIBS) \ + $(CODE_COVERAGE_LIBS) + +include $(top_srcdir)/aminclude_static.am diff --git a/lib/index/mu-indexer.cc b/lib/index/mu-indexer.cc new file mode 100644 index 00000000..79998dbd --- /dev/null +++ b/lib/index/mu-indexer.cc @@ -0,0 +1,350 @@ +/* +** Copyright (C) 2020 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "mu-indexer.hh" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std::chrono_literals; + +#include + +#include "mu-scanner.hh" +#include "utils/mu-async-queue.hh" +#include "utils/mu-error.hh" +#include "../mu-store.hh" + +using namespace Mu; + +struct Indexer::Private { + Private (Mu::Store& store): + store_{store}, + scanner_{store_.metadata().root_maildir, + [this](auto&& path, auto&& statbuf, auto&& info){ + return handler(path, statbuf, info); + }}, + max_message_size_{store_.metadata().max_message_size} { + + g_message ("created indexer for %s -> %s", + store.metadata().root_maildir.c_str(), + store.metadata().database_path.c_str()); + } + + ~Private() { stop(); } + + bool dir_predicate (const std::string& path, const struct dirent* dirent) const; + bool handler (const std::string& fullpath, struct stat *statbuf, + Scanner::HandleType htype); + + void maybe_start_worker(); + void worker(); + + bool cleanup(); + + bool start(const Indexer::Config& conf); + bool stop(); + + Indexer::Config conf_; + Store& store_; + Scanner scanner_; + const size_t max_message_size_; + + time_t dirstamp_{}; + std::atomic scan_done_{true}, clean_done_{true}; + + std::size_t max_workers_; + std::vector workers_; + std::thread scanner_worker_; + + AsyncQueue fq_; + + struct Progress { + void reset() { + processed = updated = removed = 0; + } + std::atomic processed{}; /**< Number of messages processed */ + std::atomic updated{}; /**< Number of messages added/updated to store */ + std::atomic removed{}; /**< Number of message removed from store */ + }; + Progress progress_; + + std::mutex lock_, wlock_; +}; + + +bool +Indexer::Private::handler (const std::string& fullpath, struct stat *statbuf, + Scanner::HandleType htype) +{ + switch (htype) { + case Scanner::HandleType::EnterDir: { + + // in lazy-mode, we ignore this dir if its dirstamp suggest it + // is up-to-date (this is _not_ always true; hence we call it + // lazy-mode) + dirstamp_ = store_.dirstamp(fullpath); + if (conf_.lazy_check && dirstamp_ == statbuf->st_mtime) { + g_debug("skip %s (seems up-to-date)", fullpath.c_str()); + return false; + } + + // don't index dirs with '.noindex' + auto noindex = ::access((fullpath + "/.noindex").c_str(), F_OK) == 0; + if (noindex) { + g_debug ("skip %s (has .noindex)", fullpath.c_str()); + return false; // don't descend into this dir. + } + + // don't index dirs with '.noupdate', unless we do a full + // (re)index. + if (!conf_.ignore_noupdate) { + auto noupdate = ::access((fullpath + "/.noupdate").c_str(), F_OK) == 0; + if (noupdate) { + g_debug ("skip %s (has .noupdate)", fullpath.c_str()); + return false; + } + } + + g_debug ("process %s", fullpath.c_str()); + return true; + + } + case Scanner::HandleType::LeaveDir: { + store_.set_dirstamp(fullpath, ::time({})); + return true; + } + + case Scanner::HandleType::File: { + + if ((size_t)statbuf->st_size > max_message_size_) { + g_debug ("skip %s (too big: %zu bytes)", + fullpath.c_str(), statbuf->st_size); + return false; + } + + // if the message is not in the db yet, or not up-to-date, queue + // it for updating/inserting. + if (statbuf->st_mtime <= dirstamp_ && + store_.contains_message (fullpath)) { + //g_debug ("skip %s: already up-to-date"); + return false; + } + + fq_.push(std::string{fullpath}); + return true; + } + default: + g_return_val_if_reached (false); + return false; + } +} + +void +Indexer::Private::maybe_start_worker() +{ + std::lock_guard wlock{wlock_}; + + if (fq_.size() > workers_.size() && workers_.size() < max_workers_) + workers_.emplace_back(std::thread([this]{worker();})); +} + +void +Indexer::Private::worker() +{ + std::string item; + + g_debug ("started worker"); + + while (!scan_done_ || !fq_.empty()) { + + if (!fq_.pop (item, 250ms)) + continue; + + //g_debug ("popped (n=%zu) path %s", fq_.size(), item.c_str()); + ++progress_.processed; + + try { + store_.add_message(item); + ++progress_.updated; + + } catch (const Mu::Error& er) { + g_warning ("error adding message @ %s: %s", + item.c_str(), er.what()); + } + + maybe_start_worker(); + } +} + +bool +Indexer::Private::cleanup() +{ + g_debug ("starting cleanup"); + + std::vector orphans_; // store messages without files. + store_.for_each([&](Store::Id id, const std::string &path) { + + if (clean_done_) + return false; + + if (::access(path.c_str(), F_OK) != 0) { + g_debug ("%s not found; queing id=%u for removal", + path.c_str(), id); + orphans_.emplace_back(id); + } + + return !clean_done_; + }); + + if (orphans_.empty()) { + g_debug("nothing to clean up"); + return true; + } + + store_.remove_messages (orphans_); + g_debug ("removed %zu orphan messages from store", orphans_.size()); + + return true; +} + + +bool +Indexer::Private::start(const Indexer::Config& conf) +{ + stop(); + + conf_ = conf; + if (conf_.max_threads == 0) + max_workers_ = std::thread::hardware_concurrency(); + else + max_workers_ = conf.max_threads; + + g_debug ("starting indexer with up to %zu threads", max_workers_); + + scan_done_ = false; + workers_.emplace_back(std::thread([this]{worker();})); + + scan_done_ = clean_done_ = false; + scanner_worker_ = std::thread([this]{ + + progress_.reset(); + + if (conf_.scan) { + g_debug("starting scanner"); + + if (!scanner_.start()) { + g_warning ("failed to start scanner"); + return; + } + + scan_done_ = true; + g_debug ("scanner finished"); + } + + if (conf_.cleanup) { + g_debug ("starting cleanup"); + cleanup(); + clean_done_ = true; + g_debug ("cleanup finished"); + } + + store_.commit(); + }); + + g_debug ("started indexer"); + + return true; +} + +bool +Indexer::Private::stop() +{ + scanner_.stop(); + scan_done_ = clean_done_ = true; + + const auto w_n = workers_.size(); + + fq_.clear(); + if (scanner_worker_.joinable()) + scanner_worker_.join(); + + for (auto&& w: workers_) + if (w.joinable()) + w.join(); + workers_.clear(); + + if (w_n > 0) + g_debug ("stopped indexer (joined %zu worker(s))", w_n); + + return true; +} + +Indexer::Indexer (Store& store): + priv_{std::make_unique(store)} +{} + +Indexer::~Indexer() = default; + +bool +Indexer::start(const Indexer::Config& conf) +{ + std::lock_guard l(priv_->lock_); + if (is_running()) + return true; + + return priv_->start(conf); +} + +bool +Indexer::stop() +{ + std::lock_guard l(priv_->lock_); + + if (!is_running()) + return true; + + g_debug ("stopping indexer"); + return priv_->stop(); +} + +bool +Indexer::is_running() const +{ + return !priv_->scan_done_ || !priv_->clean_done_ || + !priv_->fq_.empty(); +} + +Indexer::Progress +Indexer::progress() const +{ + return Progress{ + is_running(), + priv_->progress_.processed, + priv_->progress_.updated, + priv_->progress_.removed + }; +} diff --git a/lib/index/mu-indexer.hh b/lib/index/mu-indexer.hh new file mode 100644 index 00000000..c39862b7 --- /dev/null +++ b/lib/index/mu-indexer.hh @@ -0,0 +1,114 @@ +/* +** Copyright (C) 2020 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#ifndef MU_INDEXER_HH__ +#define MU_INDEXER_HH__ + +#include +#include + +namespace Mu { + +struct Store; + +/// An object abstracting the index process. +class Indexer { +public: + /** + * Construct an indexer object + * + * @param store the message store to use + */ + Indexer (Store& store); + + /** + * DTOR + */ + ~Indexer(); + + /// A configuration object for the indexer + struct Config { + bool scan{true}; + /**< scan for new messages */ + bool cleanup{true}; + /**< clean messages no longer in the file system */ + size_t max_threads{}; + /**< maximum # of threads to use */ + bool ignore_noupdate{}; + /**< ignore .noupdate files */ + bool lazy_check{}; + /**< whether to skip directories that don't have a changed + * mtime */ + }; + + + /** + * Start indexing. If already underway, do nothing. + * + * @param conf a configuration object + * + * @return true if starting worked or an indexing process was already + * underway; false otherwise. + * + */ + bool start(const Config& conf); + + /** + * Stop indexing. If not indexing, do nothing. + * + * + * @return true if we stopped indexing, or indexing was not underway. + * False otherwise. + */ + bool stop(); + + /** + * Is an indexing process running? + * + * @return true or false. + */ + bool is_running() const; + + + // Object describing current progress + struct Progress { + bool running{}; /**< Is an index operation in progress? */ + size_t processed{}; /**< Number of messages processed */ + size_t updated{}; /**< Number of messages added/updated to store */ + size_t removed{}; /**< Number of message removed from store */ + }; + + /** + * Get an object describing the current progress. The progress object + * describes the most recent indexing job, and is reset up a fresh + * start(). + * + * @return a progress object. + */ + Progress progress() const; + +private: + struct Private; + std::unique_ptr priv_; +}; + + + +} // namepace Mu +#endif /* MU_INDEXER_HH__ */ diff --git a/lib/index/mu-scanner.cc b/lib/index/mu-scanner.cc new file mode 100644 index 00000000..18641b93 --- /dev/null +++ b/lib/index/mu-scanner.cc @@ -0,0 +1,242 @@ +/* +** Copyright (C) 2020 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ +#include "mu-scanner.hh" + +#include "config.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "utils/mu-utils.hh" +#include "utils/mu-error.hh" + +using namespace Mu; + +struct Scanner::Private { + Private (const std::string& root_dir, + Scanner::Handler handler): + root_dir_{root_dir}, handler_{handler} { + if (!handler_) + throw Mu::Error{Error::Code::Internal, "missing handler"}; + } + ~Private() { + stop(); + } + + bool start(); + bool stop(); + bool process_dentry (const std::string& path, struct dirent *dentry, bool is_maildir); + bool process_dir (const std::string& path, bool is_maildir); + + const std::string root_dir_; + const Scanner::Handler handler_; + std::atomic running_{}; + std::mutex lock_; +}; + + +static bool +is_special_dir (const struct dirent *dentry) +{ + const auto d_name{dentry->d_name}; + return d_name[0] == '\0' || + (d_name[1] == '\0' && d_name[0] == '.') || + (d_name[2] == '\0' && d_name[0] == '.' && d_name[1] == '.'); +} + +static bool +is_new_cur (const char *dirname) +{ + if (dirname[0] == 'c' && dirname[1] == 'u' && dirname[2] == 'r' && dirname[3] == '\0') + return true; + + if (dirname[0] == 'n' && dirname[1] == 'e' && dirname[2] == 'w' && dirname[3] == '\0') + return true; + + return false; +} + +bool +Scanner::Private::process_dentry (const std::string& path, struct dirent *dentry, + bool is_maildir) +{ + if (is_special_dir (dentry)) + return true; // ignore. + + const auto fullpath{path + "/" + dentry->d_name}; + struct stat statbuf; + if (::stat(fullpath.c_str(), &statbuf) != 0) { + g_warning ("failed to stat %s: %s", fullpath.c_str(), ::strerror(errno)); + return false; + } + + if (S_ISDIR(statbuf.st_mode)) { + + const auto res = handler_(fullpath, &statbuf, Scanner::HandleType::EnterDir); + if (!res) { + //g_debug ("skipping dir %s", fullpath.c_str()); + return true; // skip + } + + process_dir (fullpath, is_new_cur(dentry->d_name)); + + return handler_(fullpath, &statbuf, Scanner::HandleType::LeaveDir); + + } else if (S_ISREG(statbuf.st_mode) && is_maildir) + return handler_(fullpath, &statbuf, Scanner::HandleType::File); + + g_debug ("skip %s (neither maildir-file nor directory)", fullpath.c_str()); + return true; +} + + +bool +Scanner::Private::process_dir (const std::string& path, bool is_maildir) +{ + const auto dir = opendir (path.c_str()); + if (G_UNLIKELY(!dir)) { + g_warning("failed to scan dir %s: %s", path.c_str(), strerror(errno)); + return false; + } + + // TODO: sort dentries by inode order, which makes things faster for extfs. + // see mu-maildir.c + + while (running_) { + errno = 0; + const auto dentry{readdir(dir)}; + + if (G_LIKELY(dentry)) { + process_dentry (path, dentry, is_maildir); + continue; + } + + if (errno != 0) { + g_warning("failed to read %s: %s", path.c_str(), strerror(errno)); + continue; + } + + break; + } + closedir (dir); + + return true; +} + +bool +Scanner::Private::start() +{ + const auto& path{root_dir_}; + if (G_UNLIKELY(path.length() > PATH_MAX)) { + g_warning("path too long"); + return false; + } + + const auto mode{F_OK | R_OK}; + if (G_UNLIKELY(access (path.c_str(), mode) != 0)) { + g_warning("'%s' is not readable: %s", path.c_str(), strerror (errno)); + return false; + } + + struct stat statbuf{}; + if (G_UNLIKELY(stat (path.c_str(), &statbuf) != 0)) { + g_warning("'%s' is not stat'able: %s", path.c_str(), strerror (errno)); + return false; + } + + if (G_UNLIKELY(!S_ISDIR (statbuf.st_mode))) { + g_warning("'%s' is not a directory", path.c_str()); + return false; + } + + running_ = true; + g_debug ("starting scan @ %s", root_dir_.c_str()); + + auto basename{g_path_get_basename(root_dir_.c_str())}; + const auto is_maildir = (g_strcmp0(basename, "cur") == 0 || + g_strcmp0(basename,"new") == 0); + g_free(basename); + + const auto start{std::chrono::steady_clock::now()}; + process_dir(root_dir_, is_maildir); + const auto elapsed = std::chrono::steady_clock::now() - start; + g_debug ("finished scan of %s in %" G_GINT64_FORMAT " ms", root_dir_.c_str(), + to_ms(elapsed)); + running_ = false; + + return true; +} + +bool +Scanner::Private::stop() +{ + if (!running_) + return true; // nothing to do + + g_debug ("stopping scan"); + running_ = false; + + return true; +} + +Scanner::Scanner (const std::string& root_dir, + Scanner::Handler handler): + priv_{std::make_unique(root_dir, handler)} +{} + +Scanner::~Scanner() = default; + +bool +Scanner::start() +{ + { + std::lock_guard l(priv_->lock_); + if (priv_->running_) + return true; //nothing to do + + priv_->running_ = true; + } + + const auto res = priv_->start(); + priv_->running_ = false; + + return res; +} + +bool +Scanner::stop() +{ + std::lock_guard l(priv_->lock_); + + return priv_->stop(); +} + +bool +Scanner::is_running() const +{ + return priv_->running_; +} diff --git a/lib/index/mu-scanner.hh b/lib/index/mu-scanner.hh new file mode 100644 index 00000000..ac88038a --- /dev/null +++ b/lib/index/mu-scanner.hh @@ -0,0 +1,96 @@ +/* +** Copyright (C) 2020 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#ifndef MU_SCANNER_HH__ +#define MU_SCANNER_HH__ + +#include +#include + +#include +#include +#include +#include + +namespace Mu { + +/// @brief Maildir scanner +/// +/// Scans maildir (trees) recursively, and calls the Handler callback for +/// directories & files. +/// +/// It filters out (i.e., does call the handler for): +/// - files starting with '.' +/// - files that do not live in a cur / new leaf maildir +/// - directories '.' and '..' +/// +class Scanner { +public: + enum struct HandleType { File, EnterDir, LeaveDir }; + + /// Prototype for a handler function + using Handler = std::function; + /** + * Construct a scanner object for scanning a directory, recursively. + * + * If handler is a directroy + * + * + * @param root_dir root dir to start scanning + * @param handler handler function for some direntry + */ + Scanner (const std::string& root_dir, Handler handler); + + /** + * DTOR + */ + ~Scanner(); + + /** + * Start the scan; this is a blocking call than run until + * finished or (from another thread) stop() is called. + * + * @return true if starting worked; false otherwise + */ + bool start(); + + /** + * Stop the scan + * + * @return true if stopping worked; false otherwi%sse + */ + bool stop(); + + /** + * Is a scan currently running? + * + * @return true or false + */ + bool is_running() const; + +private: + struct Private; + std::unique_ptr priv_; +}; + +} // namepace Mu + +#endif /* MU_SCANNER_HH__ */ diff --git a/lib/index/test-scanner.cc b/lib/index/test-scanner.cc new file mode 100644 index 00000000..a5df4e50 --- /dev/null +++ b/lib/index/test-scanner.cc @@ -0,0 +1,68 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include +#include + +#include +#include + +#include "mu-scanner.hh" +#include "mu-utils.hh" + +using namespace Mu; + + +static void +test_scan_maildir () +{ + allow_warnings(); + + Scanner scanner{"/home/djcb/Maildir", + [](const dirent* dentry)->bool { + g_print ("%02x %s\n", dentry->d_type, dentry->d_name); + return true; + }, + [](const std::string& fullpath, const struct stat* statbuf, + auto&& info)->bool { + g_print ("%s %zu\n", fullpath.c_str(), statbuf->st_size); + return true; + } + }; + g_assert_true (scanner.start()); + + while (scanner.is_running()) { + sleep(1); + } +} + +int +main (int argc, char *argv[]) try +{ + g_test_init (&argc, &argv, NULL); + + g_test_add_func ("/utils/scanner/scan-maildir", test_scan_maildir); + + return g_test_run (); + + +} catch (const std::runtime_error& re) { + std::cerr << re.what() << "\n"; + return 1; +} diff --git a/lib/mu-index.c b/lib/mu-index.c deleted file mode 100644 index bb54f3e8..00000000 --- a/lib/mu-index.c +++ /dev/null @@ -1,476 +0,0 @@ -/* -** Copyright (C) 2008-2020 Dirk-Jan C. Binnema -** -** This program is free software; you can redistribute it and/or modify -1** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 3 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** -*/ - -#include "config.h" -#include "mu-index.h" - -#include -#include -#include -#include -#include -#include - -#include "mu-maildir.h" - -#define MU_LAST_USED_MAILDIR_KEY "last_used_maildir" -#define MU_INDEX_MAX_FILE_SIZE (500*1000*1000) /* 500 Mb */ -/* apparently, people are getting really big mails, so let us index those (by - * default)*/ - -struct _MuIndex { - MuStore *_store; - gboolean _needs_reindex; - guint _max_filesize; -}; - -MuIndex* -mu_index_new (MuStore *store, GError **err) -{ - MuIndex *index; - unsigned count; - - g_return_val_if_fail (store, NULL); - g_return_val_if_fail (!mu_store_is_read_only(store), NULL); - - index = g_new0 (MuIndex, 1); - - index->_store = mu_store_ref (store); - - /* set the default max file size */ - index->_max_filesize = MU_INDEX_MAX_FILE_SIZE; - - count = mu_store_count (store, err); - if (count == (unsigned)-1) - return NULL; - else if (count == 0) - index->_needs_reindex = TRUE; - - return index; -} - -void -mu_index_destroy (MuIndex *index) -{ - if (!index) - return; - - mu_store_unref (index->_store); - g_free (index); -} - - -struct _MuIndexCallbackData { - MuIndexMsgCallback _idx_msg_cb; - MuIndexDirCallback _idx_dir_cb; - MuStore* _store; - void* _user_data; - MuIndexStats* _stats; - gboolean _reindex; - gboolean _lazy_check; - time_t _dirstamp; - guint _max_filesize; -}; -typedef struct _MuIndexCallbackData MuIndexCallbackData; - - -/* checks to determine if we need to (re)index this message note: - * simply checking timestamps is not good enough because message may - * be moved from other dirs (e.g. from 'new' to 'cur') and the time - * stamps won't change. */ -static inline gboolean -needs_index (MuIndexCallbackData *data, const char *fullpath, - time_t filestamp) -{ - /* unconditionally reindex */ - if (data->_reindex) - return TRUE; - - /* it's not in the database yet */ - if (!mu_store_contains_message (data->_store, fullpath)) - return TRUE; - - /* it's there, but it's not up to date */ - if ((unsigned)filestamp >= (unsigned)data->_dirstamp) - return TRUE; - - return FALSE; /* index not needed */ -} - - -static MuError -insert_or_update_maybe (const char *fullpath, const char *mdir, - time_t filestamp, MuIndexCallbackData *data, - gboolean *updated) -{ - MuMsg *msg; - GError *err; - gboolean rv; - - *updated = FALSE; - if (!needs_index (data, fullpath, filestamp)) - return MU_OK; /* nothing to do for this one */ - - err = NULL; - msg = mu_msg_new_from_file (fullpath, mdir, &err); - if (!msg) { - if (!err) - g_warning ("error creating message object: %s", - fullpath); - else { - g_warning ("%s", err->message); - g_clear_error (&err); - } - /* warn, then simply continue */ - return MU_OK; - } - - /* we got a valid id; scan the message contents as well */ - rv = mu_store_add_msg (data->_store, msg, &err); - mu_msg_unref (msg); - - if (!rv) { - g_warning ("error storing message object: %s", - err ? err->message : "cause unknown"); - g_clear_error (&err); - return MU_ERROR; - } - - *updated = TRUE; - return MU_OK; -} - - -static MuError -run_msg_callback_maybe (MuIndexCallbackData *data) -{ - MuError result; - - if (!data || !data->_idx_msg_cb) - return MU_OK; - - result = data->_idx_msg_cb (data->_stats, data->_user_data); - if (G_UNLIKELY(result != MU_OK && result != MU_STOP)) - g_warning ("error in callback"); - - return result; -} - - -static MuError -on_run_maildir_msg (const char *fullpath, const char *mdir, - struct stat *statbuf, MuIndexCallbackData *data) -{ - MuError result; - gboolean updated; - - /* protect against too big messages */ - if (G_UNLIKELY(statbuf->st_size > data->_max_filesize)) { - g_warning ("ignoring because bigger than %u bytes: %s", - data->_max_filesize, fullpath); - return MU_OK; /* not an error */ - } - - result = run_msg_callback_maybe (data); - if (result != MU_OK) - return result; - - /* see if we need to update/insert anything... - * use the ctime, so any status change will be visible (perms, - * filename etc.)*/ - result = insert_or_update_maybe (fullpath, mdir, statbuf->st_ctime, - data, &updated); - - if (result == MU_OK && data && data->_stats) { /* update statistics */ - ++data->_stats->_processed; - updated ? ++data->_stats->_updated : ++data->_stats->_uptodate; - } - - return result; -} - -static time_t -get_dir_timestamp (const char *path) -{ - struct stat statbuf; - - if (stat (path, &statbuf) != 0) { - g_warning ("failed to stat %s: %s", - path, strerror(errno)); - return 0; - } - - return statbuf.st_ctime; -} - -static MuError -on_run_maildir_dir (const char* fullpath, gboolean enter, - MuIndexCallbackData *data) -{ - GError *err; - - err = NULL; - - /* xapian stores a per-dir timestamp; we use this timestamp to determine - * whether a message is up-to-date - */ - if (enter) { - data->_dirstamp = - mu_store_get_dirstamp (data->_store, fullpath, &err); - /* in 'lazy' mode, we only check the dir timestamp, and if it's - * up to date, we don't bother with this dir. This fails to - * account for messages below this dir that have merely - * _changed_ though */ - if (data->_lazy_check && mu_maildir_is_leaf_dir(fullpath)) { - time_t dirstamp; - dirstamp = get_dir_timestamp (fullpath); - if (dirstamp <= data->_dirstamp) { - g_debug ("ignore %s (up-to-date)", fullpath); - return MU_IGNORE; - } - } - g_debug ("entering %s", fullpath); - } else { - mu_store_set_dirstamp (data->_store, fullpath, - time(NULL), &err); - g_debug ("leaving %s", fullpath); - } - - if (data->_idx_dir_cb) - return data->_idx_dir_cb (fullpath, enter, - data->_user_data); - - if (err) { - g_warning("%s: error handling %s: %s", __func__, - fullpath, err->message); - g_clear_error(&err); - } - - return MU_OK; -} - -static gboolean -check_path (const char *path) -{ - g_return_val_if_fail (path, FALSE); - - if (!g_path_is_absolute (path)) { - g_warning ("%s: not an absolute path: '%s'", __func__, path); - return FALSE; - } - - if (access (path, R_OK) != 0) { - g_warning ("%s: cannot open '%s': %s", - __func__, path, strerror (errno)); - return FALSE; - } - - return TRUE; -} - -static void -init_cb_data (MuIndexCallbackData *cb_data, MuStore *xapian, - gboolean reindex, gboolean lazycheck, - guint max_filesize, MuIndexStats *stats, - MuIndexMsgCallback msg_cb, MuIndexDirCallback dir_cb, - void *user_data) -{ - cb_data->_idx_msg_cb = msg_cb; - cb_data->_idx_dir_cb = dir_cb; - - cb_data->_user_data = user_data; - cb_data->_store = xapian; - - cb_data->_reindex = reindex; - cb_data->_lazy_check = lazycheck; - cb_data->_dirstamp = 0; - cb_data->_max_filesize = max_filesize; - - cb_data->_stats = stats; - if (cb_data->_stats) - memset (cb_data->_stats, 0, sizeof(MuIndexStats)); -} - - -void -mu_index_set_max_msg_size (MuIndex *index, guint max_size) -{ - g_return_if_fail (index); - - if (max_size == 0) - index->_max_filesize = MU_INDEX_MAX_FILE_SIZE; - else - index->_max_filesize = max_size; -} - - -MuError -mu_index_run (MuIndex *index, gboolean reindex, gboolean lazycheck, - MuIndexStats *stats, - MuIndexMsgCallback msg_cb, MuIndexDirCallback dir_cb, - void *user_data) -{ - MuIndexCallbackData cb_data; - MuError rv; - const char *path; - - g_return_val_if_fail (index && index->_store, MU_ERROR); - g_return_val_if_fail (msg_cb, MU_ERROR); - - path = mu_store_root_maildir (index->_store); - if (!check_path (path)) - return MU_ERROR; - - if (index->_needs_reindex) - reindex = TRUE; - - init_cb_data (&cb_data, index->_store, reindex, lazycheck, - index->_max_filesize, stats, - msg_cb, dir_cb, user_data); - - rv = mu_maildir_walk (path, - (MuMaildirWalkMsgCallback)on_run_maildir_msg, - (MuMaildirWalkDirCallback)on_run_maildir_dir, - reindex, /* re-index, ie. do a full update */ - &cb_data); - - mu_store_flush (index->_store); - - return rv; -} - -static MuError -on_stats_maildir_file (const char *fullpath, const char *mdir, - struct stat *statbuf, - MuIndexCallbackData *cb_data) -{ - MuError result; - - if (cb_data && cb_data->_idx_msg_cb) - result = cb_data->_idx_msg_cb (cb_data->_stats, - cb_data->_user_data); - else - result = MU_OK; - - if (result == MU_OK) { - if (cb_data->_stats) - ++cb_data->_stats->_processed; - return MU_OK; - } - - return result; /* MU_STOP or MU_OK */ -} - - -MuError -mu_index_stats (MuIndex *index, - MuIndexStats *stats, MuIndexMsgCallback cb_msg, - MuIndexDirCallback cb_dir, void *user_data) -{ - const char *path; - MuIndexCallbackData cb_data; - - g_return_val_if_fail (index, MU_ERROR); - g_return_val_if_fail (cb_msg, MU_ERROR); - - path = mu_store_root_maildir (index->_store); - if (!check_path (path)) - return MU_ERROR; - - if (stats) - memset (stats, 0, sizeof(MuIndexStats)); - - cb_data._idx_msg_cb = cb_msg; - cb_data._idx_dir_cb = cb_dir; - - cb_data._stats = stats; - cb_data._user_data = user_data; - - cb_data._dirstamp = 0; - - return mu_maildir_walk (path, - (MuMaildirWalkMsgCallback)on_stats_maildir_file, - NULL, FALSE, &cb_data); -} - -struct _CleanupData { - MuStore *_store; - MuIndexStats *_stats; - MuIndexCleanupDeleteCallback _cb; - void *_user_data; - -}; -typedef struct _CleanupData CleanupData; - - -static MuError -foreach_doc_cb (const char* path, CleanupData *cudata) -{ - if (access (path, R_OK) != 0) { - if (errno != EACCES) - g_debug ("cannot access %s: %s", path, strerror(errno)); - if (!mu_store_remove_path (cudata->_store, path)) - return MU_ERROR; /* something went wrong... bail out */ - if (cudata->_stats) - ++cudata->_stats->_cleaned_up; - } - - if (cudata->_stats) - ++cudata->_stats->_processed; - - if (!cudata->_cb) - return MU_OK; - - return cudata->_cb (cudata->_stats, cudata->_user_data); -} - - -MuError -mu_index_cleanup (MuIndex *index, MuIndexStats *stats, - MuIndexCleanupDeleteCallback cb, - void *user_data, GError **err) -{ - MuError rv; - CleanupData cudata; - - g_return_val_if_fail (index, MU_ERROR); - - cudata._store = index->_store; - cudata._stats = stats; - cudata._cb = cb; - cudata._user_data = user_data; - - rv = mu_store_foreach (index->_store, - (MuStoreForeachFunc)foreach_doc_cb, - &cudata, err); - - mu_store_flush (index->_store); - - return rv; -} - -gboolean -mu_index_stats_clear (MuIndexStats *stats) -{ - if (!stats) - return FALSE; - - memset (stats, 0, sizeof(MuIndexStats)); - return TRUE; -} diff --git a/lib/mu-index.h b/lib/mu-index.h deleted file mode 100644 index c0faba83..00000000 --- a/lib/mu-index.h +++ /dev/null @@ -1,193 +0,0 @@ -/* -** Copyright (C) 2008-2020 Dirk-Jan C. Binnema -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 3 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** -*/ - -#ifndef __MU_INDEX_H__ -#define __MU_INDEX_H__ - -#include -#include -#include -#include - -G_BEGIN_DECLS - -/* opaque structure */ -struct _MuIndex; -typedef struct _MuIndex MuIndex; - -struct _MuIndexStats { - unsigned _processed; /* number of msgs processed or counted */ - unsigned _updated; /* number of msgs new or updated */ - unsigned _cleaned_up; /* number of msgs cleaned up */ - unsigned _uptodate; /* number of msgs already up-to-date */ -}; -typedef struct _MuIndexStats MuIndexStats; - -/** - * create a new MuIndex instance. NOTE: the database does not have - * to exist yet, but the directory must already exist; NOTE(2): before - * doing anything with the returned Index object, make sure you haved - * called mu_msg_init somewhere in your code. - * - * @param store a writable MuStore object - * @param err to receive error or NULL; there are only errors when this - * function returns NULL. Possible errors: see mu-error.h - * - * @return a new MuIndex instance, or NULL in case of error - */ -MuIndex* mu_index_new (MuStore *store, GError **err) - G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; - - -/** - * destroy the index instance - * - * @param index a MuIndex instance, or NULL - */ -void mu_index_destroy (MuIndex *index); - - -/** - * change the maximum file size that mu-index considers from its - * default (MU_INDEX_MAX_FILE_SIZE). Note that the maximum size is a - * protection against mu (or the libraries it uses) allocating too - * much memory, which can lead to problems - * - * @param index a mu index object - * @param max_size the maximum msg size, or 0 to reset to the default - */ -void mu_index_set_max_msg_size (MuIndex *index, guint max_size); - - -/** - * callback function for mu_index_(run|stats|cleanup), for each message - * - * @param stats pointer to structure to receive statistics data - * @param user_data pointer to user data - * - * @return MU_OK to continue, MU_STOP to stop, or MU_ERROR in - * case of some error. - */ -typedef MuError (*MuIndexMsgCallback) (MuIndexStats* stats, void *user_data); - - -/** - * callback function for mu_index_(run|stats|cleanup), for each dir enter/leave - * - * @param path dirpath we just entered / left - * @param enter did we enter (TRUE) or leave(FALSE) the dir? - * @param user_data pointer to user data - * - * @return MU_OK to continue, MU_STOP to stopd or MU_ERROR in - * case of some error. - */ -typedef MuError (*MuIndexDirCallback) (const char* path, gboolean enter, - void *user_data); - -/** - * start the indexing process - * - * @param index a valid MuIndex instance - * @param force if != 0, force re-indexing already index messages; this is - * obviously a lot slower than only indexing new/changed messages - * @param lazycheck whether ignore subdirectoryies that have up-to-date - * timestamps. - * @param stats a structure with some statistics about the results; - * note that this function does *not* reset the struct values to allow - * for cumulative stats from multiple calls. If needed, you can use - * @mu_index_stats_clear before calling this function - * @param cb_msg a callback function called for every msg indexed; - * @param cb_dir a callback function called for every dir entered/left or NULL - * @param user_data a user pointer that will be passed to the callback function - * - * @return MU_OK if the stats gathering was completed successfully, - * MU_STOP if the user stopped or MU_ERROR in - * case of some error. - */ -MuError mu_index_run (MuIndex *index, gboolean force, - gboolean lazycheck, MuIndexStats *stats, - MuIndexMsgCallback msg_cb, - MuIndexDirCallback dir_cb, void *user_data); - -/** - * gather some statistics about the Maildir; this is usually much faster than - * mu_index_run, and can thus be used to provide some information to the user - * note though that the statistics may be different from the reality that - * mu_index_run sees, when there are updates in the Maildir - * - * @param index a valid MuIndex instance - * @param stats a structure with some statistics about the results; - * note that this function does *not* reset the struct values to allow - * for cumulative stats from multiple calls. If needed, you can use - * @mu_index_stats_clear before calling this function - * @param msg_cb a callback function which will be called for every msg; - * @param dir_cb a callback function which will be called for every dir or NULL - * @param user_data a user pointer that will be passed to the callback function - * xb - * @return MU_OK if the stats gathering was completed successfully, - * MU_STOP if the user stopped or MU_ERROR in - * case of some error. - */ -MuError mu_index_stats (MuIndex *index, MuIndexStats *stats, - MuIndexMsgCallback msg_cb, MuIndexDirCallback dir_cb, - void *user_data); - -/** - * callback function called for each message - * - * @param MuIndexCleanupCallback - * - * @return a MuResult - */ -typedef MuError (*MuIndexCleanupDeleteCallback) (MuIndexStats *stats, - void *user_data); - -/** - * cleanup the database; ie. remove entries for which no longer a corresponding - * file exists in the maildir - * - * @param index a valid MuIndex instance - * @param stats a structure with some statistics about the results; - * note that this function does *not* reset the struct values to allow - * for cumulative stats from multiple calls. If needed, you can use - * @mu_index_stats_clear before calling this function - * @param cb a callback function which will be called for every msg; - * @param user_data a user pointer that will be passed to the callback function - * @param err to receive error info or NULL. err->code is MuError value - * - * @return MU_OK if the stats gathering was completed successfully, - * MU_STOP if the user stopped or MU_ERROR in - * case of some error. - */ -MuError mu_index_cleanup (MuIndex *index, MuIndexStats *stats, - MuIndexCleanupDeleteCallback cb, - void *user_data, GError **err); - -/** - * clear the stats structure - * - * @param stats a MuIndexStats object - * - * @return TRUE if stats != NULL, FALSE otherwise - */ -gboolean mu_index_stats_clear (MuIndexStats *stats); - -G_END_DECLS - -#endif /*__MU_INDEX_H__*/ diff --git a/mu/mu-cmd-find.cc b/mu/mu-cmd-find.cc index 5f516c5c..af968544 100644 --- a/mu/mu-cmd-find.cc +++ b/mu/mu-cmd-find.cc @@ -28,7 +28,6 @@ #include "mu-msg.h" #include "mu-maildir.h" -#include "mu-index.h" #include "mu-query.h" #include "mu-msg-iter.h" #include "mu-bookmarks.h" diff --git a/toys/mug/mug.c b/toys/mug/mug.c index ac8a0b71..fb200133 100644 --- a/toys/mug/mug.c +++ b/toys/mug/mug.c @@ -1,5 +1,5 @@ /* -** Copyright (C) 2010-2017 Dirk-Jan C. Binnema +** Copyright (C) 2010-2020 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the @@ -17,9 +17,7 @@ ** */ -#if HAVE_CONFIG_H #include "config.h" -#endif /*HAVE_CONFIG*/ #include #include @@ -28,7 +26,6 @@ #include #include #include -#include #include "mug-msg-list-view.h" #include "mug-query-bar.h"